diff options
Diffstat (limited to 'fs')
180 files changed, 5597 insertions, 4288 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 64d44efad7a..f8fccaaad62 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -6,6 +6,10 @@ menu "File systems" if BLOCK +config FS_JOURNAL_INFO + bool + default n + source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" diff --git a/fs/afs/write.c b/fs/afs/write.c index c63a3c8beb7..5e15a21dbf9 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -671,7 +671,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); ssize_t result; size_t count = iov_length(iov, nr_segs); - int ret; _enter("{%x.%u},{%zu},%lu,", vnode->fid.vid, vnode->fid.vnode, count, nr_segs); @@ -691,13 +690,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, return result; } - /* return error values for O_SYNC and IS_SYNC() */ - if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) { - ret = afs_fsync(iocb->ki_filp, dentry, 1); - if (ret < 0) - result = ret; - } - _leave(" = %zd", result); return result; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 38502c67987..79d2b1aa389 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -380,7 +380,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, down_write(¤t->mm->mmap_sem); current->mm->start_brk = do_mmap(NULL, 0, stack_size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); if (IS_ERR_VALUE(current->mm->start_brk)) { diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c020e57..402afe0a0bf 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,6 +4,7 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select FS_JOURNAL_INFO help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 06550affbd2..77f759302e1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -909,7 +909,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unsigned long last_index; int will_write; - will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || + will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, @@ -1076,7 +1076,7 @@ out_nolock: if (err) num_written = err; - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { trans = btrfs_start_transaction(root, 1); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 4618516dd99..c2413561ea7 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -21,6 +21,7 @@ #include <linux/mount.h> #include <linux/statfs.h> #include <linux/ctype.h> +#include <linux/string.h> #include <linux/fs_struct.h> #include "internal.h" @@ -257,8 +258,7 @@ static ssize_t cachefiles_daemon_write(struct file *file, if (args == data) goto error; *args = '\0'; - for (args++; isspace(*args); args++) - continue; + args = skip_spaces(++args); } /* run the appropriate command handler */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1f42f772865..6ccf7262d1b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -214,7 +214,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags |= SMB_O_EXCL; if (oflags & O_TRUNC) posix_flags |= SMB_O_TRUNC; - if (oflags & O_SYNC) + /* be safe and imply O_SYNC for O_DSYNC */ + if (oflags & O_DSYNC) posix_flags |= SMB_O_SYNC; if (oflags & O_DIRECTORY) posix_flags |= SMB_O_DIRECTORY; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 429337eb7af..057e1dae12a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -76,8 +76,10 @@ static inline fmode_t cifs_posix_convert_flags(unsigned int flags) reopening a file. They had their effect on the original open */ if (flags & O_APPEND) posix_flags |= (fmode_t)O_APPEND; - if (flags & O_SYNC) - posix_flags |= (fmode_t)O_SYNC; + if (flags & O_DSYNC) + posix_flags |= (fmode_t)O_DSYNC; + if (flags & __O_SYNC) + posix_flags |= (fmode_t)__O_SYNC; if (flags & O_DIRECTORY) posix_flags |= (fmode_t)O_DIRECTORY; if (flags & O_NOFOLLOW) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 2346895b3a7..14cbc831422 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -111,43 +111,40 @@ #include <linux/dvb/frontend.h> #include <linux/dvb/video.h> +#include <linux/sort.h> + #ifdef CONFIG_SPARC #include <asm/fbio.h> #endif -static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, - unsigned long arg, struct file *f) -{ - return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); -} - -static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg) +static int w_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); int err; unsigned long val; - + set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); - if (!err && put_user(val, (u32 __user *)compat_ptr(arg))) + if (!err && put_user(val, argp)) return -EFAULT; return err; } - -static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg) + +static int rw_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); - u32 __user *argptr = compat_ptr(arg); int err; unsigned long val; - - if(get_user(val, argptr)) + + if(get_user(val, argp)) return -EFAULT; set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); - if (!err && put_user(val, argptr)) + if (!err && put_user(val, argp)) return -EFAULT; return err; } @@ -161,7 +158,8 @@ struct compat_video_event { } u; }; -static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_get_event(unsigned int fd, unsigned int cmd, + struct compat_video_event __user *up) { struct video_event kevent; mm_segment_t old_fs = get_fs(); @@ -172,8 +170,6 @@ static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long a set_fs(old_fs); if (!err) { - struct compat_video_event __user *up = compat_ptr(arg); - err = put_user(kevent.type, &up->type); err |= put_user(kevent.timestamp, &up->timestamp); err |= put_user(kevent.u.size.w, &up->u.size.w); @@ -192,15 +188,14 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_stillpicture(unsigned int fd, unsigned int cmd, + struct compat_video_still_picture __user *up) { - struct compat_video_still_picture __user *up; struct video_still_picture __user *up_native; compat_uptr_t fp; int32_t size; int err; - up = (struct compat_video_still_picture __user *) arg; err = get_user(fp, &up->iFrame); err |= get_user(size, &up->size); if (err) @@ -224,14 +219,13 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, + struct compat_video_spu_palette __user *up) { - struct compat_video_spu_palette __user *up; struct video_spu_palette __user *up_native; compat_uptr_t palp; int length, err; - up = (struct compat_video_spu_palette __user *) arg; err = get_user(palp, &up->palette); err |= get_user(length, &up->length); @@ -299,16 +293,15 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, + sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; - sg_io_hdr32_t __user *sgio32; u16 iovec_count; u32 data; void __user *dxferp; int err; - sgio32 = compat_ptr(arg); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -398,11 +391,11 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct + compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; - struct compat_sg_req_info __user *o = (void __user *)arg; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); err = sys_ioctl(fd,cmd,(unsigned long)r); if (err < 0) @@ -430,9 +423,9 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, + struct sock_fprog32 __user *u_fprog32) { - struct sock_fprog32 __user *u_fprog32 = compat_ptr(arg); struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; u32 fptr32; @@ -469,15 +462,14 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_gidle(unsigned int fd, unsigned int cmd, + struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; - struct ppp_idle32 __user *idle32; __kernel_time_t xmit, recv; int err; idle = compat_alloc_user_space(sizeof(*idle)); - idle32 = compat_ptr(arg); err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); @@ -491,15 +483,14 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) return err; } -static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_scompress(unsigned int fd, unsigned int cmd, + struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; - struct ppp_option_data32 __user *odata32; __u32 data; void __user *datap; odata = compat_alloc_user_space(sizeof(*odata)); - odata32 = compat_ptr(arg); if (get_user(data, &odata32->ptr)) return -EFAULT; @@ -515,35 +506,6 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); } -static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - int err; - - switch (cmd) { - case PPPIOCGIDLE32: - err = ppp_gidle(fd, cmd, arg); - break; - - case PPPIOCSCOMPRESS32: - err = ppp_scompress(fd, cmd, arg); - break; - - default: - do { - static int count; - if (++count <= 20) - printk("ppp_ioctl: Unknown cmd fd(%d) " - "cmd(%08x) arg(%08x)\n", - (int)fd, (unsigned int)cmd, (unsigned int)arg); - } while(0); - err = -EINVAL; - break; - }; - - return err; -} - - #ifdef CONFIG_BLOCK struct mtget32 { compat_long_t mt_type; @@ -561,7 +523,7 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) { mm_segment_t old_fs = get_fs(); struct mtget get; @@ -581,15 +543,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) kcmd = MTIOCGET; karg = &get; break; - default: - do { - static int count; - if (++count <= 20) - printk("mt_ioctl: Unknown cmd fd(%d) " - "cmd(%08x) arg(%08x)\n", - (int)fd, (unsigned int)cmd, (unsigned int)arg); - } while(0); - return -EINVAL; } set_fs (KERNEL_DS); err = sys_ioctl (fd, kcmd, (unsigned long)karg); @@ -598,11 +551,11 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) return err; switch (cmd) { case MTIOCPOS32: - upos32 = compat_ptr(arg); + upos32 = argp; err = __put_user(pos.mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: - umget32 = compat_ptr(arg); + umget32 = argp; err = __put_user(get.mt_type, &umget32->mt_type); err |= __put_user(get.mt_resid, &umget32->mt_resid); err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); @@ -617,162 +570,8 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) #endif /* CONFIG_BLOCK */ -#ifdef CONFIG_VT - -static int vt_check(struct file *file) -{ - struct tty_struct *tty; - struct inode *inode = file->f_path.dentry->d_inode; - struct vc_data *vc; - - if (file->f_op->unlocked_ioctl != tty_ioctl) - return -EINVAL; - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode, "tty_ioctl")) - return -EINVAL; - - if (tty->ops->ioctl != vt_ioctl) - return -EINVAL; - - vc = (struct vc_data *)tty->driver_data; - if (!vc_cons_allocated(vc->vc_num)) /* impossible? */ - return -ENOIOCTLCMD; - - /* - * To have permissions to do most of the vt ioctls, we either have - * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. - */ - if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) - return 1; - return 0; -} - -struct consolefontdesc32 { - unsigned short charcount; /* characters in font (256 or 512) */ - unsigned short charheight; /* scan lines per character (1-32) */ - compat_caddr_t chardata; /* font data in expanded form */ -}; - -static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct consolefontdesc32 __user *user_cfd = compat_ptr(arg); - struct console_font_op op; - compat_caddr_t data; - int i, perm; - - perm = vt_check(file); - if (perm < 0) return perm; - - switch (cmd) { - case PIO_FONTX: - if (!perm) - return -EPERM; - op.op = KD_FONT_OP_SET; - op.flags = 0; - op.width = 8; - if (get_user(op.height, &user_cfd->charheight) || - get_user(op.charcount, &user_cfd->charcount) || - get_user(data, &user_cfd->chardata)) - return -EFAULT; - op.data = compat_ptr(data); - return con_font_op(vc_cons[fg_console].d, &op); - case GIO_FONTX: - op.op = KD_FONT_OP_GET; - op.flags = 0; - op.width = 8; - if (get_user(op.height, &user_cfd->charheight) || - get_user(op.charcount, &user_cfd->charcount) || - get_user(data, &user_cfd->chardata)) - return -EFAULT; - if (!data) - return 0; - op.data = compat_ptr(data); - i = con_font_op(vc_cons[fg_console].d, &op); - if (i) - return i; - if (put_user(op.height, &user_cfd->charheight) || - put_user(op.charcount, &user_cfd->charcount) || - put_user((compat_caddr_t)(unsigned long)op.data, - &user_cfd->chardata)) - return -EFAULT; - return 0; - } - return -EINVAL; -} - -struct console_font_op32 { - compat_uint_t op; /* operation code KD_FONT_OP_* */ - compat_uint_t flags; /* KD_FONT_FLAG_* */ - compat_uint_t width, height; /* font size */ - compat_uint_t charcount; - compat_caddr_t data; /* font data with height fixed to 32 */ -}; - -static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct console_font_op op; - struct console_font_op32 __user *fontop = compat_ptr(arg); - int perm = vt_check(file), i; - struct vc_data *vc; - - if (perm < 0) return perm; - - if (copy_from_user(&op, fontop, sizeof(struct console_font_op32))) - return -EFAULT; - if (!perm && op.op != KD_FONT_OP_GET) - return -EPERM; - op.data = compat_ptr(((struct console_font_op32 *)&op)->data); - op.flags |= KD_FONT_FLAG_OLD; - vc = ((struct tty_struct *)file->private_data)->driver_data; - i = con_font_op(vc, &op); - if (i) - return i; - ((struct console_font_op32 *)&op)->data = (unsigned long)op.data; - if (copy_to_user(fontop, &op, sizeof(struct console_font_op32))) - return -EFAULT; - return 0; -} - -struct unimapdesc32 { - unsigned short entry_ct; - compat_caddr_t entries; -}; - -static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct unimapdesc32 tmp; - struct unimapdesc32 __user *user_ud = compat_ptr(arg); - int perm = vt_check(file); - struct vc_data *vc; - - if (perm < 0) - return perm; - if (copy_from_user(&tmp, user_ud, sizeof tmp)) - return -EFAULT; - if (tmp.entries) - if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries), - tmp.entry_ct*sizeof(struct unipair))) - return -EFAULT; - vc = ((struct tty_struct *)file->private_data)->driver_data; - switch (cmd) { - case PIO_UNIMAP: - if (!perm) - return -EPERM; - return con_set_unimap(vc, tmp.entry_ct, - compat_ptr(tmp.entries)); - case GIO_UNIMAP: - if (!perm && fg_console != vc->vc_num) - return -EPERM; - return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), - compat_ptr(tmp.entries)); - } - return 0; -} - -#endif /* CONFIG_VT */ - -static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, + compat_uid_t __user *argp) { mm_segment_t old_fs = get_fs(); __kernel_uid_t kuid; @@ -785,20 +584,15 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a set_fs(old_fs); if (err >= 0) - err = put_user(kuid, (compat_uid_t __user *)compat_ptr(arg)); + err = put_user(kuid, argp); return err; } -static __used int -ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - return -EINVAL; -} - -static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ioc_settimeout(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { - return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg); + return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp); } /* Bluetooth ioctls */ @@ -856,7 +650,8 @@ static int set_raw32_request(struct raw_config_request *req, struct raw32_config return ret ? -EFAULT : 0; } -static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int raw_ioctl(unsigned fd, unsigned cmd, + struct raw32_config_request __user *user_req) { int ret; @@ -864,7 +659,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) case RAW_SETBIND: case RAW_GETBIND: { struct raw_config_request req; - struct raw32_config_request __user *user_req = compat_ptr(arg); mm_segment_t oldfs = get_fs(); if ((ret = get_raw32_request(&req, user_req))) @@ -879,9 +673,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) } break; } - default: - ret = sys_ioctl(fd, cmd, arg); - break; } return ret; } @@ -909,11 +700,11 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int serial_struct_ioctl(unsigned fd, unsigned cmd, + struct serial_struct32 __user *ss32) { typedef struct serial_struct SS; typedef struct serial_struct32 SS32; - struct serial_struct32 __user *ss32 = compat_ptr(arg); int err; struct serial_struct ss; mm_segment_t oldseg = get_fs(); @@ -951,96 +742,6 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) return err; } -struct usbdevfs_ctrltransfer32 { - u8 bRequestType; - u8 bRequest; - u16 wValue; - u16 wIndex; - u16 wLength; - u32 timeout; /* in milliseconds */ - compat_caddr_t data; -}; - -#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) - -static int do_usbdevfs_control(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_ctrltransfer32 __user *p32 = compat_ptr(arg); - struct usbdevfs_ctrltransfer __user *p; - __u32 udata; - p = compat_alloc_user_space(sizeof(*p)); - if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) || - get_user(udata, &p32->data) || - put_user(compat_ptr(udata), &p->data)) - return -EFAULT; - return sys_ioctl(fd, USBDEVFS_CONTROL, (unsigned long)p); -} - - -struct usbdevfs_bulktransfer32 { - compat_uint_t ep; - compat_uint_t len; - compat_uint_t timeout; /* in milliseconds */ - compat_caddr_t data; -}; - -#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) - -static int do_usbdevfs_bulk(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_bulktransfer32 __user *p32 = compat_ptr(arg); - struct usbdevfs_bulktransfer __user *p; - compat_uint_t n; - compat_caddr_t addr; - - p = compat_alloc_user_space(sizeof(*p)); - - if (get_user(n, &p32->ep) || put_user(n, &p->ep) || - get_user(n, &p32->len) || put_user(n, &p->len) || - get_user(n, &p32->timeout) || put_user(n, &p->timeout) || - get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data)) - return -EFAULT; - - return sys_ioctl(fd, USBDEVFS_BULK, (unsigned long)p); -} - - -/* - * USBDEVFS_SUBMITURB, USBDEVFS_REAPURB and USBDEVFS_REAPURBNDELAY - * are handled in usbdevfs core. -Christopher Li - */ - -struct usbdevfs_disconnectsignal32 { - compat_int_t signr; - compat_caddr_t context; -}; - -#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) - -static int do_usbdevfs_discsignal(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_disconnectsignal kdis; - struct usbdevfs_disconnectsignal32 __user *udis; - mm_segment_t old_fs; - u32 uctx; - int err; - - udis = compat_ptr(arg); - - if (get_user(kdis.signr, &udis->signr) || - __get_user(uctx, &udis->context)) - return -EFAULT; - - kdis.context = compat_ptr(uctx); - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_ioctl(fd, USBDEVFS_DISCSIGNAL, (unsigned long) &kdis); - set_fs(old_fs); - - return err; -} - /* * I2C layer ioctls */ @@ -1069,9 +770,9 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_rdwr_ioctl_data32 __user *udata) { - struct i2c_rdwr_ioctl_data32 __user *udata = compat_ptr(arg); struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; @@ -1105,10 +806,10 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar return sys_ioctl(fd, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; - struct i2c_smbus_ioctl_data32 __user *udata; compat_caddr_t datap; tdata = compat_alloc_user_space(sizeof(*tdata)); @@ -1117,7 +818,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) return -EFAULT; - udata = compat_ptr(arg); if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) return -EFAULT; @@ -1137,7 +837,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) { mm_segment_t oldfs = get_fs(); compat_ulong_t val32; @@ -1155,29 +855,14 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) if (ret) return ret; val32 = kval; - return put_user(val32, (unsigned int __user *)arg); + return put_user(val32, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return sys_ioctl(fd, RTC_IRQP_SET, arg); + return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return sys_ioctl(fd, RTC_EPOCH_SET, arg); - default: - /* unreached */ - return -ENOIOCTLCMD; + return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); } -} -static int -lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct compat_timeval __user *tc = (struct compat_timeval __user *)arg; - struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval)); - struct timeval ts; - if (get_user(ts.tv_sec, &tc->tv_sec) || - get_user(ts.tv_usec, &tc->tv_usec) || - put_user(ts.tv_sec, &tn->tv_sec) || - put_user(ts.tv_usec, &tn->tv_usec)) - return -EFAULT; - return sys_ioctl(fd, cmd, (unsigned long)tn); + return -ENOIOCTLCMD; } /* on ia32 l_start is on a 32-bit boundary */ @@ -1197,9 +882,9 @@ struct space_resv_32 { #define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) /* just account for different alignment */ -static int compat_ioctl_preallocate(struct file *file, unsigned long arg) +static int compat_ioctl_preallocate(struct file *file, + struct space_resv_32 __user *p32) { - struct space_resv_32 __user *p32 = compat_ptr(arg); struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || @@ -1215,27 +900,13 @@ static int compat_ioctl_preallocate(struct file *file, unsigned long arg) } #endif +/* + * simple reversible transform to make our table more evenly + * distributed after sorting. + */ +#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) -typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int, - unsigned long, struct file *); - -struct ioctl_trans { - unsigned long cmd; - ioctl_trans_handler_t handler; - struct ioctl_trans *next; -}; - -#define HANDLE_IOCTL(cmd,handler) \ - { (cmd), (ioctl_trans_handler_t)(handler) }, - -/* pointer to compatible structure or no argument */ -#define COMPATIBLE_IOCTL(cmd) \ - { (cmd), do_ioctl32_pointer }, - -/* argument is an unsigned long integer, not a pointer */ -#define ULONG_IOCTL(cmd) \ - { (cmd), (ioctl_trans_handler_t)sys_ioctl }, - +#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs @@ -1245,7 +916,7 @@ struct ioctl_trans { Most other reasons are not valid. */ #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) -static struct ioctl_trans ioctl_start[] = { +static unsigned int ioctl_pointer[] = { /* compatible ioctls first */ COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ @@ -1256,7 +927,6 @@ COMPATIBLE_IOCTL(TCSETA) COMPATIBLE_IOCTL(TCSETAW) COMPATIBLE_IOCTL(TCSETAF) COMPATIBLE_IOCTL(TCSBRK) -ULONG_IOCTL(TCSBRKP) COMPATIBLE_IOCTL(TCXONC) COMPATIBLE_IOCTL(TCFLSH) COMPATIBLE_IOCTL(TCGETS) @@ -1266,7 +936,6 @@ COMPATIBLE_IOCTL(TCSETSF) COMPATIBLE_IOCTL(TIOCLINUX) COMPATIBLE_IOCTL(TIOCSBRK) COMPATIBLE_IOCTL(TIOCCBRK) -ULONG_IOCTL(TIOCMIWAIT) COMPATIBLE_IOCTL(TIOCGICOUNT) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -1288,7 +957,6 @@ COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) -ULONG_IOCTL(TIOCSCTTY) COMPATIBLE_IOCTL(TIOCGPTN) COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) @@ -1311,44 +979,11 @@ COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) -/* RAID */ -COMPATIBLE_IOCTL(RAID_VERSION) -COMPATIBLE_IOCTL(GET_ARRAY_INFO) -COMPATIBLE_IOCTL(GET_DISK_INFO) -COMPATIBLE_IOCTL(PRINT_RAID_DEBUG) -COMPATIBLE_IOCTL(RAID_AUTORUN) -COMPATIBLE_IOCTL(CLEAR_ARRAY) -COMPATIBLE_IOCTL(ADD_NEW_DISK) -ULONG_IOCTL(HOT_REMOVE_DISK) -COMPATIBLE_IOCTL(SET_ARRAY_INFO) -COMPATIBLE_IOCTL(SET_DISK_INFO) -COMPATIBLE_IOCTL(WRITE_RAID_INFO) -COMPATIBLE_IOCTL(UNPROTECT_ARRAY) -COMPATIBLE_IOCTL(PROTECT_ARRAY) -ULONG_IOCTL(HOT_ADD_DISK) -ULONG_IOCTL(SET_DISK_FAULTY) -COMPATIBLE_IOCTL(RUN_ARRAY) -COMPATIBLE_IOCTL(STOP_ARRAY) -COMPATIBLE_IOCTL(STOP_ARRAY_RO) -COMPATIBLE_IOCTL(RESTART_ARRAY_RW) -COMPATIBLE_IOCTL(GET_BITMAP_FILE) -ULONG_IOCTL(SET_BITMAP_FILE) -/* Big K */ -COMPATIBLE_IOCTL(PIO_FONT) -COMPATIBLE_IOCTL(GIO_FONT) -COMPATIBLE_IOCTL(PIO_CMAP) -COMPATIBLE_IOCTL(GIO_CMAP) -ULONG_IOCTL(KDSIGACCEPT) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) -ULONG_IOCTL(KIOCSOUND) -ULONG_IOCTL(KDMKTONE) COMPATIBLE_IOCTL(KDGKBTYPE) -ULONG_IOCTL(KDSETMODE) COMPATIBLE_IOCTL(KDGETMODE) -ULONG_IOCTL(KDSKBMODE) COMPATIBLE_IOCTL(KDGKBMODE) -ULONG_IOCTL(KDSKBMETA) COMPATIBLE_IOCTL(KDGKBMETA) COMPATIBLE_IOCTL(KDGKBENT) COMPATIBLE_IOCTL(KDSKBENT) @@ -1358,15 +993,7 @@ COMPATIBLE_IOCTL(KDGKBDIACR) COMPATIBLE_IOCTL(KDSKBDIACR) COMPATIBLE_IOCTL(KDKBDREP) COMPATIBLE_IOCTL(KDGKBLED) -ULONG_IOCTL(KDSKBLED) COMPATIBLE_IOCTL(KDGETLED) -ULONG_IOCTL(KDSETLED) -COMPATIBLE_IOCTL(GIO_SCRNMAP) -COMPATIBLE_IOCTL(PIO_SCRNMAP) -COMPATIBLE_IOCTL(GIO_UNISCRNMAP) -COMPATIBLE_IOCTL(PIO_UNISCRNMAP) -COMPATIBLE_IOCTL(PIO_FONTRESET) -COMPATIBLE_IOCTL(PIO_UNIMAPCLR) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) @@ -1378,20 +1005,6 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif -/* Big V */ -COMPATIBLE_IOCTL(VT_SETMODE) -COMPATIBLE_IOCTL(VT_GETMODE) -COMPATIBLE_IOCTL(VT_GETSTATE) -COMPATIBLE_IOCTL(VT_OPENQRY) -ULONG_IOCTL(VT_ACTIVATE) -ULONG_IOCTL(VT_WAITACTIVE) -ULONG_IOCTL(VT_RELDISP) -ULONG_IOCTL(VT_DISALLOCATE) -COMPATIBLE_IOCTL(VT_RESIZE) -COMPATIBLE_IOCTL(VT_RESIZEX) -COMPATIBLE_IOCTL(VT_LOCKSWITCH) -COMPATIBLE_IOCTL(VT_UNLOCKSWITCH) -COMPATIBLE_IOCTL(VT_GETHIFONTMASK) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) @@ -1420,11 +1033,12 @@ COMPATIBLE_IOCTL(MTIOCTOP) /* Socket level stuff */ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) COMPATIBLE_IOCTL(SG_EMULATED_HOST) -ULONG_IOCTL(SG_SET_TRANSFORM) COMPATIBLE_IOCTL(SG_GET_TRANSFORM) COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) @@ -1478,8 +1092,6 @@ COMPATIBLE_IOCTL(PPPIOCGCHAN) /* PPPOX */ COMPATIBLE_IOCTL(PPPOEIOCSFWD) COMPATIBLE_IOCTL(PPPOEIOCDFWD) -/* LP */ -COMPATIBLE_IOCTL(LPGETSTATUS) /* ppdev */ COMPATIBLE_IOCTL(PPSETMODE) COMPATIBLE_IOCTL(PPRSTATUS) @@ -1661,8 +1273,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) COMPATIBLE_IOCTL(OSS_GETVERSION) /* AUTOFS */ -ULONG_IOCTL(AUTOFS_IOC_READY) -ULONG_IOCTL(AUTOFS_IOC_FAIL) COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC) COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) @@ -1755,30 +1365,11 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* USB */ -COMPATIBLE_IOCTL(USBDEVFS_RESETEP) -COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION) -COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER) -COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB) -COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO) -COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO) -COMPATIBLE_IOCTL(USBDEVFS_RESET) -COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32) -COMPATIBLE_IOCTL(USBDEVFS_REAPURB32) -COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32) -COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT) /* NBD */ -ULONG_IOCTL(NBD_SET_SOCK) -ULONG_IOCTL(NBD_SET_BLKSIZE) -ULONG_IOCTL(NBD_SET_SIZE) COMPATIBLE_IOCTL(NBD_DO_IT) COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) COMPATIBLE_IOCTL(NBD_CLEAR_QUE) COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) -ULONG_IOCTL(NBD_SET_SIZE_BLOCKS) COMPATIBLE_IOCTL(NBD_DISCONNECT) /* i2c */ COMPATIBLE_IOCTL(I2C_SLAVE) @@ -1878,42 +1469,6 @@ COMPATIBLE_IOCTL(JSIOCGAXES) COMPATIBLE_IOCTL(JSIOCGBUTTONS) COMPATIBLE_IOCTL(JSIOCGNAME(0)) -/* now things that need handlers */ -#ifdef CONFIG_BLOCK -HANDLE_IOCTL(SG_IO,sg_ioctl_trans) -HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans) -#endif -HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans) -HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans) -HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans) -HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans) -#ifdef CONFIG_BLOCK -HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans) -HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans) -#endif -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) -HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout) -#ifdef CONFIG_VT -HANDLE_IOCTL(PIO_FONTX, do_fontx_ioctl) -HANDLE_IOCTL(GIO_FONTX, do_fontx_ioctl) -HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl) -HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl) -HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl) -#endif -/* One SMB ioctl needs translations. */ -#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) -HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid) -/* block stuff */ -#ifdef CONFIG_BLOCK -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) -/* Raw devices */ -HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) -HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) -#endif -/* Serial */ -HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl) -HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl) #ifdef TIOCGLTC COMPATIBLE_IOCTL(TIOCGLTC) COMPATIBLE_IOCTL(TIOCSLTC) @@ -1928,39 +1483,6 @@ COMPATIBLE_IOCTL(TIOCSLTC) COMPATIBLE_IOCTL(TIOCSTART) COMPATIBLE_IOCTL(TIOCSTOP) #endif -/* Usbdevfs */ -HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control) -HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk) -HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal) -COMPATIBLE_IOCTL(USBDEVFS_IOCTL32) -/* i2c */ -HANDLE_IOCTL(I2C_FUNCS, w_long) -HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl) -HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl) -/* Not implemented in the native kernel */ -HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl) -HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl) -HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl) -HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl) - -/* dvb */ -HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event) -HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture) -HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette) - -/* parport */ -COMPATIBLE_IOCTL(LPTIME) -COMPATIBLE_IOCTL(LPCHAR) -COMPATIBLE_IOCTL(LPABORTOPEN) -COMPATIBLE_IOCTL(LPCAREFUL) -COMPATIBLE_IOCTL(LPWAIT) -COMPATIBLE_IOCTL(LPSETIRQ) -COMPATIBLE_IOCTL(LPGETSTATUS) -COMPATIBLE_IOCTL(LPGETSTATUS) -COMPATIBLE_IOCTL(LPRESET) -/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/ -COMPATIBLE_IOCTL(LPGETFLAGS) -HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, but we don't want warnings on other file systems. So declare @@ -1988,12 +1510,110 @@ IGNORE_IOCTL(FBIOGCURSOR32) #endif }; -#define IOCTL_HASHSIZE 256 -static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE]; - -static inline unsigned long ioctl32_hash(unsigned long cmd) +/* + * Convert common ioctl arguments based on their command number + * + * Please do not add any code in here. Instead, implement + * a compat_ioctl operation in the place that handleÑ• the + * ioctl for the native case. + */ +static long do_ioctl_trans(int fd, unsigned int cmd, + unsigned long arg, struct file *file) { - return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE; + void __user *argp = compat_ptr(arg); + + switch (cmd) { + case PPPIOCGIDLE32: + return ppp_gidle(fd, cmd, argp); + case PPPIOCSCOMPRESS32: + return ppp_scompress(fd, cmd, argp); + case PPPIOCSPASS32: + case PPPIOCSACTIVE32: + return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); +#ifdef CONFIG_BLOCK + case SG_IO: + return sg_ioctl_trans(fd, cmd, argp); + case SG_GET_REQUEST_TABLE: + return sg_grt_trans(fd, cmd, argp); + case MTIOCGET32: + case MTIOCPOS32: + return mt_ioctl_trans(fd, cmd, argp); + /* Raw devices */ + case RAW_SETBIND: + case RAW_GETBIND: + return raw_ioctl(fd, cmd, argp); +#endif +#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) + case AUTOFS_IOC_SETTIMEOUT32: + return ioc_settimeout(fd, cmd, argp); + /* One SMB ioctl needs translations. */ +#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) + case SMB_IOC_GETMOUNTUID_32: + return do_smb_getmountuid(fd, cmd, argp); + /* Serial */ + case TIOCGSERIAL: + case TIOCSSERIAL: + return serial_struct_ioctl(fd, cmd, argp); + /* i2c */ + case I2C_FUNCS: + return w_long(fd, cmd, argp); + case I2C_RDWR: + return do_i2c_rdwr_ioctl(fd, cmd, argp); + case I2C_SMBUS: + return do_i2c_smbus_ioctl(fd, cmd, argp); + /* Not implemented in the native kernel */ + case RTC_IRQP_READ32: + case RTC_IRQP_SET32: + case RTC_EPOCH_READ32: + case RTC_EPOCH_SET32: + return rtc_ioctl(fd, cmd, argp); + + /* dvb */ + case VIDEO_GET_EVENT: + return do_video_get_event(fd, cmd, argp); + case VIDEO_STILLPICTURE: + return do_video_stillpicture(fd, cmd, argp); + case VIDEO_SET_SPU_PALETTE: + return do_video_set_spu_palette(fd, cmd, argp); + } + + /* + * These take an integer instead of a pointer as 'arg', + * so we must not do a compat_ptr() translation. + */ + switch (cmd) { + /* Big T */ + case TCSBRKP: + case TIOCMIWAIT: + case TIOCSCTTY: + /* RAID */ + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* Big K */ + case KDSIGACCEPT: + case KIOCSOUND: + case KDMKTONE: + case KDSETMODE: + case KDSKBMODE: + case KDSKBMETA: + case KDSKBLED: + case KDSETLED: + /* SG stuff */ + case SG_SET_TRANSFORM: + /* AUTOFS */ + case AUTOFS_IOC_READY: + case AUTOFS_IOC_FAIL: + /* NBD */ + case NBD_SET_SOCK: + case NBD_SET_BLKSIZE: + case NBD_SET_SIZE: + case NBD_SET_SIZE_BLOCKS: + return do_vfs_ioctl(file, fd, cmd, arg); + } + + return -ENOIOCTLCMD; } static void compat_ioctl_error(struct file *filp, unsigned int fd, @@ -2025,12 +1645,33 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, free_page((unsigned long)path); } +static int compat_ioctl_check_table(unsigned int xcmd) +{ + int i; + const int max = ARRAY_SIZE(ioctl_pointer) - 1; + + BUILD_BUG_ON(max >= (1 << 16)); + + /* guess initial offset into table, assuming a + normalized distribution */ + i = ((xcmd >> 16) * max) >> 16; + + /* do linear search up first, until greater or equal */ + while (ioctl_pointer[i] < xcmd && i < max) + i++; + + /* then do linear search down */ + while (ioctl_pointer[i] > xcmd && i > 0) + i--; + + return ioctl_pointer[i] == xcmd; +} + asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { struct file *filp; int error = -EBADF; - struct ioctl_trans *t; int fput_needed; filp = fget_light(fd, &fput_needed); @@ -2058,7 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(filp, arg); + error = compat_ioctl_preallocate(filp, compat_ptr(arg)); goto out_fput; #else case FS_IOC_RESVSP: @@ -2087,12 +1728,11 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, break; } - for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) { - if (t->cmd == cmd) - goto found_handler; - } + if (compat_ioctl_check_table(XFORM(cmd))) + goto found_handler; - { + error = do_ioctl_trans(fd, cmd, arg, filp); + if (error == -ENOIOCTLCMD) { static int count; if (++count <= 50) @@ -2103,13 +1743,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto out_fput; found_handler: - if (t->handler) { - lock_kernel(); - error = t->handler(fd, cmd, arg, filp); - unlock_kernel(); - goto out_fput; - } - + arg = (unsigned long)compat_ptr(arg); do_ioctl: error = do_vfs_ioctl(filp, fd, cmd, arg); out_fput: @@ -2118,35 +1752,22 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, return error; } -static void ioctl32_insert_translation(struct ioctl_trans *trans) +static int __init init_sys32_ioctl_cmp(const void *p, const void *q) { - unsigned long hash; - struct ioctl_trans *t; - - hash = ioctl32_hash (trans->cmd); - if (!ioctl32_hash_table[hash]) - ioctl32_hash_table[hash] = trans; - else { - t = ioctl32_hash_table[hash]; - while (t->next) - t = t->next; - trans->next = NULL; - t->next = trans; - } + unsigned int a, b; + a = *(unsigned int *)p; + b = *(unsigned int *)q; + if (a > b) + return 1; + if (a < b) + return -1; + return 0; } static int __init init_sys32_ioctl(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) { - if (ioctl_start[i].next) { - printk("ioctl translation %d bad\n",i); - return -1; - } - - ioctl32_insert_translation(&ioctl_start[i]); - } + sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), + init_sys32_ioctl_cmp, NULL); return 0; } __initcall(init_sys32_ioctl); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0d23b52dd22..b486169f42b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -32,7 +32,9 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) +static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev, + void *data, const struct file_operations *fops) + { struct inode *inode = new_inode(sb); @@ -44,14 +46,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d init_special_inode(inode, mode, dev); break; case S_IFREG: - inode->i_fop = &debugfs_file_operations; + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; break; case S_IFLNK: inode->i_op = &debugfs_link_operations; + inode->i_fop = fops; + inode->i_private = data; break; case S_IFDIR: inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = fops ? fops : &simple_dir_operations; + inode->i_private = data; /* directory inodes start off with i_nlink == 2 * (for "." entry) */ @@ -64,7 +70,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d /* SMP-safe */ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev) + int mode, dev_t dev, void *data, + const struct file_operations *fops) { struct inode *inode; int error = -EPERM; @@ -72,7 +79,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, if (dentry->d_inode) return -EEXIST; - inode = debugfs_get_inode(dir->i_sb, mode, dev); + inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops); if (inode) { d_instantiate(dentry, inode); dget(dentry); @@ -81,12 +88,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, return error; } -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { int res; mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; - res = debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); if (!res) { inc_nlink(dir); fsnotify_mkdir(dir, dentry); @@ -94,18 +102,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return res; } -static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { mode = (mode & S_IALLUGO) | S_IFLNK; - return debugfs_mknod(dir, dentry, mode, 0); + return debugfs_mknod(dir, dentry, mode, 0, data, fops); } -static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { int res; mode = (mode & S_IALLUGO) | S_IFREG; - res = debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); if (!res) fsnotify_create(dir, dentry); return res; @@ -139,7 +149,9 @@ static struct file_system_type debug_fs_type = { static int debugfs_create_by_name(const char *name, mode_t mode, struct dentry *parent, - struct dentry **dentry) + struct dentry **dentry, + void *data, + const struct file_operations *fops) { int error = 0; @@ -164,13 +176,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode, if (!IS_ERR(*dentry)) { switch (mode & S_IFMT) { case S_IFDIR: - error = debugfs_mkdir(parent->d_inode, *dentry, mode); + error = debugfs_mkdir(parent->d_inode, *dentry, mode, + data, fops); break; case S_IFLNK: - error = debugfs_link(parent->d_inode, *dentry, mode); + error = debugfs_link(parent->d_inode, *dentry, mode, + data, fops); break; default: - error = debugfs_create(parent->d_inode, *dentry, mode); + error = debugfs_create(parent->d_inode, *dentry, mode, + data, fops); break; } dput(*dentry); @@ -221,19 +236,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode, if (error) goto exit; - error = debugfs_create_by_name(name, mode, parent, &dentry); + error = debugfs_create_by_name(name, mode, parent, &dentry, + data, fops); if (error) { dentry = NULL; simple_release_fs(&debugfs_mount, &debugfs_mount_count); goto exit; } - - if (dentry->d_inode) { - if (data) - dentry->d_inode->i_private = data; - if (fops) - dentry->d_inode->i_fop = fops; - } exit: return dentry; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d5f8c96964b..8882ecc0f1b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -517,11 +517,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) { + struct dentry *dentry; + struct tty_struct *tty; + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + /* Ensure dentry has not been deleted by devpts_pty_kill() */ + dentry = d_find_alias(pts_inode); + if (!dentry) + return NULL; + + tty = NULL; if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return (struct tty_struct *)pts_inode->i_private; - return NULL; + tty = (struct tty_struct *)pts_inode->i_private; + + dput(dentry); + + return tty; } void devpts_pty_kill(struct tty_struct *tty) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index fd9859f92fa..0df24385081 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g, struct dlm_comms *cms = NULL; void *gps = NULL; - cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL); - gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); - sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL); - cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL); + cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); + gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS); + sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); + cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); if (!cl || !gps || !sps || !cms) goto fail; @@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name) struct dlm_nodes *nds = NULL; void *gps = NULL; - sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL); - gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); - nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL); + sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); + gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS); + nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); if (!sp || !gps || !nds) goto fail; @@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name) { struct dlm_comm *cm; - cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL); + cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); if (!cm) return ERR_PTR(-ENOMEM); @@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name) struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd; - nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); + nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); if (!nd) return ERR_PTR(-ENOMEM); @@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) if (cm->addr_count >= DLM_MAX_ADDR_COUNT) return -ENOSPC; - addr = kzalloc(sizeof(*addr), GFP_KERNEL); + addr = kzalloc(sizeof(*addr), GFP_NOFS); if (!addr) return -ENOMEM; @@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, ids_count = sp->members_count; - ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL); + ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); if (!ids) { rv = -ENOMEM; goto out; @@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, if (!new_count) goto out_ids; - new = kcalloc(new_count, sizeof(int), GFP_KERNEL); + new = kcalloc(new_count, sizeof(int), GFP_NOFS); if (!new) { kfree(ids); rv = -ENOMEM; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 1c8bb8c3a82..375a2359b3b 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) return NULL; - ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL); + ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); if (!ri) return NULL; if (n == 0) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index c4dfa1dcc86..7b84c1dbc82 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) spin_unlock(&ls->ls_recover_list_lock); if (!found) - de = kzalloc(sizeof(struct dlm_direntry) + len, - ls->ls_allocation); + de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS); return de; } @@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls) dlm_dir_clear(ls); - last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation); + last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); if (!last_name) goto out; @@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, if (namelen > DLM_RESNAME_MAXLEN) return -EINVAL; - de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation); + de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS); if (!de) return -ENOMEM; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d01ca0a711d..826d3dc6e0a 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -473,7 +473,6 @@ struct dlm_ls { int ls_low_nodeid; int ls_total_weight; int *ls_node_array; - gfp_t ls_allocation; struct dlm_rsb ls_stub_rsb; /* for returning errors */ struct dlm_lkb ls_stub_lkb; /* for returning errors */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index eb507c453c5..9c0c1db1e10 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into lowcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; @@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } if (flags & DLM_LKF_VALBLK) { - ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { kfree(ua); __put_lkb(ls, lkb); @@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua = lkb->lkb_ua; if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { - ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { error = -ENOMEM; goto out_put; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d489fcc8671..c010ecfc0d2 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, error = -ENOMEM; - ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); + ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); if (!ls) goto out; memcpy(ls->ls_name, name, namelen); @@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); - if (flags & DLM_LSFL_FS) - ls->ls_allocation = GFP_NOFS; - else - ls->ls_allocation = GFP_KERNEL; - /* ls_exflags are forced to match among nodes, and we don't need to require all nodes to have some flags set */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | @@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; - ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS); if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { @@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_lkbtbl_size; ls->ls_lkbtbl_size = size; - ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS); if (!ls->ls_lkbtbl) goto out_rsbfree; for (i = 0; i < size; i++) { @@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_dirtbl_size; ls->ls_dirtbl_size = size; - ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS); if (!ls->ls_dirtbl) goto out_lkbfree; for (i = 0; i < size; i++) { @@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); + ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); if (!ls->ls_recover_buf) goto out_dirfree; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 70736eb4b51..52cab160893 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1060,7 +1060,7 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmalloc(sizeof(*addr), GFP_KERNEL); + addr = kmalloc(sizeof(*addr), GFP_NOFS); if (!addr) break; memcpy(addr, &sas, sizeof(*addr)); @@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void) struct sockaddr_storage localaddr; struct sctp_event_subscribe subscribe; int result = -EINVAL, num = 1, i, addr_len; - struct connection *con = nodeid2con(0, GFP_KERNEL); + struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; if (!con) @@ -1171,7 +1171,7 @@ out: static int tcp_listen_for_all(void) { struct socket *sock = NULL; - struct connection *con = nodeid2con(0, GFP_KERNEL); + struct connection *con = nodeid2con(0, GFP_NOFS); int result = -EINVAL; if (!con) diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b128775913b..84f70bfb0ba 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid) struct dlm_member *memb; int w, error; - memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); if (!memb) return -ENOMEM; @@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls) ls->ls_total_weight = total; - array = kmalloc(sizeof(int) * total, ls->ls_allocation); + array = kmalloc(sizeof(int) * total, GFP_NOFS); if (!array) return; @@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) continue; log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); - memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); if (!memb) return -ENOMEM; memb->nodeid = rv->new[i]; @@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls) int *ids = NULL, *new = NULL; int error, ids_count = 0, new_count = 0; - rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation); + rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); if (!rv) return -ENOMEM; diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index c1775b84eba..8e0d00db004 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls) { char *p; - p = kzalloc(ls->ls_lvblen, ls->ls_allocation); + p = kzalloc(ls->ls_lvblen, GFP_NOFS); return p; } @@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); - r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); + r = kzalloc(sizeof(*r) + namelen, GFP_NOFS); return r; } @@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; - lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); + lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS); return lkb; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 55ea369f43a..052095cd592 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) struct sk_buff *skb; void *data; - skb = genlmsg_new(size, GFP_KERNEL); + skb = genlmsg_new(size, GFP_NOFS); if (!skb) return -ENOMEM; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index b540aa5d1f6..b5f89aef3b2 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (!ls) return -EINVAL; - xop = kzalloc(sizeof(*xop), GFP_KERNEL); + xop = kzalloc(sizeof(*xop), GFP_NOFS); if (!xop) { rv = -ENOMEM; goto out; @@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (!ls) return -EINVAL; - op = kzalloc(sizeof(*op), GFP_KERNEL); + op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) { rv = -ENOMEM; goto out; @@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (!ls) return -EINVAL; - op = kzalloc(sizeof(*op), GFP_KERNEL); + op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) { rv = -ENOMEM; goto out; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 67522c268c1..3c83a49a48a 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, char *mb; int mb_len = sizeof(struct dlm_rcom) + len; - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) { log_print("create_rcom to %d type %d len %d ENOBUFS", to_nodeid, type, len); diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 7a2307c0891..a44fa22890e 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) struct rq_entry *e; int length = ms->m_header.h_length - sizeof(struct dlm_message); - e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation); + e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); if (!e) { log_print("dlm_add_requestqueue: out of memory len %d", length); return; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index ebce994ab0b..e73a4bb572a 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } - ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; ua->proc = proc; @@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm_user_proc *proc, if (!ls) return -ENOENT; - ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; ua->proc = proc; @@ -352,7 +352,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = -ENOMEM; len = strlen(name) + strlen(name_prefix) + 2; - ls->ls_device.name = kzalloc(len, GFP_KERNEL); + ls->ls_device.name = kzalloc(len, GFP_NOFS); if (!ls->ls_device.name) goto fail; @@ -520,7 +520,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; - kbuf = kzalloc(count + 1, GFP_KERNEL); + kbuf = kzalloc(count + 1, GFP_NOFS); if (!kbuf) return -ENOMEM; @@ -546,7 +546,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, /* add 1 after namelen so that the name string is terminated */ kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, - GFP_KERNEL); + GFP_NOFS); if (!kbuf) { kfree(k32buf); return -ENOMEM; @@ -648,7 +648,7 @@ static int device_open(struct inode *inode, struct file *file) if (!ls) return -ENOENT; - proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); + proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS); if (!proc) { dlm_put_lockspace(ls); return -ENOMEM; diff --git a/fs/exec.c b/fs/exec.c index c0c636e34f6..623a5cc3076 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -923,6 +923,15 @@ char *get_task_comm(char *buf, struct task_struct *tsk) void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + + /* + * Threads may access current->comm without holding + * the task lock, so write the string carefully. + * Readers without a lock may see incomplete new + * names but are safe from non-terminating string reads. + */ + memset(tsk->comm, 0, TASK_COMM_LEN); + wmb(); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index cc2d22db119..2d0f757fda3 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -12,5 +12,5 @@ # Kbuild - Gets included from the Kernels Makefile and build system # -exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o +exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/common.h b/fs/exofs/common.h index c6718e4817f..b1b178e6171 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -49,6 +49,7 @@ #define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ #define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ #define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ +#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ /* exofs Application specific page/attribute */ @@ -78,17 +79,67 @@ enum { #define EXOFS_SUPER_MAGIC 0x5DF5 /* - * The file system control block - stored in an object's data (mainly, the one - * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored - * on disk. Right now it just has a magic value, which is basically a sanity - * check on our ability to communicate with the object store. + * The file system control block - stored in object EXOFS_SUPER_ID's data. + * This is where the in-memory superblock is stored on disk. */ +enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; struct exofs_fscb { __le64 s_nextid; /* Highest object ID used */ - __le32 s_numfiles; /* Number of files on fs */ + __le64 s_numfiles; /* Number of files on fs */ + __le32 s_version; /* == EXOFS_FSCB_VER */ __le16 s_magic; /* Magic signature */ __le16 s_newfs; /* Non-zero if this is a new fs */ -}; + + /* From here on it's a static part, only written by mkexofs */ + __le64 s_dev_table_oid; /* Resurved, not used */ + __le64 s_dev_table_count; /* == 0 means no dev_table */ +} __packed; + +/* + * Describes the raid used in the FS. It is part of the device table. + * This here is taken from the pNFS-objects definition. In exofs we + * use one raid policy through-out the filesystem. (NOTE: the funny + * alignment at begining. We take care of it at exofs_device_table. + */ +struct exofs_dt_data_map { + __le32 cb_num_comps; + __le64 cb_stripe_unit; + __le32 cb_group_width; + __le32 cb_group_depth; + __le32 cb_mirror_cnt; + __le32 cb_raid_algorithm; +} __packed; + +/* + * This is an osd device information descriptor. It is a single entry in + * the exofs device table. It describes an osd target lun which + * contains data belonging to this FS. (Same partition_id on all devices) + */ +struct exofs_dt_device_info { + __le32 systemid_len; + u8 systemid[OSD_SYSTEMID_LEN]; + __le64 long_name_offset; /* If !0 then offset-in-file */ + __le32 osdname_len; /* */ + u8 osdname[44]; /* Embbeded, Ususally an asci uuid */ +} __packed; + +/* + * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. + * It contains the raid used for this multy-device FS and an array of + * participating devices. + */ +struct exofs_device_table { + __le32 dt_version; /* == EXOFS_DT_VER */ + struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ + + /* Resurved space For future use. Total includeing this: + * (8 * sizeof(le64)) + */ + __le64 __Resurved[4]; + + __le64 dt_num_devices; /* Array size */ + struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ +} __packed; /**************************************************************************** * inode-related things @@ -155,22 +206,4 @@ enum { (((name_len) + offsetof(struct exofs_dir_entry, name) + \ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) -/************************* - * function declarations * - *************************/ -/* osd.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); - -int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid); -static inline int exofs_check_ok(struct osd_request *or) -{ - return exofs_check_ok_resid(or, NULL, NULL); -} -int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred); -int exofs_async_op(struct osd_request *or, - osd_req_done_fn *async_done, void *caller_context, u8 *cred); - -int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); - #endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 5ec72e020b2..c35fd462398 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -30,13 +30,17 @@ * along with exofs; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef __EXOFS_H__ +#define __EXOFS_H__ #include <linux/fs.h> #include <linux/time.h> #include "common.h" -#ifndef __EXOFS_H__ -#define __EXOFS_H__ +/* FIXME: Remove once pnfs hits mainline + * #include <linux/exportfs/pnfs_osd_xdr.h> + */ +#include "pnfs.h" #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) @@ -55,7 +59,7 @@ * our extension to the in-memory superblock */ struct exofs_sb_info { - struct osd_dev *s_dev; /* returned by get_osd_dev */ + struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ osd_id s_pid; /* partition ID of file system*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ @@ -63,7 +67,11 @@ struct exofs_sb_info { spinlock_t s_next_gen_lock; /* spinlock for gen # update */ u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ + uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ + + struct pnfs_osd_data_map data_map; /* Default raid to use */ + unsigned s_numdevs; /* Num of devices in array */ + struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ }; /* @@ -79,6 +87,50 @@ struct exofs_i_info { struct inode vfs_inode; /* normal in-memory inode */ }; +static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) +{ + return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; +} + +struct exofs_io_state; +typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); + +struct exofs_io_state { + struct kref kref; + + void *private; + exofs_io_done_fn done; + + struct exofs_sb_info *sbi; + struct osd_obj_id obj; + u8 *cred; + + /* Global read/write IO*/ + loff_t offset; + unsigned long length; + void *kern_buff; + struct bio *bio; + + /* Attributes */ + unsigned in_attr_len; + struct osd_attr *in_attr; + unsigned out_attr_len; + struct osd_attr *out_attr; + + /* Variable array of size numdevs */ + unsigned numdevs; + struct exofs_per_dev_state { + struct osd_request *or; + struct bio *bio; + } per_dev[]; +}; + +static inline unsigned exofs_io_state_size(unsigned numdevs) +{ + return sizeof(struct exofs_io_state) + + sizeof(struct exofs_per_dev_state) * numdevs; +} + /* * our inode flags */ @@ -130,6 +182,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) /************************* * function declarations * *************************/ + +/* ios.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length); + +int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); +void exofs_put_io_state(struct exofs_io_state *ios); + +int exofs_check_io(struct exofs_io_state *ios, u64 *resid); + +int exofs_sbi_create(struct exofs_io_state *ios); +int exofs_sbi_remove(struct exofs_io_state *ios); +int exofs_sbi_write(struct exofs_io_state *ios); +int exofs_sbi_read(struct exofs_io_state *ios); + +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); + +int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); +static inline int exofs_oi_write(struct exofs_i_info *oi, + struct exofs_io_state *ios) +{ + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + return exofs_sbi_write(ios); +} + +static inline int exofs_oi_read(struct exofs_i_info *oi, + struct exofs_io_state *ios) +{ + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + return exofs_sbi_read(ios); +} + /* inode.c */ void exofs_truncate(struct inode *inode); int exofs_setattr(struct dentry *, struct iattr *); @@ -169,6 +257,7 @@ extern const struct file_operations exofs_file_operations; /* inode.c */ extern const struct address_space_operations exofs_aops; +extern const struct osd_attr g_attr_logical_length; /* namei.c */ extern const struct inode_operations exofs_dir_inode_operations; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6c10f747669..698a8636d39 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,15 +37,18 @@ #include "exofs.h" -#ifdef CONFIG_EXOFS_DEBUG -# define EXOFS_DEBUG_OBJ_ISIZE 1 -#endif +#define EXOFS_DBGMSG2(M...) do {} while (0) + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), +}; struct page_collect { struct exofs_sb_info *sbi; struct request_queue *req_q; struct inode *inode; unsigned expected_pages; + struct exofs_io_state *ios; struct bio *bio; unsigned nr_pages; @@ -54,22 +57,23 @@ struct page_collect { }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, - struct inode *inode) + struct inode *inode) { struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; pcol->sbi = sbi; - pcol->req_q = osd_request_queue(sbi->s_dev); + /* Create master bios on first Q, later on cloning, each clone will be + * allocated on it's destination Q + */ + pcol->req_q = osd_request_queue(sbi->s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; + pcol->ios = NULL; pcol->bio = NULL; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; - - EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino, - expected_pages); } static void _pcol_reset(struct page_collect *pcol) @@ -80,35 +84,49 @@ static void _pcol_reset(struct page_collect *pcol) pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; - EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", - pcol->inode->i_ino, pcol->expected_pages); + pcol->ios = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = 128; + pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; } static int pcol_try_alloc(struct page_collect *pcol) { - int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); + int pages = min_t(unsigned, pcol->expected_pages, + BIO_MAX_PAGES_KMALLOC); + + if (!pcol->ios) { /* First time allocate io_state */ + int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); + + if (ret) + return ret; + } for (; pages; pages >>= 1) { - pcol->bio = bio_alloc(GFP_KERNEL, pages); + pcol->bio = bio_kmalloc(GFP_KERNEL, pages); if (likely(pcol->bio)) return 0; } - EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", + EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", pcol->expected_pages); return -ENOMEM; } static void pcol_free(struct page_collect *pcol) { - bio_put(pcol->bio); - pcol->bio = NULL; + if (pcol->bio) { + bio_put(pcol->bio); + pcol->bio = NULL; + } + + if (pcol->ios) { + exofs_put_io_state(pcol->ios); + pcol->ios = NULL; + } } static int pcol_add_page(struct page_collect *pcol, struct page *page, @@ -161,22 +179,17 @@ static void update_write_page(struct page *page, int ret) /* Called at the end of reads, to optionally unlock pages and update their * status. */ -static int __readpages_done(struct osd_request *or, struct page_collect *pcol, - bool do_unlock) +static int __readpages_done(struct page_collect *pcol, bool do_unlock) { struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_ok_resid(or, &resid, NULL); - - osd_end_request(or); + int ret = exofs_check_io(pcol->ios, &resid); if (likely(!ret)) good_bytes = pcol->length; - else if (!resid) - good_bytes = 0; else good_bytes = pcol->length - resid; @@ -198,7 +211,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol, else page_stat = ret; - EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", + EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", inode->i_ino, page->index, page_stat ? "bad_bytes" : "good_bytes"); @@ -214,13 +227,13 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol, } /* callback of async reads */ -static void readpages_done(struct osd_request *or, void *p) +static void readpages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; - __readpages_done(or, pcol, true); + __readpages_done(pcol, true); atomic_dec(&pcol->sbi->s_curr_pending); - kfree(p); + kfree(pcol); } static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) @@ -238,17 +251,13 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) unlock_page(page); } - pcol_free(pcol); } static int read_exec(struct page_collect *pcol, bool is_sync) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct osd_obj_id obj = {pcol->sbi->s_pid, - pcol->inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or = NULL; + struct exofs_io_state *ios = pcol->ios; struct page_collect *pcol_copy = NULL; - loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; int ret; if (!pcol->bio) @@ -257,17 +266,13 @@ static int read_exec(struct page_collect *pcol, bool is_sync) /* see comment in _readpage() about sync reads */ WARN_ON(is_sync && (pcol->nr_pages != 1)); - or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - - osd_req_read(or, &obj, i_start, pcol->bio, pcol->length); + ios->bio = pcol->bio; + ios->length = pcol->length; + ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; if (is_sync) { - exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); - return __readpages_done(or, pcol, false); + exofs_oi_read(oi, pcol->ios); + return __readpages_done(pcol, false); } pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); @@ -277,14 +282,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync) } *pcol_copy = *pcol; - ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); + ios->done = readpages_done; + ios->private = pcol_copy; + ret = exofs_oi_read(oi, ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - obj.id, _LLU(i_start), pcol->length); + ios->obj.id, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ _pcol_reset(pcol); @@ -293,12 +300,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync) err: if (!is_sync) _unlock_pcol_pages(pcol, ret, READ); - else /* Pages unlocked by caller in sync mode only free bio */ - pcol_free(pcol); + + pcol_free(pcol); kfree(pcol_copy); - if (or) - osd_end_request(or); return ret; } @@ -370,12 +375,12 @@ try_again: if (len != PAGE_CACHE_SIZE) zero_user(page, len, PAGE_CACHE_SIZE - len); - EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", + EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", inode->i_ino, page->index, len); ret = pcol_add_page(pcol, page, len); if (ret) { - EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " + EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " "this_len=0x%zx nr_pages=%u length=0x%lx\n", page, len, pcol->nr_pages, pcol->length); @@ -419,9 +424,8 @@ static int _readpage(struct page *page, bool is_sync) _pcol_init(&pcol, 1, page->mapping->host); - /* readpage_strip might call read_exec(,async) inside at several places - * but this is safe for is_async=0 since read_exec will not do anything - * when we have a single page. + /* readpage_strip might call read_exec(,is_sync==false) at several + * places but not if we have a single page. */ ret = readpage_strip(&pcol, page); if (ret) { @@ -440,8 +444,8 @@ static int exofs_readpage(struct file *file, struct page *page) return _readpage(page, false); } -/* Callback for osd_write. All writes are asynchronouse */ -static void writepages_done(struct osd_request *or, void *p) +/* Callback for osd_write. All writes are asynchronous */ +static void writepages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; struct bio_vec *bvec; @@ -449,16 +453,12 @@ static void writepages_done(struct osd_request *or, void *p) u64 resid; u64 good_bytes; u64 length = 0; + int ret = exofs_check_io(ios, &resid); - int ret = exofs_check_ok_resid(or, NULL, &resid); - - osd_end_request(or); atomic_dec(&pcol->sbi->s_curr_pending); if (likely(!ret)) good_bytes = pcol->length; - else if (!resid) - good_bytes = 0; else good_bytes = pcol->length - resid; @@ -482,7 +482,7 @@ static void writepages_done(struct osd_request *or, void *p) update_write_page(page, page_stat); unlock_page(page); - EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", + EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", inode->i_ino, page->index, page_stat); length += bvec->bv_len; @@ -496,23 +496,13 @@ static void writepages_done(struct osd_request *or, void *p) static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct osd_obj_id obj = {pcol->sbi->s_pid, - pcol->inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or = NULL; + struct exofs_io_state *ios = pcol->ios; struct page_collect *pcol_copy = NULL; - loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; int ret; if (!pcol->bio) return 0; - or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("write_exec: Faild to osd_start_request()\n"); - ret = -ENOMEM; - goto err; - } - pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); if (!pcol_copy) { EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); @@ -523,16 +513,22 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ - osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length); - ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); + + ios->bio = pcol_copy->bio; + ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; + ios->length = pcol_copy->length; + ios->done = writepages_done; + ios->private = pcol_copy; + + ret = exofs_oi_write(oi, ios); if (unlikely(ret)) { - EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); + EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); goto err; } atomic_inc(&pcol->sbi->s_curr_pending); EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), + pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ _pcol_reset(pcol); @@ -540,9 +536,9 @@ static int write_exec(struct page_collect *pcol) err: _unlock_pcol_pages(pcol, ret, WRITE); + pcol_free(pcol); kfree(pcol_copy); - if (or) - osd_end_request(or); + return ret; } @@ -586,6 +582,9 @@ static int writepage_strip(struct page *page, if (PageError(page)) ClearPageError(page); unlock_page(page); + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " + "outside the limits\n", + inode->i_ino, page->index); return 0; } } @@ -600,6 +599,9 @@ try_again: ret = write_exec(pcol); if (unlikely(ret)) goto fail; + + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", + inode->i_ino, page->index); goto try_again; } @@ -609,7 +611,7 @@ try_again: goto fail; } - EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", + EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", inode->i_ino, page->index, len); ret = pcol_add_page(pcol, page, len); @@ -634,6 +636,8 @@ try_again: return 0; fail: + EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", + inode->i_ino, page->index, ret); set_bit(AS_EIO, &page->mapping->flags); unlock_page(page); return ret; @@ -652,14 +656,17 @@ static int exofs_writepages(struct address_space *mapping, wbc->range_end >> PAGE_CACHE_SHIFT; if (start || end) - expected_pages = min(end - start + 1, 32L); + expected_pages = end - start + 1; else expected_pages = mapping->nrpages; - EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" - " m->nrpages=%lu start=0x%lx end=0x%lx\n", + if (expected_pages < 32L) + expected_pages = 32L; + + EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " + "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", mapping->host->i_ino, wbc->range_start, wbc->range_end, - mapping->nrpages, start, end); + mapping->nrpages, start, end, expected_pages); _pcol_init(&pcol, expected_pages, mapping->host); @@ -771,19 +778,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock, const struct osd_attr g_attr_logical_length = ATTR_DEF( OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); +static int _do_truncate(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + loff_t isize = i_size_read(inode); + int ret; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); + + ret = exofs_oi_truncate(oi, (u64)isize); + EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize); + return ret; +} + /* * Truncate a file to the specified size - all we have to do is set the size * attribute. We make sure the object exists first. */ void exofs_truncate(struct inode *inode) { - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; struct exofs_i_info *oi = exofs_i(inode); - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or; - struct osd_attr attr; - loff_t isize = i_size_read(inode); - __be64 newsize; int ret; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) @@ -793,22 +809,6 @@ void exofs_truncate(struct inode *inode) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n"); - goto fail; - } - - osd_req_set_attributes(or, &obj); - - newsize = cpu_to_be64((u64)isize); - attr = g_attr_logical_length; - attr.val_ptr = &newsize; - osd_req_add_set_attr_list(or, &attr, 1); /* if we are about to truncate an object, and it hasn't been * created yet, wait @@ -816,8 +816,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); - osd_end_request(or); + ret = _do_truncate(inode); if (ret) goto fail; @@ -847,65 +846,62 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) /* * Read an inode from the OSD, and return it as is. We also return the size - * attribute in the 'sanity' argument if we got compiled with debugging turned - * on. + * attribute in the 'obj_size' argument. */ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode, uint64_t *sanity) + struct exofs_fcb *inode, uint64_t *obj_size) { struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_request *or; - struct osd_attr attr; - struct osd_obj_id obj = {sbi->s_pid, - oi->vfs_inode.i_ino + EXOFS_OBJ_OFF}; + struct osd_attr attrs[2]; + struct exofs_io_state *ios; int ret; - exofs_make_credential(oi->i_cred, &obj); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n"); - return -ENOMEM; + *obj_size = ~0; + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; } - osd_req_get_attributes(or, &obj); - /* we need the inode attribute */ - osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); + ios->obj.id = exofs_oi_objno(oi); + exofs_make_credential(oi->i_cred, &ios->obj); + ios->cred = oi->i_cred; -#ifdef EXOFS_DEBUG_OBJ_ISIZE - /* we get the size attributes to do a sanity check */ - osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); -#endif + attrs[0] = g_attr_inode_data; + attrs[1] = g_attr_logical_length; + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + ret = exofs_sbi_read(ios); if (ret) goto out; - attr = g_attr_inode_data; - ret = extract_attr_from_req(or, &attr); + ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); goto out; } + WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); + memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); - WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); - memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE); - -#ifdef EXOFS_DEBUG_OBJ_ISIZE - attr = g_attr_logical_length; - ret = extract_attr_from_req(or, &attr); + ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("ERROR: extract attr from or failed\n"); + EXOFS_ERR("%s: extract_attr of logical_length failed\n", + __func__); goto out; } - *sanity = get_unaligned_be64(attr.val_ptr); -#endif + *obj_size = get_unaligned_be64(attrs[1].val_ptr); out: - osd_end_request(or); + exofs_put_io_state(ios); return ret; } +static void __oi_init(struct exofs_i_info *oi) +{ + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; +} /* * Fill in an inode read from the OSD and set it up for use */ @@ -914,7 +910,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) struct exofs_i_info *oi; struct exofs_fcb fcb; struct inode *inode; - uint64_t uninitialized_var(sanity); + uint64_t obj_size; int ret; inode = iget_locked(sb, ino); @@ -923,13 +919,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; oi = exofs_i(inode); + __oi_init(oi); /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb, &sanity); + ret = exofs_get_inode(sb, oi, &fcb, &obj_size); if (ret) goto bad_inode; - init_waitqueue_head(&oi->i_wq); set_obj_created(oi); /* copy stuff from on-disk struct to in-memory struct */ @@ -947,14 +943,12 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_generation = le32_to_cpu(fcb.i_generation); -#ifdef EXOFS_DEBUG_OBJ_ISIZE - if ((inode->i_size != sanity) && + if ((inode->i_size != obj_size) && (!exofs_inode_is_fast_symlink(inode))) { - EXOFS_ERR("WARNING: Size of object from inode and " - "attributes differ (%lld != %llu)\n", - inode->i_size, _LLU(sanity)); + EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n", + inode->i_size, _LLU(obj_size)); + /* FIXME: call exofs_inode_recovery() */ } -#endif oi->i_dir_start_lookup = 0; @@ -1020,23 +1014,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) * set the obj_created flag so that other methods know that the object exists on * the OSD. */ -static void create_done(struct osd_request *or, void *p) +static void create_done(struct exofs_io_state *ios, void *p) { struct inode *inode = p; struct exofs_i_info *oi = exofs_i(inode); struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; - ret = exofs_check_ok(or); - osd_end_request(or); + ret = exofs_check_io(ios, NULL); + exofs_put_io_state(ios); + atomic_dec(&sbi->s_curr_pending); if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", - _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); - make_bad_inode(inode); - } else - set_obj_created(oi); + _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); + /*TODO: When FS is corrupted creation can fail, object already + * exist. Get rid of this asynchronous creation, if exist + * increment the obj counter and try the next object. Until we + * succeed. All these dangling objects will be made into lost + * files by chkfs.exofs + */ + } + + set_obj_created(oi); atomic_dec(&inode->i_count); wake_up(&oi->i_wq); @@ -1051,8 +1052,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) struct inode *inode; struct exofs_i_info *oi; struct exofs_sb_info *sbi; - struct osd_request *or; - struct osd_obj_id obj; + struct exofs_io_state *ios; int ret; sb = dir->i_sb; @@ -1061,8 +1061,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) return ERR_PTR(-ENOMEM); oi = exofs_i(inode); + __oi_init(oi); - init_waitqueue_head(&oi->i_wq); set_obj_2bcreated(oi); sbi = sb->s_fs_info; @@ -1089,28 +1089,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) mark_inode_dirty(inode); - obj.partition = sbi->s_pid; - obj.id = inode->i_ino + EXOFS_OBJ_OFF; - exofs_make_credential(oi->i_cred, &obj); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_new_inode: osd_start_request failed\n"); - return ERR_PTR(-ENOMEM); + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); + return ERR_PTR(ret); } - osd_req_create_object(or, &obj); + ios->obj.id = exofs_oi_objno(oi); + exofs_make_credential(oi->i_cred, &ios->obj); /* increment the refcount so that the inode will still be around when we * reach the callback */ atomic_inc(&inode->i_count); - ret = exofs_async_op(or, create_done, inode, oi->i_cred); + ios->done = create_done; + ios->private = inode; + ios->cred = oi->i_cred; + ret = exofs_sbi_create(ios); if (ret) { atomic_dec(&inode->i_count); - osd_end_request(or); - return ERR_PTR(-EIO); + exofs_put_io_state(ios); + return ERR_PTR(ret); } atomic_inc(&sbi->s_curr_pending); @@ -1128,11 +1128,11 @@ struct updatei_args { /* * Callback function from exofs_update_inode(). */ -static void updatei_done(struct osd_request *or, void *p) +static void updatei_done(struct exofs_io_state *ios, void *p) { struct updatei_args *args = p; - osd_end_request(or); + exofs_put_io_state(ios); atomic_dec(&args->sbi->s_curr_pending); @@ -1148,8 +1148,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or; + struct exofs_io_state *ios; struct osd_attr attr; struct exofs_fcb *fcb; struct updatei_args *args; @@ -1186,18 +1185,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); - ret = -ENOMEM; + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); goto free_args; } - osd_req_set_attributes(or, &obj); - attr = g_attr_inode_data; attr.val_ptr = fcb; - osd_req_add_set_attr_list(or, &attr, 1); + ios->out_attr_len = 1; + ios->out_attr = &attr; if (!obj_created(oi)) { EXOFS_DBGMSG("!obj_created\n"); @@ -1206,22 +1203,19 @@ static int exofs_update_inode(struct inode *inode, int do_sync) EXOFS_DBGMSG("wait_event done\n"); } - if (do_sync) { - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); - osd_end_request(or); - goto free_args; - } else { + if (!do_sync) { args->sbi = sbi; + ios->done = updatei_done; + ios->private = args; + } - ret = exofs_async_op(or, updatei_done, args, oi->i_cred); - if (ret) { - osd_end_request(or); - goto free_args; - } + ret = exofs_oi_write(oi, ios); + if (!do_sync && !ret) { atomic_inc(&sbi->s_curr_pending); goto out; /* deallocation in updatei_done */ } + exofs_put_io_state(ios); free_args: kfree(args); out: @@ -1238,11 +1232,12 @@ int exofs_write_inode(struct inode *inode, int wait) * Callback function from exofs_delete_inode() - don't have much cleaning up to * do. */ -static void delete_done(struct osd_request *or, void *p) +static void delete_done(struct exofs_io_state *ios, void *p) { - struct exofs_sb_info *sbi; - osd_end_request(or); - sbi = p; + struct exofs_sb_info *sbi = p; + + exofs_put_io_state(ios); + atomic_dec(&sbi->s_curr_pending); } @@ -1256,8 +1251,7 @@ void exofs_delete_inode(struct inode *inode) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or; + struct exofs_io_state *ios; int ret; truncate_inode_pages(&inode->i_data, 0); @@ -1274,25 +1268,26 @@ void exofs_delete_inode(struct inode *inode) clear_inode(inode); - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); return; } - osd_req_remove_object(or, &obj); - /* if we are deleting an obj that hasn't been created yet, wait */ if (!obj_created(oi)) { BUG_ON(!obj_2bcreated(oi)); wait_event(oi->i_wq, obj_created(oi)); } - ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); + ios->obj.id = exofs_oi_objno(oi); + ios->done = delete_done; + ios->private = sbi; + ios->cred = oi->i_cred; + ret = exofs_sbi_remove(ios); if (ret) { - EXOFS_ERR( - "ERROR: @exofs_delete_inode exofs_async_op failed\n"); - osd_end_request(or); + EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); + exofs_put_io_state(ios); return; } atomic_inc(&sbi->s_curr_pending); diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c new file mode 100644 index 00000000000..5bad01fa1f9 --- /dev/null +++ b/fs/exofs/ios.c @@ -0,0 +1,421 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <scsi/scsi_device.h> + +#include "exofs.h" + +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + return ret; +} + +int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) +{ + struct exofs_io_state *ios; + + /*TODO: Maybe use kmem_cach per sbi of size + * exofs_io_state_size(sbi->s_numdevs) + */ + ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); + if (unlikely(!ios)) { + *pios = NULL; + return -ENOMEM; + } + + ios->sbi = sbi; + ios->obj.partition = sbi->s_pid; + *pios = ios; + return 0; +} + +void exofs_put_io_state(struct exofs_io_state *ios) +{ + if (ios) { + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; + + if (per_dev->or) + osd_end_request(per_dev->or); + if (per_dev->bio) + bio_put(per_dev->bio); + } + + kfree(ios); + } +} + +static void _sync_done(struct exofs_io_state *ios, void *p) +{ + struct completion *waiting = p; + + complete(waiting); +} + +static void _last_io(struct kref *kref) +{ + struct exofs_io_state *ios = container_of( + kref, struct exofs_io_state, kref); + + ios->done(ios, ios->private); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct exofs_io_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static int exofs_io_execute(struct exofs_io_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + bool sync = (ios->done == NULL); + int i, ret; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + ret = osd_finalize_request(or, 0, ios->cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", + ret); + return ret; + } + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + ret = 0; + + if (sync) { + wait_for_completion(&wait); + ret = exofs_check_io(ios, NULL); + } + return ret; +} + +int exofs_check_io(struct exofs_io_state *ios, u64 *resid) +{ + enum osd_err_priority acumulated_osd_err = 0; + int acumulated_lin_err = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); + + if (likely(!ret)) + continue; + + if (unlikely(ret == -EFAULT)) { + EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); + /*FIXME: All the pages in this device range should: + * clear_highpage(page); + */ + } + + if (osi.osd_err_pri >= acumulated_osd_err) { + acumulated_osd_err = osi.osd_err_pri; + acumulated_lin_err = ret; + } + } + + /* TODO: raid specific residual calculations */ + if (resid) { + if (likely(!acumulated_lin_err)) + *resid = 0; + else + *resid = ios->length; + } + + return acumulated_lin_err; +} + +int exofs_sbi_create(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_create_object(or, &ios->obj); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_remove(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_remove_object(or, &ios->obj); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_write(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + if (ios->bio) { + struct bio *bio; + + if (i != 0) { + bio = bio_kmalloc(GFP_KERNEL, + ios->bio->bi_max_vecs); + if (unlikely(!bio)) { + ret = -ENOMEM; + goto out; + } + + __bio_clone(bio, ios->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + ios->per_dev[i].bio = bio; + } else { + bio = ios->bio; + } + + osd_req_write(or, &ios->obj, ios->offset, bio, + ios->length); +/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ + } else if (ios->kern_buff) { + osd_req_write_kern(or, &ios->obj, ios->offset, + ios->kern_buff, ios->length); +/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ + } else { + osd_req_set_attributes(or, &ios->obj); +/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_read(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < 1; i++) { + struct osd_request *or; + unsigned first_dev = (unsigned)ios->obj.id; + + first_dev %= ios->sbi->s_numdevs; + or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + if (ios->bio) { + osd_req_read(or, &ios->obj, ios->offset, ios->bio, + ios->length); +/* EXOFS_DBGMSG("read sync=%d\n", sync);*/ + } else if (ios->kern_buff) { + osd_req_read_kern(or, &ios->obj, ios->offset, + ios->kern_buff, ios->length); +/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/ + } else { + osd_req_get_attributes(or, &ios->obj); +/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/ + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(ios->per_dev[0].or, + &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} + +int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) +{ + struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; + struct exofs_io_state *ios; + struct osd_attr attr; + __be64 newsize; + int i, ret; + + if (exofs_get_io_state(sbi, &ios)) + return -ENOMEM; + + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + + newsize = cpu_to_be64(size); + attr = g_attr_logical_length; + attr.val_ptr = &newsize; + + for (i = 0; i < sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_set_attributes(or, &ios->obj); + osd_req_add_set_attr_list(or, &attr, 1); + } + ret = exofs_io_execute(ios); + +out: + exofs_put_io_state(ios); + return ret; +} diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c deleted file mode 100644 index 4372542df28..00000000000 --- a/fs/exofs/osd.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <scsi/scsi_device.h> -#include <scsi/osd_sense.h> - -#include "exofs.h" - -int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) -{ - struct osd_sense_info osi; - int ret = osd_req_decode_sense(or, &osi); - - if (ret) { /* translate to Linux codes */ - if (osi.additional_code == scsi_invalid_field_in_cdb) { - if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE) - ret = -EFAULT; - if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID) - ret = -ENOENT; - else - ret = -EINVAL; - } else if (osi.additional_code == osd_quota_error) - ret = -ENOSPC; - else - ret = -EIO; - } - - /* FIXME: should be include in osd_sense_info */ - if (in_resid) - *in_resid = or->in.req ? or->in.req->resid_len : 0; - - if (out_resid) - *out_resid = or->out.req ? or->out.req->resid_len : 0; - - return ret; -} - -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} - -/* - * Perform a synchronous OSD operation. - */ -int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential) -{ - int ret; - - or->timeout = timeout; - ret = osd_finalize_request(or, 0, credential, NULL); - if (ret) { - EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); - return ret; - } - - ret = osd_execute_request(or); - - if (ret) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ - return ret; -} - -/* - * Perform an asynchronous OSD operation. - */ -int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done, - void *caller_context, u8 *cred) -{ - int ret; - - ret = osd_finalize_request(or, 0, cred, NULL); - if (ret) { - EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); - return ret; - } - - ret = osd_execute_request_async(or, async_done, caller_context); - - if (ret) - EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret); - return ret; -} - -int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr) -{ - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ - void *iter = NULL; - int nelem; - - do { - nelem = 1; - osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter); - if ((cur_attr.attr_page == attr->attr_page) && - (cur_attr.attr_id == attr->attr_id)) { - attr->len = cur_attr.len; - attr->val_ptr = cur_attr.val_ptr; - return 0; - } - } while (iter); - - return -EIO; -} diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h new file mode 100644 index 00000000000..423033addd1 --- /dev/null +++ b/fs/exofs/pnfs.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + */ + +/* FIXME: Remove this file once pnfs hits mainline */ + +#ifndef __EXOFS_PNFS_H__ +#define __EXOFS_PNFS_H__ + +#if defined(CONFIG_PNFS) + + +/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */ +#include "../nfs/objlayout/pnfs_osd_xdr.h" + +#else /* defined(CONFIG_PNFS) */ + +enum pnfs_iomode { + IOMODE_READ = 1, + IOMODE_RW = 2, + IOMODE_ANY = 3, +}; + +/* Layout Structure */ +enum pnfs_osd_raid_algorithm4 { + PNFS_OSD_RAID_0 = 1, + PNFS_OSD_RAID_4 = 2, + PNFS_OSD_RAID_5 = 3, + PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ +}; + +struct pnfs_osd_data_map { + u32 odm_num_comps; + u64 odm_stripe_unit; + u32 odm_group_width; + u32 odm_group_depth; + u32 odm_mirror_cnt; + u32 odm_raid_algorithm; +}; + +#endif /* else defined(CONFIG_PNFS) */ + +#endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9f500dec3b5..a1d1e77b12e 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -203,49 +203,45 @@ int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; - struct osd_request *or; - struct osd_obj_id obj; + struct exofs_io_state *ios; int ret = -ENOMEM; - fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); - if (!fscb) { - EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); - return -ENOMEM; - } - lock_super(sb); sbi = sb->s_fs_info; + fscb = &sbi->s_fscb; + + ret = exofs_get_io_state(sbi, &ios); + if (ret) + goto out; + + /* Note: We only write the changing part of the fscb. .i.e upto the + * the fscb->s_dev_table_oid member. There is no read-modify-write + * here. + */ + ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); + memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); fscb->s_magic = cpu_to_le16(sb->s_magic); fscb->s_newfs = 0; + fscb->s_version = EXOFS_FSCB_VER; - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); - goto out; - } + ios->obj.id = EXOFS_SUPER_ID; + ios->offset = 0; + ios->kern_buff = fscb; + ios->cred = sbi->s_cred; - obj.partition = sbi->s_pid; - obj.id = EXOFS_SUPER_ID; - ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb)); + ret = exofs_sbi_write(ios); if (unlikely(ret)) { - EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n"); - goto out; - } - - ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); - if (unlikely(ret)) { - EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n"); + EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); goto out; } sb->s_dirt = 0; out: - if (or) - osd_end_request(or); + EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); + exofs_put_io_state(ios); unlock_super(sb); - kfree(fscb); return ret; } @@ -257,6 +253,29 @@ static void exofs_write_super(struct super_block *sb) sb->s_dirt = 0; } +static void _exofs_print_device(const char *msg, const char *dev_path, + struct osd_dev *od, u64 pid) +{ + const struct osd_dev_info *odi = osduld_device_info(od); + + printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", + msg, dev_path ?: "", odi->osdname, _LLU(pid)); +} + +void exofs_free_sbi(struct exofs_sb_info *sbi) +{ + while (sbi->s_numdevs) { + int i = --sbi->s_numdevs; + struct osd_dev *od = sbi->s_ods[i]; + + if (od) { + sbi->s_ods[i] = NULL; + osduld_put_device(od); + } + } + kfree(sbi); +} + /* * This function is called when the vfs is freeing the superblock. We just * need to free our own part. @@ -279,11 +298,182 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - osduld_put_device(sbi->s_dev); - kfree(sb->s_fs_info); + _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); + + exofs_free_sbi(sbi); sb->s_fs_info = NULL; } +static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_device_table *dt) +{ + sbi->data_map.odm_num_comps = + le32_to_cpu(dt->dt_data_map.cb_num_comps); + sbi->data_map.odm_stripe_unit = + le64_to_cpu(dt->dt_data_map.cb_stripe_unit); + sbi->data_map.odm_group_width = + le32_to_cpu(dt->dt_data_map.cb_group_width); + sbi->data_map.odm_group_depth = + le32_to_cpu(dt->dt_data_map.cb_group_depth); + sbi->data_map.odm_mirror_cnt = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); + sbi->data_map.odm_raid_algorithm = + le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); + +/* FIXME: Hard coded mirror only for now. if not so do not mount */ + if ((sbi->data_map.odm_num_comps != numdevs) || + (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || + (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || + (sbi->data_map.odm_mirror_cnt != (numdevs - 1))) + return -EINVAL; + else + return 0; +} + +/* @odi is valid only as long as @fscb_dev is valid */ +static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, + struct osd_dev_info *odi) +{ + odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); + memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + + odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); + odi->osdname = dt_dev->osdname; + + /* FIXME support long names. Will need a _put function */ + if (dt_dev->long_name_offset) + return -EINVAL; + + /* Make sure osdname is printable! + * mkexofs should give us space for a null-terminator else the + * device-table is invalid. + */ + if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) + odi->osdname_len = sizeof(dt_dev->osdname) - 1; + dt_dev->osdname[odi->osdname_len] = 0; + + /* If it's all zeros something is bad we read past end-of-obj */ + return !(odi->systemid_len || odi->osdname_len); +} + +static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, + unsigned table_count) +{ + struct exofs_sb_info *sbi = *psbi; + struct osd_dev *fscb_od; + struct osd_obj_id obj = {.partition = sbi->s_pid, + .id = EXOFS_DEVTABLE_ID}; + struct exofs_device_table *dt; + unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + + sizeof(*dt); + unsigned numdevs, i; + int ret; + + dt = kmalloc(table_bytes, GFP_KERNEL); + if (unlikely(!dt)) { + EXOFS_ERR("ERROR: allocating %x bytes for device table\n", + table_bytes); + return -ENOMEM; + } + + fscb_od = sbi->s_ods[0]; + sbi->s_ods[0] = NULL; + sbi->s_numdevs = 0; + ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: reading device table\n"); + goto out; + } + + numdevs = le64_to_cpu(dt->dt_num_devices); + if (unlikely(!numdevs)) { + ret = -EINVAL; + goto out; + } + WARN_ON(table_count != numdevs); + + ret = _read_and_match_data_map(sbi, numdevs, dt); + if (unlikely(ret)) + goto out; + + if (likely(numdevs > 1)) { + unsigned size = numdevs * sizeof(sbi->s_ods[0]); + + sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); + if (unlikely(!sbi)) { + ret = -ENOMEM; + goto out; + } + memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); + *psbi = sbi; + } + + for (i = 0; i < numdevs; i++) { + struct exofs_fscb fscb; + struct osd_dev_info odi; + struct osd_dev *od; + + if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { + EXOFS_ERR("ERROR: Read all-zeros device entry\n"); + ret = -EINVAL; + goto out; + } + + printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", + i, odi.osdname); + + /* On all devices the device table is identical. The user can + * specify any one of the participating devices on the command + * line. We always keep them in device-table order. + */ + if (fscb_od && osduld_device_same(fscb_od, &odi)) { + sbi->s_ods[i] = fscb_od; + ++sbi->s_numdevs; + fscb_od = NULL; + continue; + } + + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + ret = PTR_ERR(od); + EXOFS_ERR("ERROR: device requested is not found " + "osd_name-%s =>%d\n", odi.osdname, ret); + goto out; + } + + sbi->s_ods[i] = od; + ++sbi->s_numdevs; + + /* Read the fscb of the other devices to make sure the FS + * partition is there. + */ + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, + sizeof(fscb)); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: Malformed participating device " + "error reading fscb osd_name-%s\n", + odi.osdname); + goto out; + } + + /* TODO: verify other information is correct and FS-uuid + * matches. Benny what did you say about device table + * generation and old devices? + */ + } + +out: + kfree(dt); + if (unlikely(!ret && fscb_od)) { + EXOFS_ERR( + "ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + ret = -EINVAL; + } + + return ret; +} + /* * Read the superblock from the OSD and fill in the fields */ @@ -292,24 +482,25 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; struct exofs_mountopt *opts = data; struct exofs_sb_info *sbi; /*extended info */ + struct osd_dev *od; /* Master device */ struct exofs_fscb fscb; /*on-disk superblock info */ - struct osd_request *or = NULL; struct osd_obj_id obj; + unsigned table_count; int ret; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - sb->s_fs_info = sbi; /* use mount options to fill superblock */ - sbi->s_dev = osduld_path_lookup(opts->dev_name); - if (IS_ERR(sbi->s_dev)) { - ret = PTR_ERR(sbi->s_dev); - sbi->s_dev = NULL; + od = osduld_path_lookup(opts->dev_name); + if (IS_ERR(od)) { + ret = PTR_ERR(od); goto free_sbi; } + sbi->s_ods[0] = od; + sbi->s_numdevs = 1; sbi->s_pid = opts->pid; sbi->s_timeout = opts->timeout; @@ -323,35 +514,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - /* read data from on-disk superblock object */ obj.partition = sbi->s_pid; obj.id = EXOFS_SUPER_ID; exofs_make_credential(sbi->s_cred, &obj); - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - if (!silent) - EXOFS_ERR( - "exofs_fill_super: osd_start_request failed.\n"); - ret = -ENOMEM; - goto free_sbi; - } - ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb)); - if (unlikely(ret)) { - if (!silent) - EXOFS_ERR( - "exofs_fill_super: osd_req_read_kern failed.\n"); - ret = -ENOMEM; - goto free_sbi; - } - - ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); - if (unlikely(ret)) { - if (!silent) - EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n"); - ret = -EIO; + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); + if (unlikely(ret)) goto free_sbi; - } sb->s_magic = le16_to_cpu(fscb.s_magic); sbi->s_nextid = le64_to_cpu(fscb.s_nextid); @@ -364,12 +533,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = -EINVAL; goto free_sbi; } + if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { + EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", + EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); + ret = -EINVAL; + goto free_sbi; + } /* start generation numbers from a random point */ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + table_count = le64_to_cpu(fscb.s_dev_table_count); + if (table_count) { + ret = exofs_read_lookup_dev_table(&sbi, table_count); + if (unlikely(ret)) + goto free_sbi; + } + /* set up operation vectors */ + sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); @@ -395,16 +578,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = 0; -out: - if (or) - osd_end_request(or); - return ret; + _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], + sbi->s_pid); + return 0; free_sbi: - osduld_put_device(sbi->s_dev); /* NULL safe */ - kfree(sbi); - goto out; + EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", + opts->dev_name, sbi->s_pid, ret); + exofs_free_sbi(sbi); + return ret; } /* @@ -433,7 +615,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_obj_id obj = {sbi->s_pid, 0}; + struct exofs_io_state *ios; struct osd_attr attrs[] = { ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), @@ -442,32 +624,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) }; uint64_t capacity = ULLONG_MAX; uint64_t used = ULLONG_MAX; - struct osd_request *or; uint8_t cred_a[OSD_CAP_LEN]; int ret; - /* get used/capacity attributes */ - exofs_make_credential(cred_a, &obj); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n"); - return -ENOMEM; + ret = exofs_get_io_state(sbi, &ios); + if (ret) { + EXOFS_DBGMSG("exofs_get_io_state failed.\n"); + return ret; } - osd_req_get_attributes(or, &obj); - osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); - ret = exofs_sync_op(or, sbi->s_timeout, cred_a); + exofs_make_credential(cred_a, &ios->obj); + ios->cred = sbi->s_cred; + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_read(ios); if (unlikely(ret)) goto out; - ret = extract_attr_from_req(or, &attrs[0]); - if (likely(!ret)) + ret = extract_attr_from_ios(ios, &attrs[0]); + if (likely(!ret)) { capacity = get_unaligned_be64(attrs[0].val_ptr); - else + if (unlikely(!capacity)) + capacity = ULLONG_MAX; + } else EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); - ret = extract_attr_from_req(or, &attrs[1]); + ret = extract_attr_from_ios(ios, &attrs[1]); if (likely(!ret)) used = get_unaligned_be64(attrs[1].val_ptr); else @@ -476,15 +659,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) /* fill in the stats buffer */ buf->f_type = EXOFS_SUPER_MAGIC; buf->f_bsize = EXOFS_BLKSIZE; - buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); - buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); + buf->f_blocks = capacity >> 9; + buf->f_bfree = (capacity - used) >> 9; buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_numfiles; buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; buf->f_namelen = EXOFS_NAME_LEN; out: - osd_end_request(or); + exofs_put_io_state(ios); return ret; } diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6cde970b0a1..fc2bd05d355 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -353,8 +353,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) * ext2_find_entry() * * finds an entry in the specified directory with the wanted name. It - * returns the page in which the entry was found, and the entry itself - * (as a parameter - res_dir). Page is returned mapped and unlocked. + * returns the page in which the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 9a8a8e27a06..da318b0fa63 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -142,7 +142,7 @@ struct dentry *ext2_get_parent(struct dentry *child); /* super.c */ extern void ext2_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext2_warning (struct super_block *, const char *, const char *, ...) +extern void ext2_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext2_update_dynamic_rev (struct super_block *sb); extern void ext2_write_super (struct super_block *); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index ade634076d0..71b032c65a0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -137,7 +137,8 @@ static int ext2_block_to_path(struct inode *inode, int final = 0; if (i_block < 0) { - ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0"); + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block < 0", __func__); } else if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; @@ -157,7 +158,8 @@ static int ext2_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big"); + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block is too big", __func__); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1a9ffee47d5..1388802b780 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -58,27 +58,27 @@ void ext2_error (struct super_block * sb, const char * function, } va_start(args, fmt); - printk(KERN_CRIT "EXT2-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT2-fs panic from previous error\n"); + panic("EXT2-fs: panic from previous error\n"); if (test_opt(sb, ERRORS_RO)) { - printk("Remounting filesystem read-only\n"); + ext2_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } } -void ext2_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext2_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT2-fs warning (device %s): %s: ", - sb->s_id, function); + printk("%sEXT2-fs (%s): ", prefix, sb->s_id); vprintk(fmt, args); printk("\n"); va_end(args); @@ -91,9 +91,9 @@ void ext2_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV) return; - ext2_warning(sb, __func__, - "updating to rev %d because of new feature flag, " - "running e2fsck is recommended", + ext2_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", EXT2_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); @@ -419,10 +419,10 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options (char * options, - struct ext2_sb_info *sbi) +static int parse_options(char *options, struct super_block *sb) { - char * p; + char *p; + struct ext2_sb_info *sbi = EXT2_SB(sb); substring_t args[MAX_OPT_ARGS]; int option; @@ -505,7 +505,8 @@ static int parse_options (char * options, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT2 (no)user_xattr options not supported\n"); + ext2_msg(sb, KERN_INFO, "(no)user_xattr options" + "not supported"); break; #endif #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -518,14 +519,15 @@ static int parse_options (char * options, #else case Opt_acl: case Opt_noacl: - printk("EXT2 (no)acl options not supported\n"); + ext2_msg(sb, KERN_INFO, + "(no)acl options not supported"); break; #endif case Opt_xip: #ifdef CONFIG_EXT2_FS_XIP set_opt (sbi->s_mount_opt, XIP); #else - printk("EXT2 xip option not supported\n"); + ext2_msg(sb, KERN_INFO, "xip option not supported"); #endif break; @@ -542,19 +544,18 @@ static int parse_options (char * options, case Opt_quota: case Opt_usrquota: case Opt_grpquota: - printk(KERN_ERR - "EXT2-fs: quota operations not supported.\n"); - + ext2_msg(sb, KERN_INFO, + "quota operations not supported"); break; #endif case Opt_reservation: set_opt(sbi->s_mount_opt, RESERVATION); - printk("reservations ON\n"); + ext2_msg(sb, KERN_INFO, "reservations ON"); break; case Opt_noreservation: clear_opt(sbi->s_mount_opt, RESERVATION); - printk("reservations OFF\n"); + ext2_msg(sb, KERN_INFO, "reservations OFF"); break; case Opt_ignore: break; @@ -573,34 +574,40 @@ static int ext2_setup_super (struct super_block * sb, struct ext2_sb_info *sbi = EXT2_SB(sb); if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { - printk ("EXT2-fs warning: revision level too high, " - "forcing read-only mode\n"); + ext2_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT2_VALID_FS)) - printk ("EXT2-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); else if ((sbi->s_mount_state & EXT2_ERROR_FS)) - printk ("EXT2-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk ("EXT2-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && - (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk ("EXT2-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext2_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); if (!le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext2_write_super(sb); if (test_opt (sb, DEBUG)) - printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]\n", + ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize, sbi->s_frag_size, sbi->s_groups_count, @@ -767,7 +774,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) */ blocksize = sb_min_blocksize(sb, BLOCK_SIZE); if (!blocksize) { - printk ("EXT2-fs: unable to set blocksize\n"); + ext2_msg(sb, KERN_ERR, "error: unable to set blocksize"); goto failed_sbi; } @@ -783,7 +790,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logic_sb_block))) { - printk ("EXT2-fs: unable to read superblock\n"); + ext2_msg(sb, KERN_ERR, "error: unable to read superblock"); goto failed_sbi; } /* @@ -826,7 +833,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); - if (!parse_options ((char *) data, sbi)) + if (!parse_options((char *) data, sb)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -840,8 +847,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk("EXT2-fs warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -849,16 +857,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) */ features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP); if (features) { - printk("EXT2-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext2_msg(sb, KERN_ERR, "error: couldn't mount because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); goto failed_mount; } if (!(sb->s_flags & MS_RDONLY) && (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ - printk("EXT2-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); goto failed_mount; } @@ -866,7 +874,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { if (!silent) - printk("XIP: Unsupported blocksize\n"); + ext2_msg(sb, KERN_ERR, + "error: unsupported blocksize for xip"); goto failed_mount; } @@ -875,7 +884,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n"); + ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); goto failed_sbi; } @@ -883,14 +892,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) offset = (sb_block*BLOCK_SIZE) % blocksize; bh = sb_bread(sb, logic_sb_block); if(!bh) { - printk("EXT2-fs: Couldn't read superblock on " - "2nd try.\n"); + ext2_msg(sb, KERN_ERR, "error: couldn't read" + "superblock on 2nd try"); goto failed_sbi; } es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) { - printk ("EXT2-fs: Magic mismatch, very weird !\n"); + ext2_msg(sb, KERN_ERR, "error: magic mismatch"); goto failed_mount; } } @@ -906,7 +915,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) || !is_power_of_2(sbi->s_inode_size) || (sbi->s_inode_size > blocksize)) { - printk ("EXT2-fs: unsupported inode size: %d\n", + ext2_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", sbi->s_inode_size); goto failed_mount; } @@ -943,29 +953,33 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_blocksize != bh->b_size) { if (!silent) - printk ("VFS: Unsupported blocksize on dev " - "%s.\n", sb->s_id); + ext2_msg(sb, KERN_ERR, "error: unsupported blocksize"); goto failed_mount; } if (sb->s_blocksize != sbi->s_frag_size) { - printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n", + ext2_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %lu" + "(not supported yet)", sbi->s_frag_size, sb->s_blocksize); goto failed_mount; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { - printk ("EXT2-fs: #blocks per group too big: %lu\n", + ext2_msg(sb, KERN_ERR, + "error: #blocks per group too big: %lu", sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_frags_per_group > sb->s_blocksize * 8) { - printk ("EXT2-fs: #fragments per group too big: %lu\n", + ext2_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", sbi->s_frags_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { - printk ("EXT2-fs: #inodes per group too big: %lu\n", + ext2_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -979,13 +993,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk ("EXT2-fs: not enough memory\n"); + ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount; } bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); if (!sbi->s_debts) { - printk ("EXT2-fs: not enough memory\n"); + ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount_group_desc; } for (i = 0; i < db_count; i++) { @@ -994,12 +1008,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->s_group_desc[i]) { for (j = 0; j < i; j++) brelse (sbi->s_group_desc[j]); - printk ("EXT2-fs: unable to read group descriptors\n"); + ext2_msg(sb, KERN_ERR, + "error: unable to read group descriptors"); goto failed_mount_group_desc; } } if (!ext2_check_descriptors (sb)) { - printk ("EXT2-fs: group descriptors corrupted!\n"); + ext2_msg(sb, KERN_ERR, "group descriptors corrupted"); goto failed_mount2; } sbi->s_gdb_count = db_count; @@ -1032,7 +1047,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_count_dirs(sb)); } if (err) { - printk(KERN_ERR "EXT2-fs: insufficient memory\n"); + ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } /* @@ -1048,27 +1063,28 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); - printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n"); + ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); goto failed_mount3; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { iput(root); - printk(KERN_ERR "EXT2-fs: get root inode failed\n"); + ext2_msg(sb, KERN_ERR, "error: get root inode failed"); ret = -ENOMEM; goto failed_mount3; } if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) - ext2_warning(sb, __func__, - "mounting ext3 filesystem as ext2"); + ext2_msg(sb, KERN_WARNING, + "warning: mounting ext3 filesystem as ext2"); ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); return 0; cantfind_ext2: if (!silent) - printk("VFS: Can't find an ext2 filesystem on dev %s.\n", - sb->s_id); + ext2_msg(sb, KERN_ERR, + "error: can't find an ext2 filesystem on dev %s.", + sb->s_id); goto failed_mount; failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); @@ -1121,8 +1137,24 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) static int ext2_sync_fs(struct super_block *sb, int wait) { struct ext2_super_block *es = EXT2_SB(sb)->s_es; + struct buffer_head *sbh = EXT2_SB(sb)->s_sbh; lock_kernel(); + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext2_msg(sb, KERN_ERR, + "previous I/O error to superblock detected\n"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { ext2_debug("setting valid to 0\n"); es->s_state &= cpu_to_le16(~EXT2_VALID_FS); @@ -1170,7 +1202,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options (data, sbi)) { + if (!parse_options(data, sb)) { err = -EINVAL; goto restore_opts; } @@ -1182,7 +1214,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) EXT2_MOUNT_XIP if not */ if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - printk("XIP: Unsupported blocksize\n"); + ext2_msg(sb, KERN_WARNING, + "warning: unsupported blocksize for xip"); err = -EINVAL; goto restore_opts; } @@ -1191,8 +1224,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && invalidate_inodes(sb)) { - ext2_warning(sb, __func__, "refusing change of xip flag " - "with busy inodes while remounting"); + ext2_msg(sb, KERN_WARNING, "warning: refusing change of " + "xip flag with busy inodes while remounting"); sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; } @@ -1216,9 +1249,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP); if (ret) { - printk("EXT2-fs: %s: couldn't remount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + ext2_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR because of " + "unsupported optional features (%x).", + le32_to_cpu(ret)); err = -EROFS; goto restore_opts; } diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index c18fbf3e406..322a56b2dfb 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -69,8 +69,9 @@ void ext2_xip_verify_sb(struct super_block *sb) if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && !sb->s_bdev->bd_disk->fops->direct_access) { sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_warning(sb, __func__, - "ignoring xip option - not supported by bdev"); + ext2_msg(sb, KERN_WARNING, + "warning: ignoring xip option - " + "not supported by bdev"); } } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2db95777890..ad14227f509 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext3_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1209,7 +1219,7 @@ write_begin_failed: unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 8359e7b3dc8..5f83b617917 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -266,7 +266,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext3_journal_dirty_metadata(handle, gdb); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 427496c4767..7ad1e8c30bd 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -135,12 +135,24 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); + printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); journal_abort_handle(handle); } +void ext3_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk("%sEXT3-fs (%s): ", prefix, sb->s_id); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -174,12 +186,13 @@ static void ext3_handle_error(struct super_block *sb) journal_abort(journal, -EIO); } if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } ext3_commit_super(sb, es, 1); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs (device %s): panic forced after error\n", + panic("EXT3-fs (%s): panic forced after error\n", sb->s_id); } @@ -247,8 +260,7 @@ void __ext3_std_error (struct super_block * sb, const char * function, return; errstr = ext3_decode_error(sb, errno, nbuf); - printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); ext3_handle_error(sb); } @@ -268,21 +280,20 @@ void ext3_abort (struct super_block * sb, const char * function, { va_list args; - printk (KERN_CRIT "ext3_abort called.\n"); - va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs panic from previous error\n"); + panic("EXT3-fs: panic from previous error\n"); if (sb->s_flags & MS_RDONLY) return; - printk(KERN_CRIT "Remounting filesystem read-only\n"); + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; sb->s_flags |= MS_RDONLY; EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; @@ -296,7 +307,7 @@ void ext3_warning (struct super_block * sb, const char * function, va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ", + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); @@ -310,10 +321,10 @@ void ext3_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) return; - ext3_warning(sb, __func__, - "updating to rev %d because of new feature flag, " - "running e2fsck is recommended", - EXT3_DYNAMIC_REV); + ext3_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT3_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); @@ -331,7 +342,7 @@ void ext3_update_dynamic_rev(struct super_block *sb) /* * Open the external journal device */ -static struct block_device *ext3_blkdev_get(dev_t dev) +static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; char b[BDEVNAME_SIZE]; @@ -342,8 +353,9 @@ static struct block_device *ext3_blkdev_get(dev_t dev) return bdev; fail: - printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n", - __bdevname(dev, b), PTR_ERR(bdev)); + ext3_msg(sb, "error: failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; } @@ -378,13 +390,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) { struct list_head *l; - printk(KERN_ERR "sb orphan head is %d\n", + ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); - printk(KERN_ERR "sb_info orphan list:\n"); + ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); - printk(KERN_ERR " " + ext3_msg(sb, KERN_ERR, " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, @@ -527,9 +539,22 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl #if defined(CONFIG_QUOTA) struct ext3_sb_info *sbi = EXT3_SB(sb); - if (sbi->s_jquota_fmt) - seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -636,6 +661,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext3_show_quota_options(seq, sb); return 0; @@ -787,9 +815,9 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_usrquota, Opt_grpquota }; static const match_table_t tokens = { @@ -818,6 +846,7 @@ static const match_table_t tokens = { {Opt_reservation, "reservation"}, {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, @@ -836,6 +865,7 @@ static const match_table_t tokens = { {Opt_grpjquota, "grpjquota=%s"}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_grpquota, "grpquota"}, {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, @@ -845,7 +875,7 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; -static ext3_fsblk_t get_sb_block(void **data) +static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) { ext3_fsblk_t sb_block; char *options = (char *) *data; @@ -856,7 +886,7 @@ static ext3_fsblk_t get_sb_block(void **data) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - printk("EXT3-fs: Invalid sb specification: %s\n", + ext3_msg(sb, "error: invalid sb specification: %s", (char *) *data); return 1; } @@ -956,7 +986,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT3 (no)user_xattr options not supported\n"); + ext3_msg(sb, KERN_INFO, + "(no)user_xattr options not supported"); break; #endif #ifdef CONFIG_EXT3_FS_POSIX_ACL @@ -969,7 +1000,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_acl: case Opt_noacl: - printk("EXT3 (no)acl options not supported\n"); + ext3_msg(sb, KERN_INFO, + "(no)acl options not supported"); break; #endif case Opt_reservation: @@ -985,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb, user to specify an existing inode to be the journal file. */ if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); break; case Opt_journal_inum: if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } if (match_int(&args[0], &option)) @@ -1003,8 +1035,8 @@ static int parse_options (char *options, struct super_block *sb, break; case Opt_journal_dev: if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } if (match_int(&args[0], &option)) @@ -1036,12 +1068,11 @@ static int parse_options (char *options, struct super_block *sb, if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) == data_opt) break; - printk(KERN_ERR - "EXT3-fs (device %s): Cannot change " + ext3_msg(sb, KERN_ERR, + "error: cannot change " "data mode on remount. The filesystem " "is mounted in data=%s mode and you " - "try to remount it in data=%s mode.\n", - sb->s_id, + "try to remount it in data=%s mode.", data_mode_string(sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS), data_mode_string(data_opt)); @@ -1066,31 +1097,31 @@ static int parse_options (char *options, struct super_block *sb, set_qf_name: if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { - printk(KERN_ERR - "EXT3-fs: Cannot change journaled " - "quota options when quota turned on.\n"); + ext3_msg(sb, KERN_ERR, + "error: cannot change journaled " + "quota options when quota turned on."); return 0; } qname = match_strdup(&args[0]); if (!qname) { - printk(KERN_ERR - "EXT3-fs: not enough memory for " - "storing quotafile name.\n"); + ext3_msg(sb, KERN_ERR, + "error: not enough memory for " + "storing quotafile name."); return 0; } if (sbi->s_qf_names[qtype] && strcmp(sbi->s_qf_names[qtype], qname)) { - printk(KERN_ERR - "EXT3-fs: %s quota file already " - "specified.\n", QTYPE2NAME(qtype)); + ext3_msg(sb, KERN_ERR, + "error: %s quota file already " + "specified.", QTYPE2NAME(qtype)); kfree(qname); return 0; } sbi->s_qf_names[qtype] = qname; if (strchr(sbi->s_qf_names[qtype], '/')) { - printk(KERN_ERR - "EXT3-fs: quotafile must be on " - "filesystem root.\n"); + ext3_msg(sb, KERN_ERR, + "error: quotafile must be on " + "filesystem root."); kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; return 0; @@ -1105,9 +1136,9 @@ set_qf_name: clear_qf_name: if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { - printk(KERN_ERR "EXT3-fs: Cannot change " + ext3_msg(sb, KERN_ERR, "error: cannot change " "journaled quota options when " - "quota turned on.\n"); + "quota turned on."); return 0; } /* @@ -1121,12 +1152,15 @@ clear_qf_name: goto set_qf_format; case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; set_qf_format: if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { - printk(KERN_ERR "EXT3-fs: Cannot change " + ext3_msg(sb, KERN_ERR, "error: cannot change " "journaled quota options when " - "quota turned on.\n"); + "quota turned on."); return 0; } sbi->s_jquota_fmt = qfmt; @@ -1142,8 +1176,8 @@ set_qf_format: break; case Opt_noquota: if (sb_any_quota_loaded(sb)) { - printk(KERN_ERR "EXT3-fs: Cannot change quota " - "options when quota turned on.\n"); + ext3_msg(sb, KERN_ERR, "error: cannot change " + "quota options when quota turned on."); return 0; } clear_opt(sbi->s_mount_opt, QUOTA); @@ -1154,8 +1188,8 @@ set_qf_format: case Opt_quota: case Opt_usrquota: case Opt_grpquota: - printk(KERN_ERR - "EXT3-fs: quota options not supported.\n"); + ext3_msg(sb, KERN_ERR, + "error: quota options not supported."); break; case Opt_usrjquota: case Opt_grpjquota: @@ -1163,9 +1197,10 @@ set_qf_format: case Opt_offgrpjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: - printk(KERN_ERR - "EXT3-fs: journaled quota options not " - "supported.\n"); + case Opt_jqfmt_vfsv1: + ext3_msg(sb, KERN_ERR, + "error: journaled quota options not " + "supported."); break; case Opt_noquota: break; @@ -1185,8 +1220,9 @@ set_qf_format: break; case Opt_resize: if (!is_remount) { - printk("EXT3-fs: resize option only available " - "for remount\n"); + ext3_msg(sb, KERN_ERR, + "error: resize option only available " + "for remount"); return 0; } if (match_int(&args[0], &option) != 0) @@ -1200,9 +1236,9 @@ set_qf_format: clear_opt(sbi->s_mount_opt, NOBH); break; default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " - "or missing value\n", p); + ext3_msg(sb, KERN_ERR, + "error: unrecognized mount option \"%s\" " + "or missing value", p); return 0; } } @@ -1220,21 +1256,21 @@ set_qf_format: (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || (sbi->s_qf_names[GRPQUOTA] && (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { - printk(KERN_ERR "EXT3-fs: old and new quota " - "format mixing.\n"); + ext3_msg(sb, KERN_ERR, "error: old and new quota " + "format mixing."); return 0; } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journaled quota format " - "not specified.\n"); + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "not specified."); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journaled quota format " + ext3_msg(sb, KERN_ERR, "error: journaled quota format " "specified with no journaling " - "enabled.\n"); + "enabled."); return 0; } } @@ -1249,31 +1285,33 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, int res = 0; if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { - printk (KERN_ERR "EXT3-fs warning: revision level too high, " - "forcing read-only mode\n"); + ext3_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT3_VALID_FS)) - printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); else if ((sbi->s_mount_state & EXT3_ERROR_FS)) - printk (KERN_WARNING - "EXT3-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk (KERN_WARNING - "EXT3-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk (KERN_WARNING - "EXT3-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); #if 0 /* @@@ We _will_ want to clear the valid bit if we find inconsistencies, to force a fsck at reboot. But for @@ -1290,22 +1328,20 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, ext3_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]\n", + ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", sb->s_blocksize, sbi->s_groups_count, EXT3_BLOCKS_PER_GROUP(sb), EXT3_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id); if (EXT3_SB(sb)->s_journal->j_inode == NULL) { char b[BDEVNAME_SIZE]; - - printk("external journal on %s\n", + ext3_msg(sb, KERN_INFO, "using external journal on %s", bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); } else { - printk("internal journal\n"); + ext3_msg(sb, KERN_INFO, "using internal journal"); } return res; } @@ -1399,8 +1435,8 @@ static void ext3_orphan_cleanup (struct super_block * sb, } if (bdev_read_only(sb->s_bdev)) { - printk(KERN_ERR "EXT3-fs: write access " - "unavailable, skipping orphan cleanup.\n"); + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, skipping orphan cleanup."); return; } @@ -1414,8 +1450,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } if (s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", - sb->s_id); + ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~MS_RDONLY; } #ifdef CONFIG_QUOTA @@ -1426,9 +1461,9 @@ static void ext3_orphan_cleanup (struct super_block * sb, if (EXT3_SB(sb)->s_qf_names[i]) { int ret = ext3_quota_on_mount(sb, i); if (ret < 0) - printk(KERN_ERR - "EXT3-fs: Cannot turn on journaled " - "quota: error %d\n", ret); + ext3_msg(sb, KERN_ERR, + "error: cannot turn on journaled " + "quota: %d", ret); } } #endif @@ -1466,11 +1501,11 @@ static void ext3_orphan_cleanup (struct super_block * sb, #define PLURAL(x) (x), ((x)==1) ? "" : "s" if (nr_orphans) - printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", - sb->s_id, PLURAL(nr_orphans)); + ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); if (nr_truncates) - printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", - sb->s_id, PLURAL(nr_truncates)); + ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { @@ -1554,7 +1589,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) struct ext3_super_block *es = NULL; struct ext3_sb_info *sbi; ext3_fsblk_t block; - ext3_fsblk_t sb_block = get_sb_block(&data); + ext3_fsblk_t sb_block = get_sb_block(&data, sb); ext3_fsblk_t logic_sb_block; unsigned long offset = 0; unsigned int journal_inum = 0; @@ -1590,7 +1625,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); if (!blocksize) { - printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); + ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); goto out_fail; } @@ -1606,7 +1641,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logic_sb_block))) { - printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); + ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); goto out_fail; } /* @@ -1665,9 +1700,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk(KERN_WARNING - "EXT3-fs warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -1675,25 +1710,25 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) */ features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); if (features) { - printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "optional features (%x)", le32_to_cpu(features)); goto failed_mount; } features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount RDWR because of unsupported " + "optional features (%x)", le32_to_cpu(features)); goto failed_mount; } blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT3_MIN_BLOCK_SIZE || blocksize > EXT3_MAX_BLOCK_SIZE) { - printk(KERN_ERR - "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", - blocksize, sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "filesystem blocksize %d", blocksize); goto failed_mount; } @@ -1704,30 +1739,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) * than the hardware sectorsize for the machine. */ if (blocksize < hblock) { - printk(KERN_ERR "EXT3-fs: blocksize %d too small for " - "device blocksize %d.\n", blocksize, hblock); + ext3_msg(sb, KERN_ERR, + "error: fsblocksize %d too small for " + "hardware sectorsize %d", blocksize, hblock); goto failed_mount; } brelse (bh); if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n", - blocksize); + ext3_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); goto out_fail; } logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; bh = sb_bread(sb, logic_sb_block); if (!bh) { - printk(KERN_ERR - "EXT3-fs: Can't read superblock on 2nd try.\n"); + ext3_msg(sb, KERN_ERR, + "error: can't read superblock on 2nd try"); goto failed_mount; } es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { - printk (KERN_ERR - "EXT3-fs: Magic mismatch, very weird !\n"); + ext3_msg(sb, KERN_ERR, + "error: magic mismatch"); goto failed_mount; } } @@ -1743,8 +1779,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { - printk (KERN_ERR - "EXT3-fs: unsupported inode size: %d\n", + ext3_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", sbi->s_inode_size); goto failed_mount; } @@ -1752,8 +1788,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << le32_to_cpu(es->s_log_frag_size); if (blocksize != sbi->s_frag_size) { - printk(KERN_ERR - "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", + ext3_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %u (unsupported)", sbi->s_frag_size, blocksize); goto failed_mount; } @@ -1789,31 +1825,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #blocks per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_frags_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #fragments per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", sbi->s_frags_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #inodes per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", sbi->s_inodes_per_group); goto failed_mount; } if (le32_to_cpu(es->s_blocks_count) > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to mount safely\n", sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) - printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not " - "enabled\n"); + ext3_msg(sb, KERN_ERR, + "error: CONFIG_LBDAF not enabled"); goto failed_mount; } @@ -1827,7 +1863,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk (KERN_ERR "EXT3-fs: not enough memory\n"); + ext3_msg(sb, KERN_ERR, + "error: not enough memory"); goto failed_mount; } @@ -1837,14 +1874,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); if (!sbi->s_group_desc[i]) { - printk (KERN_ERR "EXT3-fs: " - "can't read group descriptor %d\n", i); + ext3_msg(sb, KERN_ERR, + "error: can't read group descriptor %d", i); db_count = i; goto failed_mount2; } } if (!ext3_check_descriptors (sb)) { - printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n"); + ext3_msg(sb, KERN_ERR, + "error: group descriptors corrupted"); goto failed_mount2; } sbi->s_gdb_count = db_count; @@ -1862,7 +1900,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_count_dirs(sb)); } if (err) { - printk(KERN_ERR "EXT3-fs: insufficient memory\n"); + ext3_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } @@ -1910,9 +1948,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount3; } else { if (!silent) - printk (KERN_ERR - "ext3: No journal on filesystem on %s\n", - sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: no journal found. " + "mounting ext3 over ext2?"); goto failed_mount3; } @@ -1934,8 +1972,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) case EXT3_MOUNT_WRITEBACK_DATA: if (!journal_check_available_features (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { - printk(KERN_ERR "EXT3-fs: Journal does not support " - "requested data journaling mode\n"); + ext3_msg(sb, KERN_ERR, + "error: journal does not support " + "requested data journaling mode"); goto failed_mount4; } default: @@ -1944,8 +1983,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { - printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - " - "its supported only with writeback mode\n"); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring nobh option - " + "it is supported only with writeback mode"); clear_opt(sbi->s_mount_opt, NOBH); } } @@ -1956,18 +1996,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) root = ext3_iget(sb, EXT3_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "EXT3-fs: get root inode failed\n"); + ext3_msg(sb, KERN_ERR, "error: get root inode failed"); ret = PTR_ERR(root); goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); - printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n"); + ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { - printk(KERN_ERR "EXT3-fs: get root dentry failed\n"); + ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); iput(root); ret = -ENOMEM; goto failed_mount4; @@ -1986,9 +2026,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; if (needs_recovery) - printk (KERN_INFO "EXT3-fs: recovery complete.\n"); + ext3_msg(sb, KERN_INFO, "recovery complete"); ext3_mark_recovery_complete(sb, es); - printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", + ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); @@ -1998,7 +2038,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) cantfind_ext3: if (!silent) - printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n", + ext3_msg(sb, KERN_INFO, + "error: can't find ext3 filesystem on dev %s.", sb->s_id); goto failed_mount; @@ -2066,27 +2107,27 @@ static journal_t *ext3_get_journal(struct super_block *sb, journal_inode = ext3_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { - printk(KERN_ERR "EXT3-fs: no journal found.\n"); + ext3_msg(sb, KERN_ERR, "error: no journal found"); return NULL; } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); - printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); + ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); return NULL; } jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { - printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); + ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); iput(journal_inode); return NULL; } journal = journal_init_inode(journal_inode); if (!journal) { - printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); + ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); iput(journal_inode); return NULL; } @@ -2108,13 +2149,13 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, struct ext3_super_block * es; struct block_device *bdev; - bdev = ext3_blkdev_get(j_dev); + bdev = ext3_blkdev_get(j_dev, sb); if (bdev == NULL) return NULL; if (bd_claim(bdev, sb)) { - printk(KERN_ERR - "EXT3: failed to claim external journal device.\n"); + ext3_msg(sb, KERN_ERR, + "error: failed to claim external journal device"); blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -2122,8 +2163,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { - printk(KERN_ERR - "EXT3-fs: blocksize too small for journal device.\n"); + ext3_msg(sb, KERN_ERR, + "error: blocksize too small for journal device"); goto out_bdev; } @@ -2131,8 +2172,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, offset = EXT3_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); if (!(bh = __bread(bdev, sb_block, blocksize))) { - printk(KERN_ERR "EXT3-fs: couldn't read superblock of " - "external journal\n"); + ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " + "external journal"); goto out_bdev; } @@ -2140,14 +2181,14 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { - printk(KERN_ERR "EXT3-fs: external journal has " - "bad superblock\n"); + ext3_msg(sb, KERN_ERR, "error: external journal has " + "bad superblock"); brelse(bh); goto out_bdev; } if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); + ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); brelse(bh); goto out_bdev; } @@ -2159,19 +2200,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, journal = journal_init_dev(bdev, sb->s_bdev, start, len, blocksize); if (!journal) { - printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); + ext3_msg(sb, KERN_ERR, + "error: failed to create device journal"); goto out_bdev; } journal->j_private = sb; ll_rw_block(READ, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { - printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); goto out_journal; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - printk(KERN_ERR "EXT3-fs: External journal has more than one " - "user (unsupported) - %d\n", + ext3_msg(sb, KERN_ERR, + "error: external journal has more than one " + "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } @@ -2197,8 +2240,8 @@ static int ext3_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { - printk(KERN_INFO "EXT3-fs: external journal device major/minor " - "numbers have changed\n"); + ext3_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); @@ -2213,21 +2256,21 @@ static int ext3_load_journal(struct super_block *sb, if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { if (sb->s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT3-fs: INFO: recovery " - "required on readonly filesystem.\n"); + ext3_msg(sb, KERN_INFO, + "recovery required on readonly filesystem"); if (really_read_only) { - printk(KERN_ERR "EXT3-fs: write access " - "unavailable, cannot proceed.\n"); + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, cannot proceed"); return -EROFS; } - printk (KERN_INFO "EXT3-fs: write access will " - "be enabled during recovery.\n"); + ext3_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); } } if (journal_inum && journal_dev) { - printk(KERN_ERR "EXT3-fs: filesystem has both journal " - "and inode journals!\n"); + ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " + "and inode journals"); return -EINVAL; } @@ -2242,7 +2285,7 @@ static int ext3_load_journal(struct super_block *sb, if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = journal_update_format(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error updating journal.\n"); + ext3_msg(sb, KERN_ERR, "error updating journal"); journal_destroy(journal); return err; } @@ -2254,7 +2297,7 @@ static int ext3_load_journal(struct super_block *sb, err = journal_load(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error loading journal.\n"); + ext3_msg(sb, KERN_ERR, "error loading journal"); journal_destroy(journal); return err; } @@ -2273,16 +2316,17 @@ static int ext3_load_journal(struct super_block *sb, return 0; } -static int ext3_create_journal(struct super_block * sb, - struct ext3_super_block * es, +static int ext3_create_journal(struct super_block *sb, + struct ext3_super_block *es, unsigned int journal_inum) { journal_t *journal; int err; if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " - "create journal.\n"); + ext3_msg(sb, KERN_ERR, + "error: readonly filesystem when trying to " + "create journal"); return -EROFS; } @@ -2290,12 +2334,12 @@ static int ext3_create_journal(struct super_block * sb, if (!journal) return -EINVAL; - printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", + ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", journal_inum); err = journal_create(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error creating journal.\n"); + ext3_msg(sb, KERN_ERR, "error creating journal"); journal_destroy(journal); return -EIO; } @@ -2376,8 +2420,8 @@ out: * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es) +static void ext3_clear_journal_err(struct super_block *sb, + struct ext3_super_block *es) { journal_t *journal; int j_errno; @@ -2568,10 +2612,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))) { - printk(KERN_WARNING "EXT3-fs: %s: couldn't " - "remount RDWR because of unsupported " - "optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + ext3_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR " + "because of unsupported optional " + "features (%x)", le32_to_cpu(ret)); err = -EROFS; goto restore_opts; } @@ -2582,11 +2626,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) * require a full umount/remount for now. */ if (es->s_last_orphan) { - printk(KERN_WARNING "EXT3-fs: %s: couldn't " + ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead.\n", - sb->s_id); + "umount/remount instead."); err = -EINVAL; goto restore_opts; } @@ -2686,13 +2729,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_bsize = sb->s_blocksize; buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); - es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT3_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); @@ -2837,9 +2878,9 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, if (EXT3_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ if (path.dentry->d_parent != sb->s_root) - printk(KERN_WARNING - "EXT3-fs: Quota file not on filesystem root. " - "Journaled quota will not work.\n"); + ext3_msg(sb, KERN_WARNING, + "warning: Quota file not on filesystem root. " + "Journaled quota will not work."); } /* @@ -2921,8 +2962,9 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, handle_t *handle = journal_current_handle(); if (!handle) { - printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)" - " cancelled because transaction is not started.\n", + ext3_msg(sb, KERN_WARNING, + "warning: quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started.", (unsigned long long)off, (unsigned long long)len); return -EIO; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 545e37c4b91..387d92d00b9 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -960,6 +960,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext3_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); @@ -985,9 +989,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext3_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext3_xattr_ibody_set(handle, inode, &i, &is); diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9f2d45d75b1..e5f6774846e 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,6 +2,7 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 + select FS_JOURNAL_INFO help This is the next generation of the ext3 filesystem. @@ -26,6 +27,16 @@ config EXT4_FS If unsure, say N. +config EXT4_USE_FOR_EXT23 + bool "Use ext4 for ext2/ext3 file systems" + depends on EXT3_FS=n || EXT2_FS=n + default y + help + Allow the ext4 file system driver code to be used for ext2 or + ext3 file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + config EXT4_FS_XATTR bool "Ext4 extended attributes" depends on EXT4_FS diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1d0418980f8..22bc7435d91 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -499,44 +499,6 @@ error_return: } /** - * ext4_free_blocks() -- Free given blocks and update quota - * @handle: handle for this transaction - * @inode: inode - * @block: start physical block to free - * @count: number of blocks to count - * @metadata: Are these metadata blocks - */ -void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata) -{ - struct super_block *sb; - unsigned long dquot_freed_blocks; - - /* this isn't the right place to decide whether block is metadata - * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; - - /* We need to make sure we don't reuse - * block released untill the transaction commit. - * writeback mode have weak data consistency so - * don't force data as metadata when freeing block - * for writeback mode. - */ - if (metadata == 0 && !ext4_should_writeback_data(inode)) - metadata = 1; - - sb = inode->i_sb; - - ext4_mb_free_blocks(handle, inode, block, count, - metadata, &dquot_freed_blocks); - if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); - return; -} - -/** * ext4_has_free_blocks() * @sbi: in-core super block structure. * @nblocks: number of needed blocks @@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; } /** diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 50784ef0756..4df8621ec31 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - sbi->s_gdb_count + 1); + ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); if (ret) @@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, struct rb_node *n = sbi->system_blks.rb_node; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; while (n) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8825515eedd..ab31e65d46d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -376,6 +376,12 @@ struct ext4_new_group_data { EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 + +/* * ioctl commands */ #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS @@ -703,6 +709,13 @@ struct ext4_inode_info { struct list_head i_aio_dio_complete_list; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* @@ -750,6 +763,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt @@ -1324,8 +1338,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, int metadata); extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); @@ -1384,16 +1396,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); -extern void ext4_mb_free_blocks(handle_t *, struct inode *, - ext4_fsblk_t, unsigned long, int, unsigned long *); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); /* inode.c */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6a9409920de..b57e5c711b6 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -4,6 +4,8 @@ #include "ext4_jbd2.h" +#include <trace/events/ext4.h> + int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { @@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, return err; } -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr) { - int err = 0; + int err; - if (ext4_handle_valid(handle)) { - err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); - } - else + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* In the no journal case, we can just do a bforget and return */ + if (!ext4_handle_valid(handle)) { bforget(bh); - return err; -} + return 0; + } -int __ext4_journal_revoke(const char *where, handle_t *handle, - ext4_fsblk_t blocknr, struct buffer_head *bh) -{ - int err = 0; + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ - if (ext4_handle_valid(handle)) { - err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + return err; + } + return 0; } - else - bforget(bh); + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, __func__, bh, handle, err); + ext4_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + } + BUFFER_TRACE(bh, "exit"); return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a2865980342..05eca817d70 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -49,7 +49,7 @@ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. @@ -57,7 +57,7 @@ * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be @@ -92,6 +92,7 @@ * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else @@ -99,6 +100,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) int ext4_mark_iloc_dirty(handle_t *handle, @@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* - * Wrapper functions with which ext4 calls into JBD. The intent here is - * to allow these to be turned into appropriate stubs so ext4 can control - * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't - * been done yet. + * Wrapper functions with which ext4 calls into JBD. */ - void ext4_journal_abort_handle(const char *caller, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); @@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh); -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh); - -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_revoke(const char *where, handle_t *handle, - ext4_fsblk_t blocknr, struct buffer_head *bh); +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); @@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, __ext4_journal_get_undo_access(__func__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, (handle), (bh)) -#define ext4_journal_revoke(handle, blocknr, bh) \ - __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ + (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_forget(handle, bh) \ - __ext4_journal_forget(__func__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) @@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 715264b4bae..3a7928f825e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1007,7 +1007,8 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); @@ -1761,7 +1762,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, while (block < last && block != EXT_MAX_BLOCK) { num = last - block; /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -1957,7 +1960,6 @@ errout: static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { - struct buffer_head *bh; int err; ext4_fsblk_t leaf; @@ -1973,9 +1975,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); - bh = sb_find_get_block(inode->i_sb, leaf); - ext4_forget(handle, 1, inode, bh, leaf); - ext4_free_blocks(handle, inode, leaf, 1, 1); + ext4_free_blocks(handle, inode, 0, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; } @@ -2042,12 +2043,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_lblk_t to) { - struct buffer_head *bh; unsigned short ee_len = ext4_ext_get_actual_len(ex); - int i, metadata = 0; + int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; + flags |= EXT4_FREE_BLOCKS_METADATA; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2072,11 +2072,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, num = le32_to_cpu(ex->ee_block) + ee_len - from; start = ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); - for (i = 0; i < num; i++) { - bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, 0, inode, bh, start + i); - } - ext4_free_blocks(handle, inode, start, num, metadata); + ext4_free_blocks(handle, inode, 0, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -2167,7 +2163,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) @@ -3064,6 +3060,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); goto out2; } /* buffered IO case */ @@ -3091,6 +3089,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, path, iblock, max_blocks); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out: if (ret <= 0) { err = ret; @@ -3319,8 +3319,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), 0); goto out2; } @@ -3329,10 +3329,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = ext4_ext_get_actual_len(&newex); set_buffer_new(bh_result); - /* Cache only when it is _not_ an uninitialized extent */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > max_blocks) allocated = max_blocks; @@ -3720,10 +3726,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_read(&EXT4_I(inode)->i_data_sem); } return error; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2b1531266ee..0b22497d92e 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -51,25 +51,30 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int err, ret = 0; + int ret; + tid_t commit_tid; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + ret = flush_aio_dio_completed_IO(inode); if (ret < 0) - goto out; + return ret; + + if (!journal) + return simple_fsync(file, dentry, datasync); + /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for proper transaction to + * commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -79,32 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - goto out; - } + if (ext4_should_journal_data(inode)) + return ext4_force_commit(inode->i_sb); - if (!journal) - ret = sync_mapping_buffers(inode->i_mapping); - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - err = sync_inode(inode, &wbc); - if (ret == 0) - ret = err; - } -out: - if (journal && (journal->j_flags & JBD2_BARRIER)) + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (jbd2_log_start_commit(journal, commit_tid)) + jbd2_log_wait_commit(journal, commit_tid); + else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4e8e2f15b8b..5352db1a308 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -71,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * The ext4 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - * - * If the handle isn't valid we're not journaling, but we still need to - * call into ext4_journal_revoke() to put the buffer head. - */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext4_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return ext4_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext4_journal_revoke"); - err = ext4_journal_revoke(handle, blocknr, bh); - if (err) - ext4_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - -/* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ @@ -721,7 +669,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); return ret; } @@ -817,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, branch[i].bh); + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); } - for (i = 0; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); return err; } @@ -903,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, where[i].bh); - ext4_free_blocks(handle, inode, - le32_to_cpu(where[i-1].key), 1, 0); + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + blks, 0); return err; } @@ -1021,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -1052,7 +1012,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) EXT4_I(inode)->i_reserved_meta_blocks; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return total; + return (total << inode->i_blkbits); } /* * Calculate the number of metadata blocks need to reserve @@ -1534,6 +1494,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1599,7 +1569,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1709,7 +1679,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1751,7 +1721,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1814,7 +1784,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -2600,7 +2570,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) } static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, unsigned int len) { struct address_space *mapping = page->mapping; @@ -2758,7 +2727,7 @@ static int ext4_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc, len); + return __ext4_journalled_writepage(page, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) @@ -2788,7 +2757,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(inode->i_flags & EXT4_EXTENTS_FL) && + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; @@ -3091,7 +3060,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -4120,6 +4089,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4134,27 +4108,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } } - /* - * Any buffers which are on the journal will be in memory. We - * find them on the hash table so jbd2_journal_revoke() will - * run jbd2_journal_forget() on them. We've already detached - * each block from the file, so bforget() in - * jbd2_journal_forget() should be safe. - * - * AKPM: turn on bforget in jbd2_journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *tbh; - - *p = 0; - tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, 0, inode, tbh, nr); - } - } + for (p = first; p < last; p++) + *p = 0; - ext4_free_blocks(handle, inode, block_to_free, count, 0); + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); } /** @@ -4342,7 +4299,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } - ext4_free_blocks(handle, inode, nr, 1, 1); + ext4_free_blocks(handle, inode, 0, nr, 1, + EXT4_FREE_BLOCKS_METADATA); if (parent_bh) { /* @@ -4781,8 +4739,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; - struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4793,11 +4751,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); + iloc.bh = 0; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; - bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); @@ -4820,7 +4778,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -4848,11 +4805,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse(bh); ret = -EIO; goto bad_inode; } @@ -4884,10 +4865,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - EXT4_SB(sb)->s_gdb_count)) || - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { ext4_error(sb, __func__, "bad extended attribute block %llu in inode #%lu", ei->i_file_acl, inode->i_ino); @@ -4905,10 +4883,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } - if (ret) { - brelse(bh); + if (ret) goto bad_inode; - } if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; @@ -4936,7 +4912,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { - brelse(bh); ret = -EIO; ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", @@ -4949,6 +4924,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; bad_inode: + brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } @@ -5108,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ei->i_state &= ~EXT4_STATE_NEW; + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); @@ -5227,8 +5204,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c1cdf613e72..b63d193126d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -221,31 +221,38 @@ setversion_out: struct file *donor_filp; int err; + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; + me.moved_len = 0; donor_filp = fget(me.donor_fd); if (!donor_filp) return -EBADF; - if (!capable(CAP_DAC_OVERRIDE)) { - if ((current->real_cred->fsuid != inode->i_uid) || - !(inode->i_mode & S_IRUSR) || - !(donor_filp->f_dentry->d_inode->i_mode & - S_IRUSR)) { - fput(donor_filp); - return -EACCES; - } + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; } + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto mext_out; + err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); - fput(donor_filp); + mnt_drop_write(filp->f_path.mnt); + if (me.moved_len > 0) + file_remove_suid(donor_filp); if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) - return -EFAULT; - + err = -EFAULT; +mext_out: + fput(donor_filp); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 74e495dabe0..b1fd3daadc9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct ext4_group_info *db; int err, count = 0, count2 = 0; struct ext4_free_data *entry; - ext4_fsblk_t discard_block; struct list_head *l, *ltmp; list_for_each_safe(l, ltmp, &txn->t_private_list) { @@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + discard_block = (ext4_fsblk_t)entry->group * + EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(es->s_first_data_block); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } @@ -3006,6 +3011,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) } /* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + +/* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, @@ -3932,7 +3955,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ - ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); + ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; @@ -4290,6 +4313,7 @@ repeat: ac->ac_status = AC_STATUS_CONTINUE; goto repeat; } else if (*errp) { + ext4_discard_allocated_blocks(ac); ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); @@ -4422,18 +4446,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, return 0; } -/* - * Main entry point into mballoc to free blocks +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @metadata: Are these metadata blocks */ -void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata, unsigned long *freed) +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; struct ext4_super_block *es; + unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4443,13 +4473,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; - *freed = 0; + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { + if (!ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, __func__, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); @@ -4457,7 +4490,32 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %llu\n", block); - trace_ext4_free_blocks(inode, block, count, metadata); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4533,7 +4591,8 @@ do_more: err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_return; - if (metadata && ext4_handle_valid(handle)) { + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { struct ext4_free_data *new_entry; /* * blocks being freed are metadata. these blocks shouldn't @@ -4572,7 +4631,7 @@ do_more: ext4_mb_release_desc(&e4b); - *freed += count; + freed += count; /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -4592,6 +4651,8 @@ do_more: } sb->s_dirt = 1; error_return: + if (freed) + vfs_dq_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a93d5b80f3e..81415814b00 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) * So allocate a credit of 3. We may update * quota (user and group). */ - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); if (ext4_journal_extend(handle, needed) != 0) retval = ext4_journal_restart(handle, needed); @@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle, for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(tmp_idata[i]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(i_data[0]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ @@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return retval; } @@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 25b6b145736..82c415be87a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -77,12 +77,14 @@ static int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { + struct ext4_extent_header *eh; int ppos, leaf_ppos = path->p_depth; ppos = leaf_ppos; if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext_pblock(path[ppos].p_ext); return 0; } @@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, ext_block_hdr(path[cur_ppos+1].p_bh); } + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + /* leaf block */ path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, } /** - * mext_double_down_read - Acquire two inodes' read semaphore - * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. - */ -static void -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) -{ - struct inode *first = orig_inode, *second = donor_inode; - - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; - } - - down_read(&EXT4_I(first)->i_data_sem); - down_read(&EXT4_I(second)->i_data_sem); -} - -/** - * mext_double_down_write - Acquire two inodes' write semaphore + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * * @orig_inode: original inode structure * @donor_inode: donor inode structure - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. */ static void -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; @@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) } down_write(&EXT4_I(first)->i_data_sem); - down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** - * mext_double_up_read - Release two inodes' read semaphore + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second - * Release read semaphore of two inodes (orig and donor). + * Release write lock of i_data_sem of two inodes (orig and donor). */ static void -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) -{ - up_read(&EXT4_I(orig_inode)->i_data_sem); - up_read(&EXT4_I(donor_inode)->i_data_sem); -} - -/** - * mext_double_up_write - Release two inodes' write semaphore - * - * @orig_inode: original inode structure to be released its lock first - * @donor_inode: donor inode structure to be released its lock second - * Release write semaphore of two inodes (orig and donor). - */ -static void -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -596,7 +568,7 @@ out: * @tmp_oext: the extent that will belong to the donor inode * @orig_off: block offset of original inode * @donor_off: block offset of donor inode - * @max_count: the maximun length of extents + * @max_count: the maximum length of extents * * Return 0 on success, or a negative error value on failure. */ @@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @donor_inode: donor inode * @from: block offset of orig_inode * @count: block count to be replaced + * @err: pointer to save return value * * Replace original inode extents and donor inode extents page by page. * We implement this replacement in the following three steps: @@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * 3. Change the block information of donor inode to point at the saved * original inode blocks in the dummy extents. * - * Return 0 on success, or a negative error value on failure. + * Return replaced block count. */ static int mext_replace_branches(handle_t *handle, struct inode *orig_inode, struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count) + ext4_lblk_t count, int *err) { struct ext4_ext_path *orig_path = NULL; struct ext4_ext_path *donor_path = NULL; struct ext4_extent *oext, *dext; struct ext4_extent tmp_dext, tmp_oext; ext4_lblk_t orig_off = from, donor_off = from; - int err = 0; int depth; int replaced_count = 0; int dext_alen; - mext_double_down_write(orig_inode, donor_inode); + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; /* Get the donor extent for the head */ - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); - if (err) + if (*err) goto out; /* Loop for the donor extents */ @@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (!dext) { ext4_error(donor_inode->i_sb, __func__, "The extent for donor must be found"); - err = -EIO; + *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { ext4_error(donor_inode->i_sb, __func__, @@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, "extent(%u) should be equal", donor_off, le32_to_cpu(tmp_dext.ee_block)); - err = -EIO; + *err = -EIO; goto out; } /* Set donor extent to orig extent */ - err = mext_leaf_block(handle, orig_inode, + *err = mext_leaf_block(handle, orig_inode, orig_path, &tmp_dext, &orig_off); - if (err < 0) + if (*err) goto out; /* Set orig extent to donor extent */ - err = mext_leaf_block(handle, donor_inode, + *err = mext_leaf_block(handle, donor_inode, donor_path, &tmp_oext, &donor_off); - if (err < 0) + if (*err) goto out; dext_alen = ext4_ext_get_actual_len(&tmp_dext); @@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; - if (le32_to_cpu(oext->ee_block) + - ext4_ext_get_actual_len(oext) <= orig_off) { - err = 0; - goto out; - } tmp_oext = *oext; if (donor_path) ext4_ext_drop_refs(donor_path); - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; - if (le32_to_cpu(dext->ee_block) + - ext4_ext_get_actual_len(dext) <= donor_off) { - err = 0; - goto out; - } tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count - replaced_count); - if (err) + if (*err) goto out; } @@ -795,8 +758,12 @@ out: kfree(donor_path); } - mext_double_up_write(orig_inode, donor_inode); - return err; + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + + return replaced_count; } /** @@ -808,16 +775,17 @@ out: * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling mext_replace_branches(). - * Finally, write out the saved data in new original inode blocks. Return 0 - * on success, or a negative error value on failure. + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit) + int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; struct address_space *mapping = orig_inode->i_mapping; @@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; - unsigned int tmp_data_len, data_len; + unsigned int tmp_data_size, data_size, replaced_size; void *fsdata; - int ret, i, jblocks; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* @@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; + *err = PTR_ERR(handle); + return 0; } if (segment_eq(get_fs(), KERNEL_DS)) @@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - ret = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page); - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); goto out2; } offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_len */ + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ - tmp_data_len = orig_inode->i_size & (blocksize - 1); + tmp_data_size = orig_inode->i_size & (blocksize - 1); /* - * If data_len equal zero, it shows data_len is multiples of + * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ - if (tmp_data_len == 0) - tmp_data_len = blocksize; + if (tmp_data_size == 0) + tmp_data_size = blocksize; - data_len = tmp_data_len + + data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else { - data_len = block_len_in_page << orig_inode->i_blkbits; - } + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, &page, &fsdata); - if (unlikely(ret < 0)) + if (unlikely(*err < 0)) goto out; if (!PageUptodate(page)) { @@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - ret = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page); - if (ret < 0) - goto out; - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); @@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { - ret = ext4_get_block(orig_inode, + *err = ext4_get_block(orig_inode, (sector_t)(orig_blk_offset + i), bh, 0); - if (ret < 0) + if (*err < 0) goto out; if (bh->b_this_page != NULL) bh = bh->b_this_page; } - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, page, fsdata); page = NULL; @@ -951,7 +921,10 @@ out: out2: ext4_journal_stop(handle); - return ret < 0 ? ret : 0; + if (err2) + *err = err2; + + return replaced_count; } /** @@ -962,7 +935,6 @@ out2: * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved - * @moved_len: moved block length * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. @@ -970,8 +942,8 @@ out2: */ static int mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len, __u64 moved_len) + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) { ext4_lblk_t orig_blocks, donor_blocks; unsigned int blkbits = orig_inode->i_blkbits; @@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " @@ -1025,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if (moved_len) { - ext4_debug("ext4 move extent: moved_len should be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); - return -EINVAL; - } - if ((orig_start > EXT_MAX_BLOCK) || (donor_start > EXT_MAX_BLOCK) || (*len > EXT_MAX_BLOCK) || @@ -1088,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode, } if (!*len) { - ext4_debug("ext4 move extent: len shoudld not be 0 " + ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; @@ -1232,16 +1204,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } - /* protect orig and donor against a truncate */ + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) return ret1; - mext_double_down_read(orig_inode, donor_inode); + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len, *moved_len); - mext_double_up_read(orig_inode, donor_inode); + donor_start, &len); if (ret1) goto out; @@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, seq_start = le32_to_cpu(ext_cur->ee_block); rest_blocks = seq_blocks; - /* Discard preallocations of two inodes */ - down_write(&EXT4_I(orig_inode)->i_data_sem); - ext4_discard_preallocations(orig_inode); - up_write(&EXT4_I(orig_inode)->i_data_sem); - - down_write(&EXT4_I(donor_inode)->i_data_sem); - ext4_discard_preallocations(donor_inode); - up_write(&EXT4_I(donor_inode)->i_data_sem); + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret1 = move_extent_per_page(o_filp, donor_inode, + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit); - if (ret1 < 0) - goto out; - orig_page_offset++; + block_len_in_page, uninit, + &ret1); + /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; + if (ret1 < 0) + break; if (*moved_len > len) { ext4_error(orig_inode->i_sb, __func__, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); ret1 = -EIO; - goto out; + break; } + orig_page_offset++; data_offset_in_page = 0; rest_blocks -= block_len_in_page; if (rest_blocks > blocks_per_page) @@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); @@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + if (orig_path) { ext4_ext_drop_refs(orig_path); kfree(orig_path); @@ -1422,7 +1406,7 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - + double_up_write_data_sem(orig_inode, donor_inode); ret2 = mext_inode_double_unlock(orig_inode, donor_inode); if (ret1) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d2c1b897fc..17a17e10dd6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1292,9 +1292,6 @@ errout: * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) { - brelse(bh); + bh, offset)) return -EIO; - } - if (ext4_match(namelen, name, de)) { - brelse(bh); + if (ext4_match(namelen, name, de)) return -EEXIST; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_journal_get_write_access(handle, bh); if (err) { ext4_std_error(dir->i_sb, err); - brelse(bh); return err; } @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - brelse(bh); return 0; } @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (!(de)) return retval; - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if(!bh) return retval; retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) + if (retval != -ENOSPC) { + brelse(bh); return retval; + } if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto journal_error; err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; + if (err != -ENOSPC) goto cleanup; - } /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; goto cleanup; journal_error: @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2259,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3cfc343c41b..3b2c5541d8a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext4_handle_dirty_metadata(handle, NULL, gdb); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d4ca92aab51..827bde1f259 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb) if (sb->s_dirt) ext4_commit_super(sb, 1); - ext4_release_system_zone(sb); - ext4_mb_release(sb); - ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, __func__, "Couldn't clean up the journal"); } + + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -704,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&(ei->i_block_reservation_lock)); INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; return &ei->vfs_inode; } @@ -765,9 +769,22 @@ static inline void ext4_show_quota_options(struct seq_file *seq, #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_jquota_fmt) - seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0"); + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -899,6 +916,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); + if (test_opt(sb, DISCARD)) + seq_puts(seq, ",discard"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext4_show_quota_options(seq, sb); return 0; @@ -1074,12 +1097,13 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, }; static const match_table_t tokens = { @@ -1104,6 +1128,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, @@ -1125,6 +1150,7 @@ static const match_table_t tokens = { {Opt_grpjquota, "grpjquota=%s"}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_grpquota, "grpquota"}, {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, @@ -1144,6 +1170,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, }; @@ -1425,6 +1453,9 @@ clear_qf_name: goto set_qf_format; case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; set_qf_format: if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { @@ -1467,6 +1498,7 @@ set_qf_format: case Opt_offgrpjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: ext4_msg(sb, KERN_ERR, "journaled quota options not supported"); break; @@ -1565,6 +1597,12 @@ set_qf_format: else set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; + case Opt_discard: + set_opt(sbi->s_mount_opt, DISCARD); + break; + case Opt_nodiscard: + clear_opt(sbi->s_mount_opt, DISCARD); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -1673,14 +1711,14 @@ static int ext4_fill_flex_info(struct super_block *sb) size_t size; int i; - if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + if (groups_per_flex < 2) { sbi->s_log_groups_per_flex = 0; return 1; } - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << @@ -2099,11 +2137,8 @@ static int parse_strtoul(const char *buf, { char *endp; - while (*buf && isspace(*buf)) - buf++; - *value = simple_strtoul(buf, &endp, 0); - while (*endp && isspace(*endp)) - endp++; + *value = simple_strtoul(skip_spaces(buf), &endp, 0); + endp = skip_spaces(endp); if (*endp || *value > max) return -EINVAL; @@ -2721,26 +2756,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext4_load_journal(sb, es, journal_devnum)) goto failed_mount3; - if (!(sb->s_flags & MS_RDONLY) && - EXT4_SB(sb)->s_journal->j_failed_commit) { - ext4_msg(sb, KERN_CRIT, "error: " - "ext4_fill_super: Journal transaction " - "%u is corrupt", - EXT4_SB(sb)->s_journal->j_failed_commit); - if (test_opt(sb, ERRORS_RO)) { - ext4_msg(sb, KERN_CRIT, - "Mounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - } - if (test_opt(sb, ERRORS_PANIC)) { - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, 1); - goto failed_mount4; - } - } } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " @@ -3668,13 +3683,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); - ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); @@ -3966,6 +3979,58 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } +#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +#endif + +#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} +#else +static inline void register_as_ext3(void) { } +static inline void unregister_as_ext3(void) { } +#endif + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", @@ -3995,11 +4060,15 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; + register_as_ext2(); + register_as_ext3(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: + unregister_as_ext2(); + unregister_as_ext3(); destroy_inodecache(); out1: exit_ext4_xattr(); @@ -4015,6 +4084,8 @@ out4: static void __exit exit_ext4_fs(void) { + unregister_as_ext2(); + unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); exit_ext4_xattr(); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fed5b01d7a8..910bf9a59cb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ea_bdebug(bh, "refcount now=0; freeing"); if (ce) mb_cache_entry_free(ce); - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); get_bh(bh); - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); error = ext4_handle_dirty_metadata(handle, inode, bh); @@ -832,7 +833,8 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA); error = -EIO; goto cleanup; } @@ -988,6 +990,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -1013,9 +1019,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 4dcddf83326..b192c661caa 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -10,6 +10,7 @@ config GFS2_FS select SLOW_WORK select QUOTA select QUOTACTL + select FS_JOURNAL_INFO help A cluster filesystem. diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c5dad1eb7b9..0dc34621f6a 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -85,11 +85,7 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) buf[0] = '\0'; if (!gfs2_uuid_valid(uuid)) return 0; - return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-" - "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", - uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], - uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11], - uuid[12], uuid[13], uuid[14], uuid[15]); + return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); } static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) @@ -575,14 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!sdp->sd_args.ar_spectator) add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); - if (gfs2_uuid_valid(uuid)) { - add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" - "%02X%02X-%02X%02X%02X%02X%02X%02X", - uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], - uuid[5], uuid[6], uuid[7], uuid[8], uuid[9], - uuid[10], uuid[11], uuid[12], uuid[13], - uuid[14], uuid[15]); - } + if (gfs2_uuid_valid(uuid)) + add_uevent_var(env, "UUID=%pUB", uuid); return 0; } diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 6d98f116ca0..424b0337f52 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, err = hfs_brec_find(&src_fd); if (err) goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, src_fd.entrylength); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 7c69b98a2e4..2b3b8611b41 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) filp->f_pos++; /* fall through */ case 1: + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { printk(KERN_ERR "hfs: bad catalog folder thread\n"); @@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index f7fcbe49da7..5ed7252b7b2 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -409,8 +409,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) /* try to get the root inode */ hfs_find_init(HFS_SB(sb)->cat_tree, &fd); res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); - if (!res) + if (!res) { + if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + res = -EIO; + goto bail; + } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + } if (res) { hfs_find_exit(&fd); goto bail_no_root; diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index defb932eee9..0b3fa7974fa 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -36,286 +36,323 @@ static void *zisofs_zlib_workspace; static DEFINE_MUTEX(zisofs_zlib_lock); /* - * When decompressing, we typically obtain more than one page - * per reference. We inject the additional pages into the page - * cache as a form of readahead. + * Read data of @inode from @block_start to @block_end and uncompress + * to one zisofs block. Store the data in the @pages array with @pcount + * entries. Start storing at offset @poffset of the first page. */ -static int zisofs_readpage(struct file *file, struct page *page) +static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, + loff_t block_end, int pcount, + struct page **pages, unsigned poffset, + int *errp) { - struct inode *inode = file->f_path.dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - unsigned int maxpage, xpage, fpage, blockindex; - unsigned long offset; - unsigned long blockptr, blockendptr, cstart, cend, csize; - struct buffer_head *bh, *ptrbh[2]; - unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); - unsigned int bufshift = ISOFS_BUFFER_BITS(inode); - unsigned long bufmask = bufsize - 1; - int err = -EIO; - int i; - unsigned int header_size = ISOFS_I(inode)->i_format_parm[0]; unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; - /* unsigned long zisofs_block_size = 1UL << zisofs_block_shift; */ - unsigned int zisofs_block_page_shift = zisofs_block_shift-PAGE_CACHE_SHIFT; - unsigned long zisofs_block_pages = 1UL << zisofs_block_page_shift; - unsigned long zisofs_block_page_mask = zisofs_block_pages-1; - struct page *pages[zisofs_block_pages]; - unsigned long index = page->index; - int indexblocks; - - /* We have already been given one page, this is the one - we must do. */ - xpage = index & zisofs_block_page_mask; - pages[xpage] = page; - - /* The remaining pages need to be allocated and inserted */ - offset = index & ~zisofs_block_page_mask; - blockindex = offset >> zisofs_block_page_shift; - maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - /* - * If this page is wholly outside i_size we just return zero; - * do_generic_file_read() will handle this for us - */ - if (page->index >= maxpage) { - SetPageUptodate(page); - unlock_page(page); + unsigned int bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned int bufshift = ISOFS_BUFFER_BITS(inode); + unsigned int bufmask = bufsize - 1; + int i, block_size = block_end - block_start; + z_stream stream = { .total_out = 0, + .avail_in = 0, + .avail_out = 0, }; + int zerr; + int needblocks = (block_size + (block_start & bufmask) + bufmask) + >> bufshift; + int haveblocks; + blkcnt_t blocknum; + struct buffer_head *bhs[needblocks + 1]; + int curbh, curpage; + + if (block_size > deflateBound(1UL << zisofs_block_shift)) { + *errp = -EIO; return 0; } - - maxpage = min(zisofs_block_pages, maxpage-offset); - - for ( i = 0 ; i < maxpage ; i++, offset++ ) { - if ( i != xpage ) { - pages[i] = grab_cache_page_nowait(mapping, offset); - } - page = pages[i]; - if ( page ) { - ClearPageError(page); - kmap(page); + /* Empty block? */ + if (block_size == 0) { + for ( i = 0 ; i < pcount ; i++ ) { + if (!pages[i]) + continue; + memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE); + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); } + return ((loff_t)pcount) << PAGE_CACHE_SHIFT; } - /* This is the last page filled, plus one; used in case of abort. */ - fpage = 0; + /* Because zlib is not thread-safe, do all the I/O at the top. */ + blocknum = block_start >> bufshift; + memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); + haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); + ll_rw_block(READ, haveblocks, bhs); - /* Find the pointer to this specific chunk */ - /* Note: we're not using isonum_731() here because the data is known aligned */ - /* Note: header_size is in 32-bit words (4 bytes) */ - blockptr = (header_size + blockindex) << 2; - blockendptr = blockptr + 4; + curbh = 0; + curpage = 0; + /* + * First block is special since it may be fractional. We also wait for + * it before grabbing the zlib mutex; odds are that the subsequent + * blocks are going to come in in short order so we don't hold the zlib + * mutex longer than necessary. + */ - indexblocks = ((blockptr^blockendptr) >> bufshift) ? 2 : 1; - ptrbh[0] = ptrbh[1] = NULL; + if (!bhs[0]) + goto b_eio; - if ( isofs_get_blocks(inode, blockptr >> bufshift, ptrbh, indexblocks) != indexblocks ) { - if ( ptrbh[0] ) brelse(ptrbh[0]); - printk(KERN_DEBUG "zisofs: Null buffer on reading block table, inode = %lu, block = %lu\n", - inode->i_ino, blockptr >> bufshift); - goto eio; - } - ll_rw_block(READ, indexblocks, ptrbh); - - bh = ptrbh[0]; - if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n", - inode->i_ino, blockptr >> bufshift); - if ( ptrbh[1] ) - brelse(ptrbh[1]); - goto eio; - } - cstart = le32_to_cpu(*(__le32 *)(bh->b_data + (blockptr & bufmask))); - - if ( indexblocks == 2 ) { - /* We just crossed a block boundary. Switch to the next block */ - brelse(bh); - bh = ptrbh[1]; - if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n", - inode->i_ino, blockendptr >> bufshift); - goto eio; - } + wait_on_buffer(bhs[0]); + if (!buffer_uptodate(bhs[0])) { + *errp = -EIO; + goto b_eio; } - cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask))); - brelse(bh); - if (cstart > cend) - goto eio; + stream.workspace = zisofs_zlib_workspace; + mutex_lock(&zisofs_zlib_lock); - csize = cend-cstart; - - if (csize > deflateBound(1UL << zisofs_block_shift)) - goto eio; - - /* Now page[] contains an array of pages, any of which can be NULL, - and the locks on which we hold. We should now read the data and - release the pages. If the pages are NULL the decompressed data - for that particular page should be discarded. */ - - if ( csize == 0 ) { - /* This data block is empty. */ - - for ( fpage = 0 ; fpage < maxpage ; fpage++ ) { - if ( (page = pages[fpage]) != NULL ) { - memset(page_address(page), 0, PAGE_CACHE_SIZE); - - flush_dcache_page(page); - SetPageUptodate(page); - kunmap(page); - unlock_page(page); - if ( fpage == xpage ) - err = 0; /* The critical page */ - else - page_cache_release(page); + zerr = zlib_inflateInit(&stream); + if (zerr != Z_OK) { + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else + *errp = -EIO; + printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", + zerr); + goto z_eio; + } + + while (curpage < pcount && curbh < haveblocks && + zerr != Z_STREAM_END) { + if (!stream.avail_out) { + if (pages[curpage]) { + stream.next_out = page_address(pages[curpage]) + + poffset; + stream.avail_out = PAGE_CACHE_SIZE - poffset; + poffset = 0; + } else { + stream.next_out = (void *)&zisofs_sink_page; + stream.avail_out = PAGE_CACHE_SIZE; } } - } else { - /* This data block is compressed. */ - z_stream stream; - int bail = 0, left_out = -1; - int zerr; - int needblocks = (csize + (cstart & bufmask) + bufmask) >> bufshift; - int haveblocks; - struct buffer_head *bhs[needblocks+1]; - struct buffer_head **bhptr; - - /* Because zlib is not thread-safe, do all the I/O at the top. */ - - blockptr = cstart >> bufshift; - memset(bhs, 0, (needblocks+1)*sizeof(struct buffer_head *)); - haveblocks = isofs_get_blocks(inode, blockptr, bhs, needblocks); - ll_rw_block(READ, haveblocks, bhs); - - bhptr = &bhs[0]; - bh = *bhptr++; - - /* First block is special since it may be fractional. - We also wait for it before grabbing the zlib - mutex; odds are that the subsequent blocks are - going to come in in short order so we don't hold - the zlib mutex longer than necessary. */ - - if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n", - fpage, xpage, csize); - goto b_eio; - } - stream.next_in = bh->b_data + (cstart & bufmask); - stream.avail_in = min(bufsize-(cstart & bufmask), csize); - csize -= stream.avail_in; - - stream.workspace = zisofs_zlib_workspace; - mutex_lock(&zisofs_zlib_lock); - - zerr = zlib_inflateInit(&stream); - if ( zerr != Z_OK ) { - if ( err && zerr == Z_MEM_ERROR ) - err = -ENOMEM; - printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", - zerr); - goto z_eio; + if (!stream.avail_in) { + wait_on_buffer(bhs[curbh]); + if (!buffer_uptodate(bhs[curbh])) { + *errp = -EIO; + break; + } + stream.next_in = bhs[curbh]->b_data + + (block_start & bufmask); + stream.avail_in = min_t(unsigned, bufsize - + (block_start & bufmask), + block_size); + block_size -= stream.avail_in; + block_start = 0; } - while ( !bail && fpage < maxpage ) { - page = pages[fpage]; - if ( page ) - stream.next_out = page_address(page); - else - stream.next_out = (void *)&zisofs_sink_page; - stream.avail_out = PAGE_CACHE_SIZE; - - while ( stream.avail_out ) { - int ao, ai; - if ( stream.avail_in == 0 && left_out ) { - if ( !csize ) { - printk(KERN_WARNING "zisofs: ZF read beyond end of input\n"); - bail = 1; - break; - } else { - bh = *bhptr++; - if ( !bh || - (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - /* Reached an EIO */ - printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n", - fpage, xpage, csize); - - bail = 1; - break; - } - stream.next_in = bh->b_data; - stream.avail_in = min(csize,bufsize); - csize -= stream.avail_in; - } - } - ao = stream.avail_out; ai = stream.avail_in; - zerr = zlib_inflate(&stream, Z_SYNC_FLUSH); - left_out = stream.avail_out; - if ( zerr == Z_BUF_ERROR && stream.avail_in == 0 ) - continue; - if ( zerr != Z_OK ) { - /* EOF, error, or trying to read beyond end of input */ - if ( err && zerr == Z_MEM_ERROR ) - err = -ENOMEM; - if ( zerr != Z_STREAM_END ) - printk(KERN_DEBUG "zisofs: zisofs_inflate returned %d, inode = %lu, index = %lu, fpage = %d, xpage = %d, avail_in = %d, avail_out = %d, ai = %d, ao = %d\n", - zerr, inode->i_ino, index, - fpage, xpage, - stream.avail_in, stream.avail_out, - ai, ao); - bail = 1; - break; + while (stream.avail_out && stream.avail_in) { + zerr = zlib_inflate(&stream, Z_SYNC_FLUSH); + if (zerr == Z_BUF_ERROR && stream.avail_in == 0) + break; + if (zerr == Z_STREAM_END) + break; + if (zerr != Z_OK) { + /* EOF, error, or trying to read beyond end of input */ + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else { + printk(KERN_DEBUG + "zisofs: zisofs_inflate returned" + " %d, inode = %lu," + " page idx = %d, bh idx = %d," + " avail_in = %d," + " avail_out = %d\n", + zerr, inode->i_ino, curpage, + curbh, stream.avail_in, + stream.avail_out); + *errp = -EIO; } + goto inflate_out; } + } - if ( stream.avail_out && zerr == Z_STREAM_END ) { - /* Fractional page written before EOF. This may - be the last page in the file. */ - memset(stream.next_out, 0, stream.avail_out); - stream.avail_out = 0; + if (!stream.avail_out) { + /* This page completed */ + if (pages[curpage]) { + flush_dcache_page(pages[curpage]); + SetPageUptodate(pages[curpage]); } + curpage++; + } + if (!stream.avail_in) + curbh++; + } +inflate_out: + zlib_inflateEnd(&stream); - if ( !stream.avail_out ) { - /* This page completed */ - if ( page ) { - flush_dcache_page(page); - SetPageUptodate(page); - kunmap(page); - unlock_page(page); - if ( fpage == xpage ) - err = 0; /* The critical page */ - else - page_cache_release(page); - } - fpage++; - } +z_eio: + mutex_unlock(&zisofs_zlib_lock); + +b_eio: + for (i = 0; i < haveblocks; i++) + brelse(bhs[i]); + return stream.total_out; +} + +/* + * Uncompress data so that pages[full_page] is fully uptodate and possibly + * fills in other pages if we have data for them. + */ +static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, + struct page **pages) +{ + loff_t start_off, end_off; + loff_t block_start, block_end; + unsigned int header_size = ISOFS_I(inode)->i_format_parm[0]; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int blockptr; + loff_t poffset = 0; + blkcnt_t cstart_block, cend_block; + struct buffer_head *bh; + unsigned int blkbits = ISOFS_BUFFER_BITS(inode); + unsigned int blksize = 1 << blkbits; + int err; + loff_t ret; + + BUG_ON(!pages[full_page]); + + /* + * We want to read at least 'full_page' page. Because we have to + * uncompress the whole compression block anyway, fill the surrounding + * pages with the data we have anyway... + */ + start_off = page_offset(pages[full_page]); + end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size); + + cstart_block = start_off >> zisofs_block_shift; + cend_block = (end_off + (1 << zisofs_block_shift) - 1) + >> zisofs_block_shift; + + WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) != + ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK)); + + /* Find the pointer to this specific chunk */ + /* Note: we're not using isonum_731() here because the data is known aligned */ + /* Note: header_size is in 32-bit words (4 bytes) */ + blockptr = (header_size + cstart_block) << 2; + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + block_start = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + + while (cstart_block < cend_block && pcount > 0) { + /* Load end of the compressed block in the file */ + blockptr += 4; + /* Traversed to next block? */ + if (!(blockptr & (blksize - 1))) { + brelse(bh); + + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + } + block_end = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + if (block_start > block_end) { + brelse(bh); + return -EIO; + } + err = 0; + ret = zisofs_uncompress_block(inode, block_start, block_end, + pcount, pages, poffset, &err); + poffset += ret; + pages += poffset >> PAGE_CACHE_SHIFT; + pcount -= poffset >> PAGE_CACHE_SHIFT; + full_page -= poffset >> PAGE_CACHE_SHIFT; + poffset &= ~PAGE_CACHE_MASK; + + if (err) { + brelse(bh); + /* + * Did we finish reading the page we really wanted + * to read? + */ + if (full_page < 0) + return 0; + return err; } - zlib_inflateEnd(&stream); - z_eio: - mutex_unlock(&zisofs_zlib_lock); + block_start = block_end; + cstart_block++; + } + + if (poffset && *pages) { + memset(page_address(*pages) + poffset, 0, + PAGE_CACHE_SIZE - poffset); + flush_dcache_page(*pages); + SetPageUptodate(*pages); + } + return 0; +} - b_eio: - for ( i = 0 ; i < haveblocks ; i++ ) { - if ( bhs[i] ) - brelse(bhs[i]); +/* + * When decompressing, we typically obtain more than one page + * per reference. We inject the additional pages into the page + * cache as a form of readahead. + */ +static int zisofs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + int err; + int i, pcount, full_page; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int zisofs_pages_per_cblock = + PAGE_CACHE_SHIFT <= zisofs_block_shift ? + (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0; + struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; + pgoff_t index = page->index, end_index; + + end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* + * If this page is wholly outside i_size we just return zero; + * do_generic_file_read() will handle this for us + */ + if (index >= end_index) { + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + if (PAGE_CACHE_SHIFT <= zisofs_block_shift) { + /* We have already been given one page, this is the one + we must do. */ + full_page = index & (zisofs_pages_per_cblock - 1); + pcount = min_t(int, zisofs_pages_per_cblock, + end_index - (index & ~(zisofs_pages_per_cblock - 1))); + index -= full_page; + } else { + full_page = 0; + pcount = 1; + } + pages[full_page] = page; + + for (i = 0; i < pcount; i++, index++) { + if (i != full_page) + pages[i] = grab_cache_page_nowait(mapping, index); + if (pages[i]) { + ClearPageError(pages[i]); + kmap(pages[i]); } } -eio: + err = zisofs_fill_pages(inode, full_page, pcount, pages); /* Release any residual pages, do not SetPageUptodate */ - while ( fpage < maxpage ) { - page = pages[fpage]; - if ( page ) { - flush_dcache_page(page); - if ( fpage == xpage ) - SetPageError(page); - kunmap(page); - unlock_page(page); - if ( fpage != xpage ) - page_cache_release(page); + for (i = 0; i < pcount; i++) { + if (pages[i]) { + flush_dcache_page(pages[i]); + if (i == full_page && err) + SetPageError(pages[i]); + kunmap(pages[i]); + unlock_page(pages[i]); + if (i != full_page) + page_cache_release(pages[i]); } - fpage++; } /* At this point, err contains 0 or -EIO depending on the "critical" page */ diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index c2fb2dd0131..96a685c550f 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -518,8 +518,7 @@ repeat: if (algo == SIG('p', 'z')) { int block_shift = isonum_711(&rr->u.ZF.parms[1]); - if (block_shift < PAGE_CACHE_SHIFT - || block_shift > 17) { + if (block_shift > 17) { printk(KERN_WARNING "isofs: " "Can't handle ZF block " "size of 2^%d\n", diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig index 4e28beeed15..a8408983abd 100644 --- a/fs/jbd/Kconfig +++ b/fs/jbd/Kconfig @@ -1,5 +1,6 @@ config JBD tristate + select FS_JOURNAL_INFO help This is a generic journalling layer for block devices. It is currently used by the ext3 file system, but it could also be diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index f32f346f4b0..0f7d1ceafdf 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -1,6 +1,7 @@ config JBD2 tristate select CRC32 + select FS_JOURNAL_INFO help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d4cfd6d2779..6a10238d2c6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -286,7 +286,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, if (err) { /* * Because AS_EIO is cleared by - * wait_on_page_writeback_range(), set it again so + * filemap_fdatawait_range(), set it again so * that user process can get -EIO from fsync(). */ set_bit(AS_EIO, @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fed85388ee8..b7ca3a92a4d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); @@ -358,6 +359,10 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + jbd2_journal_put_journal_head(new_jh); + return -ENOMEM; + } jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); @@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal) if (jbd2_journal_recover(journal)) goto recovery_error; + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EIO; + } + /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ diff --git a/fs/namei.c b/fs/namei.c index d11f404667e..87f97ba90ad 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1279,28 +1279,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, NULL); } -/** - * lookup_one_noperm - bad hack for sysfs - * @name: pathname component to lookup - * @base: base directory to lookup from - * - * This is a variant of lookup_one_len that doesn't perform any permission - * checks. It's a horrible hack to work around the braindead sysfs - * architecture and should not be used anywhere else. - * - * DON'T USE THIS FUNCTION EVER, thanks. - */ -struct dentry *lookup_one_noperm(const char *name, struct dentry *base) -{ - int err; - struct qstr this; - - err = __lookup_one_len(name, &this, base, strlen(name)); - if (err) - return ERR_PTR(err); - return __lookup_hash(&this, base, NULL); -} - int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { @@ -1678,6 +1656,15 @@ struct file *do_filp_open(int dfd, const char *pathname, int will_write; int flag = open_to_namei_flags(open_flag); + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (open_flag & __O_SYNC) + open_flag |= O_DSYNC; + if (!acc_mode) acc_mode = MAY_OPEN | ACC_MODE(flag); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 293fa0528a6..73ab220354d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - /* - * FIXME: do we really need to run this under the BKL? If so, please - * add a comment about what it's intended to protect. - */ - lock_kernel(); while (!kthread_should_stop()) { /* * Listen for a request on the socket @@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp) preverr = err; svc_process(rqstp); } - unlock_kernel(); return 0; } @@ -160,11 +154,6 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - /* - * FIXME: do we really need to run this under the BKL? If so, please - * add a comment about what it's intended to protect. - */ - lock_kernel(); while (!kthread_should_stop()) { prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -183,7 +172,6 @@ nfs41_callback_svc(void *vrqstp) } finish_wait(&serv->sv_cb_waitq, &wq); } - unlock_kernel(); return 0; } @@ -397,6 +385,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) */ static struct svc_version *nfs4_callback_version[] = { [1] = &nfs4_callback_version1, + [4] = &nfs4_callback_version4, }; static struct svc_stat nfs4_callback_stats; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 07baa8254ca..d4036be0b58 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -106,6 +106,19 @@ struct cb_sequenceres { extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res); +extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); + +#define RCA4_TYPE_MASK_RDATA_DLG 0 +#define RCA4_TYPE_MASK_WDATA_DLG 1 + +struct cb_recallanyargs { + struct sockaddr *craa_addr; + uint32_t craa_objs_to_keep; + uint32_t craa_type_mask; +}; + +extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); #endif /* CONFIG_NFS_V4_1 */ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); @@ -114,8 +127,9 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); #ifdef CONFIG_NFS_V4 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); extern void nfs_callback_down(int minorversion); +extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); #endif /* CONFIG_NFS_V4 */ - /* * nfs41: Callbacks are expected to not cause substantial latency, * so we limit their concurrency to 1 by setting up the maximum number diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index b7da1f54da6..defa9b4c470 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -61,6 +61,16 @@ out: return res->status; } +static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +{ +#if defined(CONFIG_NFS_V4_1) + if (clp->cl_minorversion > 0) + return nfs41_validate_delegation_stateid; +#endif + return nfs4_validate_delegation_stateid; +} + + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { struct nfs_client *clp; @@ -81,7 +91,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) inode = nfs_delegation_find_inode(clp, &args->fh); if (inode != NULL) { /* Set up a helper thread to actually return the delegation */ - switch(nfs_async_inode_return_delegation(inode, &args->stateid)) { + switch (nfs_async_inode_return_delegation(inode, &args->stateid, + nfs_validate_delegation_stateid(clp))) { case 0: res = 0; break; @@ -102,8 +113,31 @@ out: return res; } +int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, + sizeof(delegation->stateid.data)) != 0) + return 0; + return 1; +} + #if defined(CONFIG_NFS_V4_1) +int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL) + return 0; + + /* seqid is 4-bytes long */ + if (((u32 *) &stateid->data)[0] != 0) + return 0; + if (memcmp(&delegation->stateid.data[4], &stateid->data[4], + sizeof(stateid->data)-4)) + return 0; + + return 1; +} + /* * Validate the sequenceID sent by the server. * Return success if the sequenceID is one more than what we last saw on @@ -227,4 +261,32 @@ out: return res->csr_status; } +unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) +{ + struct nfs_client *clp; + int status; + fmode_t flags = 0; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; + + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; + + if (flags) + nfs_expire_all_delegation_types(clp, flags); + status = htonl(NFS4_OK); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 76b0aa0f73b..8e1a2511c8b 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -23,6 +23,7 @@ #if defined(CONFIG_NFS_V4_1) #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) +#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -326,6 +327,25 @@ out_free: goto out; } +static unsigned decode_recallany_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallanyargs *args) +{ + uint32_t *p; + + args->craa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_objs_to_keep = ntohl(*p++); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_type_mask = ntohl(*p); + + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -533,6 +553,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_GETATTR: case OP_CB_RECALL: case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: *op = &callback_ops[op_nr]; break; @@ -540,7 +561,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_NOTIFY_DEVICEID: case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: - case OP_CB_RECALL_ANY: case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_RECALL_SLOT: case OP_CB_WANTS_CANCELLED: @@ -688,6 +708,11 @@ static struct callback_op callback_ops[] = { .encode_res = (callback_encode_res_t)encode_cb_sequence_res, .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, }, + [OP_CB_RECALL_ANY] = { + .process_op = (callback_process_op_t)nfs4_callback_recallany, + .decode_args = (callback_decode_arg_t)decode_recallany_args, + .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; @@ -718,3 +743,10 @@ struct svc_version nfs4_callback_version1 = { .vs_dispatch = NULL, }; +struct svc_version nfs4_callback_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, +}; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 99ea196f071..ee77713ce68 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1260,10 +1260,20 @@ error: static void nfs4_session_set_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + if (!nfs4_has_session(server->nfs_client)) return; - server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6dd48a4405b..2563bebc4c6 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -92,7 +92,7 @@ out: return status; } -static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; @@ -116,10 +116,11 @@ again: err = nfs_delegation_claim_locks(ctx, state); put_nfs_open_context(ctx); if (err != 0) - return; + return err; goto again; } spin_unlock(&inode->i_lock); + return 0; } /* @@ -261,30 +262,34 @@ static void nfs_msync_inode(struct inode *inode) /* * Basic procedure for returning a delegation to the server */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) +static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_inode *nfsi = NFS_I(inode); + int err; - nfs_msync_inode(inode); /* * Guard against new delegated open/lock/unlock calls and against * state recovery */ down_write(&nfsi->rwsem); - nfs_delegation_claim_opens(inode, &delegation->stateid); + err = nfs_delegation_claim_opens(inode, &delegation->stateid); up_write(&nfsi->rwsem); - nfs_msync_inode(inode); + if (err) + goto out; - return nfs_do_return_delegation(inode, delegation, 1); + err = nfs_do_return_delegation(inode, delegation, issync); +out: + return err; } /* * Return all delegations that have been marked for return */ -void nfs_client_return_marked_delegations(struct nfs_client *clp) +int nfs_client_return_marked_delegations(struct nfs_client *clp) { struct nfs_delegation *delegation; struct inode *inode; + int err = 0; restart: rcu_read_lock(); @@ -298,12 +303,18 @@ restart: delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); rcu_read_unlock(); - if (delegation != NULL) - __nfs_inode_return_delegation(inode, delegation); + if (delegation != NULL) { + filemap_flush(inode->i_mapping); + err = __nfs_inode_return_delegation(inode, delegation, 0); + } iput(inode); - goto restart; + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; } rcu_read_unlock(); + return 0; } /* @@ -338,8 +349,10 @@ int nfs_inode_return_delegation(struct inode *inode) spin_lock(&clp->cl_lock); delegation = nfs_detach_delegation_locked(nfsi, NULL); spin_unlock(&clp->cl_lock); - if (delegation != NULL) - err = __nfs_inode_return_delegation(inode, delegation); + if (delegation != NULL) { + nfs_msync_inode(inode); + err = __nfs_inode_return_delegation(inode, delegation, 1); + } } return err; } @@ -368,33 +381,47 @@ void nfs_super_return_all_delegations(struct super_block *sb) spin_unlock(&delegation->lock); } rcu_read_unlock(); - nfs_client_return_marked_delegations(clp); + if (nfs_client_return_marked_delegations(clp) != 0) + nfs4_schedule_state_manager(clp); } -static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +static +void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) { struct nfs_delegation *delegation; rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) + continue; + if (delegation->type & flags) + nfs_mark_return_delegation(clp, delegation); } rcu_read_unlock(); } +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + static void nfs_delegation_run_state_manager(struct nfs_client *clp) { if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) nfs4_schedule_state_manager(clp); } -void nfs_expire_all_delegations(struct nfs_client *clp) +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) { - nfs_client_mark_return_all_delegations(clp); + nfs_client_mark_return_all_delegation_types(clp, flags); nfs_delegation_run_state_manager(clp); } +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + /* * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. */ @@ -413,8 +440,7 @@ static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *c list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + nfs_mark_return_delegation(clp, delegation); } rcu_read_unlock(); } @@ -428,18 +454,21 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) /* * Asynchronous delegation recall! */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, + int (*validate_stateid)(struct nfs_delegation *delegation, + const nfs4_stateid *stateid)) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) { + + if (!validate_stateid(delegation, stateid)) { rcu_read_unlock(); return -ENOENT; } + nfs_mark_return_delegation(clp, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 09f38379517..944b627ec6e 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,15 +34,18 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, + int (*validate_stateid)(struct nfs_delegation *delegation, + const nfs4_stateid *stateid)); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_super_return_all_delegations(struct super_block *sb); void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); void nfs_handle_cb_pathdown(struct nfs_client *clp); -void nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_client_return_marked_delegations(struct nfs_client *clp); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7cb298525ee..2c5ace4f00a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1579,55 +1579,46 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *rehash = NULL; int error = -EBUSY; - /* - * To prevent any new references to the target during the rename, - * we unhash the dentry and free the inode in advance. - */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; - } - dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, atomic_read(&new_dentry->d_count)); /* - * First check whether the target is busy ... we can't - * safely do _any_ rename if the target is in use. - * - * For files, make a copy of the dentry and then do a - * silly-rename. If the silly-rename succeeds, the - * copied dentry is hashed and becomes the new target. + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. */ - if (!new_inode) - goto go_ahead; - if (S_ISDIR(new_inode->i_mode)) { - error = -EISDIR; - if (!S_ISDIR(old_inode->i_mode)) - goto out; - } else if (atomic_read(&new_dentry->d_count) > 2) { - int err; - /* copy the target dentry's name */ - dentry = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!dentry) - goto out; + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } + + if (atomic_read(&new_dentry->d_count) > 2) { + int err; + + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; - /* silly-rename the existing target ... */ - err = nfs_sillyrename(new_dir, new_dentry); - if (!err) { - new_dentry = rehash = dentry; + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (err) + goto out; + + new_dentry = dentry; new_inode = NULL; - /* instantiate the replacement target */ - d_instantiate(new_dentry, NULL); - } else if (atomic_read(&new_dentry->d_count) > 1) - /* dentry still busy? */ - goto out; + } } -go_ahead: /* * ... prune child dentries and writebacks if needed. */ diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index f4d54ba97cc..95e1ca765d4 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -146,7 +146,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, return 0; } -struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, +static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, struct nfs_dns_ent *key) { struct cache_head *ch; @@ -159,7 +159,7 @@ struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, return container_of(ch, struct nfs_dns_ent, h); } -struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, +static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, struct nfs_dns_ent *new, struct nfs_dns_ent *key) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f5fdd39e037..6b891328f33 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -581,7 +581,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_SYNC)) + if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) return 1; ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) @@ -622,7 +622,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); result = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* Return error values for O_SYNC and IS_SYNC() */ + /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); if (err < 0) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e21b1bb9972..29e464d23b3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -30,6 +30,15 @@ static inline int nfs4_has_session(const struct nfs_client *clp) return 0; } +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -156,6 +165,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); @@ -177,24 +187,14 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); extern struct rpc_procinfo nfs3_procedures[]; extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); -/* nfs4proc.c */ -static inline void nfs4_restart_rpc(struct rpc_task *task, - const struct nfs_client *clp) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(clp) && - test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { - rpc_restart_call_prepare(task); - return; - } -#endif /* CONFIG_NFS_V4_1 */ - rpc_restart_call(task); -} - /* nfs4xdr.c */ #ifdef CONFIG_NFS_V4 extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); #endif +#ifdef CONFIG_NFS_V4_1 +extern const u32 nfs41_maxread_overhead; +extern const u32 nfs41_maxwrite_overhead; +#endif /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 @@ -273,20 +273,6 @@ extern int _nfs4_call_sync_session(struct nfs_server *server, struct nfs4_sequence_res *res, int cache_reply); -#ifdef CONFIG_NFS_V4_1 -extern void nfs41_sequence_free_slot(const struct nfs_client *, - struct nfs4_sequence_res *res); -#endif /* CONFIG_NFS_V4_1 */ - -static inline void nfs4_sequence_free_slot(const struct nfs_client *clp, - struct nfs4_sequence_res *res) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(clp)) - nfs41_sequence_free_slot(clp, res); -#endif /* CONFIG_NFS_V4_1 */ -} - /* * Determine the device name as a string */ @@ -380,3 +366,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } + +/* + * Helper for restarting RPC calls in the possible presence of NFSv4.1 + * sessions. + */ +static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + rpc_restart_call_prepare(task); + else + rpc_restart_call(task); +} diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index ceda50aad73..46d779abafd 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -25,13 +25,7 @@ struct nfs_iostats { static inline void nfs_inc_server_stats(const struct nfs_server *server, enum nfs_stat_eventcounters stat) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->events[stat]++; - put_cpu(); + this_cpu_inc(server->io_stats->events[stat]); } static inline void nfs_inc_stats(const struct inode *inode, @@ -44,13 +38,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->bytes[stat] += addend; - put_cpu(); + this_cpu_add(server->io_stats->bytes[stat], addend); } static inline void nfs_add_stats(const struct inode *inode, @@ -65,13 +53,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode, enum nfs_stat_fscachecounters stat, unsigned long addend) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); - iostats->fscache[stat] += addend; - put_cpu(); + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); } #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6ea07a3c75d..7e57b04e401 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -44,7 +44,8 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_REBOOT, NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, - NFS4CLNT_SESSION_SETUP, + NFS4CLNT_SESSION_RESET, + NFS4CLNT_SESSION_DRAINING, }; /* @@ -180,6 +181,7 @@ struct nfs4_state_recovery_ops { int (*recover_lock)(struct nfs4_state *, struct file_lock *); int (*establish_clid)(struct nfs_client *, struct rpc_cred *); struct rpc_cred * (*get_clid_cred)(struct nfs_client *); + int (*reclaim_complete)(struct nfs_client *); }; struct nfs4_state_maintenance_ops { @@ -200,9 +202,11 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); @@ -218,9 +222,11 @@ extern int nfs4_setup_sequence(struct nfs_client *clp, int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); -extern int nfs4_proc_create_session(struct nfs_client *, int reset); +extern int nfs4_proc_create_session(struct nfs_client *); extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); #else /* CONFIG_NFS_v4_1 */ static inline int nfs4_setup_sequence(struct nfs_client *clp, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -267,6 +273,7 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_state_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); @@ -287,6 +294,7 @@ struct nfs4_mount_data; /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; #else diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 741a562177f..9f5f11ecfd9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -270,11 +270,18 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); + nfs4_schedule_state_recovery(clp); exception->retry = 1; - /* FALLTHROUGH */ + break; #endif /* !defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { + /* We have retried a decent amount, time to + * fail + */ + ret = -EBUSY; + break; + } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: ret = nfs4_delay(server->client, &exception->timeout); @@ -311,48 +318,54 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * so we need to scan down from highest_used_slotid to 0 looking for the now * highest slotid in use. * If none found, highest_used_slotid is set to -1. + * + * Must be called while holding tbl->slot_tbl_lock */ static void nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) { int slotid = free_slotid; - spin_lock(&tbl->slot_tbl_lock); /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); /* update highest_used_slotid when it is freed */ if (slotid == tbl->highest_used_slotid) { slotid = find_last_bit(tbl->used_slots, tbl->max_slots); - if (slotid >= 0 && slotid < tbl->max_slots) + if (slotid < tbl->max_slots) tbl->highest_used_slotid = slotid; else tbl->highest_used_slotid = -1; } - rpc_wake_up_next(&tbl->slot_tbl_waitq); - spin_unlock(&tbl->slot_tbl_lock); dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, free_slotid, tbl->highest_used_slotid); } -void nfs41_sequence_free_slot(const struct nfs_client *clp, +static void nfs41_sequence_free_slot(const struct nfs_client *clp, struct nfs4_sequence_res *res) { struct nfs4_slot_table *tbl; - if (!nfs4_has_session(clp)) { - dprintk("%s: No session\n", __func__); - return; - } tbl = &clp->cl_session->fc_slot_table; if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { - dprintk("%s: No slot\n", __func__); /* just wake up the next guy waiting since * we may have not consumed a slot after all */ - rpc_wake_up_next(&tbl->slot_tbl_waitq); + dprintk("%s: No slot\n", __func__); return; } + + spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slotid); + + /* Signal state manager thread if session is drained */ + if (test_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { + if (tbl->highest_used_slotid == -1) { + dprintk("%s COMPLETE: Session Drained\n", __func__); + complete(&clp->cl_session->complete); + } + } else + rpc_wake_up_next(&tbl->slot_tbl_waitq); + spin_unlock(&tbl->slot_tbl_lock); res->sr_slotid = NFS4_MAX_SLOT_TABLE; } @@ -377,10 +390,10 @@ static void nfs41_sequence_done(struct nfs_client *clp, if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) goto out; - tbl = &clp->cl_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; - + /* Check the SEQUENCE operation status */ if (res->sr_status == 0) { + tbl = &clp->cl_session->fc_slot_table; + slot = tbl->slots + res->sr_slotid; /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; timestamp = res->sr_renewal_time; @@ -388,7 +401,8 @@ static void nfs41_sequence_done(struct nfs_client *clp, if (time_before(clp->cl_last_renewal, timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); - return; + /* Check sequence flags */ + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); } out: /* The session may be reset by one of the error handlers. */ @@ -429,24 +443,6 @@ out: return ret_id; } -static int nfs4_recover_session(struct nfs4_session *session) -{ - struct nfs_client *clp = session->clp; - unsigned int loop; - int ret; - - for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { - ret = nfs4_wait_clnt_recover(clp); - if (ret != 0) - break; - if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) - break; - nfs4_schedule_state_manager(clp); - ret = -EIO; - } - return ret; -} - static int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -455,7 +451,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, { struct nfs4_slot *slot; struct nfs4_slot_table *tbl; - int status = 0; u8 slotid; dprintk("--> %s\n", __func__); @@ -468,21 +463,15 @@ static int nfs41_setup_sequence(struct nfs4_session *session, tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) { - if (tbl->highest_used_slotid != -1) { - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); - dprintk("<-- %s: Session reset: draining\n", __func__); - return -EAGAIN; - } - - /* The slot table is empty; start the reset thread */ - dprintk("%s Session Reset\n", __func__); + if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state)) { + /* + * The state manager will wait until the slot table is empty. + * Schedule the reset thread + */ + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); - status = nfs4_recover_session(session); - if (status) - return status; - spin_lock(&tbl->slot_tbl_lock); + dprintk("%s Schedule Session Reset\n", __func__); + return -EAGAIN; } slotid = nfs4_find_slot(tbl, task); @@ -527,7 +516,7 @@ int nfs4_setup_sequence(struct nfs_client *clp, goto out; ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, task); - if (ret != -EAGAIN) { + if (ret && ret != -EAGAIN) { /* terminate rpc task */ task->tk_status = ret; task->tk_action = NULL; @@ -561,7 +550,6 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) struct nfs41_call_sync_data *data = calldata; nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); - nfs41_sequence_free_slot(data->clp, data->seq_res); } struct rpc_call_ops nfs41_call_sync_ops = { @@ -637,15 +625,6 @@ static void nfs4_sequence_done(const struct nfs_server *server, #endif /* CONFIG_NFS_V4_1 */ } -/* no restart, therefore free slot here */ -static void nfs4_sequence_done_free_slot(const struct nfs_server *server, - struct nfs4_sequence_res *res, - int rpc_status) -{ - nfs4_sequence_done(server, res, rpc_status); - nfs4_sequence_free_slot(server->nfs_client, res); -} - static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { struct nfs_inode *nfsi = NFS_I(dir); @@ -720,9 +699,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (flags & O_EXCL) { - u32 *s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; + if (nfs4_has_persistent_session(server->nfs_client)) { + /* GUARDED */ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); + } else { /* EXCLUSIVE4_1 */ + u32 *s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; + } } else if (flags & O_CREAT) { p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); @@ -776,13 +761,16 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode goto out; switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 + && state->n_rdonly != 0; break; case FMODE_WRITE: - ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 + && state->n_wronly != 0; break; case FMODE_READ|FMODE_WRITE: - ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 + && state->n_rdwr != 0; } out: return ret; @@ -1183,6 +1171,14 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -ENOENT: case -ESTALE: goto out; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_state_recovery( + server->nfs_client); + goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -1336,8 +1332,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res, - task->tk_status); + nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, + task->tk_status); if (RPC_ASSASSINATED(task)) return; @@ -1488,7 +1484,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s return ret; } -static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_exception exception = { }; @@ -1496,10 +1492,16 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4 do { err = _nfs4_open_expired(ctx, state); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } } while (exception.retry); +out: return err; } @@ -1712,6 +1714,18 @@ static void nfs4_free_closedata(void *data) kfree(calldata); } +static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, + fmode_t fmode) +{ + spin_lock(&state->owner->so_lock); + if (!(fmode & FMODE_READ)) + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + if (!(fmode & FMODE_WRITE)) + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_unlock(&state->owner->so_lock); +} + static void nfs4_close_done(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; @@ -1728,6 +1742,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case 0: nfs_set_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); + nfs4_close_clear_stateid_flags(state, + calldata->arg.fmode); break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -1737,11 +1753,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { - nfs4_restart_rpc(task, server->nfs_client); + nfs_restart_rpc(task, server->nfs_client); return; } } - nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res); nfs_refresh_inode(calldata->inode, calldata->res.fattr); } @@ -1749,38 +1764,39 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; - int clear_rd, clear_wr, clear_rdwr; + int call_close = 0; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; - clear_rd = clear_wr = clear_rdwr = 0; + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + calldata->arg.fmode = FMODE_READ|FMODE_WRITE; spin_lock(&state->owner->so_lock); /* Calculate the change in open mode */ if (state->n_rdwr == 0) { if (state->n_rdonly == 0) { - clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_READ; } if (state->n_wronly == 0) { - clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_WRITE; } } spin_unlock(&state->owner->so_lock); - if (!clear_rd && !clear_wr && !clear_rdwr) { + + if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ task->tk_action = NULL; return; } + + if (calldata->arg.fmode == 0) + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + nfs_fattr_init(calldata->res.fattr); - if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_READ; - } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_WRITE; - } calldata->timestamp = jiffies; if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, &calldata->arg.seq_args, &calldata->res.seq_res, @@ -1981,7 +1997,7 @@ out_drop: return 0; } -void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) +static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { if (ctx->state == NULL) return; @@ -2532,7 +2548,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; - nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res); update_changeattr(dir, &res->cinfo); nfs_post_op_update_inode(dir, &res->dir_attr); return 1; @@ -2971,11 +2986,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) dprintk("--> %s\n", __func__); - /* nfs4_sequence_free_slot called in the read rpc_call_done */ nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs4_restart_rpc(task, server->nfs_client); + nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; } @@ -2995,12 +3009,11 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - /* slot is freed in nfs_writeback_done */ nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; } if (task->tk_status >= 0) { @@ -3028,11 +3041,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { - nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; } - nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client, - &data->res.seq_res); nfs_refresh_inode(inode, data->res.fattr); return 0; } @@ -3350,7 +3361,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); + nfs4_schedule_state_recovery(clp); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3483,12 +3494,23 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; - nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res, - task->tk_status); + nfs4_sequence_done(data->res.server, &data->res.seq_res, + task->tk_status); - data->rpc_status = task->tk_status; - if (data->rpc_status == 0) + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + case 0: renew_lease(data->res.server, data->timestamp); + break; + default: + if (nfs4_async_handle_error(task, data->res.server, NULL) == + -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } + } + data->rpc_status = task->tk_status; } static void nfs4_delegreturn_release(void *calldata) @@ -3741,11 +3763,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) - nfs4_restart_rpc(task, - calldata->server->nfs_client); + nfs_restart_rpc(task, + calldata->server->nfs_client); } - nfs4_sequence_free_slot(calldata->server->nfs_client, - &calldata->res.seq_res); } static void nfs4_locku_prepare(struct rpc_task *task, void *data) @@ -3927,8 +3947,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) dprintk("%s: begin!\n", __func__); - nfs4_sequence_done_free_slot(data->server, &data->res.seq_res, - task->tk_status); + nfs4_sequence_done(data->server, &data->res.seq_res, + task->tk_status); data->rpc_status = task->tk_status; if (RPC_ASSASSINATED(task)) @@ -4049,10 +4069,16 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, 0); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } } while (exception.retry); +out: return err; } @@ -4172,6 +4198,11 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_EXPIRED: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: nfs4_schedule_state_recovery(server->nfs_client); goto out; case -ERESTARTSYS: @@ -4296,7 +4327,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, * NFS4ERR_BADSESSION in the sequence operation, and will therefore * be in some phase of session reset. */ -static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) { nfs4_verifier verifier; struct nfs41_exchange_id_args args = { @@ -4318,6 +4349,9 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) dprintk("--> %s\n", __func__); BUG_ON(clp == NULL); + /* Remove server-only flags */ + args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R; + p = (u32 *)verifier.data; *p++ = htonl((u32)clp->cl_boot_time.tv_sec); *p = htonl((u32)clp->cl_boot_time.tv_nsec); @@ -4389,10 +4423,9 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; - nfs4_restart_rpc(task, data->clp); + nfs_restart_rpc(task, data->clp); return; } - nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res); dprintk("<-- %s\n", __func__); } @@ -4465,7 +4498,6 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, spin_lock(&tbl->slot_tbl_lock); for (i = 0; i < max_slots; ++i) tbl->slots[i].seq_nr = ivalue; - tbl->highest_used_slotid = -1; spin_unlock(&tbl->slot_tbl_lock); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, tbl, tbl->slots, tbl->max_slots); @@ -4515,7 +4547,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session) static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, int max_slots, int ivalue) { - int i; struct nfs4_slot *slot; int ret = -ENOMEM; @@ -4526,18 +4557,9 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); if (!slot) goto out; - for (i = 0; i < max_slots; ++i) - slot[i].seq_nr = ivalue; ret = 0; spin_lock(&tbl->slot_tbl_lock); - if (tbl->slots != NULL) { - spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s: slot table already initialized. tbl=%p slots=%p\n", - __func__, tbl, tbl->slots); - WARN_ON(1); - goto out_free; - } tbl->max_slots = max_slots; tbl->slots = slot; tbl->highest_used_slotid = -1; /* no slot is currently used */ @@ -4547,10 +4569,6 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, out: dprintk("<-- %s: return %d\n", __func__, ret); return ret; - -out_free: - kfree(slot); - goto out; } /* @@ -4558,17 +4576,24 @@ out_free: */ static int nfs4_init_slot_tables(struct nfs4_session *session) { - int status; + struct nfs4_slot_table *tbl; + int status = 0; - status = nfs4_init_slot_table(&session->fc_slot_table, - session->fc_attrs.max_reqs, 1); - if (status) - return status; + tbl = &session->fc_slot_table; + if (tbl->slots == NULL) { + status = nfs4_init_slot_table(tbl, + session->fc_attrs.max_reqs, 1); + if (status) + return status; + } - status = nfs4_init_slot_table(&session->bc_slot_table, - session->bc_attrs.max_reqs, 0); - if (status) - nfs4_destroy_slot_tables(session); + tbl = &session->bc_slot_table; + if (tbl->slots == NULL) { + status = nfs4_init_slot_table(tbl, + session->bc_attrs.max_reqs, 0); + if (status) + nfs4_destroy_slot_tables(session); + } return status; } @@ -4582,7 +4607,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) if (!session) return NULL; - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); /* * The create session reply races with the server back * channel probe. Mark the client NFS_CS_SESSION_INITING @@ -4590,12 +4614,15 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) * nfs_client struct */ clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); tbl = &session->fc_slot_table; + tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); tbl = &session->bc_slot_table; + tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); @@ -4747,11 +4774,10 @@ static int _nfs4_proc_create_session(struct nfs_client *clp) * It is the responsibility of the caller to verify the session is * expired before calling this routine. */ -int nfs4_proc_create_session(struct nfs_client *clp, int reset) +int nfs4_proc_create_session(struct nfs_client *clp) { int status; unsigned *ptr; - struct nfs_fsinfo fsinfo; struct nfs4_session *session = clp->cl_session; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); @@ -4760,35 +4786,19 @@ int nfs4_proc_create_session(struct nfs_client *clp, int reset) if (status) goto out; - /* Init or reset the fore channel */ - if (reset) - status = nfs4_reset_slot_tables(session); - else - status = nfs4_init_slot_tables(session); - dprintk("fore channel slot table initialization returned %d\n", status); + /* Init and reset the fore channel */ + status = nfs4_init_slot_tables(session); + dprintk("slot table initialization returned %d\n", status); + if (status) + goto out; + status = nfs4_reset_slot_tables(session); + dprintk("slot table reset returned %d\n", status); if (status) goto out; ptr = (unsigned *)&session->sess_id.data[0]; dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); - - if (reset) - /* Lease time is aleady set */ - goto out; - - /* Get the lease time */ - status = nfs4_proc_get_lease_time(clp, &fsinfo); - if (status == 0) { - /* Update lease time and schedule renewal */ - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = jiffies; - clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - spin_unlock(&clp->cl_lock); - - nfs4_schedule_state_renewal(clp); - } out: dprintk("<-- %s\n", __func__); return status; @@ -4827,13 +4837,16 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) int nfs4_init_session(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + struct nfs4_session *session; int ret; if (!nfs4_has_session(clp)) return 0; - clp->cl_session->fc_attrs.max_rqst_sz = server->wsize; - clp->cl_session->fc_attrs.max_resp_sz = server->rsize; + session = clp->cl_session; + session->fc_attrs.max_rqst_sz = server->wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = server->rsize + nfs41_maxread_overhead; + ret = nfs4_recover_expired_lease(server); if (!ret) ret = nfs4_check_client_ready(clp); @@ -4872,11 +4885,10 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) if (_nfs4_async_handle_error(task, NULL, clp, NULL) == -EAGAIN) { - nfs4_restart_rpc(task, clp); + nfs_restart_rpc(task, clp); return; } } - nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); kfree(task->tk_msg.rpc_argp); @@ -4931,6 +4943,109 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, &nfs41_sequence_ops, (void *)clp); } +struct nfs4_reclaim_complete_data { + struct nfs_client *clp; + struct nfs41_reclaim_complete_args arg; + struct nfs41_reclaim_complete_res res; +}; + +static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); +} + +static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); + nfs41_sequence_done(clp, res, task->tk_status); + switch (task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + /* + * Handle the session error, but do not retry the operation, as + * we have no way of telling whether the clientid had to be + * reset before we got our reply. If reset, a new wave of + * reclaim operations will follow, containing their own reclaim + * complete. We don't want our retry to get on the way of + * recovery by incorrectly indicating to the server that we're + * done reclaiming state since the process had to be restarted. + */ + _nfs4_async_handle_error(task, NULL, clp, NULL); + break; + default: + if (_nfs4_async_handle_error( + task, NULL, clp, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + + dprintk("<-- %s\n", __func__); +} + +static void nfs4_free_reclaim_complete_data(void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + kfree(calldata); +} + +static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { + .rpc_call_prepare = nfs4_reclaim_complete_prepare, + .rpc_call_done = nfs4_reclaim_complete_done, + .rpc_release = nfs4_free_reclaim_complete_data, +}; + +/* + * Issue a global reclaim complete. + */ +static int nfs41_proc_reclaim_complete(struct nfs_client *clp) +{ + struct nfs4_reclaim_complete_data *calldata; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_reclaim_complete_call_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); + calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); + if (calldata == NULL) + goto out; + calldata->clp = clp; + calldata->arg.one_fs = 0; + calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + status = PTR_ERR(task); + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -4948,8 +5063,9 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, - .establish_clid = nfs4_proc_exchange_id, + .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, + .reclaim_complete = nfs41_proc_reclaim_complete, }; #endif /* CONFIG_NFS_V4_1 */ @@ -4968,7 +5084,7 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, .recover_open = nfs4_open_expired, .recover_lock = nfs4_lock_expired, - .establish_clid = nfs4_proc_exchange_id, + .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, }; #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2ef4fecf398..e76427e6346 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -116,6 +116,68 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) #if defined(CONFIG_NFS_V4_1) +static int nfs41_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = jiffies; + spin_unlock(&clp->cl_lock); + + nfs4_schedule_state_renewal(clp); + } + + return status; +} + +static void nfs41_end_drain_session(struct nfs_client *clp, + struct nfs4_session *ses) +{ + if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) + rpc_wake_up(&ses->fc_slot_table.slot_tbl_waitq); +} + +static int nfs41_begin_drain_session(struct nfs_client *clp, + struct nfs4_session *ses) +{ + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); + return wait_for_completion_interruptible(&ses->complete); + } + spin_unlock(&tbl->slot_tbl_lock); + return 0; +} + +int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + + status = nfs41_begin_drain_session(clp, clp->cl_session); + if (status != 0) + goto out; + status = nfs4_proc_exchange_id(clp, cred); + if (status != 0) + goto out; + status = nfs4_proc_create_session(clp); + if (status != 0) + goto out; + nfs41_end_drain_session(clp, clp->cl_session); + nfs41_setup_state_renewal(clp); + nfs_mark_client_ready(clp, NFS_CS_READY); +out: + return status; +} + struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) { struct rpc_cred *cred; @@ -877,6 +939,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", @@ -959,6 +1025,10 @@ restart: case -NFS4ERR_NO_GRACE: nfs4_state_mark_reclaim_nograce(sp->so_client, state); case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out_err; } nfs4_put_open_state(state); @@ -1011,6 +1081,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); } +static void nfs4_reclaim_complete(struct nfs_client *clp, + const struct nfs4_state_recovery_ops *ops) +{ + /* Notify the server we're done reclaiming our state */ + if (ops->reclaim_complete) + (void)ops->reclaim_complete(clp); +} + static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { struct nfs4_state_owner *sp; @@ -1020,6 +1098,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) return; + nfs4_reclaim_complete(clp, + nfs4_reboot_recovery_ops[clp->cl_minorversion]); + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); spin_lock(&sp->so_lock); @@ -1046,25 +1127,25 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } -static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp) -{ - clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); -} - -static void nfs4_recovery_handle_error(struct nfs_client *clp, int error) +static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { case -NFS4ERR_CB_PATH_DOWN: nfs_handle_cb_pathdown(clp); - break; + return 0; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + return 0; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_end_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); nfs4_state_start_reclaim_nograce(clp); + break; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: @@ -1072,8 +1153,11 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + return 0; } + return error; } static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) @@ -1093,8 +1177,7 @@ restart: if (status < 0) { set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); - nfs4_recovery_handle_error(clp, status); - return status; + return nfs4_recovery_handle_error(clp, status); } nfs4_put_state_owner(sp); goto restart; @@ -1124,8 +1207,7 @@ static int nfs4_check_lease(struct nfs_client *clp) status = ops->renew_lease(clp, cred); put_rpccred(cred); out: - nfs4_recovery_handle_error(clp, status); - return status; + return nfs4_recovery_handle_error(clp, status); } static int nfs4_reclaim_lease(struct nfs_client *clp) @@ -1151,55 +1233,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 -static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err) +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { - switch (err) { - case -NFS4ERR_STALE_CLIENTID: + if (!flags) + return; + else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) { set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); - } + nfs4_state_start_reclaim_reboot(clp); + nfs4_schedule_state_recovery(clp); + } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + SEQ4_STATUS_ADMIN_STATE_REVOKED | + SEQ4_STATUS_RECALLABLE_STATE_REVOKED | + SEQ4_STATUS_LEASE_MOVED)) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + nfs4_schedule_state_recovery(clp); + } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_BACKCHANNEL_FAULT | + SEQ4_STATUS_CB_PATH_DOWN_SESSION)) + nfs_expire_all_delegations(clp); } static int nfs4_reset_session(struct nfs_client *clp) { + struct nfs4_session *ses = clp->cl_session; int status; + status = nfs41_begin_drain_session(clp, ses); + if (status != 0) + return status; + status = nfs4_proc_destroy_session(clp->cl_session); if (status && status != -NFS4ERR_BADSESSION && status != -NFS4ERR_DEADSESSION) { - nfs4_session_recovery_handle_error(clp, status); + status = nfs4_recovery_handle_error(clp, status); goto out; } memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); - status = nfs4_proc_create_session(clp, 1); + status = nfs4_proc_create_session(clp); if (status) - nfs4_session_recovery_handle_error(clp, status); - /* fall through*/ -out: - /* Wake up the next rpc task even on error */ - rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq); - return status; -} - -static int nfs4_initialize_session(struct nfs_client *clp) -{ - int status; + status = nfs4_recovery_handle_error(clp, status); - status = nfs4_proc_create_session(clp, 0); - if (!status) { - nfs_mark_client_ready(clp, NFS_CS_READY); - } else if (status == -NFS4ERR_STALE_CLIENTID) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); - } else { - nfs_mark_client_ready(clp, status); +out: + /* + * Let the state manager reestablish state + * without waking other tasks yet. + */ + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + /* Wake up the next rpc task */ + nfs41_end_drain_session(clp, ses); + if (status == 0) + nfs41_setup_state_renewal(clp); } return status; } + #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } -static int nfs4_initialize_session(struct nfs_client *clp) { return 0; } #endif /* CONFIG_NFS_V4_1 */ /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors @@ -1234,7 +1326,8 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_reclaim_lease(clp); if (status) { nfs4_set_lease_expired(clp, status); - if (status == -EAGAIN) + if (test_bit(NFS4CLNT_LEASE_EXPIRED, + &clp->cl_state)) continue; if (clp->cl_cons_state == NFS_CS_SESSION_INITING) @@ -1242,55 +1335,51 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { status = nfs4_check_lease(clp); - if (status != 0) + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) continue; + if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) + goto out_error; } + /* Initialize or reset the session */ - if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) + if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) && nfs4_has_session(clp)) { - if (clp->cl_cons_state == NFS_CS_SESSION_INITING) - status = nfs4_initialize_session(clp); - else - status = nfs4_reset_session(clp); - if (status) { - if (status == -NFS4ERR_STALE_CLIENTID) - continue; + status = nfs4_reset_session(clp); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; + if (status < 0) goto out_error; - } } + /* First recover reboot state... */ - if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { status = nfs4_do_reclaim(clp, nfs4_reboot_recovery_ops[clp->cl_minorversion]); - if (status == -NFS4ERR_STALE_CLIENTID) - continue; - if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) continue; nfs4_state_end_reclaim_reboot(clp); - continue; + if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; } /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { status = nfs4_do_reclaim(clp, nfs4_nograce_recovery_ops[clp->cl_minorversion]); - if (status < 0) { - set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); - if (status == -NFS4ERR_STALE_CLIENTID) - continue; - if (status == -NFS4ERR_EXPIRED) - continue; - if (test_bit(NFS4CLNT_SESSION_SETUP, - &clp->cl_state)) - continue; + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + continue; + if (status < 0) goto out_error; - } else - nfs4_state_end_reclaim_nograce(clp); - continue; } if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { @@ -1309,8 +1398,6 @@ static void nfs4_state_manager(struct nfs_client *clp) out_error: printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" " with error %d\n", clp->cl_hostname, -status); - if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) - nfs4_state_end_reclaim_reboot(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 20b4e30e6c8..e437fd6a819 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -46,11 +46,13 @@ #include <linux/proc_fs.h> #include <linux/kdev_t.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/msg_prot.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> #include "nfs4_fs.h" +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -134,7 +136,7 @@ static int nfs4_stat_to_errno(int); #define decode_lookup_maxsz (op_decode_hdr_maxsz) #define encode_share_access_maxsz \ (2) -#define encode_createmode_maxsz (1 + encode_attrs_maxsz) +#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz) #define encode_opentype_maxsz (1 + encode_createmode_maxsz) #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ @@ -299,6 +301,8 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) #define decode_sequence_maxsz (op_decode_hdr_maxsz + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) +#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) +#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -676,6 +680,25 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putrootfh_maxsz + \ decode_fsinfo_maxsz) +#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_reclaim_complete_maxsz) +#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) + +const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + + encode_getattr_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz) * + XDR_UNIT); #endif /* CONFIG_NFS_V4_1 */ static const umode_t nfs_type2fmt[] = { @@ -1140,6 +1163,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { __be32 *p; + struct nfs_client *clp; p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { @@ -1148,8 +1172,23 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); + clp = arg->server->nfs_client; + if (clp->cl_minorversion > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); + } else { + struct iattr dummy; + + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->server); + } + } else { + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + } } } @@ -1592,6 +1631,19 @@ static void encode_destroy_session(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } + +static void encode_reclaim_complete(struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE); + *p++ = cpu_to_be32(args->one_fs); + hdr->nops++; + hdr->replen += decode_reclaim_complete_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ static void encode_sequence(struct xdr_stream *xdr, @@ -2096,7 +2148,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, encode_compound_hdr(&xdr, req, &hdr); encode_sequence(&xdr, &args->seq_args, &hdr); encode_putfh(&xdr, args->fh, &hdr); - replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, @@ -2420,6 +2472,26 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, encode_nops(&hdr); return 0; } + +/* + * a RECLAIM_COMPLETE request + */ +static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, + struct nfs41_reclaim_complete_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args) + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_reclaim_complete(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -4528,6 +4600,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) { return decode_op_hdr(xdr, OP_DESTROY_SESSION); } + +static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); +} #endif /* CONFIG_NFS_V4_1 */ static int decode_sequence(struct xdr_stream *xdr, @@ -4583,8 +4660,8 @@ static int decode_sequence(struct xdr_stream *xdr, dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ dummy = be32_to_cpup(p++); - /* result flags - currently not processed */ - dummy = be32_to_cpup(p); + /* result flags */ + res->sr_status_flags = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; @@ -5309,7 +5386,7 @@ out: } /* - * FSINFO request + * Decode FSINFO response */ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_res *res) @@ -5330,7 +5407,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, } /* - * PATHCONF request + * Decode PATHCONF response */ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs4_pathconf_res *res) @@ -5351,7 +5428,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, } /* - * STATFS request + * Decode STATFS response */ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs4_statfs_res *res) @@ -5372,7 +5449,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, } /* - * GETATTR_BITMAP request + * Decode GETATTR_BITMAP response */ static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) { @@ -5411,7 +5488,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) } /* - * a SETCLIENTID request + * Decode SETCLIENTID response */ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) @@ -5428,7 +5505,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, } /* - * a SETCLIENTID_CONFIRM request + * Decode SETCLIENTID_CONFIRM response */ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) { @@ -5448,7 +5525,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str } /* - * DELEGRETURN request + * Decode DELEGRETURN response */ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) { @@ -5474,7 +5551,7 @@ out: } /* - * FS_LOCATIONS request + * Decode FS_LOCATIONS response */ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_res *res) @@ -5504,7 +5581,7 @@ out: #if defined(CONFIG_NFS_V4_1) /* - * EXCHANGE_ID request + * Decode EXCHANGE_ID response */ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, void *res) @@ -5521,7 +5598,7 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a CREATE_SESSION request + * Decode CREATE_SESSION response */ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, struct nfs41_create_session_res *res) @@ -5538,7 +5615,7 @@ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a DESTROY_SESSION request + * Decode DESTROY_SESSION response */ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) @@ -5555,7 +5632,7 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a SEQUENCE request + * Decode SEQUENCE response */ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_sequence_res *res) @@ -5572,7 +5649,7 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a GET_LEASE_TIME request + * Decode GET_LEASE_TIME response */ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_get_lease_time_res *res) @@ -5591,6 +5668,25 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, status = decode_fsinfo(&xdr, res->lr_fsinfo); return status; } + +/* + * Decode RECLAIM_COMPLETE response + */ +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs41_reclaim_complete_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (!status) + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; +} #endif /* CONFIG_NFS_V4_1 */ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) @@ -5767,6 +5863,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), PROC(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 12c9e66d3f1..db9b360ae19 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -356,25 +356,19 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data struct nfs_readres *resp = &data->res; if (resp->eof || resp->count == argp->count) - goto out; + return; /* This is a short read! */ nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count == 0) - goto out; + return; /* Yes, so retry the read at the end of the data */ argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); - return; -out: - nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client, - &data->res.seq_res); - return; - + nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 90be551b80c..ce907efc550 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -175,14 +175,16 @@ static const match_table_t nfs_mount_option_tokens = { }; enum { - Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, + Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, Opt_xprt_err }; static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, + { Opt_xprt_udp6, "udp6" }, { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, { Opt_xprt_err, NULL } @@ -492,6 +494,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) return sec_flavours[i].str; } +static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + + seq_printf(m, ",mountproto="); + switch (sap->sa_family) { + case AF_INET: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + case AF_INET6: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP6); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP6); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } +} + static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { @@ -505,7 +546,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, } case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr); + seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr); break; } default: @@ -518,17 +559,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->mountd_port || showdefaults) seq_printf(m, ",mountport=%u", nfss->mountd_port); - switch (nfss->mountd_protocol) { - case IPPROTO_UDP: - seq_printf(m, ",mountproto=udp"); - break; - case IPPROTO_TCP: - seq_printf(m, ",mountproto=tcp"); - break; - default: - if (showdefaults) - seq_printf(m, ",mountproto=auto"); - } + nfs_show_mountd_netid(m, nfss, showdefaults); } /* @@ -578,7 +609,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, nfs_infop->nostr); } seq_printf(m, ",proto=%s", - rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); if (version == 4) { if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); @@ -714,8 +745,6 @@ static void nfs_umount_begin(struct super_block *sb) struct nfs_server *server; struct rpc_clnt *rpc; - lock_kernel(); - server = NFS_SB(sb); /* -EIO all pending I/O */ rpc = server->client_acl; @@ -724,8 +753,6 @@ static void nfs_umount_begin(struct super_block *sb) rpc = server->client; if (!IS_ERR(rpc)) rpc_killall_tasks(rpc); - - unlock_kernel(); } static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) @@ -734,8 +761,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->rsize = NFS_MAX_FILE_IO_SIZE; - data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; @@ -887,6 +912,8 @@ static int nfs_parse_mount_options(char *raw, { char *p, *string, *secdata; int rc, sloppy = 0, invalid_option = 0; + unsigned short protofamily = AF_UNSPEC; + unsigned short mountfamily = AF_UNSPEC; if (!raw) { dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); @@ -1232,12 +1259,17 @@ static int nfs_parse_mount_options(char *raw, token = match_token(string, nfs_xprt_protocol_tokens, args); + protofamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + protofamily = AF_INET6; case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; kfree(string); break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; @@ -1265,10 +1297,15 @@ static int nfs_parse_mount_options(char *raw, nfs_xprt_protocol_tokens, args); kfree(string); + mountfamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; case Opt_xprt_udp: mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; case Opt_xprt_tcp: mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -1367,8 +1404,33 @@ static int nfs_parse_mount_options(char *raw, if (!sloppy && invalid_option) return 0; + /* + * verify that any proto=/mountproto= options match the address + * familiies in the addr=/mountaddr= options. + */ + if (protofamily != AF_UNSPEC && + protofamily != mnt->nfs_server.address.ss_family) + goto out_proto_mismatch; + + if (mountfamily != AF_UNSPEC) { + if (mnt->mount_server.addrlen) { + if (mountfamily != mnt->mount_server.address.ss_family) + goto out_mountproto_mismatch; + } else { + if (mountfamily != mnt->nfs_server.address.ss_family) + goto out_mountproto_mismatch; + } + } + return 1; +out_mountproto_mismatch: + printk(KERN_INFO "NFS: mount server address does not match mountproto= " + "option\n"); + return 0; +out_proto_mismatch: + printk(KERN_INFO "NFS: server address does not match proto= option\n"); + return 0; out_invalid_address: printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); return 0; @@ -1881,7 +1943,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (data == NULL) return -ENOMEM; - lock_kernel(); /* fill out struct with values from existing mount */ data->flags = nfss->flags; data->rsize = nfss->rsize; @@ -1907,7 +1968,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) error = nfs_compare_remount_data(nfss, data); out: kfree(data); - unlock_kernel(); return error; } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 1064c91ae81..6da3d3ff6ed 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -83,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct inode *dir = data->dir; if (!NFS_PROTO(dir)->unlink_done(task, dir)) - nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client); + nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); } /** diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c84b5cc1a94..d171696017f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -774,7 +774,7 @@ int nfs_updatepage(struct file *file, struct page *page, */ if (nfs_write_pageuptodate(page, inode) && inode->i_flock == NULL && - !(file->f_flags & O_SYNC)) { + !(file->f_flags & O_DSYNC)) { count = max(count + offset, nfs_page_length(page)); offset = 0; } @@ -1216,7 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ argp->stable = NFS_FILE_SYNC; } - nfs4_restart_rpc(task, server->nfs_client); + nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; } if (time_before(complain, jiffies)) { @@ -1228,7 +1228,6 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res); return 0; } @@ -1612,15 +1611,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, if (ret) goto out_unlock; page_cache_get(newpage); + spin_lock(&mapping->host->i_lock); req->wb_page = newpage; SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + set_page_private(newpage, (unsigned long)req); ClearPagePrivate(page); set_page_private(page, 0); + spin_unlock(&mapping->host->i_lock); page_cache_release(page); out_unlock: nfs_clear_page_tag_locked(req); - nfs_release_request(req); out: return ret; } diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 251da07b2a1..1225af7b216 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -2,6 +2,7 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" depends on EXPERIMENTAL select CRC32 + select FS_JOURNAL_INFO help NILFS2 is a log-structured file system (LFS) supporting continuous snapshotting. In addition to versioning capability of the entire diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index d69e6ae5925..3f959f1879d 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -142,29 +142,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode, } } +static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, + int create, + void (*init_block)(struct inode *, + struct buffer_head *, + void *), + struct buffer_head **bhp, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + int ret; + + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + get_bh(prev->bh); + *bhp = prev->bh; + spin_unlock(lock); + return 0; + } + spin_unlock(lock); + + ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp); + if (!ret) { + spin_lock(lock); + /* + * The following code must be safe for change of the + * cache contents during the get block call. + */ + brelse(prev->bh); + get_bh(*bhp); + prev->bh = *bhp; + prev->blkoff = blkoff; + spin_unlock(lock); + } + return ret; +} + static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { - return nilfs_mdt_get_block(inode, - nilfs_palloc_desc_blkoff(inode, group), - create, nilfs_palloc_desc_block_init, bhp); + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, + bhp, &cache->prev_desc, &cache->lock); } static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { - return nilfs_mdt_get_block(inode, - nilfs_palloc_bitmap_blkoff(inode, group), - create, NULL, bhp); + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp, + &cache->prev_bitmap, &cache->lock); } int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) { - return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), - create, NULL, bhp); + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp, + &cache->prev_entry, &cache->lock); } static struct nilfs_palloc_group_desc * @@ -176,13 +222,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode, group % nilfs_palloc_groups_per_desc_block(inode); } -static unsigned char * -nilfs_palloc_block_get_bitmap(const struct inode *inode, - const struct buffer_head *bh, void *kaddr) -{ - return (unsigned char *)(kaddr + bh_offset(bh)); -} - void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, const struct buffer_head *bh, void *kaddr) { @@ -289,8 +328,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, if (ret < 0) goto out_desc; bitmap_kaddr = kmap(bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap( - inode, bitmap_bh, bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( inode, group, group_offset, bitmap, entries_per_group); @@ -351,8 +389,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, - bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) @@ -385,8 +422,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, - bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) printk(KERN_WARNING "%s: entry numer %llu already freed\n", @@ -472,8 +508,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); bitmap_kaddr = kmap(bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap( - inode, bitmap_bh, bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); for (j = i, n = 0; (j < nitems) && nilfs_palloc_group_is_in(inode, group, entry_nrs[j]); @@ -502,3 +537,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) } return 0; } + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache) +{ + NILFS_MDT(inode)->mi_palloc_cache = cache; + spin_lock_init(&cache->lock); +} + +void nilfs_palloc_clear_cache(struct inode *inode) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + spin_lock(&cache->lock); + brelse(cache->prev_desc.bh); + brelse(cache->prev_bitmap.bh); + brelse(cache->prev_entry.bh); + cache->prev_desc.bh = NULL; + cache->prev_bitmap.bh = NULL; + cache->prev_entry.bh = NULL; + spin_unlock(&cache->lock); +} + +void nilfs_palloc_destroy_cache(struct inode *inode) +{ + nilfs_palloc_clear_cache(inode); + NILFS_MDT(inode)->mi_palloc_cache = NULL; +} diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 4ace5475c2c..f4543ac4f56 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit ext2_find_next_zero_bit +/* + * persistent object allocator cache + */ + +struct nilfs_bh_assoc { + unsigned long blkoff; + struct buffer_head *bh; +}; + +struct nilfs_palloc_cache { + spinlock_t lock; + struct nilfs_bh_assoc prev_desc; + struct nilfs_bh_assoc prev_bitmap; + struct nilfs_bh_assoc prev_entry; +}; + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache); +void nilfs_palloc_clear_cache(struct inode *inode); +void nilfs_palloc_destroy_cache(struct inode *inode); + #endif /* _NILFS_ALLOC_H */ diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 08834df6ec6..f4a14ea2ed9 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -402,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n) { inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); - if (NILFS_MDT(bmap->b_inode)) - nilfs_mdt_mark_dirty(bmap->b_inode); - else - mark_inode_dirty(bmap->b_inode); } void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) { inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); - if (NILFS_MDT(bmap->b_inode)) - nilfs_mdt_mark_dirty(bmap->b_inode); - else - mark_inode_dirty(bmap->b_inode); } __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 84c25382f8e..471e269536a 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -68,9 +68,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc) truncate_inode_pages(btnc, 0); } +struct buffer_head * +nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) +{ + struct inode *inode = NILFS_BTNC_I(btnc); + struct buffer_head *bh; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return NULL; + + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + memset(bh->b_data, 0, 1 << inode->i_blkbits); + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return bh; +} + int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, struct buffer_head **pbh, - int newblk) + sector_t pblocknr, struct buffer_head **pbh) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); @@ -81,19 +106,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, return -ENOMEM; err = -EEXIST; /* internal code */ - if (newblk) { - if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || - buffer_dirty(bh))) { - brelse(bh); - BUG(); - } - memset(bh->b_data, 0, 1 << inode->i_blkbits); - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; - bh->b_blocknr = blocknr; - set_buffer_mapped(bh); - set_buffer_uptodate(bh); - goto found; - } if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; @@ -135,27 +147,6 @@ out_locked: return err; } -int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, struct buffer_head **pbh, int newblk) -{ - struct buffer_head *bh; - int err; - - err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk); - if (err == -EEXIST) /* internal code (cache hit) */ - return 0; - if (unlikely(err)) - return err; - - bh = *pbh; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - brelse(bh); - return -EIO; - } - return 0; -} - /** * nilfs_btnode_delete - delete B-tree node buffer * @bh: buffer to be deleted @@ -244,12 +235,13 @@ retry: unlock_page(obh->b_page); } - err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1); - if (likely(!err)) { - BUG_ON(nbh == obh); - ctxt->newbh = nbh; - } - return err; + nbh = nilfs_btnode_create_block(btnc, newkey); + if (!nbh) + return -ENOMEM; + + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + return 0; failed_unlock: unlock_page(obh->b_page); diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 3e2275172ed..07da83f0771 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt { void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); +struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, + __u64 blocknr); int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, - struct buffer_head **, int); -int nilfs_btnode_get(struct address_space *, __u64, sector_t, - struct buffer_head **, int); + struct buffer_head **); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index e25b507a474..7cdd98b8d51 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -114,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, { struct address_space *btnc = &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; - return nilfs_btnode_get(btnc, ptr, 0, bhp, 0); + int err; + + err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp); + if (err) + return err == -EEXIST ? 0 : err; + + wait_on_buffer(*bhp); + if (!buffer_uptodate(*bhp)) { + brelse(*bhp); + return -EIO; + } + return 0; } static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, @@ -122,12 +133,15 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, { struct address_space *btnc = &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; - int ret; + struct buffer_head *bh; - ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1); - if (!ret) - set_buffer_nilfs_volatile(*bhp); - return ret; + bh = nilfs_btnode_create_block(btnc, ptr); + if (!bh) + return -ENOMEM; + + set_buffer_nilfs_volatile(bh); + *bhp = bh; + return 0; } static inline int @@ -444,6 +458,18 @@ nilfs_btree_get_node(const struct nilfs_btree *btree, nilfs_btree_get_nonroot_node(path, level); } +static inline int +nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) +{ + if (unlikely(nilfs_btree_node_get_level(node) != level)) { + dump_stack(); + printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n", + nilfs_btree_node_get_level(node), level); + return 1; + } + return 0; +} + static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, struct nilfs_btree_path *path, __u64 key, __u64 *ptrp, int minlevel) @@ -467,7 +493,8 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); - BUG_ON(level != nilfs_btree_node_get_level(node)); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; if (!found) found = nilfs_btree_node_lookup(node, key, &index); else @@ -512,7 +539,8 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); - BUG_ON(level != nilfs_btree_node_get_level(node)); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_index = index; @@ -638,13 +666,11 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, { if (level < nilfs_btree_height(btree) - 1) { do { - lock_buffer(path[level].bp_bh); nilfs_btree_node_set_key( nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); } while ((path[level].bp_index == 0) && (++level < nilfs_btree_height(btree) - 1)); } @@ -663,13 +689,11 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, struct nilfs_btree_node *node; if (level < nilfs_btree_height(btree) - 1) { - lock_buffer(path[level].bp_bh); node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, @@ -689,9 +713,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, struct nilfs_btree_node *node, *left; int nchildren, lnchildren, n, move; - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -712,9 +733,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -740,9 +758,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, struct nilfs_btree_node *node, *right; int nchildren, rnchildren, n, move; - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -763,9 +778,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(right, 0)); @@ -794,9 +806,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree, __u64 newptr; int nchildren, n, move; - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -815,9 +824,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; @@ -852,8 +858,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, struct nilfs_btree_node *root, *child; int n; - lock_buffer(path[level].bp_sib_bh); - root = nilfs_btree_get_root(btree); child = nilfs_btree_get_sib_node(path, level); @@ -865,8 +869,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_sib_bh); - path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; @@ -1023,11 +1025,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, stats->bs_nblocks++; - lock_buffer(bh); nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 0, level, 0, NULL, NULL); - unlock_buffer(bh); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_split; } @@ -1052,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; - lock_buffer(bh); nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 0, level, 0, NULL, NULL); - unlock_buffer(bh); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_grow; @@ -1154,13 +1152,11 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, struct nilfs_btree_node *node; if (level < nilfs_btree_height(btree) - 1) { - lock_buffer(path[level].bp_bh); node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, node, keyp, ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1180,9 +1176,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -1197,9 +1190,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1217,9 +1207,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -1234,9 +1221,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(right, 0)); @@ -1255,9 +1239,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); @@ -1268,9 +1249,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; @@ -1286,9 +1264,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); @@ -1299,9 +1274,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btnode_delete(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level + 1].bp_index++; @@ -1316,7 +1288,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); root = nilfs_btree_get_root(btree); child = nilfs_btree_get_nonroot_node(path, level); @@ -1324,7 +1295,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, nilfs_btree_node_set_level(root, level); n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(btree, root, child, n); - unlock_buffer(path[level].bp_bh); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = NULL; @@ -1699,7 +1669,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); /* create child node at level 1 */ - lock_buffer(bh); node = (struct nilfs_btree_node *)bh->b_data; nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); nilfs_btree_node_insert(btree, node, @@ -1709,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, if (!nilfs_bmap_dirty(bmap)) nilfs_bmap_set_dirty(bmap); - unlock_buffer(bh); brelse(bh); /* create root node at level 2 */ @@ -2050,7 +2018,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < NILFS_BTREE_LEVEL_MAX; level++) - list_splice(&lists[level], listp->prev); + list_splice_tail(&lists[level], listp); } static int nilfs_btree_assign_p(struct nilfs_btree *btree, diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 0e72bbbc6b6..4b82d84ade7 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -34,28 +34,6 @@ struct nilfs_btree; struct nilfs_btree_path; /** - * struct nilfs_btree_node - B-tree node - * @bn_flags: flags - * @bn_level: level - * @bn_nchildren: number of children - * @bn_pad: padding - */ -struct nilfs_btree_node { - __u8 bn_flags; - __u8 bn_level; - __le16 bn_nchildren; - __le32 bn_pad; -}; - -/* flags */ -#define NILFS_BTREE_NODE_ROOT 0x01 - -/* level */ -#define NILFS_BTREE_LEVEL_DATA 0 -#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) -#define NILFS_BTREE_LEVEL_MAX 14 - -/** * struct nilfs_btree - B-tree structure * @bt_bmap: bmap base structure */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 3f5d5d06f53..d5ad54e204a 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -926,3 +926,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) up_read(&NILFS_MDT(cpfile)->mi_sem); return ret; } + +/** + * nilfs_cpfile_read - read cpfile inode + * @cpfile: cpfile inode + * @raw_inode: on-disk cpfile inode + */ +int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode) +{ + return nilfs_read_inode_common(cpfile, raw_inode); +} + +/** + * nilfs_cpfile_new - create cpfile + * @nilfs: nilfs object + * @cpsize: size of a checkpoint entry + */ +struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize) +{ + struct inode *cpfile; + + cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0); + if (cpfile) + nilfs_mdt_set_entry_size(cpfile, cpsize, + sizeof(struct nilfs_cpfile_header)); + return cpfile; +} diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index debea896e70..bc0809e0ab4 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -40,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, size_t); +int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode); +struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize); + #endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 1ff8e15bd36..187dd07ba86 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -33,6 +33,16 @@ #define NILFS_CNO_MIN ((__u64)1) #define NILFS_CNO_MAX (~(__u64)0) +struct nilfs_dat_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; +}; + +static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) +{ + return (struct nilfs_dat_info *)NILFS_MDT(dat); +} + static int nilfs_dat_prepare_entry(struct inode *dat, struct nilfs_palloc_req *req, int create) { @@ -425,3 +435,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, return nvi; } + +/** + * nilfs_dat_read - read dat inode + * @dat: dat inode + * @raw_inode: on-disk dat inode + */ +int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode) +{ + return nilfs_read_inode_common(dat, raw_inode); +} + +/** + * nilfs_dat_new - create dat file + * @nilfs: nilfs object + * @entry_size: size of a dat entry + */ +struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size) +{ + static struct lock_class_key dat_lock_key; + struct inode *dat; + struct nilfs_dat_info *di; + int err; + + dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di)); + if (dat) { + err = nilfs_palloc_init_blockgroup(dat, entry_size); + if (unlikely(err)) { + nilfs_mdt_destroy(dat); + return NULL; + } + + di = NILFS_DAT_I(dat); + lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); + nilfs_palloc_setup_cache(dat, &di->palloc_cache); + } + return dat; +} diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index 406070d3ff4..d31c3aab0ef 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -53,4 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t); int nilfs_dat_move(struct inode *, __u64, sector_t); ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); +int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode); +struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size); + #endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index e097099bfc8..76d803e060a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -99,9 +99,9 @@ static int nilfs_prepare_chunk(struct page *page, NULL, nilfs_get_block); } -static int nilfs_commit_chunk(struct page *page, - struct address_space *mapping, - unsigned from, unsigned to) +static void nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) { struct inode *dir = mapping->host; struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); @@ -112,15 +112,13 @@ static int nilfs_commit_chunk(struct page *page, nr_dirty = nilfs_page_count_clean_buffers(page, from, to); copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); - if (pos + copied > dir->i_size) { + if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); - mark_inode_dirty(dir); - } if (IS_DIRSYNC(dir)) nilfs_set_transaction_flag(NILFS_TI_SYNC); err = nilfs_set_file_dirty(sbi, dir, nr_dirty); + WARN_ON(err); /* do not happen */ unlock_page(page); - return err; } static void nilfs_check_page(struct page *page) @@ -455,11 +453,10 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, BUG_ON(err); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; /* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ - mark_inode_dirty(dir); } /* @@ -548,10 +545,10 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - err = nilfs_commit_chunk(page, page->mapping, from, to); + nilfs_commit_chunk(page, page->mapping, from, to); dir->i_mtime = dir->i_ctime = CURRENT_TIME; /* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ - mark_inode_dirty(dir); + nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: nilfs_put_page(page); @@ -595,10 +592,9 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) if (pde) pde->rec_len = cpu_to_le16(to - from); dir->inode = 0; - err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(page, mapping, from, to); inode->i_ctime = inode->i_mtime = CURRENT_TIME; /* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */ - mark_inode_dirty(inode); out: nilfs_put_page(page); return err; @@ -640,7 +636,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); kunmap_atomic(kaddr, KM_USER0); - err = nilfs_commit_chunk(page, mapping, 0, chunk_size); + nilfs_commit_chunk(page, mapping, 0, chunk_size); fail: page_cache_release(page); return err; diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c index 93383c5cee9..dd5f7e0a95f 100644 --- a/fs/nilfs2/gcdat.c +++ b/fs/nilfs2/gcdat.c @@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs) nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap); + nilfs_palloc_clear_cache(dat); + nilfs_palloc_clear_cache(gcdat); nilfs_clear_dirty_pages(mapping); nilfs_copy_back_pages(mapping, gmapping); /* note: mdt dirty flags should be cleared by segctor. */ @@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs) gcdat->i_state = I_CLEAR; gii->i_flags = 0; + nilfs_palloc_clear_cache(gcdat); truncate_inode_pages(gcdat->i_mapping, 0); truncate_inode_pages(&gii->i_btnode_cache, 0); } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index e6de0a27ab5..e16a6664dfa 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -149,7 +149,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, - vbn ? : pbn, pbn, out_bh, 0); + vbn ? : pbn, pbn, out_bh); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; @@ -212,9 +212,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs) static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, __u64 cno) { - struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS); + struct inode *inode; struct nilfs_inode_info *ii; + inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0); if (!inode) return NULL; @@ -265,7 +266,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno) */ void nilfs_clear_gcinode(struct inode *inode) { - nilfs_mdt_clear(inode); nilfs_mdt_destroy(inode); } diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index de86401f209..922d9dd42c8 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -29,6 +29,17 @@ #include "alloc.h" #include "ifile.h" + +struct nilfs_ifile_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; +}; + +static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile) +{ + return (struct nilfs_ifile_info *)NILFS_MDT(ifile); +} + /** * nilfs_ifile_create_inode - create a new disk inode * @ifile: ifile inode @@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, } return err; } + +/** + * nilfs_ifile_new - create inode file + * @sbi: nilfs_sb_info struct + * @inode_size: size of an inode + */ +struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size) +{ + struct inode *ifile; + int err; + + ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO, + sizeof(struct nilfs_ifile_info)); + if (ifile) { + err = nilfs_palloc_init_blockgroup(ifile, inode_size); + if (unlikely(err)) { + nilfs_mdt_destroy(ifile); + return NULL; + } + nilfs_palloc_setup_cache(ifile, + &NILFS_IFILE_I(ifile)->palloc_cache); + } + return ifile; +} diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index ecc3ba76db4..cbca32e498f 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -49,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); int nilfs_ifile_delete_inode(struct inode *, ino_t); int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); +struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size); + #endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 2a0a5a3ac13..7868cc122ac 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -97,6 +97,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, nilfs_transaction_abort(inode->i_sb); goto out; } + nilfs_mark_inode_dirty(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ /* Error handling should be detailed */ set_buffer_new(bh_result); @@ -322,7 +323,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) nilfs_init_acl(), proper cancellation of above jobs should be considered */ - mark_inode_dirty(inode); return inode; failed_acl: @@ -525,7 +525,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); - /* The buffer is guarded with lock_buffer() by the caller */ if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); @@ -599,6 +598,7 @@ void nilfs_truncate(struct inode *inode) if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_mark_inode_dirty(inode); nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); nilfs_transaction_commit(sb); /* May construct a logical segment and may fail in sync mode. @@ -623,6 +623,7 @@ void nilfs_delete_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); nilfs_truncate_bmap(ii, 0); + nilfs_mark_inode_dirty(inode); nilfs_free_inode(inode); /* nilfs_free_inode() marks inode buffer dirty */ if (IS_SYNC(inode)) @@ -745,9 +746,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) "failed to reget inode block.\n"); return err; } - lock_buffer(ibh); nilfs_update_inode(inode, ibh); - unlock_buffer(ibh); nilfs_mdt_mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(sbi->s_ifile); brelse(ibh); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index f6326112d64..06713ffcc7f 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -186,7 +186,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, } static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, - struct buffer_head **out_bh) + int readahead, struct buffer_head **out_bh) { struct buffer_head *first_bh, *bh; unsigned long blkoff; @@ -200,16 +200,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, if (unlikely(err)) goto failed; - blkoff = block + 1; - for (i = 0; i < nr_ra_blocks; i++, blkoff++) { - err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); - if (likely(!err || err == -EEXIST)) - brelse(bh); - else if (err != -EBUSY) - break; /* abort readahead if bmap lookup failed */ - - if (!buffer_locked(first_bh)) - goto out_no_wait; + if (readahead) { + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; + /* abort readahead if bmap lookup failed */ + if (!buffer_locked(first_bh)) + goto out_no_wait; + } } wait_on_buffer(first_bh); @@ -263,7 +265,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, /* Should be rewritten with merging nilfs_mdt_read_block() */ retry: - ret = nilfs_mdt_read_block(inode, blkoff, out_bh); + ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); if (!create || ret != -ENOENT) return ret; @@ -371,7 +373,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) struct buffer_head *bh; int err; - err = nilfs_mdt_read_block(inode, block, &bh); + err = nilfs_mdt_read_block(inode, block, 0, &bh); if (unlikely(err)) return err; nilfs_mark_buffer_dirty(bh); @@ -445,9 +447,17 @@ static const struct file_operations def_mdt_fops; * longer than those of the super block structs; they may continue for * several consecutive mounts/umounts. This would need discussions. */ +/** + * nilfs_mdt_new_common - allocate a pseudo inode for metadata file + * @nilfs: nilfs object + * @sb: super block instance the metadata file belongs to + * @ino: inode number + * @gfp_mask: gfp mask for data pages + * @objsz: size of the private object attached to inode->i_private + */ struct inode * nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino, gfp_t gfp_mask) + ino_t ino, gfp_t gfp_mask, size_t objsz) { struct inode *inode = nilfs_alloc_inode_common(nilfs); @@ -455,8 +465,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, return NULL; else { struct address_space * const mapping = &inode->i_data; - struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS); + struct nilfs_mdt_info *mi; + mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); if (!mi) { nilfs_destroy_inode(inode); return NULL; @@ -513,11 +524,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, } struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino) + ino_t ino, size_t objsz) { - struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, - NILFS_MDT_GFP); + struct inode *inode; + inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz); if (!inode) return NULL; @@ -544,14 +555,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) &NILFS_I(orig)->i_btnode_cache; } -void nilfs_mdt_clear(struct inode *inode) +static void nilfs_mdt_clear(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); invalidate_mapping_pages(inode->i_mapping, 0, -1); truncate_inode_pages(inode->i_mapping, 0); - nilfs_bmap_clear(ii->i_bmap); + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); nilfs_btnode_cache_clear(&ii->i_btnode_cache); } @@ -559,6 +571,10 @@ void nilfs_mdt_destroy(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + if (mdi->mi_palloc_cache) + nilfs_palloc_destroy_cache(inode); + nilfs_mdt_clear(inode); + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); nilfs_destroy_inode(inode); diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index 431599733c9..6c4bbb0470f 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -36,6 +36,7 @@ * @mi_entry_size: size of an entry * @mi_first_entry_offset: offset to the first entry * @mi_entries_per_block: number of entries in a block + * @mi_palloc_cache: persistent object allocator cache * @mi_blocks_per_group: number of blocks in a group * @mi_blocks_per_desc_block: number of blocks per descriptor block */ @@ -46,6 +47,7 @@ struct nilfs_mdt_info { unsigned mi_entry_size; unsigned mi_first_entry_offset; unsigned long mi_entries_per_block; + struct nilfs_palloc_cache *mi_palloc_cache; unsigned long mi_blocks_per_group; unsigned long mi_blocks_per_desc_block; }; @@ -74,11 +76,11 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); -struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, + size_t); struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, - ino_t, gfp_t); + ino_t, gfp_t, size_t); void nilfs_mdt_destroy(struct inode *); -void nilfs_mdt_clear(struct inode *); void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); void nilfs_mdt_set_shadow(struct inode *, struct inode *); @@ -104,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode) #define nilfs_mdt_bgl_lock(inode, bg) \ (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) - -static inline int -nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh, - unsigned n) -{ - return nilfs_read_inode_common( - inode, (struct nilfs_inode *)(bh->b_data + n)); -} - -static inline void -nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh, - unsigned n) -{ - nilfs_write_inode_common( - inode, (struct nilfs_inode *)(bh->b_data + n), 1); -} - #endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index ed02e886fa7..07ba838ef08 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -120,7 +120,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_op = &nilfs_file_inode_operations; inode->i_fop = &nilfs_file_operations; inode->i_mapping->a_ops = &nilfs_aops; - mark_inode_dirty(inode); + nilfs_mark_inode_dirty(inode); err = nilfs_add_nondir(dentry, inode); } if (!err) @@ -148,7 +148,7 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); - mark_inode_dirty(inode); + nilfs_mark_inode_dirty(inode); err = nilfs_add_nondir(dentry, inode); } if (!err) @@ -188,7 +188,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, goto out_fail; /* mark_inode_dirty(inode); */ - /* nilfs_new_inode() and page_symlink() do this */ + /* page_symlink() do this */ err = nilfs_add_nondir(dentry, inode); out: @@ -200,7 +200,8 @@ out: return err; out_fail: - inode_dec_link_count(inode); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); iput(inode); goto out; } @@ -245,7 +246,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) return err; - inode_inc_link_count(dir); + inc_nlink(dir); inode = nilfs_new_inode(dir, S_IFDIR | mode); err = PTR_ERR(inode); @@ -256,7 +257,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_fop = &nilfs_dir_operations; inode->i_mapping->a_ops = &nilfs_aops; - inode_inc_link_count(inode); + inc_nlink(inode); err = nilfs_make_empty(inode, dir); if (err) @@ -266,6 +267,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; + nilfs_mark_inode_dirty(inode); d_instantiate(dentry, inode); out: if (!err) @@ -276,26 +278,23 @@ out: return err; out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); + drop_nlink(inode); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); iput(inode); out_dir: - inode_dec_link_count(dir); + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); goto out; } -static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode; struct nilfs_dir_entry *de; struct page *page; - struct nilfs_transaction_info ti; int err; - err = nilfs_transaction_begin(dir->i_sb, &ti, 0); - if (err) - return err; - err = -ENOENT; de = nilfs_find_entry(dir, dentry, &page); if (!de) @@ -317,12 +316,28 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) goto out; inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); + drop_nlink(inode); err = 0; out: - if (!err) + return err; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = nilfs_do_unlink(dir, dentry); + + if (!err) { + nilfs_mark_inode_dirty(dir); + nilfs_mark_inode_dirty(dentry->d_inode); err = nilfs_transaction_commit(dir->i_sb); - else + } else nilfs_transaction_abort(dir->i_sb); return err; @@ -340,11 +355,13 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) err = -ENOTEMPTY; if (nilfs_empty_dir(inode)) { - err = nilfs_unlink(dir, dentry); + err = nilfs_do_unlink(dir, dentry); if (!err) { inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); } } if (!err) @@ -395,42 +412,48 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); + inc_nlink(old_inode); nilfs_set_link(new_dir, new_de, new_page, old_inode); + nilfs_mark_inode_dirty(new_dir); new_inode->i_ctime = CURRENT_TIME; if (dir_de) drop_nlink(new_inode); - inode_dec_link_count(new_inode); + drop_nlink(new_inode); + nilfs_mark_inode_dirty(new_inode); } else { if (dir_de) { err = -EMLINK; if (new_dir->i_nlink >= NILFS_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); + inc_nlink(old_inode); err = nilfs_add_link(new_dentry, old_inode); if (err) { - inode_dec_link_count(old_inode); + drop_nlink(old_inode); + nilfs_mark_inode_dirty(old_inode); goto out_dir; } - if (dir_de) - inode_inc_link_count(new_dir); + if (dir_de) { + inc_nlink(new_dir); + nilfs_mark_inode_dirty(new_dir); + } } /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME; nilfs_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + drop_nlink(old_inode); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); - inode_dec_link_count(old_dir); + drop_nlink(old_dir); } + nilfs_mark_inode_dirty(old_dir); + nilfs_mark_inode_dirty(old_inode); err = nilfs_transaction_commit(old_dir->i_sb); return err; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 6dc83591d11..c9c96c7825d 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -770,14 +770,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, nilfs_finish_roll_forward(nilfs, sbi, ri); } - nilfs_detach_checkpoint(sbi); - return 0; - failed: nilfs_detach_checkpoint(sbi); - nilfs_mdt_clear(nilfs->ns_cpfile); - nilfs_mdt_clear(nilfs->ns_sufile); - nilfs_mdt_clear(nilfs->ns_dat); return err; } @@ -804,6 +798,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, struct nilfs_segsum_info ssi; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ + sector_t b, end; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; @@ -819,6 +814,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, /* Calculate range of segment */ nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + /* Read ahead segment */ + b = seg_start; + while (b <= seg_end) + sb_breadahead(sbi->s_super, b++); + for (;;) { /* Load segment summary */ ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); @@ -841,14 +841,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, ri->ri_nextnum = nextnum; empty_seg = 0; + if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + + if (pseg_start == seg_start) { + nilfs_get_segment_range(nilfs, nextnum, &b, &end); + while (b <= end) + sb_breadahead(sbi->s_super, b++); + } if (!NILFS_SEG_HAS_SR(&ssi)) { - if (!scan_newer) { - /* This will never happen because a superblock - (last_segment) always points to a pseg - having a super root. */ - ret = NILFS_SEG_FAIL_CONSISTENCY; - goto failed; - } if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; @@ -919,7 +925,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, super_root_found: /* Updating pointers relating to the latest checkpoint */ - list_splice(&segments, ri->ri_used_segments.prev); + list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; nilfs->ns_last_cno = ri->ri_cno; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index e6d9e37fa24..645c78656aa 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -24,10 +24,22 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/crc32.h> +#include <linux/backing-dev.h> #include "page.h" #include "segbuf.h" +struct nilfs_write_info { + struct the_nilfs *nilfs; + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; +}; + + static struct kmem_cache *nilfs_segbuf_cachep; static void nilfs_segbuf_init_once(void *obj) @@ -63,6 +75,11 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) INIT_LIST_HEAD(&segbuf->sb_list); INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + + init_completion(&segbuf->sb_bio_event); + atomic_set(&segbuf->sb_err, 0); + segbuf->sb_nbio = 0; + return segbuf; } @@ -83,6 +100,22 @@ void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; } +/** + * nilfs_segbuf_map_cont - map a new log behind a given log + * @segbuf: new segment buffer + * @prev: segment buffer containing a log to be continued + */ +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev) +{ + segbuf->sb_segnum = prev->sb_segnum; + segbuf->sb_fseg_start = prev->sb_fseg_start; + segbuf->sb_fseg_end = prev->sb_fseg_end; + segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, __u64 nextnum, struct the_nilfs *nilfs) { @@ -132,8 +165,6 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; segbuf->sb_sum.ctime = ctime; - - segbuf->sb_io_error = 0; return 0; } @@ -219,7 +250,7 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, raw_sum->ss_datasum = cpu_to_le32(crc); } -void nilfs_release_buffers(struct list_head *list) +static void nilfs_release_buffers(struct list_head *list) { struct buffer_head *bh, *n; @@ -241,13 +272,56 @@ void nilfs_release_buffers(struct list_head *list) } } +static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); +} + +/* + * Iterators for segment buffers + */ +void nilfs_clear_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) + nilfs_segbuf_clear(segbuf); +} + +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last) +{ + struct nilfs_segment_buffer *n, *segbuf; + + segbuf = list_prepare_entry(last, logs, sb_list); + list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_clear(segbuf); + nilfs_segbuf_free(segbuf); + } +} + +int nilfs_wait_on_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, logs, sb_list) { + err = nilfs_segbuf_wait(segbuf); + if (err) + return err; + } + return 0; +} + /* * BIO operations */ static void nilfs_end_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nilfs_write_info *wi = bio->bi_private; + struct nilfs_segment_buffer *segbuf = bio->bi_private; if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); @@ -256,21 +330,22 @@ static void nilfs_end_bio_write(struct bio *bio, int err) } if (!uptodate) - atomic_inc(&wi->err); + atomic_inc(&segbuf->sb_err); bio_put(bio); - complete(&wi->bio_event); + complete(&segbuf->sb_bio_event); } -static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) +static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, int mode) { struct bio *bio = wi->bio; int err; - if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { - wait_for_completion(&wi->bio_event); - wi->nbio--; - if (unlikely(atomic_read(&wi->err))) { + if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) { + wait_for_completion(&segbuf->sb_bio_event); + segbuf->sb_nbio--; + if (unlikely(atomic_read(&segbuf->sb_err))) { bio_put(bio); err = -EIO; goto failed; @@ -278,7 +353,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) } bio->bi_end_io = nilfs_end_bio_write; - bio->bi_private = wi; + bio->bi_private = segbuf; bio_get(bio); submit_bio(mode, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { @@ -286,7 +361,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) err = -EOPNOTSUPP; goto failed; } - wi->nbio++; + segbuf->sb_nbio++; bio_put(bio); wi->bio = NULL; @@ -301,17 +376,15 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) } /** - * nilfs_alloc_seg_bio - allocate a bio for writing segment. - * @sb: super block - * @start: beginning disk block number of this BIO. + * nilfs_alloc_seg_bio - allocate a new bio for writing log + * @nilfs: nilfs object + * @start: start block number of the bio * @nr_vecs: request size of page vector. * - * alloc_seg_bio() allocates a new BIO structure and initialize it. - * * Return Value: On success, pointer to the struct bio is returned. * On error, NULL is returned. */ -static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, +static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, int nr_vecs) { struct bio *bio; @@ -322,36 +395,33 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { - bio->bi_bdev = sb->s_bdev; - bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9); + bio->bi_bdev = nilfs->ns_bdev; + bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); } return bio; } -void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi) +static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev); + wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev); wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; - wi->nbio = 0; wi->blocknr = segbuf->sb_pseg_start; - - atomic_set(&wi->err, 0); - init_completion(&wi->bio_event); } -static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, - int mode) +static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, + struct buffer_head *bh, int mode) { int len, err; BUG_ON(wi->nr_vecs <= 0); repeat: if (!wi->bio) { - wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end, + wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end, wi->nr_vecs); if (unlikely(!wi->bio)) return -ENOMEM; @@ -363,76 +433,83 @@ static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, return 0; } /* bio is FULL */ - err = nilfs_submit_seg_bio(wi, mode); + err = nilfs_segbuf_submit_bio(segbuf, wi, mode); /* never submit current bh */ if (likely(!err)) goto repeat; return err; } +/** + * nilfs_segbuf_write - submit write requests of a log + * @segbuf: buffer storing a log to be written + * @nilfs: nilfs object + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. + */ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi) + struct the_nilfs *nilfs) { + struct nilfs_write_info wi; struct buffer_head *bh; - int res, rw = WRITE; + int res = 0, rw = WRITE; + + wi.nilfs = nilfs; + nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - res = nilfs_submit_bh(wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - res = nilfs_submit_bh(wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); if (unlikely(res)) goto failed_bio; } - if (wi->bio) { + if (wi.bio) { /* * Last BIO is always sent through the following * submission. */ rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); - res = nilfs_submit_seg_bio(wi, rw); - if (unlikely(res)) - goto failed_bio; + res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } - res = 0; - out: - return res; - failed_bio: - atomic_inc(&wi->err); - goto out; + return res; } /** * nilfs_segbuf_wait - wait for completion of requested BIOs - * @wi: nilfs_write_info + * @segbuf: segment buffer * * Return Value: On Success, 0 is returned. On Error, one of the following * negative error code is returned. * * %-EIO - I/O error */ -int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi) +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) { int err = 0; - if (!wi->nbio) + if (!segbuf->sb_nbio) return 0; do { - wait_for_completion(&wi->bio_event); - } while (--wi->nbio > 0); + wait_for_completion(&segbuf->sb_bio_event); + } while (--segbuf->sb_nbio > 0); - if (unlikely(atomic_read(&wi->err) > 0)) { + if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { printk(KERN_ERR "NILFS: IO error writing segment\n"); err = -EIO; - segbuf->sb_io_error = 1; } return err; } diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 0c3076f4e59..6af1630fb40 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -27,7 +27,6 @@ #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/completion.h> -#include <linux/backing-dev.h> /** * struct nilfs_segsum_info - On-memory segment summary @@ -77,7 +76,9 @@ struct nilfs_segsum_info { * @sb_rest_blocks: Number of residual blocks in the current segment * @sb_segsum_buffers: List of buffers for segment summaries * @sb_payload_buffers: List of buffers for segment payload - * @sb_io_error: I/O error status + * @sb_nbio: Number of flying bio requests + * @sb_err: I/O error status + * @sb_bio_event: Completion event of log writing */ struct nilfs_segment_buffer { struct super_block *sb_super; @@ -96,7 +97,9 @@ struct nilfs_segment_buffer { struct list_head sb_payload_buffers; /* including super root */ /* io status */ - int sb_io_error; + int sb_nbio; + atomic_t sb_err; + struct completion sb_bio_event; }; #define NILFS_LIST_SEGBUF(head) \ @@ -125,6 +128,8 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); void nilfs_segbuf_free(struct nilfs_segment_buffer *); void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, struct the_nilfs *); +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); @@ -161,41 +166,18 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, segbuf->sb_sum.nfileblk++; } -void nilfs_release_buffers(struct list_head *); +int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs); +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); -static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +void nilfs_clear_logs(struct list_head *logs); +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last); +int nilfs_wait_on_logs(struct list_head *logs); + +static inline void nilfs_destroy_logs(struct list_head *logs) { - nilfs_release_buffers(&segbuf->sb_segsum_buffers); - nilfs_release_buffers(&segbuf->sb_payload_buffers); + nilfs_truncate_logs(logs, NULL); } -struct nilfs_write_info { - struct bio *bio; - int start, end; /* The region to be submitted */ - int rest_blocks; - int max_pages; - int nr_vecs; - sector_t blocknr; - - int nbio; - atomic_t err; - struct completion bio_event; - /* completion event of segment write */ - - /* - * The following fields must be set explicitly - */ - struct super_block *sb; - struct backing_dev_info *bdi; /* backing dev info */ - struct buffer_head *bh_sr; -}; - - -void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *, - struct nilfs_write_info *); -int nilfs_segbuf_write(struct nilfs_segment_buffer *, - struct nilfs_write_info *); -int nilfs_segbuf_wait(struct nilfs_segment_buffer *, - struct nilfs_write_info *); - #endif /* _NILFS_SEGBUF_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6eff66a070d..17584c52448 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -974,12 +974,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs->ns_nongc_ctime : sci->sc_seg_ctime); raw_sr->sr_flags = 0; - nilfs_mdt_write_inode_direct( - nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz)); - nilfs_mdt_write_inode_direct( - nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz)); - nilfs_mdt_write_inode_direct( - nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz)); + nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr + + NILFS_SR_DAT_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + + NILFS_SR_CPFILE_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + + NILFS_SR_SUFILE_OFFSET(isz), 1); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1273,73 +1273,75 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) return err; } -static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum) -{ - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; - int err; - - err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su); - if (unlikely(err)) - return err; - nilfs_mdt_mark_buffer_dirty(bh_su); - nilfs_mdt_mark_dirty(sufile); - nilfs_sufile_put_segment_usage(sufile, segnum, bh_su); - return 0; -} - +/** + * nilfs_segctor_begin_construction - setup segment buffer to make a new log + * @sci: nilfs_sc_info + * @nilfs: nilfs object + */ static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { - struct nilfs_segment_buffer *segbuf, *n; + struct nilfs_segment_buffer *segbuf, *prev; __u64 nextnum; - int err; + int err, alloc = 0; - if (list_empty(&sci->sc_segbufs)) { - segbuf = nilfs_segbuf_new(sci->sc_super); - if (unlikely(!segbuf)) - return -ENOMEM; - list_add(&segbuf->sb_list, &sci->sc_segbufs); - } else - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + + if (list_empty(&sci->sc_write_logs)) { + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, + nilfs->ns_pseg_offset, nilfs); + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } - nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, - nilfs); + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nextnum = nilfs->ns_nextnum; - if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { - nilfs_shift_to_next_segment(nilfs); - nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + if (nilfs->ns_segnum == nilfs->ns_nextnum) + /* Start from the head of a new full segment */ + alloc++; + } else { + /* Continue logs */ + prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs); + nilfs_segbuf_map_cont(segbuf, prev); + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq; + nextnum = prev->sb_nextnum; + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + segbuf->sb_sum.seg_seq++; + alloc++; + } } - sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; - err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum); - if (unlikely(err)) - return err; + err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum); + if (err) + goto failed; - if (nilfs->ns_segnum == nilfs->ns_nextnum) { - /* Start from the head of a new full segment */ + if (alloc) { err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); - if (unlikely(err)) - return err; - } else - nextnum = nilfs->ns_nextnum; - - segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + if (err) + goto failed; + } nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); - /* truncating segment buffers */ - list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, - sb_list) { - list_del_init(&segbuf->sb_list); - nilfs_segbuf_free(segbuf); - } + BUG_ON(!list_empty(&sci->sc_segbufs)); + list_add_tail(&segbuf->sb_list, &sci->sc_segbufs); + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; return 0; + + failed: + nilfs_segbuf_free(segbuf); + return err; } static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, struct the_nilfs *nilfs, int nadd) { - struct nilfs_segment_buffer *segbuf, *prev, *n; + struct nilfs_segment_buffer *segbuf, *prev; struct inode *sufile = nilfs->ns_sufile; __u64 nextnextnum; LIST_HEAD(list); @@ -1352,7 +1354,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, * not be dirty. The following call ensures that the buffer is dirty * and will pin the buffer on memory until the sufile is written. */ - err = nilfs_touch_segusage(sufile, prev->sb_nextnum); + err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum); if (unlikely(err)) return err; @@ -1378,33 +1380,33 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, list_add_tail(&segbuf->sb_list, &list); prev = segbuf; } - list_splice(&list, sci->sc_segbufs.prev); + list_splice_tail(&list, &sci->sc_segbufs); return 0; failed_segbuf: nilfs_segbuf_free(segbuf); failed: - list_for_each_entry_safe(segbuf, n, &list, sb_list) { + list_for_each_entry(segbuf, &list, sb_list) { ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ - list_del_init(&segbuf->sb_list); - nilfs_segbuf_free(segbuf); } + nilfs_destroy_logs(&list); return err; } -static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, - struct the_nilfs *nilfs) +static void nilfs_free_incomplete_logs(struct list_head *logs, + struct the_nilfs *nilfs) { - struct nilfs_segment_buffer *segbuf; - int ret, done = 0; + struct nilfs_segment_buffer *segbuf, *prev; + struct inode *sufile = nilfs->ns_sufile; + int ret; - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + segbuf = NILFS_FIRST_SEGBUF(logs); if (nilfs->ns_nextnum != segbuf->sb_nextnum) { - ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ } - if (segbuf->sb_io_error) { + if (atomic_read(&segbuf->sb_err)) { /* Case 1: The first segment failed */ if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) /* Case 1a: Partial segment appended into an existing @@ -1413,106 +1415,54 @@ static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, segbuf->sb_fseg_end); else /* Case 1b: New full segment */ set_nilfs_discontinued(nilfs); - done++; } - list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); - WARN_ON(ret); /* never fails */ - if (!done && segbuf->sb_io_error) { - if (segbuf->sb_segnum != nilfs->ns_nextnum) - /* Case 2: extended segment (!= next) failed */ - nilfs_sufile_set_error(nilfs->ns_sufile, - segbuf->sb_segnum); - done++; - } - } -} - -static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci) -{ - struct nilfs_segment_buffer *segbuf; - - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) - nilfs_segbuf_clear(segbuf); - sci->sc_super_root = NULL; -} - -static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci) -{ - struct nilfs_segment_buffer *segbuf; - - while (!list_empty(&sci->sc_segbufs)) { - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); - list_del_init(&segbuf->sb_list); - nilfs_segbuf_free(segbuf); - } - /* sci->sc_curseg = NULL; */ -} - -static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci, - struct the_nilfs *nilfs, int err) -{ - if (unlikely(err)) { - nilfs_segctor_free_incomplete_segments(sci, nilfs); - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { - int ret; - - ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, - sci->sc_freesegs, - sci->sc_nfreesegs, - NULL); - WARN_ON(ret); /* do not happen */ + prev = segbuf; + list_for_each_entry_continue(segbuf, logs, sb_list) { + if (prev->sb_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ } + if (atomic_read(&segbuf->sb_err) && + segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(sufile, segbuf->sb_segnum); + prev = segbuf; } - nilfs_segctor_clear_segment_buffers(sci); } static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, struct inode *sufile) { struct nilfs_segment_buffer *segbuf; - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; unsigned long live_blocks; int ret; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, - &raw_su, &bh_su); - WARN_ON(ret); /* always succeed because bh_su is dirty */ live_blocks = segbuf->sb_sum.nblocks + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); - raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime); - raw_su->su_nblocks = cpu_to_le32(live_blocks); - nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, - bh_su); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + live_blocks, + sci->sc_seg_ctime); + WARN_ON(ret); /* always succeed because the segusage is dirty */ } } -static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci, - struct inode *sufile) +static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile) { struct nilfs_segment_buffer *segbuf; - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; int ret; - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); - ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, - &raw_su, &bh_su); - WARN_ON(ret); /* always succeed because bh_su is dirty */ - raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start - - segbuf->sb_fseg_start); - nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su); + segbuf = NILFS_FIRST_SEGBUF(logs); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + segbuf->sb_pseg_start - + segbuf->sb_fseg_start, 0); + WARN_ON(ret); /* always succeed because the segusage is dirty */ - list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, - &raw_su, &bh_su); + list_for_each_entry_continue(segbuf, logs, sb_list) { + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + 0, 0); WARN_ON(ret); /* always succeed */ - raw_su->su_nblocks = 0; - nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, - bh_su); } } @@ -1520,17 +1470,15 @@ static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, struct nilfs_segment_buffer *last, struct inode *sufile) { - struct nilfs_segment_buffer *segbuf = last, *n; + struct nilfs_segment_buffer *segbuf = last; int ret; - list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, - sb_list) { - list_del_init(&segbuf->sb_list); + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); - nilfs_segbuf_free(segbuf); } + nilfs_truncate_logs(&sci->sc_segbufs, last); } @@ -1569,7 +1517,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, NULL); WARN_ON(err); /* do not happen */ } - nilfs_segctor_clear_segment_buffers(sci); + nilfs_clear_logs(&sci->sc_segbufs); err = nilfs_segctor_extend_segments(sci, nilfs, nadd); if (unlikely(err)) @@ -1814,26 +1762,18 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, } static int nilfs_segctor_write(struct nilfs_sc_info *sci, - struct backing_dev_info *bdi) + struct the_nilfs *nilfs) { struct nilfs_segment_buffer *segbuf; - struct nilfs_write_info wi; - int err, res; - - wi.sb = sci->sc_super; - wi.bh_sr = sci->sc_super_root; - wi.bdi = bdi; + int ret = 0; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - nilfs_segbuf_prepare_write(segbuf, &wi); - err = nilfs_segbuf_write(segbuf, &wi); - - res = nilfs_segbuf_wait(segbuf, &wi); - err = err ? : res; - if (err) - return err; + ret = nilfs_segbuf_write(segbuf, nilfs); + if (ret) + break; } - return 0; + list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); + return ret; } static void __nilfs_end_page_io(struct page *page, int err) @@ -1911,15 +1851,17 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err) } } -static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, - struct page *failed_page, int err) +static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, + struct buffer_head *bh_sr, int err) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; + struct buffer_head *bh; - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - struct buffer_head *bh; + if (list_empty(logs)) + return; + list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { if (bh->b_page != bd_page) { @@ -1931,7 +1873,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == sci->sc_super_root) { + if (bh == bh_sr) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; @@ -1941,7 +1883,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, if (bh->b_page != fs_page) { nilfs_end_page_io(fs_page, err); if (fs_page && fs_page == failed_page) - goto done; + return; fs_page = bh->b_page; } } @@ -1950,8 +1892,34 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, end_page_writeback(bd_page); nilfs_end_page_io(fs_page, err); - done: +} + +static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + LIST_HEAD(logs); + int ret; + + list_splice_tail_init(&sci->sc_write_logs, &logs); + ret = nilfs_wait_on_logs(&logs); + if (ret) + nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret); + + list_splice_tail_init(&sci->sc_segbufs, &logs); + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); + nilfs_free_incomplete_logs(&logs, nilfs); nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); + + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(ret); /* do not happen */ + } + + nilfs_destroy_logs(&logs); + sci->sc_super_root = NULL; } static void nilfs_set_next_segment(struct the_nilfs *nilfs, @@ -1973,7 +1941,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) struct the_nilfs *nilfs = sbi->s_nilfs; int update_sr = (sci->sc_super_root != NULL); - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { struct buffer_head *bh; list_for_each_entry(bh, &segbuf->sb_segsum_buffers, @@ -2046,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) sci->sc_nblk_inc += sci->sc_nblk_this_inc; - segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs); nilfs_set_next_segment(nilfs, segbuf); if (update_sr) { @@ -2057,10 +2025,23 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + nilfs_segctor_clear_metadata_dirty(sci); } else clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); } +static int nilfs_segctor_wait(struct nilfs_sc_info *sci) +{ + int ret; + + ret = nilfs_wait_on_logs(&sci->sc_write_logs); + if (!ret) { + nilfs_segctor_complete_write(sci); + nilfs_destroy_logs(&sci->sc_write_logs); + } + return ret; +} + static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi) { @@ -2173,7 +2154,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Avoid empty segment */ if (sci->sc_stage.scnt == NILFS_ST_DONE && NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { - nilfs_segctor_end_construction(sci, nilfs, 1); + nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; } @@ -2187,7 +2168,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (has_sr) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) - goto failed_to_make_up; + goto failed_to_write; nilfs_segctor_fill_in_super_root(sci, nilfs); } @@ -2195,42 +2176,46 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Write partial segments */ err = nilfs_segctor_prepare_write(sci, &failed_page); - if (unlikely(err)) + if (err) { + nilfs_abort_logs(&sci->sc_segbufs, failed_page, + sci->sc_super_root, err); goto failed_to_write; - + } nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); - err = nilfs_segctor_write(sci, nilfs->ns_bdi); + err = nilfs_segctor_write(sci, nilfs); if (unlikely(err)) goto failed_to_write; - nilfs_segctor_complete_write(sci); - - /* Commit segments */ - if (has_sr) - nilfs_segctor_clear_metadata_dirty(sci); - - nilfs_segctor_end_construction(sci, nilfs, 0); - + if (sci->sc_stage.scnt == NILFS_ST_DONE || + nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { + /* + * At this point, we avoid double buffering + * for blocksize < pagesize because page dirty + * flag is turned off during write and dirty + * buffers are not properly collected for + * pages crossing over segments. + */ + err = nilfs_segctor_wait(sci); + if (err) + goto failed_to_write; + } } while (sci->sc_stage.scnt != NILFS_ST_DONE); + sci->sc_super_root = NULL; + out: - nilfs_segctor_destroy_segment_buffers(sci); nilfs_segctor_check_out_files(sci, sbi); return err; failed_to_write: - nilfs_segctor_abort_write(sci, failed_page, err); - nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile); - - failed_to_make_up: if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) nilfs_redirty_inodes(&sci->sc_dirty_files); failed: if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); - nilfs_segctor_end_construction(sci, nilfs, err); + nilfs_segctor_abort_construction(sci, nilfs, err); goto out; } @@ -2559,7 +2544,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, sci->sc_freesegs = kbufs[4]; sci->sc_nfreesegs = argv[4].v_nmembs; - list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); + list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); for (;;) { nilfs_segctor_accept(sci, &req); @@ -2788,6 +2773,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) spin_lock_init(&sci->sc_state_lock); INIT_LIST_HEAD(&sci->sc_dirty_files); INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_copied_buffers); @@ -2855,6 +2841,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) } WARN_ON(!list_empty(&sci->sc_segbufs)); + WARN_ON(!list_empty(&sci->sc_write_logs)); down_write(&sbi->s_nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 0d2a475a741..3d3ab2f9864 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -97,6 +97,7 @@ struct nilfs_segsum_pointer { * @sc_dsync_start: start byte offset of data pages * @sc_dsync_end: end byte offset of data pages (inclusive) * @sc_segbufs: List of segment buffers + * @sc_write_logs: List of segment buffers to hold logs under writing * @sc_segbuf_nblocks: Number of available blocks in segment buffers. * @sc_curseg: Current segment buffer * @sc_super_root: Pointer to the super root buffer @@ -143,6 +144,7 @@ struct nilfs_sc_info { /* Segment buffers */ struct list_head sc_segbufs; + struct list_head sc_write_logs; unsigned long sc_segbuf_nblocks; struct nilfs_segment_buffer *sc_curseg; struct buffer_head *sc_super_root; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 37994d4a59c..b6c36d0cc33 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -31,6 +31,16 @@ #include "sufile.h" +struct nilfs_sufile_info { + struct nilfs_mdt_info mi; + unsigned long ncleansegs; +}; + +static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) +{ + return (struct nilfs_sufile_info *)NILFS_MDT(sufile); +} + static inline unsigned long nilfs_sufile_segment_usages_per_block(const struct inode *sufile) { @@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, max - curr + 1); } -static inline struct nilfs_sufile_header * -nilfs_sufile_block_get_header(const struct inode *sufile, - struct buffer_head *bh, - void *kaddr) -{ - return kaddr + bh_offset(bh); -} - static struct nilfs_segment_usage * nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, struct buffer_head *bh, void *kaddr) @@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, } /** + * nilfs_sufile_get_ncleansegs - return the number of clean segments + * @sufile: inode of segment usage file + */ +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) +{ + return NILFS_SUI(sufile)->ncleansegs; +} + +/** * nilfs_sufile_updatev - modify multiple segment usages at a time * @sufile: inode of segment usage file * @segnumv: array of segment numbers @@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) if (ret < 0) goto out_sem; kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + header = kaddr + bh_offset(header_bh); ncleansegs = le64_to_cpu(header->sh_ncleansegs); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_atomic(kaddr, KM_USER0); @@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) kunmap_atomic(kaddr, KM_USER0); kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header( - sufile, header_bh, kaddr); + header = kaddr + bh_offset(header_bh); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); kunmap_atomic(kaddr, KM_USER0); + NILFS_SUI(sufile)->ncleansegs--; nilfs_mdt_mark_buffer_dirty(header_bh); nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); @@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, kunmap_atomic(kaddr, KM_USER0); nilfs_sufile_mod_counter(header_bh, -1, 1); + NILFS_SUI(sufile)->ncleansegs--; + nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, kunmap_atomic(kaddr, KM_USER0); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); + NILFS_SUI(sufile)->ncleansegs -= clean; + nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); + NILFS_SUI(sufile)->ncleansegs++; + nilfs_mdt_mark_dirty(sufile); } /** - * nilfs_sufile_get_segment_usage - get a segment usage + * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty * @sufile: inode of segment usage file * @segnum: segment number - * @sup: pointer to segment usage - * @bhp: pointer to buffer head - * - * Description: nilfs_sufile_get_segment_usage() acquires the segment usage - * specified by @segnum. - * - * Return Value: On success, 0 is returned, and the segment usage and the - * buffer head of the buffer on which the segment usage is located are stored - * in the place pointed by @sup and @bhp, respectively. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid segment usage number. */ -int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum, - struct nilfs_segment_usage **sup, - struct buffer_head **bhp) +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; - struct nilfs_segment_usage *su; - void *kaddr; int ret; - /* segnum is 0 origin */ - if (segnum >= nilfs_sufile_get_nsegments(sufile)) - return -EINVAL; - down_write(&NILFS_MDT(sufile)->mi_sem); - ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh); - if (ret < 0) - goto out_sem; - kaddr = kmap(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); - if (nilfs_segment_usage_error(su)) { - kunmap(bh->b_page); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (!ret) { + nilfs_mdt_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); brelse(bh); - ret = -EINVAL; - goto out_sem; } - - if (sup != NULL) - *sup = su; - *bhp = bh; - - out_sem: - up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** - * nilfs_sufile_put_segment_usage - put a segment usage + * nilfs_sufile_set_segment_usage - set usage of a segment * @sufile: inode of segment usage file * @segnum: segment number - * @bh: buffer head - * - * Description: nilfs_sufile_put_segment_usage() releases the segment usage - * specified by @segnum. @bh must be the buffer head which have been returned - * by a previous call to nilfs_sufile_get_segment_usage() with @segnum. + * @nblocks: number of live blocks in the segment + * @modtime: modification time (option) */ -void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum, - struct buffer_head *bh) +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime) { - kunmap(bh->b_page); + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + WARN_ON(nilfs_segment_usage_error(su)); + if (modtime) + su->su_lastmod = cpu_to_le64(modtime); + su->su_nblocks = cpu_to_le32(nblocks); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); brelse(bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; } /** @@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) goto out_sem; kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + header = kaddr + bh_offset(header_bh); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); @@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) return ret; } -/** - * nilfs_sufile_get_ncleansegs - get the number of clean segments - * @sufile: inode of segment usage file - * @nsegsp: pointer to the number of clean segments - * - * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean - * segments. - * - * Return Value: On success, 0 is returned and the number of clean segments is - * stored in the place pointed by @nsegsp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - */ -int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp) -{ - struct nilfs_sustat sustat; - int ret; - - ret = nilfs_sufile_get_stat(sufile, &sustat); - if (ret == 0) - *nsegsp = sustat.ss_ncleansegs; - return ret; -} - void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) @@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, nilfs_segment_usage_set_error(su); kunmap_atomic(kaddr, KM_USER0); - if (suclean) + if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); + NILFS_SUI(sufile)->ncleansegs--; + } nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, up_read(&NILFS_MDT(sufile)->mi_sem); return ret; } + +/** + * nilfs_sufile_read - read sufile inode + * @sufile: sufile inode + * @raw_inode: on-disk sufile inode + */ +int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode) +{ + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + void *kaddr; + int ret; + + ret = nilfs_read_inode_common(sufile, raw_inode); + if (ret < 0) + return ret; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (!ret) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + } + return ret; +} + +/** + * nilfs_sufile_new - create sufile + * @nilfs: nilfs object + * @susize: size of a segment usage entry + */ +struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize) +{ + struct inode *sufile; + + sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO, + sizeof(struct nilfs_sufile_info)); + if (sufile) + nilfs_mdt_set_entry_size(sufile, susize, + sizeof(struct nilfs_sufile_header)); + return sufile; +} diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 0e99e5c0bd0..15163b8aff7 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -34,14 +34,13 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; } +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); + int nilfs_sufile_alloc(struct inode *, __u64 *); -int nilfs_sufile_get_segment_usage(struct inode *, __u64, - struct nilfs_segment_usage **, - struct buffer_head **); -void nilfs_sufile_put_segment_usage(struct inode *, __u64, - struct buffer_head *); +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); -int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, size_t); @@ -62,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, struct buffer_head *); +int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode); +struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize); + /** * nilfs_sufile_scrap - make a segment garbage * @sufile: inode of segment usage file diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 644e66727dd..5403b3ef3a4 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -363,14 +363,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); - sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); + sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); if (!sbi->s_ifile) return -ENOMEM; - err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size); - if (unlikely(err)) - goto failed; - down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); @@ -411,7 +407,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) { struct the_nilfs *nilfs = sbi->s_nilfs; - nilfs_mdt_clear(sbi->s_ifile); nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; down_write(&nilfs->ns_super_sem); @@ -419,22 +414,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) up_write(&nilfs->ns_super_sem); } -static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) -{ - struct the_nilfs *nilfs = sbi->s_nilfs; - int err = 0; - - down_write(&nilfs->ns_sem); - if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { - nilfs->ns_mount_state |= NILFS_VALID_FS; - err = nilfs_commit_super(sbi, 1); - if (likely(!err)) - printk(KERN_INFO "NILFS: recovery complete.\n"); - } - up_write(&nilfs->ns_sem); - return err; -} - static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -490,7 +469,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) struct nilfs_sb_info *sbi = NILFS_SB(sb); if (!nilfs_test_opt(sbi, BARRIER)) - seq_printf(seq, ",barrier=off"); + seq_printf(seq, ",nobarrier"); if (nilfs_test_opt(sbi, SNAPSHOT)) seq_printf(seq, ",cp=%llu", (unsigned long long int)sbi->s_snapshot_cno); @@ -500,6 +479,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",errors=panic"); if (nilfs_test_opt(sbi, STRICT_ORDER)) seq_printf(seq, ",order=strict"); + if (nilfs_test_opt(sbi, NORECOVERY)) + seq_printf(seq, ",norecovery"); return 0; } @@ -568,7 +549,7 @@ static const struct export_operations nilfs_export_ops = { enum { Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_barrier, Opt_snapshot, Opt_order, + Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, Opt_err, }; @@ -576,25 +557,13 @@ static match_table_t tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, - {Opt_barrier, "barrier=%s"}, + {Opt_nobarrier, "nobarrier"}, {Opt_snapshot, "cp=%u"}, {Opt_order, "order=%s"}, + {Opt_norecovery, "norecovery"}, {Opt_err, NULL} }; -static int match_bool(substring_t *s, int *result) -{ - int len = s->to - s->from; - - if (strncmp(s->from, "on", len) == 0) - *result = 1; - else if (strncmp(s->from, "off", len) == 0) - *result = 0; - else - return 1; - return 0; -} - static int parse_options(char *options, struct super_block *sb) { struct nilfs_sb_info *sbi = NILFS_SB(sb); @@ -612,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb) token = match_token(p, tokens, args); switch (token) { - case Opt_barrier: - if (match_bool(&args[0], &option)) - return 0; - if (option) - nilfs_set_opt(sbi, BARRIER); - else - nilfs_clear_opt(sbi, BARRIER); + case Opt_nobarrier: + nilfs_clear_opt(sbi, BARRIER); break; case Opt_order: if (strcmp(args[0].from, "relaxed") == 0) @@ -647,6 +611,9 @@ static int parse_options(char *options, struct super_block *sb) sbi->s_snapshot_cno = option; nilfs_set_opt(sbi, SNAPSHOT); break; + case Opt_norecovery: + nilfs_set_opt(sbi, NORECOVERY); + break; default: printk(KERN_ERR "NILFS: Unrecognized mount option \"%s\"\n", p); @@ -672,9 +639,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi) int mnt_count = le16_to_cpu(sbp->s_mnt_count); /* nilfs->sem must be locked by the caller. */ - if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { - printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); - } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + if (nilfs->ns_mount_state & NILFS_ERROR_FS) { printk(KERN_WARNING "NILFS warning: mounting fs with errors\n"); #if 0 @@ -782,11 +747,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, sb->s_root = NULL; sb->s_time_gran = 1; - if (!nilfs_loaded(nilfs)) { - err = load_nilfs(nilfs, sbi); - if (err) - goto failed_sbi; - } + err = load_nilfs(nilfs, sbi); + if (err) + goto failed_sbi; + cno = nilfs_last_cno(nilfs); if (sb->s_flags & MS_RDONLY) { @@ -854,12 +818,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, up_write(&nilfs->ns_sem); } - err = nilfs_mark_recovery_complete(sbi); - if (unlikely(err)) { - printk(KERN_ERR "NILFS: recovery failed.\n"); - goto failed_root; - } - down_write(&nilfs->ns_super_sem); if (!nilfs_test_opt(sbi, SNAPSHOT)) nilfs->ns_current = sbi; @@ -867,10 +825,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, return 0; - failed_root: - dput(sb->s_root); - sb->s_root = NULL; - failed_segctor: nilfs_detach_segment_constructor(sbi); @@ -915,6 +869,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (!nilfs_valid_fs(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the filesystem is in an " + "incomplete recovery state.\n", sb->s_id); + err = -EINVAL; + goto restore_opts; + } + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) goto out; if (*flags & MS_RDONLY) { diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index ad391a8c3e7..6241e1722ef 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -146,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs) might_sleep(); if (nilfs_loaded(nilfs)) { - nilfs_mdt_clear(nilfs->ns_sufile); nilfs_mdt_destroy(nilfs->ns_sufile); - nilfs_mdt_clear(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_cpfile); - nilfs_mdt_clear(nilfs->ns_dat); nilfs_mdt_destroy(nilfs->ns_dat); - /* XXX: how and when to clear nilfs->ns_gc_dat? */ nilfs_mdt_destroy(nilfs->ns_gc_dat); } if (nilfs_init(nilfs)) { @@ -166,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs) static int nilfs_load_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, sector_t sr_block) { - static struct lock_class_key dat_lock_key; struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; struct nilfs_super_block **sbp = nilfs->ns_sbp; @@ -187,51 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, inode_size = nilfs->ns_inode_size; err = -ENOMEM; - nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); + nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size); if (unlikely(!nilfs->ns_dat)) goto failed; - nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); + nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size); if (unlikely(!nilfs->ns_gc_dat)) goto failed_dat; - nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); + nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size); if (unlikely(!nilfs->ns_cpfile)) goto failed_gc_dat; - nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); + nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size); if (unlikely(!nilfs->ns_sufile)) goto failed_cpfile; - err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size); - if (unlikely(err)) - goto failed_sufile; - - err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size); - if (unlikely(err)) - goto failed_sufile; - - lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key); - lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key); - nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); - nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size, - sizeof(struct nilfs_cpfile_header)); - nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size, - sizeof(struct nilfs_sufile_header)); - err = nilfs_mdt_read_inode_direct( - nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size)); + err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data + + NILFS_SR_DAT_OFFSET(inode_size)); if (unlikely(err)) goto failed_sufile; - err = nilfs_mdt_read_inode_direct( - nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size)); + err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data + + NILFS_SR_CPFILE_OFFSET(inode_size)); if (unlikely(err)) goto failed_sufile; - err = nilfs_mdt_read_inode_direct( - nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size)); + err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data + + NILFS_SR_SUFILE_OFFSET(inode_size)); if (unlikely(err)) goto failed_sufile; @@ -281,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) struct nilfs_recovery_info ri; unsigned int s_flags = sbi->s_super->s_flags; int really_read_only = bdev_read_only(nilfs->ns_bdev); - unsigned valid_fs; - int err = 0; - - nilfs_init_recovery_info(&ri); + int valid_fs = nilfs_valid_fs(nilfs); + int err; - down_write(&nilfs->ns_sem); - valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); - up_write(&nilfs->ns_sem); + if (nilfs_loaded(nilfs)) { + if (valid_fs || + ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY))) + return 0; + printk(KERN_ERR "NILFS: the filesystem is in an incomplete " + "recovery state.\n"); + return -EINVAL; + } - if (!valid_fs && (s_flags & MS_RDONLY)) { - printk(KERN_INFO "NILFS: INFO: recovery " - "required for readonly filesystem.\n"); - if (really_read_only) { - printk(KERN_ERR "NILFS: write access " - "unavailable, cannot proceed.\n"); - err = -EROFS; - goto failed; + if (!valid_fs) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); } - printk(KERN_INFO "NILFS: write access will " - "be enabled during recovery.\n"); - sbi->s_super->s_flags &= ~MS_RDONLY; } + nilfs_init_recovery_info(&ri); + err = nilfs_search_super_root(nilfs, sbi, &ri); if (unlikely(err)) { printk(KERN_ERR "NILFS: error searching super root.\n"); @@ -316,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) goto failed; } - if (!valid_fs) { - err = nilfs_recover_logical_segments(nilfs, sbi, &ri); - if (unlikely(err)) { - nilfs_mdt_destroy(nilfs->ns_cpfile); - nilfs_mdt_destroy(nilfs->ns_sufile); - nilfs_mdt_destroy(nilfs->ns_dat); - goto failed; + if (valid_fs) + goto skip_recovery; + + if (s_flags & MS_RDONLY) { + if (nilfs_test_opt(sbi, NORECOVERY)) { + printk(KERN_INFO "NILFS: norecovery option specified. " + "skipping roll-forward recovery\n"); + goto skip_recovery; } - if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) - sbi->s_super->s_dirt = 1; + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed_unload; + } + sbi->s_super->s_flags &= ~MS_RDONLY; + } else if (nilfs_test_opt(sbi, NORECOVERY)) { + printk(KERN_ERR "NILFS: recovery cancelled because norecovery " + "option was specified for a read/write mount\n"); + err = -EINVAL; + goto failed_unload; } + err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + if (err) + goto failed_unload; + + down_write(&nilfs->ns_sem); + nilfs->ns_mount_state |= NILFS_VALID_FS; + nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + err = nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + + if (err) { + printk(KERN_ERR "NILFS: failed to update super block. " + "recovery unfinished.\n"); + goto failed_unload; + } + printk(KERN_INFO "NILFS: recovery complete.\n"); + + skip_recovery: set_nilfs_loaded(nilfs); + nilfs_clear_recovery_info(&ri); + sbi->s_super->s_flags = s_flags; + return 0; + + failed_unload: + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_dat); failed: nilfs_clear_recovery_info(&ri); @@ -632,30 +650,23 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { struct inode *dat = nilfs_dat_inode(nilfs); unsigned long ncleansegs; - int err; down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ - err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ - if (likely(!err)) - *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; - return err; + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return 0; } int nilfs_near_disk_full(struct the_nilfs *nilfs) { - struct inode *sufile = nilfs->ns_sufile; unsigned long ncleansegs, nincsegs; - int ret; - ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs); - if (likely(!ret)) { - nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / - nilfs->ns_blocks_per_segment + 1; - if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs) - ret++; - } - return ret; + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + + return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; } /** diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 20abd55881e..589786e3346 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -258,6 +258,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) kfree(sbi); } +static inline int nilfs_valid_fs(struct the_nilfs *nilfs) +{ + unsigned valid_fs; + + down_read(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_read(&nilfs->ns_sem); + return valid_fs; +} + static inline void nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, sector_t *seg_start, sector_t *seg_end) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index de059f49058..3d30a1c974a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2006,7 +2006,7 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); - if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index e5df9d170b0..123bc520a2c 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -17,10 +17,6 @@ #include "ocfs2.h" -/* Common stuff */ -/* id number of quota format */ -#define QFMT_OCFS2 3 - /* * In-memory structures */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 1a2c50a759f..21f9e71223c 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1325,7 +1325,7 @@ out: return status; } -static struct quota_format_ops ocfs2_format_ops = { +static const struct quota_format_ops ocfs2_format_ops = { .check_quota_file = ocfs2_local_check_quota_file, .read_file_info = ocfs2_local_read_info, .write_file_info = ocfs2_global_write_info, diff --git a/fs/proc/base.c b/fs/proc/base.c index af643b5aefe..4df4a464a91 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1265,6 +1265,72 @@ static const struct file_operations proc_pid_sched_operations = { #endif +static ssize_t comm_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (same_thread_group(current, p)) + set_task_comm(p, buffer); + else + count = -EINVAL; + + put_task_struct(p); + + return count; +} + +static int comm_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + task_lock(p); + seq_printf(m, "%s\n", p->comm); + task_unlock(p); + + put_task_struct(p); + + return 0; +} + +static int comm_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, comm_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_set_comm_operations = { + .open = comm_open, + .read = seq_read, + .write = comm_write, + .llseek = seq_lseek, + .release = single_release, +}; + /* * We added or removed a vma mapping the executable. The vmas are only mapped * during exec and are not mapped with the mmap system call. @@ -2504,6 +2570,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), #endif @@ -2838,6 +2905,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), #endif diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 7ba79a54948..123257bb356 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/time.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/stat.h> #include <linux/string.h> #include <asm/prom.h> @@ -25,26 +26,27 @@ static struct proc_dir_entry *proc_device_tree; /* * Supply data on a read from /proc/device-tree/node/property. */ -static int property_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int property_proc_show(struct seq_file *m, void *v) { - struct property *pp = data; - int n; + struct property *pp = m->private; - if (off >= pp->length) { - *eof = 1; - return 0; - } - n = pp->length - off; - if (n > count) - n = count; - else - *eof = 1; - memcpy(page, (char *)pp->value + off, n); - *start = page; - return n; + seq_write(m, pp->value, pp->length); + return 0; +} + +static int property_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, property_proc_show, PDE(inode)->data); } +static const struct file_operations property_proc_fops = { + .owner = THIS_MODULE, + .open = property_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * For a node with a name like "gc@10", we make symlinks called "gc" * and "@10" to it. @@ -63,10 +65,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, * Unfortunately proc_register puts each new entry * at the beginning of the list. So we rearrange them. */ - ent = create_proc_read_entry(name, - strncmp(name, "security-", 9) - ? S_IRUGO : S_IRUSR, de, - property_read_proc, pp); + ent = proc_create_data(name, + strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, + de, &property_proc_fops, pp); if (ent == NULL) return NULL; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2a1bef9203c..47c03f4336b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -650,6 +650,50 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } +static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +{ + u64 pme = 0; + if (pte_present(pte)) + pme = PM_PFRAME(pte_pfn(pte) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} + +static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma; + struct pagemapread *pm = walk->private; + struct hstate *hs = NULL; + int err = 0; + + vma = find_vma(walk->mm, addr); + if (vma) + hs = hstate_vma(vma); + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + + if (vma && (addr >= vma->vm_end)) { + vma = find_vma(walk->mm, addr); + if (vma) + hs = hstate_vma(vma); + } + + if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) { + /* calculate pfn of the "raw" page in the hugepage. */ + int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT; + pfn = huge_pte_to_pagemap_entry(*pte, offset); + } + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -742,6 +786,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; + pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; pagemap_walk.mm = mm; pagemap_walk.private = ± diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8f5c05d3dbd..5d9fd64ef81 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -110,9 +110,13 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, } } - size += (*text = mm->end_code - mm->start_code); - size += (*data = mm->start_stack - mm->start_data); + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) + >> PAGE_SHIFT; up_read(&mm->mmap_sem); + size >>= PAGE_SHIFT; + size += *text + *data; *resident = size; return size; } diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 353e78a9ebe..efc02ebb8c7 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -46,12 +46,14 @@ config QFMT_V1 format say Y here. config QFMT_V2 - tristate "Quota format v2 support" + tristate "Quota format vfsv0 and vfsv1 support" depends on QUOTA select QUOTA_TREE help - This quota format allows using quotas with 32-bit UIDs/GIDs. If you - need this functionality say Y here. + This config option enables kernel support for vfsv0 and vfsv1 quota + formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format + also supports 64-bit inode and block quota limits. If you need this + functionality say Y here. config QUOTACTL bool diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index eb5a755718f..cd6bb9a33c1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2164,7 +2164,9 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; + mutex_lock(&sb->s_root->d_inode->i_mutex); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); + mutex_unlock(&sb->s_root->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 0edcf42b177..2ae757e9c00 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -204,7 +204,7 @@ out: return ret; } -static struct quota_format_ops v1_format_ops = { +static const struct quota_format_ops v1_format_ops = { .check_quota_file = v1_check_quota_file, .read_file_info = v1_read_file_info, .write_file_info = v1_write_file_info, diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index a5475fb1ae4..3dfc23e0213 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -23,14 +23,23 @@ MODULE_LICENSE("GPL"); #define __QUOTA_V2_PARANOIA -static void v2_mem2diskdqb(void *dp, struct dquot *dquot); -static void v2_disk2memdqb(struct dquot *dquot, void *dp); -static int v2_is_id(void *dp, struct dquot *dquot); - -static struct qtree_fmt_operations v2_qtree_ops = { - .mem2disk_dqblk = v2_mem2diskdqb, - .disk2mem_dqblk = v2_disk2memdqb, - .is_id = v2_is_id, +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r0_is_id(void *dp, struct dquot *dquot); +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r1_is_id(void *dp, struct dquot *dquot); + +static struct qtree_fmt_operations v2r0_qtree_ops = { + .mem2disk_dqblk = v2r0_mem2diskdqb, + .disk2mem_dqblk = v2r0_disk2memdqb, + .is_id = v2r0_is_id, +}; + +static struct qtree_fmt_operations v2r1_qtree_ops = { + .mem2disk_dqblk = v2r1_mem2diskdqb, + .disk2mem_dqblk = v2r1_disk2memdqb, + .is_id = v2r1_is_id, }; #define QUOTABLOCK_BITS 10 @@ -46,23 +55,33 @@ static inline qsize_t v2_qbtos(qsize_t blocks) return blocks << QUOTABLOCK_BITS; } +static int v2_read_header(struct super_block *sb, int type, + struct v2_disk_dqheader *dqhead) +{ + ssize_t size; + + size = sb->s_op->quota_read(sb, type, (char *)dqhead, + sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) { + printk(KERN_WARNING "quota_v2: Failed header read:" + " expected=%zd got=%zd\n", + sizeof(struct v2_disk_dqheader), size); + return 0; + } + return 1; +} + /* Check whether given file is really vfsv0 quotafile */ static int v2_check_quota_file(struct super_block *sb, int type) { struct v2_disk_dqheader dqhead; - ssize_t size; static const uint quota_magics[] = V2_INITQMAGICS; static const uint quota_versions[] = V2_INITQVERSIONS; - size = sb->s_op->quota_read(sb, type, (char *)&dqhead, - sizeof(struct v2_disk_dqheader), 0); - if (size != sizeof(struct v2_disk_dqheader)) { - printk("quota_v2: failed read expected=%zd got=%zd\n", - sizeof(struct v2_disk_dqheader), size); + if (!v2_read_header(sb, type, &dqhead)) return 0; - } if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || - le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) + le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) return 0; return 1; } @@ -71,14 +90,20 @@ static int v2_check_quota_file(struct super_block *sb, int type) static int v2_read_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; + struct v2_disk_dqheader dqhead; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct qtree_mem_dqinfo *qinfo; ssize_t size; + unsigned int version; + + if (!v2_read_header(sb, type, &dqhead)) + return 0; + version = le32_to_cpu(dqhead.dqh_version); size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - printk(KERN_WARNING "Can't read info structure on device %s.\n", + printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", sb->s_id); return -1; } @@ -89,9 +114,15 @@ static int v2_read_file_info(struct super_block *sb, int type) return -1; } qinfo = info->dqi_priv; - /* limits are stored as unsigned 32-bit data */ - info->dqi_maxblimit = 0xffffffff; - info->dqi_maxilimit = 0xffffffff; + if (version == 0) { + /* limits are stored as unsigned 32-bit data */ + info->dqi_maxblimit = 0xffffffff; + info->dqi_maxilimit = 0xffffffff; + } else { + /* used space is stored as unsigned 64-bit value */ + info->dqi_maxblimit = 0xffffffffffffffff; /* 2^64-1 */ + info->dqi_maxilimit = 0xffffffffffffffff; + } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); @@ -103,8 +134,13 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; qinfo->dqi_qtree_depth = qtree_depth(qinfo); - qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk); - qinfo->dqi_ops = &v2_qtree_ops; + if (version == 0) { + qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk); + qinfo->dqi_ops = &v2r0_qtree_ops; + } else { + qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); + qinfo->dqi_ops = &v2r1_qtree_ops; + } return 0; } @@ -135,9 +171,9 @@ static int v2_write_file_info(struct super_block *sb, int type) return 0; } -static void v2_disk2memdqb(struct dquot *dquot, void *dp) +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp) { - struct v2_disk_dqblk *d = dp, empty; + struct v2r0_disk_dqblk *d = dp, empty; struct mem_dqblk *m = &dquot->dq_dqb; m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); @@ -149,15 +185,15 @@ static void v2_disk2memdqb(struct dquot *dquot, void *dp) m->dqb_curspace = le64_to_cpu(d->dqb_curspace); m->dqb_btime = le64_to_cpu(d->dqb_btime); /* We need to escape back all-zero structure */ - memset(&empty, 0, sizeof(struct v2_disk_dqblk)); + memset(&empty, 0, sizeof(struct v2r0_disk_dqblk)); empty.dqb_itime = cpu_to_le64(1); - if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk))) + if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk))) m->dqb_itime = 0; } -static void v2_mem2diskdqb(void *dp, struct dquot *dquot) +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) { - struct v2_disk_dqblk *d = dp; + struct v2r0_disk_dqblk *d = dp; struct mem_dqblk *m = &dquot->dq_dqb; struct qtree_mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; @@ -175,9 +211,60 @@ static void v2_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_itime = cpu_to_le64(1); } -static int v2_is_id(void *dp, struct dquot *dquot) +static int v2r0_is_id(void *dp, struct dquot *dquot) +{ + struct v2r0_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(info, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct v2r1_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit)); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2r1_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk))) + m->dqb_itime = 0; +} + +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(dquot->dq_id); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); +} + +static int v2r1_is_id(void *dp, struct dquot *dquot) { - struct v2_disk_dqblk *d = dp; + struct v2r1_disk_dqblk *d = dp; struct qtree_mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; @@ -207,7 +294,7 @@ static int v2_free_file_info(struct super_block *sb, int type) return 0; } -static struct quota_format_ops v2_format_ops = { +static const struct quota_format_ops v2_format_ops = { .check_quota_file = v2_check_quota_file, .read_file_info = v2_read_file_info, .write_file_info = v2_write_file_info, @@ -217,20 +304,32 @@ static struct quota_format_ops v2_format_ops = { .release_dqblk = v2_release_dquot, }; -static struct quota_format_type v2_quota_format = { +static struct quota_format_type v2r0_quota_format = { .qf_fmt_id = QFMT_VFS_V0, .qf_ops = &v2_format_ops, .qf_owner = THIS_MODULE }; +static struct quota_format_type v2r1_quota_format = { + .qf_fmt_id = QFMT_VFS_V1, + .qf_ops = &v2_format_ops, + .qf_owner = THIS_MODULE +}; + static int __init init_v2_quota_format(void) { - return register_quota_format(&v2_quota_format); + int ret; + + ret = register_quota_format(&v2r0_quota_format); + if (ret) + return ret; + return register_quota_format(&v2r1_quota_format); } static void __exit exit_v2_quota_format(void) { - unregister_quota_format(&v2_quota_format); + unregister_quota_format(&v2r0_quota_format); + unregister_quota_format(&v2r1_quota_format); } module_init(init_v2_quota_format); diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h index 530fe580685..f1966b42c2f 100644 --- a/fs/quota/quotaio_v2.h +++ b/fs/quota/quotaio_v2.h @@ -17,8 +17,8 @@ } #define V2_INITQVERSIONS {\ - 0, /* USRQUOTA */\ - 0 /* GRPQUOTA */\ + 1, /* USRQUOTA */\ + 1 /* GRPQUOTA */\ } /* First generic header */ @@ -32,7 +32,7 @@ struct v2_disk_dqheader { * (as it appears on disk) - the file is a radix tree whose leaves point * to blocks of these structures. */ -struct v2_disk_dqblk { +struct v2r0_disk_dqblk { __le32 dqb_id; /* id this quota applies to */ __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __le32 dqb_isoftlimit; /* preferred inode limit */ @@ -44,6 +44,19 @@ struct v2_disk_dqblk { __le64 dqb_itime; /* time limit for excessive inode use */ }; +struct v2r1_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_pad; + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ + __le64 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + /* Header with type and version specific information */ struct v2_disk_dqinfo { __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 513f431038f..ac7cd75c86f 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,6 +1,7 @@ config REISERFS_FS tristate "Reiserfs support" select CRC32 + select FS_JOURNAL_INFO help Stores not just filenames but the files themselves in a balanced tree. Uses journalling. diff --git a/fs/sync.c b/fs/sync.c index d104591b066..36752a68348 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -295,10 +295,11 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) */ int generic_write_sync(struct file *file, loff_t pos, loff_t count) { - if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) return 0; return vfs_fsync_range(file, file->f_path.dentry, pos, - pos + count - 1, 1); + pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); } EXPORT_SYMBOL(generic_write_sync); @@ -452,9 +453,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = wait_on_page_writeback_range(mapping, - offset >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + ret = filemap_fdatawait_range(mapping, offset, endbyte); if (ret < 0) goto out; } @@ -467,9 +466,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, } if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { - ret = wait_on_page_writeback_range(mapping, - offset >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + ret = filemap_fdatawait_range(mapping, offset, endbyte); } out: return ret; diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e0201837d24..f05f2303a8b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -25,7 +25,6 @@ #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); -DEFINE_MUTEX(sysfs_rename_mutex); DEFINE_SPINLOCK(sysfs_assoc_lock); static DEFINE_SPINLOCK(sysfs_ino_lock); @@ -85,46 +84,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) } /** - * sysfs_get_dentry - get dentry for the given sysfs_dirent - * @sd: sysfs_dirent of interest - * - * Get dentry for @sd. Dentry is looked up if currently not - * present. This function descends from the root looking up - * dentry for each step. - * - * LOCKING: - * mutex_lock(sysfs_rename_mutex) - * - * RETURNS: - * Pointer to found dentry on success, ERR_PTR() value on error. - */ -struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd) -{ - struct dentry *dentry = dget(sysfs_sb->s_root); - - while (dentry->d_fsdata != sd) { - struct sysfs_dirent *cur; - struct dentry *parent; - - /* find the first ancestor which hasn't been looked up */ - cur = sd; - while (cur->s_parent != dentry->d_fsdata) - cur = cur->s_parent; - - /* look it up */ - parent = dentry; - mutex_lock(&parent->d_inode->i_mutex); - dentry = lookup_one_noperm(cur->s_name, parent); - mutex_unlock(&parent->d_inode->i_mutex); - dput(parent); - - if (IS_ERR(dentry)) - break; - } - return dentry; -} - -/** * sysfs_get_active - get an active reference to sysfs_dirent * @sd: sysfs_dirent to get an active reference to * @@ -298,7 +257,61 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) goto repeat; } -static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) +static int sysfs_dentry_delete(struct dentry *dentry) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + return !!(sd->s_flags & SYSFS_FLAG_REMOVED); +} + +static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + int is_dir; + + mutex_lock(&sysfs_mutex); + + /* The sysfs dirent has been deleted */ + if (sd->s_flags & SYSFS_FLAG_REMOVED) + goto out_bad; + + /* The sysfs dirent has been moved? */ + if (dentry->d_parent->d_fsdata != sd->s_parent) + goto out_bad; + + /* The sysfs dirent has been renamed */ + if (strcmp(dentry->d_name.name, sd->s_name) != 0) + goto out_bad; + + mutex_unlock(&sysfs_mutex); +out_valid: + return 1; +out_bad: + /* Remove the dentry from the dcache hashes. + * If this is a deleted dentry we use d_drop instead of d_delete + * so sysfs doesn't need to cope with negative dentries. + * + * If this is a dentry that has simply been renamed we + * use d_drop to remove it from the dcache lookup on its + * old parent. If this dentry persists later when a lookup + * is performed at its new name the dentry will be readded + * to the dcache hashes. + */ + is_dir = (sysfs_type(sd) == SYSFS_DIR); + mutex_unlock(&sysfs_mutex); + if (is_dir) { + /* If we have submounts we must allow the vfs caches + * to lie about the state of the filesystem to prevent + * leaks and other nasty things. + */ + if (have_submounts(dentry)) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + return 0; +} + +static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode) { struct sysfs_dirent * sd = dentry->d_fsdata; @@ -307,7 +320,9 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) } static const struct dentry_operations sysfs_dentry_ops = { - .d_iput = sysfs_d_iput, + .d_revalidate = sysfs_dentry_revalidate, + .d_delete = sysfs_dentry_delete, + .d_iput = sysfs_dentry_iput, }; struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) @@ -344,12 +359,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) return NULL; } -static int sysfs_ilookup_test(struct inode *inode, void *arg) -{ - struct sysfs_dirent *sd = arg; - return inode->i_ino == sd->s_ino; -} - /** * sysfs_addrm_start - prepare for sysfs_dirent add/remove * @acxt: pointer to sysfs_addrm_cxt to be used @@ -357,47 +366,20 @@ static int sysfs_ilookup_test(struct inode *inode, void *arg) * * This function is called when the caller is about to add or * remove sysfs_dirent under @parent_sd. This function acquires - * sysfs_mutex, grabs inode for @parent_sd if available and lock - * i_mutex of it. @acxt is used to keep and pass context to + * sysfs_mutex. @acxt is used to keep and pass context to * other addrm functions. * * LOCKING: * Kernel thread context (may sleep). sysfs_mutex is locked on - * return. i_mutex of parent inode is locked on return if - * available. + * return. */ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *parent_sd) { - struct inode *inode; - memset(acxt, 0, sizeof(*acxt)); acxt->parent_sd = parent_sd; - /* Lookup parent inode. inode initialization is protected by - * sysfs_mutex, so inode existence can be determined by - * looking up inode while holding sysfs_mutex. - */ mutex_lock(&sysfs_mutex); - - inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, - parent_sd); - if (inode) { - WARN_ON(inode->i_state & I_NEW); - - /* parent inode available */ - acxt->parent_inode = inode; - - /* sysfs_mutex is below i_mutex in lock hierarchy. - * First, trylock i_mutex. If fails, unlock - * sysfs_mutex and lock them in order. - */ - if (!mutex_trylock(&inode->i_mutex)) { - mutex_unlock(&sysfs_mutex); - mutex_lock(&inode->i_mutex); - mutex_lock(&sysfs_mutex); - } - } } /** @@ -422,18 +404,22 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, */ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { + struct sysfs_inode_attrs *ps_iattr; + if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) return -EEXIST; sd->s_parent = sysfs_get(acxt->parent_sd); - if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) - inc_nlink(acxt->parent_inode); - - acxt->cnt++; - sysfs_link_sibling(sd); + /* Update timestamps on the parent */ + ps_iattr = acxt->parent_sd->s_iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + return 0; } @@ -512,70 +498,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) */ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { + struct sysfs_inode_attrs *ps_iattr; + BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); sysfs_unlink_sibling(sd); + /* Update timestamps on the parent */ + ps_iattr = acxt->parent_sd->s_iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + sd->s_flags |= SYSFS_FLAG_REMOVED; sd->s_sibling = acxt->removed; acxt->removed = sd; - - if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) - drop_nlink(acxt->parent_inode); - - acxt->cnt++; -} - -/** - * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent - * @sd: target sysfs_dirent - * - * Drop dentry for @sd. @sd must have been unlinked from its - * parent on entry to this function such that it can't be looked - * up anymore. - */ -static void sysfs_drop_dentry(struct sysfs_dirent *sd) -{ - struct inode *inode; - struct dentry *dentry; - - inode = ilookup(sysfs_sb, sd->s_ino); - if (!inode) - return; - - /* Drop any existing dentries associated with sd. - * - * For the dentry to be properly freed we need to grab a - * reference to the dentry under the dcache lock, unhash it, - * and then put it. The playing with the dentry count allows - * dput to immediately free the dentry if it is not in use. - */ -repeat: - spin_lock(&dcache_lock); - list_for_each_entry(dentry, &inode->i_dentry, d_alias) { - if (d_unhashed(dentry)) - continue; - dget_locked(dentry); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - dput(dentry); - goto repeat; - } - spin_unlock(&dcache_lock); - - /* adjust nlink and update timestamp */ - mutex_lock(&inode->i_mutex); - - inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); - if (sysfs_type(sd) == SYSFS_DIR) - drop_nlink(inode); - - mutex_unlock(&inode->i_mutex); - - iput(inode); } /** @@ -584,25 +522,15 @@ repeat: * * Finish up sysfs_dirent add/remove. Resources acquired by * sysfs_addrm_start() are released and removed sysfs_dirents are - * cleaned up. Timestamps on the parent inode are updated. + * cleaned up. * * LOCKING: - * All mutexes acquired by sysfs_addrm_start() are released. + * sysfs_mutex is released. */ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) { /* release resources acquired by sysfs_addrm_start() */ mutex_unlock(&sysfs_mutex); - if (acxt->parent_inode) { - struct inode *inode = acxt->parent_inode; - - /* if added/removed, update timestamps on the parent */ - if (acxt->cnt) - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - - mutex_unlock(&inode->i_mutex); - iput(inode); - } /* kill removed sysfs_dirents */ while (acxt->removed) { @@ -611,7 +539,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) acxt->removed = sd->s_sibling; sd->s_sibling = NULL; - sysfs_drop_dentry(sd); sysfs_deactivate(sd); unmap_bin_file(sd); sysfs_put(sd); @@ -751,10 +678,15 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, } /* instantiate and hash dentry */ - dentry->d_op = &sysfs_dentry_ops; - dentry->d_fsdata = sysfs_get(sd); - d_instantiate(dentry, inode); - d_rehash(dentry); + ret = d_find_alias(inode); + if (!ret) { + dentry->d_op = &sysfs_dentry_ops; + dentry->d_fsdata = sysfs_get(sd); + d_add(dentry, inode); + } else { + d_move(ret, dentry); + iput(inode); + } out_unlock: mutex_unlock(&sysfs_mutex); @@ -763,7 +695,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, + .permission = sysfs_permission, .setattr = sysfs_setattr, + .getattr = sysfs_getattr, .setxattr = sysfs_setxattr, }; @@ -826,141 +760,65 @@ void sysfs_remove_dir(struct kobject * kobj) __sysfs_remove_dir(sd); } -int sysfs_rename_dir(struct kobject * kobj, const char *new_name) +int sysfs_rename(struct sysfs_dirent *sd, + struct sysfs_dirent *new_parent_sd, const char *new_name) { - struct sysfs_dirent *sd = kobj->sd; - struct dentry *parent = NULL; - struct dentry *old_dentry = NULL, *new_dentry = NULL; const char *dup_name = NULL; int error; - mutex_lock(&sysfs_rename_mutex); + mutex_lock(&sysfs_mutex); error = 0; - if (strcmp(sd->s_name, new_name) == 0) + if ((sd->s_parent == new_parent_sd) && + (strcmp(sd->s_name, new_name) == 0)) goto out; /* nothing to rename */ - /* get the original dentry */ - old_dentry = sysfs_get_dentry(sd); - if (IS_ERR(old_dentry)) { - error = PTR_ERR(old_dentry); - old_dentry = NULL; - goto out; - } - - parent = old_dentry->d_parent; - - /* lock parent and get dentry for new name */ - mutex_lock(&parent->d_inode->i_mutex); - mutex_lock(&sysfs_mutex); - error = -EEXIST; - if (sysfs_find_dirent(sd->s_parent, new_name)) - goto out_unlock; - - error = -ENOMEM; - new_dentry = d_alloc_name(parent, new_name); - if (!new_dentry) - goto out_unlock; + if (sysfs_find_dirent(new_parent_sd, new_name)) + goto out; /* rename sysfs_dirent */ - error = -ENOMEM; - new_name = dup_name = kstrdup(new_name, GFP_KERNEL); - if (!new_name) - goto out_unlock; - - dup_name = sd->s_name; - sd->s_name = new_name; + if (strcmp(sd->s_name, new_name) != 0) { + error = -ENOMEM; + new_name = dup_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + + dup_name = sd->s_name; + sd->s_name = new_name; + } - /* rename */ - d_add(new_dentry, NULL); - d_move(old_dentry, new_dentry); + /* Remove from old parent's list and insert into new parent's list. */ + if (sd->s_parent != new_parent_sd) { + sysfs_unlink_sibling(sd); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); + sd->s_parent = new_parent_sd; + sysfs_link_sibling(sd); + } error = 0; - out_unlock: + out: mutex_unlock(&sysfs_mutex); - mutex_unlock(&parent->d_inode->i_mutex); kfree(dup_name); - dput(old_dentry); - dput(new_dentry); - out: - mutex_unlock(&sysfs_rename_mutex); return error; } +int sysfs_rename_dir(struct kobject *kobj, const char *new_name) +{ + return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); +} + int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) { struct sysfs_dirent *sd = kobj->sd; struct sysfs_dirent *new_parent_sd; - struct dentry *old_parent, *new_parent = NULL; - struct dentry *old_dentry = NULL, *new_dentry = NULL; - int error; - mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? + new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; - error = 0; - if (sd->s_parent == new_parent_sd) - goto out; /* nothing to move */ - - /* get dentries */ - old_dentry = sysfs_get_dentry(sd); - if (IS_ERR(old_dentry)) { - error = PTR_ERR(old_dentry); - old_dentry = NULL; - goto out; - } - old_parent = old_dentry->d_parent; - - new_parent = sysfs_get_dentry(new_parent_sd); - if (IS_ERR(new_parent)) { - error = PTR_ERR(new_parent); - new_parent = NULL; - goto out; - } - -again: - mutex_lock(&old_parent->d_inode->i_mutex); - if (!mutex_trylock(&new_parent->d_inode->i_mutex)) { - mutex_unlock(&old_parent->d_inode->i_mutex); - goto again; - } - mutex_lock(&sysfs_mutex); - - error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, sd->s_name)) - goto out_unlock; - - error = -ENOMEM; - new_dentry = d_alloc_name(new_parent, sd->s_name); - if (!new_dentry) - goto out_unlock; - - error = 0; - d_add(new_dentry, NULL); - d_move(old_dentry, new_dentry); - - /* Remove from old parent's list and insert into new parent's list. */ - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - drop_nlink(old_parent->d_inode); - sysfs_put(sd->s_parent); - sd->s_parent = new_parent_sd; - inc_nlink(new_parent->d_inode); - sysfs_link_sibling(sd); - - out_unlock: - mutex_unlock(&sysfs_mutex); - mutex_unlock(&new_parent->d_inode->i_mutex); - mutex_unlock(&old_parent->d_inode->i_mutex); - out: - dput(new_parent); - dput(old_dentry); - dput(new_dentry); - mutex_unlock(&sysfs_rename_mutex); - return error; + return sysfs_rename(sd, new_parent_sd, sd->s_name); } /* Relationship between s_mode and the DT_xxx types */ diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f5ea4680f15..dc30d9e3168 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -579,46 +579,23 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); */ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) { - struct sysfs_dirent *victim_sd = NULL; - struct dentry *victim = NULL; - struct inode * inode; + struct sysfs_dirent *sd; struct iattr newattrs; int rc; - rc = -ENOENT; - victim_sd = sysfs_get_dirent(kobj->sd, attr->name); - if (!victim_sd) - goto out; + mutex_lock(&sysfs_mutex); - mutex_lock(&sysfs_rename_mutex); - victim = sysfs_get_dentry(victim_sd); - mutex_unlock(&sysfs_rename_mutex); - if (IS_ERR(victim)) { - rc = PTR_ERR(victim); - victim = NULL; + rc = -ENOENT; + sd = sysfs_find_dirent(kobj->sd, attr->name); + if (!sd) goto out; - } - - inode = victim->d_inode; - - mutex_lock(&inode->i_mutex); - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - newattrs.ia_ctime = current_fs_time(inode->i_sb); - rc = sysfs_setattr(victim, &newattrs); + newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE; + rc = sysfs_sd_setattr(sd, &newattrs); - if (rc == 0) { - fsnotify_change(victim, newattrs.ia_valid); - mutex_lock(&sysfs_mutex); - victim_sd->s_mode = newattrs.ia_mode; - mutex_unlock(&sysfs_mutex); - } - - mutex_unlock(&inode->i_mutex); out: - dput(victim); - sysfs_put(victim_sd); + mutex_unlock(&sysfs_mutex); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e28cecf179f..220b758523a 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -37,7 +37,9 @@ static struct backing_dev_info sysfs_backing_dev_info = { }; static const struct inode_operations sysfs_inode_operations ={ + .permission = sysfs_permission, .setattr = sysfs_setattr, + .getattr = sysfs_getattr, .setxattr = sysfs_setxattr, }; @@ -46,7 +48,7 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } -struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) { struct sysfs_inode_attrs *attrs; struct iattr *iattrs; @@ -64,30 +66,15 @@ struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) return attrs; } -int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) + +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) { - struct inode * inode = dentry->d_inode; - struct sysfs_dirent * sd = dentry->d_fsdata; struct sysfs_inode_attrs *sd_attrs; struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; - int error; - - if (!sd) - return -EINVAL; sd_attrs = sd->s_iattr; - error = inode_change_ok(inode, iattr); - if (error) - return error; - - iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ - - error = inode_setattr(inode, iattr); - if (error) - return error; - if (!sd_attrs) { /* setting attributes for the first time, allocate now */ sd_attrs = sysfs_init_inode_attrs(sd); @@ -103,42 +90,78 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) iattrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); + iattrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); + iattrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); + iattrs->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; iattrs->ia_mode = sd->s_mode = mode; } } + return 0; +} + +int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct sysfs_dirent *sd = dentry->d_fsdata; + int error; + + if (!sd) + return -EINVAL; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ + + error = inode_setattr(inode, iattr); + if (error) + return error; + + mutex_lock(&sysfs_mutex); + error = sysfs_sd_setattr(sd, iattr); + mutex_unlock(&sysfs_mutex); + return error; } +static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len) +{ + struct sysfs_inode_attrs *iattrs; + void *old_secdata; + size_t old_secdata_len; + + iattrs = sd->s_iattr; + if (!iattrs) + iattrs = sysfs_init_inode_attrs(sd); + if (!iattrs) + return -ENOMEM; + + old_secdata = iattrs->ia_secdata; + old_secdata_len = iattrs->ia_secdata_len; + + iattrs->ia_secdata = *secdata; + iattrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct sysfs_dirent *sd = dentry->d_fsdata; - struct sysfs_inode_attrs *iattrs; void *secdata; int error; u32 secdata_len = 0; if (!sd) return -EINVAL; - if (!sd->s_iattr) - sd->s_iattr = sysfs_init_inode_attrs(sd); - if (!sd->s_iattr) - return -ENOMEM; - - iattrs = sd->s_iattr; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; @@ -150,12 +173,13 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, &secdata, &secdata_len); if (error) goto out; - if (iattrs->ia_secdata) - security_release_secctx(iattrs->ia_secdata, - iattrs->ia_secdata_len); - iattrs->ia_secdata = secdata; - iattrs->ia_secdata_len = secdata_len; + mutex_lock(&sysfs_mutex); + error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); + mutex_unlock(&sysfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); } else return -EINVAL; out: @@ -170,7 +194,6 @@ static inline void set_default_inode_attr(struct inode * inode, mode_t mode) static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) { - inode->i_mode = iattr->ia_mode; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; inode->i_atime = iattr->ia_atime; @@ -178,17 +201,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } - -/* - * sysfs has a different i_mutex lock order behavior for i_mutex than other - * filesystems; sysfs i_mutex is called in many places with subsystem locks - * held. At the same time, many of the VFS locking rules do not apply to - * sysfs at all (cross directory rename for example). To untangle this mess - * (which gives false positives in lockdep), we're giving sysfs inodes their - * own class for i_mutex. - */ -static struct lock_class_key sysfs_inode_imutex_key; - static int sysfs_count_nlink(struct sysfs_dirent *sd) { struct sysfs_dirent *child; @@ -201,38 +213,55 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) return nr + 2; } +static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) +{ + struct sysfs_inode_attrs *iattrs = sd->s_iattr; + + inode->i_mode = sd->s_mode; + if (iattrs) { + /* sysfs_dirent has non-default attributes + * get them from persistent copy in sysfs_dirent + */ + set_inode_attr(inode, &iattrs->ia_iattr); + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); + } + + if (sysfs_type(sd) == SYSFS_DIR) + inode->i_nlink = sysfs_count_nlink(sd); +} + +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&sysfs_mutex); + sysfs_refresh_inode(sd, inode); + mutex_unlock(&sysfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; - struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; - inode->i_ino = sd->s_ino; - lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - iattrs = sd->s_iattr; - if (iattrs) { - /* sysfs_dirent has non-default attributes - * get them for the new inode from persistent copy - * in sysfs_dirent - */ - set_inode_attr(inode, &iattrs->ia_iattr); - if (iattrs->ia_secdata) - security_inode_notifysecctx(inode, - iattrs->ia_secdata, - iattrs->ia_secdata_len); - } else - set_default_inode_attr(inode, sd->s_mode); + set_default_inode_attr(inode, sd->s_mode); + sysfs_refresh_inode(sd, inode); /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: inode->i_op = &sysfs_dir_inode_operations; inode->i_fop = &sysfs_dir_operations; - inode->i_nlink = sysfs_count_nlink(sd); break; case SYSFS_KOBJ_ATTR: inode->i_size = PAGE_SIZE; @@ -315,3 +344,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) else return -ENOENT; } + +int sysfs_permission(struct inode *inode, int mask) +{ + struct sysfs_dirent *sd = inode->i_private; + + mutex_lock(&sysfs_mutex); + sysfs_refresh_inode(sd, inode); + mutex_unlock(&sysfs_mutex); + + return generic_permission(inode, mask, NULL); +} diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c5081ad7702..c5eff49fa41 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -210,10 +210,13 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { - .setxattr = sysfs_setxattr, - .readlink = generic_readlink, - .follow_link = sysfs_follow_link, - .put_link = sysfs_put_link, + .setxattr = sysfs_setxattr, + .readlink = generic_readlink, + .follow_link = sysfs_follow_link, + .put_link = sysfs_put_link, + .setattr = sysfs_setattr, + .getattr = sysfs_getattr, + .permission = sysfs_permission, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index af4c4e7482a..ca52e7b9d8f 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -89,9 +89,7 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) */ struct sysfs_addrm_cxt { struct sysfs_dirent *parent_sd; - struct inode *parent_inode; struct sysfs_dirent *removed; - int cnt; }; /* @@ -105,7 +103,6 @@ extern struct kmem_cache *sysfs_dir_cachep; * dir.c */ extern struct mutex sysfs_mutex; -extern struct mutex sysfs_rename_mutex; extern spinlock_t sysfs_assoc_lock; extern const struct file_operations sysfs_dir_operations; @@ -133,6 +130,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); +int sysfs_rename(struct sysfs_dirent *sd, + struct sysfs_dirent *new_parent_sd, const char *new_name); + static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { if (sd) { @@ -155,7 +155,10 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) */ struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); +int sysfs_permission(struct inode *inode, int mask); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index dbc093afd94..90492327b38 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -350,13 +350,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) le32_to_cpu(sup->fmt_version)); printk(KERN_DEBUG "\ttime_gran %u\n", le32_to_cpu(sup->time_gran)); - printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X" - "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", - sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3], - sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7], - sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11], - sup->uuid[12], sup->uuid[13], sup->uuid[14], - sup->uuid[15]); + printk(KERN_DEBUG "\tUUID %pUB\n", + sup->uuid); break; } case UBIFS_MST_NODE: @@ -2014,7 +2009,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, inum = key_inum_flash(c, &dent->key); fscki1 = read_add_inode(c, priv, inum); if (IS_ERR(fscki1)) { - err = PTR_ERR(fscki); + err = PTR_ERR(fscki1); ubifs_err("error %d while processing entry node and " "trying to find parent inode node %lu", err, (unsigned long)inum); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 1009adc8d60..39849f887e7 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1389,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { int err; - ssize_t ret; struct inode *inode = iocb->ki_filp->f_mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1397,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) return err; - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (ret < 0) - return ret; - - if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) { - err = ubifs_sync_wbufs_by_inode(c, inode); - if (err) - return err; - } - - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } static int ubifs_set_page_dirty(struct page *page) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 333e181ee98..43f9d19a6f3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1393,12 +1393,7 @@ static int mount_ubifs(struct ubifs_info *c) c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); - dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" - "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", - c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3], - c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], - c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], - c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); + dbg_msg("UUID: %pUB", c->uuid); dbg_msg("big_lpt %d", c->big_lpt); dbg_msg("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); @@ -1842,22 +1837,32 @@ const struct super_operations ubifs_super_operations = { * @name: UBI volume name * @mode: UBI volume open mode * - * There are several ways to specify UBI volumes when mounting UBIFS: - * o ubiX_Y - UBI device number X, volume Y; - * o ubiY - UBI device number 0, volume Y; + * The primary method of mounting UBIFS is by specifying the UBI volume + * character device node path. However, UBIFS may also be mounted withoug any + * character device node using one of the following methods: + * + * o ubiX_Y - mount UBI device number X, volume Y; + * o ubiY - mount UBI device number 0, volume Y; * o ubiX:NAME - mount UBI device X, volume with name NAME; * o ubi:NAME - mount UBI device 0, volume with name NAME. * * Alternative '!' separator may be used instead of ':' (because some shells * like busybox may interpret ':' as an NFS host name separator). This function - * returns ubi volume object in case of success and a negative error code in - * case of failure. + * returns UBI volume description object in case of success and a negative + * error code in case of failure. */ static struct ubi_volume_desc *open_ubi(const char *name, int mode) { + struct ubi_volume_desc *ubi; int dev, vol; char *endptr; + /* First, try to open using the device node path method */ + ubi = ubi_open_volume_path(name, mode); + if (!IS_ERR(ubi)) + return ubi; + + /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') return ERR_PTR(-EINVAL); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 1e068535b58..82372e332f0 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -440,7 +440,7 @@ static void udf_table_free_blocks(struct super_block *sb, (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, partmap->s_partition_len); goto error_return; } diff --git a/fs/udf/file.c b/fs/udf/file.c index b80cbd78833..f311d509b6a 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -196,6 +196,7 @@ static int udf_release_file(struct inode *inode, struct file *filp) mutex_lock(&inode->i_mutex); lock_kernel(); udf_discard_prealloc(inode); + udf_truncate_tail_extent(inode); unlock_kernel(); mutex_unlock(&inode->i_mutex); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 6d24c2c63f9..f90231eb291 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -97,15 +97,17 @@ no_delete: */ void udf_clear_inode(struct inode *inode) { - struct udf_inode_info *iinfo; - if (!(inode->i_sb->s_flags & MS_RDONLY)) { - lock_kernel(); - udf_truncate_tail_extent(inode); - unlock_kernel(); - write_inode_now(inode, 0); - invalidate_inode_buffers(inode); + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && + inode->i_size != iinfo->i_lenExtents) { + printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " + "inode size %llu different from extent lenght %llu. " + "Filesystem need not be standards compliant.\n", + inode->i_sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); } - iinfo = UDF_I(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } @@ -198,7 +200,6 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, int newblock; struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; - uint32_t elen; uint8_t alloctype; struct extent_position epos; @@ -273,12 +274,11 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, eloc.logicalBlockNum = *block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; - elen = inode->i_sb->s_blocksize; - iinfo->i_lenExtents = elen; + iinfo->i_lenExtents = inode->i_size; epos.bh = NULL; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, &eloc, elen, 0); + udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); /* UniqueID stuff */ brelse(epos.bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 21dad8c608f..cd2115060fd 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -408,15 +408,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, } add: - /* Is there any extent whose size we need to round up? */ - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { - elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - } f_pos += nfidlen; if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && @@ -439,6 +430,7 @@ add: udf_current_aext(dir, &epos, &eloc, &elen, 1); } + /* Entry fits into current block? */ if (sb->s_blocksize - fibh->eoffset >= nfidlen) { fibh->soffset = fibh->eoffset; fibh->eoffset += nfidlen; @@ -462,6 +454,16 @@ add: (fibh->sbh->b_data + fibh->soffset); } } else { + /* Round up last extent in the file */ + elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize + - 1) & ~(sb->s_blocksize - 1); + fibh->soffset = fibh->eoffset - sb->s_blocksize; fibh->eoffset += nfidlen - sb->s_blocksize; if (fibh->sbh != fibh->ebh) { @@ -508,6 +510,20 @@ add: dir->i_size += nfidlen; if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) dinfo->i_lenAlloc += nfidlen; + else { + /* Find the last extent and truncate it to proper size */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + elen -= dinfo->i_lenExtents - dir->i_size; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = dir->i_size; + } + mark_inode_dirty(dir); goto out_ok; } else { @@ -922,7 +938,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, block = udf_get_pblock(inode->i_sb, block, iinfo->i_location.partitionReferenceNum, 0); - epos.bh = udf_tread(inode->i_sb, block); + epos.bh = udf_tgetblk(inode->i_sb, block); lock_buffer(epos.bh); memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(epos.bh); @@ -999,6 +1015,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, inode->i_size = elen; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) iinfo->i_lenAlloc = inode->i_size; + else + udf_truncate_tail_extent(inode); mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); diff --git a/fs/udf/super.c b/fs/udf/super.c index 9d1b8c2e6c4..1e4543cbcd2 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb, return 0; } -static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +static void udf_find_vat_block(struct super_block *sb, int p_index, + int type1_index, sector_t start_block) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; + sector_t vat_block; struct kernel_lb_addr ino; + + /* + * VAT file entry is in the last recorded block. Some broken disks have + * it a few blocks before so try a bit harder... + */ + ino.partitionReferenceNum = type1_index; + for (vat_block = start_block; + vat_block >= map->s_partition_root && + vat_block >= start_block - 3 && + !sbi->s_vat_inode; vat_block--) { + ino.logicalBlockNum = vat_block - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } +} + +static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; - /* VAT file entry is in the last recorded block */ - ino.partitionReferenceNum = type1_index; - ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" @@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) "block of the device (%lu).\n", (unsigned long)sbi->s_last_block, (unsigned long)blocks - 1); - ino.partitionReferenceNum = type1_index; - ino.logicalBlockNum = blocks - 1 - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) return 1; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 70f989895d1..87813e405ce 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -235,71 +235,36 @@ xfs_setfilesize( } /* - * Buffered IO write completion for delayed allocate extents. + * IO write completion. */ STATIC void -xfs_end_bio_delalloc( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * Buffered IO write completion for regular, written extents. - */ -STATIC void -xfs_end_bio_written( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * IO write completion for unwritten extents. - * - * Issue transactions to convert a buffer range from unwritten - * to written extents. - */ -STATIC void -xfs_end_bio_unwritten( +xfs_end_io( struct work_struct *work) { xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); - xfs_off_t offset = ioend->io_offset; - size_t size = ioend->io_size; - - if (likely(!ioend->io_error)) { - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - int error; - error = xfs_iomap_write_unwritten(ip, offset, size); - if (error) - ioend->io_error = error; - } - xfs_setfilesize(ioend); - } - xfs_destroy_ioend(ioend); -} -/* - * IO read completion for regular, written extents. - */ -STATIC void -xfs_end_bio_read( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == IOMAP_UNWRITTEN && + likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { + int error; + + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + if (error) + ioend->io_error = error; + } + /* + * We might have to update the on-disk file size after extending + * writes. + */ + if (ioend->io_type != IOMAP_READ) + xfs_setfilesize(ioend); xfs_destroy_ioend(ioend); } @@ -314,10 +279,10 @@ xfs_finish_ioend( int wait) { if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq = xfsdatad_workqueue; - if (ioend->io_work.func == xfs_end_bio_unwritten) - wq = xfsconvertd_workqueue; + struct workqueue_struct *wq; + wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; queue_work(wq, &ioend->io_work); if (wait) flush_workqueue(wq); @@ -355,15 +320,7 @@ xfs_alloc_ioend( ioend->io_offset = 0; ioend->io_size = 0; - if (type == IOMAP_UNWRITTEN) - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); - else if (type == IOMAP_DELAY) - INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); - else if (type == IOMAP_READ) - INIT_WORK(&ioend->io_work, xfs_end_bio_read); - else - INIT_WORK(&ioend->io_work, xfs_end_bio_written); - + INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; } @@ -380,7 +337,7 @@ xfs_map_blocks( return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); } -STATIC_INLINE int +STATIC int xfs_iomap_valid( xfs_iomap_t *iomapp, loff_t offset) @@ -412,8 +369,9 @@ xfs_end_bio( STATIC void xfs_submit_ioend_bio( - xfs_ioend_t *ioend, - struct bio *bio) + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) { atomic_inc(&ioend->io_remaining); bio->bi_private = ioend; @@ -426,7 +384,8 @@ xfs_submit_ioend_bio( if (xfs_ioend_new_eof(ioend)) xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); - submit_bio(WRITE, bio); + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); } @@ -505,6 +464,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) */ STATIC void xfs_submit_ioend( + struct writeback_control *wbc, xfs_ioend_t *ioend) { xfs_ioend_t *head = ioend; @@ -533,19 +493,19 @@ xfs_submit_ioend( retry: bio = xfs_alloc_ioend_bio(bh); } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } if (bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } lastblock = bh->b_blocknr; } if (bio) - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); xfs_finish_ioend(ioend, 0); } while ((ioend = next) != NULL); } @@ -1191,7 +1151,7 @@ xfs_page_state_convert( } if (iohead) - xfs_submit_ioend(iohead); + xfs_submit_ioend(wbc, iohead); return page_dirty; @@ -1528,7 +1488,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - INIT_WORK(&ioend->io_work, xfs_end_bio_written); + ioend->io_type = IOMAP_NEW; xfs_finish_ioend(ioend, 0); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 965df1227d6..4ddc973aea7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -149,7 +149,7 @@ page_region_mask( return mask; } -STATIC_INLINE void +STATIC void set_page_region( struct page *page, size_t offset, @@ -161,7 +161,7 @@ set_page_region( SetPageUptodate(page); } -STATIC_INLINE int +STATIC int test_page_region( struct page *page, size_t offset, @@ -582,7 +582,7 @@ found: * although backing storage may not be. */ xfs_buf_t * -xfs_buf_get_flags( +xfs_buf_get( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ @@ -661,7 +661,7 @@ _xfs_buf_read( } xfs_buf_t * -xfs_buf_read_flags( +xfs_buf_read( xfs_buftarg_t *target, xfs_off_t ioff, size_t isize, @@ -671,7 +671,7 @@ xfs_buf_read_flags( flags |= XBF_READ; - bp = xfs_buf_get_flags(target, ioff, isize, flags); + bp = xfs_buf_get(target, ioff, isize, flags); if (bp) { if (!XFS_BUF_ISDONE(bp)) { XB_TRACE(bp, "read", (unsigned long)flags); @@ -718,7 +718,7 @@ xfs_buf_readahead( return; flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); - xfs_buf_read_flags(target, ioff, isize, flags); + xfs_buf_read(target, ioff, isize, flags); } xfs_buf_t * @@ -1113,7 +1113,7 @@ xfs_bdwrite( xfs_buf_delwri_queue(bp, 1); } -STATIC_INLINE void +STATIC void _xfs_buf_ioend( xfs_buf_t *bp, int schedule) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 9b4d666ad31..5f07dd91c5f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -186,15 +186,10 @@ extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, #define xfs_incore(buftarg,blkno,len,lockit) \ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) -extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t, +extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -#define xfs_buf_get(target, blkno, len, flags) \ - xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) - -extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t, +extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -#define xfs_buf_read(target, blkno, len, flags) \ - xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index eff61e2732a..e4caeb28ce2 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -52,7 +52,7 @@ xfs_file_aio_read( loff_t pos) { struct file *file = iocb->ki_filp; - int ioflags = IO_ISAIO; + int ioflags = 0; BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) @@ -71,7 +71,7 @@ xfs_file_aio_write( loff_t pos) { struct file *file = iocb->ki_filp; - int ioflags = IO_ISAIO; + int ioflags = 0; BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index cd42ef78f6b..1f3b4b8f7dd 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -573,8 +573,8 @@ xfs_vn_fallocate( bf.l_len = len; xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, - 0, XFS_ATTR_NOLOCK); + error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, + 0, XFS_ATTR_NOLOCK); if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) new_size = offset + len; @@ -585,7 +585,7 @@ xfs_vn_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 072050f8d34..1bf47f219c9 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -255,8 +255,6 @@ xfs_read( iocb->ki_pos = *offset; ret = generic_file_aio_read(iocb, iovp, segs, *offset); - if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) - ret = wait_on_sync_kiocb(iocb); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -774,9 +772,6 @@ write_retry: current->backing_dev_info = NULL; - if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) - ret = wait_on_sync_kiocb(iocb); - isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; @@ -811,7 +806,7 @@ write_retry: XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; int error2; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 18a4b8e11df..1bfb0e98019 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -930,13 +930,39 @@ xfs_fs_alloc_inode( */ STATIC void xfs_fs_destroy_inode( - struct inode *inode) + struct inode *inode) { - xfs_inode_t *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(inode); + + xfs_itrace_entry(ip); XFS_STATS_INC(vn_reclaim); - if (xfs_reclaim(ip)) - panic("%s: cannot reclaim 0x%p\n", __func__, inode); + + /* bad inode, get out here ASAP */ + if (is_bad_inode(inode)) + goto out_reclaim; + + xfs_ioend_wait(ip); + + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * If we have nothing to flush with this inode then complete the + * teardown now, otherwise delay the flush operation. + */ + if (!xfs_inode_clean(ip)) { + xfs_inode_set_reclaim_tag(ip); + return; + } + +out_reclaim: + xfs_ireclaim(ip); } /* @@ -973,7 +999,6 @@ xfs_fs_inode_init_once( mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); } /* @@ -1075,6 +1100,20 @@ xfs_fs_clear_inode( XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); + /* + * The iolock is used by the file system to coordinate reads, + * writes, and block truncates. Up to this point the lock + * protected concurrent accesses by users of the inode. But + * from here forward we're doing some final processing of the + * inode because we're done with it, and although we reuse the + * iolock for protection it is really a distinct lock class + * (in the lockdep sense) from before. To keep lockdep happy + * (and basically indicate what we are doing), we explicitly + * re-init the iolock here. + */ + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + xfs_inactive(ip); } @@ -1092,8 +1131,6 @@ xfs_fs_put_super( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); - struct xfs_inode *rip = mp->m_rootip; - int unmount_event_flags = 0; xfs_syncd_stop(mp); @@ -1109,20 +1146,7 @@ xfs_fs_put_super( xfs_sync_attr(mp, 0); } -#ifdef HAVE_DMAPI - if (mp->m_flags & XFS_MOUNT_DMAPI) { - unmount_event_flags = - (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? - 0 : DM_FLAGS_UNWANTED; - /* - * Ignore error from dmapi here, first unmount is not allowed - * to fail anyway, and second we wouldn't want to fail a - * unmount because of dmapi. - */ - XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL, - NULL, NULL, 0, 0, unmount_event_flags); - } -#endif + XFS_SEND_PREUNMOUNT(mp); /* * Blow away any referenced inode in the filestreams cache. @@ -1133,10 +1157,7 @@ xfs_fs_put_super( XFS_bflush(mp->m_ddev_targp); - if (mp->m_flags & XFS_MOUNT_DMAPI) { - XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0, - unmount_event_flags); - } + XFS_SEND_UNMOUNT(mp); xfs_unmountfs(mp); xfs_freesb(mp); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 961df0a22c7..d895a3a960f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -663,10 +663,9 @@ xfs_syncd_stop( kthread_stop(mp->m_sync_task); } -int +STATIC int xfs_reclaim_inode( xfs_inode_t *ip, - int locked, int sync_mode) { xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); @@ -682,10 +681,6 @@ xfs_reclaim_inode( !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { spin_unlock(&ip->i_flags_lock); write_unlock(&pag->pag_ici_lock); - if (locked) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } return -EAGAIN; } __xfs_iflags_set(ip, XFS_IRECLAIM); @@ -704,10 +699,8 @@ xfs_reclaim_inode( * We get the flush lock regardless, though, just to make sure * we don't free it while it is being flushed. */ - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); /* * In the case of a forced shutdown we rely on xfs_iflush() to @@ -778,7 +771,7 @@ xfs_reclaim_inode_now( } read_unlock(&pag->pag_ici_lock); - return xfs_reclaim_inode(ip, 0, flags); + return xfs_reclaim_inode(ip, flags); } int diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 27920eb7a82..a500b4d9183 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); -int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index ad7fbead4c9..00cabf5354d 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -36,7 +36,6 @@ struct attrlist_cursor_kern; /* * Flags for read/write calls - same values as IRIX */ -#define IO_ISAIO 0x00001 /* don't wait for completion */ #define IO_ISDIRECT 0x00004 /* bypass page cache */ #define IO_INVIS 0x00020 /* don't update inode timestamps */ diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 6f4fd37c67a..d2d20462fd4 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -41,10 +41,6 @@ extern void assfail(char *expr, char *f, int l); # define STATIC static noinline #endif -#ifndef STATIC_INLINE -# define STATIC_INLINE static inline -#endif - #else /* DEBUG */ #define ASSERT(expr) \ @@ -54,19 +50,5 @@ extern void assfail(char *expr, char *f, int l); # define STATIC noinline #endif -/* - * We stop inlining of inline functions in debug mode. - * Unfortunately, this means static inline in header files - * get multiple definitions, so they need to remain static. - * This then gives tonnes of warnings about unused but defined - * functions, so we need to add the unused attribute to prevent - * these spurious warnings. - */ -#ifndef STATIC_INLINE -# define STATIC_INLINE static __attribute__ ((unused)) noinline -#endif - #endif /* DEBUG */ - - #endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 4ece1906bd4..8fe6f6b78a4 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -123,9 +123,13 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -int -xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, - char *value, int *valuelenp, int flags) +STATIC int +xfs_attr_get_int( + struct xfs_inode *ip, + struct xfs_name *name, + char *value, + int *valuelenp, + int flags) { xfs_da_args_t args; int error; @@ -188,7 +192,7 @@ xfs_attr_get( return error; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags); + error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); return(error); } @@ -2143,8 +2147,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, - XFS_BUF_LOCK | XBF_DONT_BLOCK); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index fb3b2a68b9b..12f0be3a73d 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -131,7 +131,6 @@ typedef struct xfs_attr_list_context { */ int xfs_attr_calc_size(struct xfs_inode *, int, int, int *); int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index afdc8911637..0b687351293 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -98,7 +98,7 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); * If namespace bits don't match return 0. * If all match then return 1. */ -STATIC_INLINE int +STATIC int xfs_attr_namesp_match(int arg_flags, int ondisk_flags) { return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index eb7b702d069..6f5ccede63f 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -98,8 +98,7 @@ xfs_bmdr_to_bmbt( * This code must be in sync with the routines xfs_bmbt_get_startoff, * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. */ - -STATIC_INLINE void +STATIC void __xfs_bmbt_get_all( __uint64_t l0, __uint64_t l1, diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index f655f7dc334..4aba67c5f64 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -79,7 +79,7 @@ extern ktrace_t *xfs_filestreams_trace_buf; * the cache that reference per-ag array elements that have since been * reallocated. */ -STATIC_INLINE int +static inline int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -87,7 +87,7 @@ xfs_filestream_peek_ag( return atomic_read(&mp->m_perag[agno].pagf_fstrms); } -STATIC_INLINE int +static inline int xfs_filestream_get_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -95,7 +95,7 @@ xfs_filestream_get_ag( return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); } -STATIC_INLINE int +static inline int xfs_filestream_put_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -122,7 +122,7 @@ int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); /* filestreams for the inode? */ -STATIC_INLINE int +static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2d0b3e1da9e..36079aa9134 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -201,8 +201,8 @@ xfs_growfs_data_private( * AG freelist header block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); agf = XFS_BUF_TO_AGF(bp); memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -233,8 +233,8 @@ xfs_growfs_data_private( * AG inode header block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); agi = XFS_BUF_TO_AGI(bp); memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -257,8 +257,9 @@ xfs_growfs_data_private( * BNO btree root block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); @@ -278,8 +279,9 @@ xfs_growfs_data_private( * CNT btree root block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); @@ -300,8 +302,9 @@ xfs_growfs_data_private( * INO btree root block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); @@ -611,7 +614,7 @@ xfs_fs_log_dummy( xfs_inode_t *ip; int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); + tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 0785797db82..cb907ba69c4 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -425,7 +425,7 @@ xfs_ialloc_ag_alloc( return 0; } -STATIC_INLINE xfs_agnumber_t +STATIC xfs_agnumber_t xfs_ialloc_next_ag( xfs_mount_t *mp) { diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 80e526489be..073bb4a26b1 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -73,6 +73,9 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); /* initialise the xfs inode */ ip->i_ino = ino; @@ -290,7 +293,7 @@ xfs_iget_cache_miss( struct xfs_inode **ipp, xfs_daddr_t bno, int flags, - int lock_flags) __releases(pag->pag_ici_lock) + int lock_flags) { struct xfs_inode *ip; int error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 67ae5555a30..7294abce6ef 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -860,8 +860,15 @@ xfs_iomap_write_unwritten( * set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. + * + * Note that we open code the transaction allocation here + * to pass KM_NOFS--we can't risk to recursing back into + * the filesystem here as we might be asked to write out + * the same inode that we complete here and might deadlock + * on the iolock. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index fb17f8226b0..9d4fdcaf897 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -225,16 +225,10 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - int b; - - cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); - for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); - cmn_err(CE_DEBUG, " log : uuid = "); - for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); + cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", + __func__, &mp->m_sb.sb_uuid, XLOG_FMT); + cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", + &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else #define xlog_header_check_dump(mp, head) @@ -2206,6 +2200,7 @@ xlog_recover_do_buffer_trans( xfs_daddr_t blkno; int len; ushort flags; + uint buf_flags; buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; @@ -2246,12 +2241,11 @@ xlog_recover_do_buffer_trans( } mp = log->l_mp; - if (flags & XFS_BLI_INODE_BUF) { - bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len, - XFS_BUF_LOCK); - } else { - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0); - } + buf_flags = XFS_BUF_LOCK; + if (!(flags & XFS_BLI_INODE_BUF)) + buf_flags |= XFS_BUF_MAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, bp, blkno); @@ -2350,8 +2344,8 @@ xlog_recover_do_inode_trans( goto error; } - bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno, - in_f->ilf_len, XFS_BUF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + XFS_BUF_LOCK); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, bp, in_f->ilf_blkno); @@ -3517,7 +3511,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t bufaddr, offset; + xfs_caddr_t offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; @@ -3610,7 +3604,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = NULL; + offset = XFS_BUF_PTR(hbp); split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3646,9 +3640,8 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - bufaddr = XFS_BUF_PTR(hbp); error = XFS_BUF_SET_PTR(hbp, - bufaddr + BBTOB(split_hblks), + offset + BBTOB(split_hblks), BBTOB(hblks - split_hblks)); if (error) goto bread_err2; @@ -3658,14 +3651,10 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = XFS_BUF_SET_PTR(hbp, bufaddr, + error = XFS_BUF_SET_PTR(hbp, offset, BBTOB(hblks)); if (error) goto bread_err2; - - if (!offset) - offset = xlog_align(log, 0, - wrapped_hblks, hbp); } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, @@ -3685,7 +3674,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = NULL; + offset = XFS_BUF_PTR(dbp); split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -3714,9 +3703,8 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - bufaddr = XFS_BUF_PTR(dbp); error = XFS_BUF_SET_PTR(dbp, - bufaddr + BBTOB(split_bblks), + offset + BBTOB(split_bblks), BBTOB(bblks - split_bblks)); if (error) goto bread_err2; @@ -3727,13 +3715,9 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size); + error = XFS_BUF_SET_PTR(dbp, offset, h_size); if (error) goto bread_err2; - - if (!offset) - offset = xlog_align(log, wrapped_hblks, - bblks - split_bblks, dbp); } xlog_unpack_data(rhead, offset, log); if ((error = xlog_recover_process_data(log, rhash, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 8b6c9e807ef..bfffd6334ab 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -583,8 +583,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; - bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); + bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), + extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { xfs_fs_mount_cmn_err(flags, "SB read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; @@ -624,8 +624,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) XFS_BUF_UNMANAGE(bp); xfs_buf_relse(bp); sector_size = mp->m_sb.sb_sectsize; - bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); + bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { xfs_fs_mount_cmn_err(flags, "SB re-read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; @@ -1471,7 +1471,7 @@ xfs_log_sbcount( if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) return 0; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, XFS_DEFAULT_LOG_COUNT); if (error) { @@ -2123,7 +2123,7 @@ xfs_icsb_destroy_counters( mutex_destroy(&mp->m_icsb_mutex); } -STATIC_INLINE void +STATIC void xfs_icsb_lock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -2132,7 +2132,7 @@ xfs_icsb_lock_cntr( } } -STATIC_INLINE void +STATIC void xfs_icsb_unlock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -2140,7 +2140,7 @@ xfs_icsb_unlock_cntr( } -STATIC_INLINE void +STATIC void xfs_icsb_lock_all_counters( xfs_mount_t *mp) { @@ -2153,7 +2153,7 @@ xfs_icsb_lock_all_counters( } } -STATIC_INLINE void +STATIC void xfs_icsb_unlock_all_counters( xfs_mount_t *mp) { @@ -2389,12 +2389,12 @@ xfs_icsb_modify_counters( { xfs_icsb_cnts_t *icsbp; long long lcounter; /* long counter for 64 bit fields */ - int cpu, ret = 0; + int ret = 0; might_sleep(); again: - cpu = get_cpu(); - icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu); + preempt_disable(); + icsbp = this_cpu_ptr(mp->m_sb_cnts); /* * if the counter is disabled, go to slow path @@ -2438,11 +2438,11 @@ again: break; } xfs_icsb_unlock_cntr(icsbp); - put_cpu(); + preempt_enable(); return 0; slow_path: - put_cpu(); + preempt_enable(); /* * serialise with a mutex so we don't burn lots of cpu on @@ -2490,7 +2490,7 @@ slow_path: balance_counter: xfs_icsb_unlock_cntr(icsbp); - put_cpu(); + preempt_enable(); /* * We may have multiple threads here if multiple per-cpu diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a6c023bc0fb..1df7e450296 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -93,6 +93,9 @@ typedef struct xfs_dmops { xfs_send_unmount_t xfs_send_unmount; } xfs_dmops_t; +#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \ + (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED) + #define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) #define XFS_SEND_MMAP(mp, vma,fl) \ @@ -101,12 +104,24 @@ typedef struct xfs_dmops { (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) #define XFS_SEND_MOUNT(mp,right,path,name) \ (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) -#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl) +#define XFS_SEND_PREUNMOUNT(mp) \ +do { \ + if (mp->m_flags & XFS_MOUNT_DMAPI) { \ + (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \ + (mp)->m_rootip, DM_RIGHT_NULL, \ + (mp)->m_rootip, DM_RIGHT_NULL, \ + NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ + } \ +} while (0) +#define XFS_SEND_UNMOUNT(mp) \ +do { \ + if (mp->m_flags & XFS_MOUNT_DMAPI) { \ + (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \ + DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ + } \ +} while (0) #ifdef HAVE_PERCPU_SB @@ -387,13 +402,13 @@ xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag) * Per-cpu superblock locking functions */ #ifdef HAVE_PERCPU_SB -STATIC_INLINE void +static inline void xfs_icsb_lock(xfs_mount_t *mp) { mutex_lock(&mp->m_icsb_mutex); } -STATIC_INLINE void +static inline void xfs_icsb_unlock(xfs_mount_t *mp) { mutex_unlock(&mp->m_icsb_mutex); diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 3f816ad7ff1..4c199d18f85 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -277,10 +277,10 @@ xfs_read_buf( xfs_buf_t *bp; int error; - if (flags) - bp = xfs_buf_read_flags(target, blkno, len, flags); - else - bp = xfs_buf_read(target, blkno, len, flags); + if (!flags) + flags = XBF_LOCK | XBF_MAPPED; + + bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return XFS_ERROR(EIO); error = XFS_BUF_GETERROR(bp); @@ -336,3 +336,25 @@ xfs_bwrite( } return (error); } + +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t extsz; + + if (unlikely(XFS_IS_REALTIME_INODE(ip))) { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize + : ip->i_mount->m_sb.sb_rextsize; + ASSERT(extsz); + } else { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize : 0; + } + + return extsz; +} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f5e4874c37d..571f2174435 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -37,34 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) } /* - * Flags for xfs_free_eofblocks - */ -#define XFS_FREE_EOF_LOCK (1<<0) -#define XFS_FREE_EOF_NOLOCK (1<<1) - - -/* - * helper function to extract extent size hint from inode - */ -STATIC_INLINE xfs_extlen_t -xfs_get_extsz_hint( - xfs_inode_t *ip) -{ - xfs_extlen_t extsz; - - if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - ? ip->i_d.di_extsize - : ip->i_mount->m_sb.sb_rextsize; - ASSERT(extsz); - } else { - extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - ? ip->i_d.di_extsize : 0; - } - return extsz; -} - -/* * Prototypes for functions in xfs_rw.c. */ extern int xfs_write_clear_setuid(struct xfs_inode *ip); @@ -76,5 +48,6 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, struct xfs_buf **bpp); extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, xfs_buf_t *bp, xfs_daddr_t blkno); +extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 66b849358e6..237badcbac3 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -236,19 +236,20 @@ xfs_trans_alloc( uint type) { xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type); + return _xfs_trans_alloc(mp, type, KM_SLEEP); } xfs_trans_t * _xfs_trans_alloc( xfs_mount_t *mp, - uint type) + uint type, + uint memflags) { xfs_trans_t *tp; atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + tp = kmem_zone_zalloc(xfs_trans_zone, memflags); tp->t_magic = XFS_TRANS_MAGIC; tp->t_type = type; tp->t_mountp = mp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ed47fc77759..a0574f593f5 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -924,7 +924,7 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces. */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 218829e6a15..03a1f701fea 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -79,11 +79,8 @@ xfs_trans_get_buf(xfs_trans_t *tp, /* * Default to a normal get_buf() call if the tp is NULL. */ - if (tp == NULL) { - bp = xfs_buf_get_flags(target_dev, blkno, len, - flags | BUF_BUSY); - return(bp); - } + if (tp == NULL) + return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); /* * If we find the buffer in the cache with this transaction @@ -129,7 +126,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); if (bp == NULL) { return NULL; } @@ -302,7 +299,7 @@ xfs_trans_read_buf( * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); if (!bp) return (flags & XFS_BUF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -398,7 +395,7 @@ xfs_trans_read_buf( * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); if (bp == NULL) { *bpp = NULL; return 0; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b572f7e840e..578f3f59b78 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -538,9 +538,8 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), - XBF_LOCK | XBF_MAPPED | - XBF_DONT_BLOCK); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", @@ -709,6 +708,11 @@ xfs_fsync( } /* + * Flags for xfs_free_eofblocks + */ +#define XFS_FREE_EOF_TRYLOCK (1<<0) + +/* * This is called by xfs_inactive to free any blocks beyond eof * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. @@ -726,7 +730,6 @@ xfs_free_eofblocks( xfs_filblks_t map_len; int nimaps; xfs_bmbt_irec_t imap; - int use_iolock = (flags & XFS_FREE_EOF_LOCK); /* * Figure out if there are any blocks beyond the end @@ -768,14 +771,19 @@ xfs_free_eofblocks( * cache and we can't * do that within a transaction. */ - if (use_iolock) + if (flags & XFS_FREE_EOF_TRYLOCK) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return 0; + } + } else { xfs_ilock(ip, XFS_IOLOCK_EXCL); + } error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, ip->i_size); if (error) { xfs_trans_cancel(tp, 0); - if (use_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -812,8 +820,7 @@ xfs_free_eofblocks( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); } - xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL) - : XFS_ILOCK_EXCL)); + xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL); } return error; } @@ -1113,7 +1120,17 @@ xfs_release( (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { - error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + + /* + * If we can't get the iolock just skip truncating + * the blocks past EOF because we could deadlock + * with the mmap_sem otherwise. We'll get another + * chance to drop them once the last reference to + * the inode is dropped, so we'll never leak blocks + * permanently. + */ + error = xfs_free_eofblocks(mp, ip, + XFS_FREE_EOF_TRYLOCK); if (error) return error; } @@ -1184,7 +1201,7 @@ xfs_inactive( (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || (ip->i_delayed_blks != 0)))) { - error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + error = xfs_free_eofblocks(mp, ip, 0); if (error) return VN_INACTIVE_CACHE; } @@ -2456,46 +2473,6 @@ xfs_set_dmattrs( return error; } -int -xfs_reclaim( - xfs_inode_t *ip) -{ - - xfs_itrace_entry(ip); - - ASSERT(!VN_MAPPED(VFS_I(ip))); - - /* bad inode, get out here ASAP */ - if (is_bad_inode(VFS_I(ip))) { - xfs_ireclaim(ip); - return 0; - } - - xfs_ioend_wait(ip); - - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - - /* - * If we have nothing to flush with this inode then complete the - * teardown now, otherwise break the link between the xfs inode and the - * linux inode and clean up the xfs inode later. This avoids flushing - * the inode to disk during the delete operation itself. - * - * When breaking the link, we need to set the XFS_IRECLAIMABLE flag - * first to ensure that xfs_iunpin() will never see an xfs inode - * that has a linux inode being reclaimed. Synchronisation is provided - * by the i_flags_lock. - */ - if (!ip->i_update_core && (ip->i_itemp == NULL)) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_IRECLAIMABLE); - return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); - } - xfs_inode_set_reclaim_tag(ip); - return 0; -} - /* * xfs_alloc_file_space() * This routine allocates disk space for the given file. diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index a9e102de71a..167a467403a 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, mode_t mode, struct xfs_inode **ipp, cred_t *credp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); -int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, |