summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/Locking4
-rw-r--r--Documentation/sysctl/fs.txt42
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c77
-rw-r--r--arch/powerpc/platforms/cell/spufs/syscalls.c2
-rw-r--r--drivers/base/devtmpfs.c9
-rw-r--r--drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c2
-rw-r--r--drivers/staging/bcm/Misc.c31
-rw-r--r--drivers/staging/gdm72xx/sdio_boot.c7
-rw-r--r--drivers/staging/gdm72xx/usb_boot.c22
-rw-r--r--drivers/target/target_core_file.c32
-rw-r--r--drivers/usb/gadget/storage_common.c12
-rw-r--r--drivers/usb/gadget/u_uac1.c6
-rw-r--r--drivers/video/fb_defio.c2
-rw-r--r--fs/9p/vfs_file.c3
-rw-r--r--fs/btrfs/disk-io.c3
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c6
-rw-r--r--fs/btrfs/ioctl.c15
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/buffer.c28
-rw-r--r--fs/ceph/addr.c3
-rw-r--r--fs/ecryptfs/inode.c30
-rw-r--r--fs/exec.c19
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/super.c33
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/super.c31
-rw-r--r--fs/fat/file.c15
-rw-r--r--fs/file_table.c4
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/inode.c12
-rw-r--r--fs/internal.h4
-rw-r--r--fs/lockd/clntproc.c14
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/namei.c313
-rw-r--r--fs/namespace.c97
-rw-r--r--fs/nfsd/nfs4recover.c9
-rw-r--r--fs/nfsd/nfsfh.c1
-rw-r--r--fs/nfsd/nfsproc.c9
-rw-r--r--fs/nfsd/vfs.c79
-rw-r--r--fs/nfsd/vfs.h11
-rw-r--r--fs/nilfs2/file.c18
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ocfs2/file.c11
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c7
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/refcounttree.c11
-rw-r--r--fs/open.c15
-rw-r--r--fs/pipe.c75
-rw-r--r--fs/splice.c3
-rw-r--r--fs/super.c252
-rw-r--r--fs/sysfs/bin.c2
-rw-r--r--fs/xfs/xfs_aops.c18
-rw-r--r--fs/xfs/xfs_file.c10
-rw-r--r--fs/xfs/xfs_ioctl.c55
-rw-r--r--fs/xfs/xfs_ioctl32.c12
-rw-r--r--fs/xfs/xfs_iomap.c4
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_sync.c2
-rw-r--r--fs/xfs/xfs_trans.c17
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--include/linux/audit.h4
-rw-r--r--include/linux/fs.h154
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/namei.h1
-rw-r--r--include/linux/nfsd/nfsfh.h1
-rw-r--r--include/linux/pipe_fs_i.h2
-rw-r--r--kernel/audit.c21
-rw-r--r--kernel/sysctl.c18
-rw-r--r--lib/percpu_counter.c14
-rw-r--r--mm/filemap.c31
-rw-r--r--mm/filemap_xip.c6
-rw-r--r--mm/memory.c14
-rw-r--r--net/unix/af_unix.c93
-rw-r--r--sound/sound_firmware.c8
84 files changed, 1326 insertions, 639 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 7f647e17830c..0f103e39b4f6 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -138,8 +138,8 @@ evict_inode:
put_super: write
write_super: read
sync_fs: read
-freeze_fs: read
-unfreeze_fs: read
+freeze_fs: write
+unfreeze_fs: write
statfs: maybe(read) (see below)
remount_fs: write
umount_begin: no
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 8c235b6e4246..88152f214f48 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs:
- nr_open
- overflowuid
- overflowgid
+- protected_hardlinks
+- protected_symlinks
- suid_dumpable
- super-max
- super-nr
@@ -157,6 +159,46 @@ The default is 65534.
==============================================================
+protected_hardlinks:
+
+A long-standing class of security issues is the hardlink-based
+time-of-check-time-of-use race, most commonly seen in world-writable
+directories like /tmp. The common method of exploitation of this flaw
+is to cross privilege boundaries when following a given hardlink (i.e. a
+root process follows a hardlink created by another user). Additionally,
+on systems without separated partitions, this stops unauthorized users
+from "pinning" vulnerable setuid/setgid files against being upgraded by
+the administrator, or linking to special files.
+
+When set to "0", hardlink creation behavior is unrestricted.
+
+When set to "1" hardlinks cannot be created by users if they do not
+already own the source file, or do not have read/write access to it.
+
+This protection is based on the restrictions in Openwall and grsecurity.
+
+==============================================================
+
+protected_symlinks:
+
+A long-standing class of security issues is the symlink-based
+time-of-check-time-of-use race, most commonly seen in world-writable
+directories like /tmp. The common method of exploitation of this flaw
+is to cross privilege boundaries when following a given symlink (i.e. a
+root process follows a symlink belonging to another user). For a likely
+incomplete list of hundreds of examples across the years, please see:
+http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp
+
+When set to "0", symlink following behavior is unrestricted.
+
+When set to "1" symlinks are permitted to be followed only when outside
+a sticky world-writable directory, or when the uid of the symlink and
+follower match, or when the directory owner matches the symlink's owner.
+
+This protection is based on the restrictions in Openwall and grsecurity.
+
+==============================================================
+
suid_dumpable:
This value can be used to query and set the core dump mode for setuid
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index d544d7816df3..dba1ce235da5 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -186,10 +186,13 @@ static void spufs_prune_dir(struct dentry *dir)
static int spufs_rmdir(struct inode *parent, struct dentry *dir)
{
/* remove all entries */
+ int res;
spufs_prune_dir(dir);
d_drop(dir);
-
- return simple_rmdir(parent, dir);
+ res = simple_rmdir(parent, dir);
+ /* We have to give up the mm_struct */
+ spu_forget(SPUFS_I(dir->d_inode)->i_ctx);
+ return res;
}
static int spufs_fill_dir(struct dentry *dir,
@@ -245,9 +248,6 @@ static int spufs_dir_close(struct inode *inode, struct file *file)
mutex_unlock(&parent->i_mutex);
WARN_ON(ret);
- /* We have to give up the mm_struct */
- spu_forget(ctx);
-
return dcache_dir_close(inode, file);
}
@@ -450,28 +450,24 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,
struct spu_context *neighbor;
struct path path = {.mnt = mnt, .dentry = dentry};
- ret = -EPERM;
if ((flags & SPU_CREATE_NOSCHED) &&
!capable(CAP_SYS_NICE))
- goto out_unlock;
+ return -EPERM;
- ret = -EINVAL;
if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE))
== SPU_CREATE_ISOLATE)
- goto out_unlock;
+ return -EINVAL;
- ret = -ENODEV;
if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
- goto out_unlock;
+ return -ENODEV;
gang = NULL;
neighbor = NULL;
affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU);
if (affinity) {
gang = SPUFS_I(inode)->i_gang;
- ret = -EINVAL;
if (!gang)
- goto out_unlock;
+ return -EINVAL;
mutex_lock(&gang->aff_mutex);
neighbor = spufs_assert_affinity(flags, gang, aff_filp);
if (IS_ERR(neighbor)) {
@@ -492,22 +488,12 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,
}
ret = spufs_context_open(&path);
- if (ret < 0) {
+ if (ret < 0)
WARN_ON(spufs_rmdir(inode, dentry));
- if (affinity)
- mutex_unlock(&gang->aff_mutex);
- mutex_unlock(&inode->i_mutex);
- spu_forget(SPUFS_I(dentry->d_inode)->i_ctx);
- goto out;
- }
out_aff_unlock:
if (affinity)
mutex_unlock(&gang->aff_mutex);
-out_unlock:
- mutex_unlock(&inode->i_mutex);
-out:
- dput(dentry);
return ret;
}
@@ -580,18 +566,13 @@ static int spufs_create_gang(struct inode *inode,
int ret;
ret = spufs_mkgang(inode, dentry, mode & S_IRWXUGO);
- if (ret)
- goto out;
-
- ret = spufs_gang_open(&path);
- if (ret < 0) {
- int err = simple_rmdir(inode, dentry);
- WARN_ON(err);
+ if (!ret) {
+ ret = spufs_gang_open(&path);
+ if (ret < 0) {
+ int err = simple_rmdir(inode, dentry);
+ WARN_ON(err);
+ }
}
-
-out:
- mutex_unlock(&inode->i_mutex);
- dput(dentry);
return ret;
}
@@ -601,40 +582,32 @@ static struct file_system_type spufs_type;
long spufs_create(struct path *path, struct dentry *dentry,
unsigned int flags, umode_t mode, struct file *filp)
{
+ struct inode *dir = path->dentry->d_inode;
int ret;
- ret = -EINVAL;
/* check if we are on spufs */
if (path->dentry->d_sb->s_type != &spufs_type)
- goto out;
+ return -EINVAL;
/* don't accept undefined flags */
if (flags & (~SPU_CREATE_FLAG_ALL))
- goto out;
+ return -EINVAL;
/* only threads can be underneath a gang */
- if (path->dentry != path->dentry->d_sb->s_root) {
- if ((flags & SPU_CREATE_GANG) ||
- !SPUFS_I(path->dentry->d_inode)->i_gang)
- goto out;
- }
+ if (path->dentry != path->dentry->d_sb->s_root)
+ if ((flags & SPU_CREATE_GANG) || !SPUFS_I(dir)->i_gang)
+ return -EINVAL;
mode &= ~current_umask();
if (flags & SPU_CREATE_GANG)
- ret = spufs_create_gang(path->dentry->d_inode,
- dentry, path->mnt, mode);
+ ret = spufs_create_gang(dir, dentry, path->mnt, mode);
else
- ret = spufs_create_context(path->dentry->d_inode,
- dentry, path->mnt, flags, mode,
+ ret = spufs_create_context(dir, dentry, path->mnt, flags, mode,
filp);
if (ret >= 0)
- fsnotify_mkdir(path->dentry->d_inode, dentry);
- return ret;
+ fsnotify_mkdir(dir, dentry);
-out:
- mutex_unlock(&path->dentry->d_inode->i_mutex);
- dput(dentry);
return ret;
}
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 5665dcc382c7..5b7d8ffbf890 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags,
ret = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
ret = spufs_create(&path, dentry, flags, mode, neighbor);
- path_put(&path);
+ done_path_create(&path, dentry);
}
return ret;
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index d91a3a0b2325..deb4a456cf83 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -156,9 +156,7 @@ static int dev_mkdir(const char *name, umode_t mode)
if (!err)
/* mark as kernel-created inode */
dentry->d_inode->i_private = &thread;
- dput(dentry);
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
+ done_path_create(&path, dentry);
return err;
}
@@ -218,10 +216,7 @@ static int handle_create(const char *nodename, umode_t mode, struct device *dev)
/* mark as kernel-created inode */
dentry->d_inode->i_private = &thread;
}
- dput(dentry);
-
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
+ done_path_create(&path, dentry);
return err;
}
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
index 57bf1d7ee80f..9ab24528f9b9 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
@@ -1188,7 +1188,7 @@ exit:
kfree(buf);
/* close file before return */
if (fp)
- filp_close(fp, current->files);
+ filp_close(fp, NULL);
/* restore previous address limit */
set_fs(old_fs);
diff --git a/drivers/staging/bcm/Misc.c b/drivers/staging/bcm/Misc.c
index 9a60d4cd2184..f545716c666d 100644
--- a/drivers/staging/bcm/Misc.c
+++ b/drivers/staging/bcm/Misc.c
@@ -157,12 +157,7 @@ static int create_worker_threads(struct bcm_mini_adapter *psAdapter)
static struct file *open_firmware_file(struct bcm_mini_adapter *Adapter, const char *path)
{
- struct file *flp = NULL;
- mm_segment_t oldfs;
- oldfs = get_fs();
- set_fs(get_ds());
- flp = filp_open(path, O_RDONLY, S_IRWXU);
- set_fs(oldfs);
+ struct file *flp = filp_open(path, O_RDONLY, S_IRWXU);
if (IS_ERR(flp)) {
pr_err(DRV_NAME "Unable To Open File %s, err %ld", path, PTR_ERR(flp));
flp = NULL;
@@ -183,14 +178,12 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u
{
int errorno = 0;
struct file *flp = NULL;
- mm_segment_t oldfs;
struct timeval tv = {0};
flp = open_firmware_file(Adapter, path);
if (!flp) {
- errorno = -ENOENT;
BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Unable to Open %s\n", path);
- goto exit_download;
+ return -ENOENT;
}
BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Opened file is = %s and length =0x%lx to be downloaded at =0x%x", path, (unsigned long)flp->f_dentry->d_inode->i_size, loc);
do_gettimeofday(&tv);
@@ -201,10 +194,7 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u
errorno = -EIO;
goto exit_download;
}
- oldfs = get_fs();
- set_fs(get_ds());
vfs_llseek(flp, 0, 0);
- set_fs(oldfs);
if (Adapter->bcm_file_readback_from_chip(Adapter->pvInterfaceAdapter, flp, loc)) {
BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Failed to read back firmware!");
errorno = -EIO;
@@ -212,12 +202,7 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u
}
exit_download:
- oldfs = get_fs();
- set_fs(get_ds());
- if (flp && !(IS_ERR(flp)))
- filp_close(flp, current->files);
- set_fs(oldfs);
-
+ filp_close(flp, NULL);
return errorno;
}
@@ -1056,10 +1041,8 @@ OUT:
static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter)
{
struct file *flp = NULL;
- mm_segment_t oldfs = {0};
char *buff;
int len = 0;
- loff_t pos = 0;
buff = kmalloc(BUFFER_1K, GFP_KERNEL);
if (!buff)
@@ -1079,20 +1062,16 @@ static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter)
Adapter->pstargetparams = NULL;
return -ENOENT;
}
- oldfs = get_fs();
- set_fs(get_ds());
- len = vfs_read(flp, (void __user __force *)buff, BUFFER_1K, &pos);
- set_fs(oldfs);
+ len = kernel_read(flp, 0, buff, BUFFER_1K);
+ filp_close(flp, NULL);
if (len != sizeof(STARGETPARAMS)) {
BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Mismatch in Target Param Structure!\n");
kfree(buff);
kfree(Adapter->pstargetparams);
Adapter->pstargetparams = NULL;
- filp_close(flp, current->files);
return -ENOENT;
}
- filp_close(flp, current->files);
/* Check for autolink in config params */
/*
diff --git a/drivers/staging/gdm72xx/sdio_boot.c b/drivers/staging/gdm72xx/sdio_boot.c
index 760efee23d4a..65624bca8b3a 100644
--- a/drivers/staging/gdm72xx/sdio_boot.c
+++ b/drivers/staging/gdm72xx/sdio_boot.c
@@ -66,9 +66,8 @@ static int download_image(struct sdio_func *func, char *img_name)
return -ENOENT;
}
- if (filp->f_dentry)
- inode = filp->f_dentry->d_inode;
- if (!inode || !S_ISREG(inode->i_mode)) {
+ inode = filp->f_dentry->d_inode;
+ if (!S_ISREG(inode->i_mode)) {
printk(KERN_ERR "Invalid file type: %s\n", img_name);
ret = -EINVAL;
goto out;
@@ -123,7 +122,7 @@ static int download_image(struct sdio_func *func, char *img_name)
pno++;
}
out:
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
return ret;
}
diff --git a/drivers/staging/gdm72xx/usb_boot.c b/drivers/staging/gdm72xx/usb_boot.c
index fef290c38db6..e3dbd5a552ca 100644
--- a/drivers/staging/gdm72xx/usb_boot.c
+++ b/drivers/staging/gdm72xx/usb_boot.c
@@ -173,14 +173,12 @@ int usb_boot(struct usb_device *usbdev, u16 pid)
filp = filp_open(img_name, O_RDONLY | O_LARGEFILE, 0);
if (IS_ERR(filp)) {
printk(KERN_ERR "Can't find %s.\n", img_name);
- set_fs(fs);
ret = PTR_ERR(filp);
goto restore_fs;
}
- if (filp->f_dentry)
- inode = filp->f_dentry->d_inode;
- if (!inode || !S_ISREG(inode->i_mode)) {
+ inode = filp->f_dentry->d_inode;
+ if (!S_ISREG(inode->i_mode)) {
printk(KERN_ERR "Invalid file type: %s\n", img_name);
ret = -EINVAL;
goto out;
@@ -262,7 +260,7 @@ int usb_boot(struct usb_device *usbdev, u16 pid)
ret = -EINVAL;
}
out:
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
restore_fs:
set_fs(fs);
@@ -322,13 +320,11 @@ static int em_download_image(struct usb_device *usbdev, char *path,
goto restore_fs;
}
- if (filp->f_dentry) {
- inode = filp->f_dentry->d_inode;
- if (!inode || !S_ISREG(inode->i_mode)) {
- printk(KERN_ERR "Invalid file type: %s\n", path);
- ret = -EINVAL;
- goto out;
- }
+ inode = filp->f_dentry->d_inode;
+ if (!S_ISREG(inode->i_mode)) {
+ printk(KERN_ERR "Invalid file type: %s\n", path);
+ ret = -EINVAL;
+ goto out;
}
buf = kmalloc(DOWNLOAD_CHUCK + pad_size, GFP_KERNEL);
@@ -364,7 +360,7 @@ static int em_download_image(struct usb_device *usbdev, char *path,
goto out;
out:
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
restore_fs:
set_fs(fs);
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 9e2100551c78..cbb5aaf3e567 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -109,46 +109,29 @@ static struct se_device *fd_create_virtdevice(
struct se_subsystem_dev *se_dev,
void *p)
{
- char *dev_p = NULL;
struct se_device *dev;
struct se_dev_limits dev_limits;
struct queue_limits *limits;
struct fd_dev *fd_dev = p;
struct fd_host *fd_host = hba->hba_ptr;
- mm_segment_t old_fs;
struct file *file;
struct inode *inode = NULL;
int dev_flags = 0, flags, ret = -EINVAL;
memset(&dev_limits, 0, sizeof(struct se_dev_limits));
- old_fs = get_fs();
- set_fs(get_ds());
- dev_p = getname(fd_dev->fd_dev_name);
- set_fs(old_fs);
-
- if (IS_ERR(dev_p)) {
- pr_err("getname(%s) failed: %lu\n",
- fd_dev->fd_dev_name, IS_ERR(dev_p));
- ret = PTR_ERR(dev_p);
- goto fail;
- }
/*
* Use O_DSYNC by default instead of O_SYNC to forgo syncing
* of pure timestamp updates.
*/
flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC;
- file = filp_open(dev_p, flags, 0600);
+ file = filp_open(fd_dev->fd_dev_name, flags, 0600);
if (IS_ERR(file)) {
- pr_err("filp_open(%s) failed\n", dev_p);
+ pr_err("filp_open(%s) failed\n", fd_dev->fd_dev_name);
ret = PTR_ERR(file);
goto fail;
}
- if (!file || !file->f_dentry) {
- pr_err("filp_open(%s) failed\n", dev_p);
- goto fail;
- }
fd_dev->fd_file = file;
/*
* If using a block backend with this struct file, we extract
@@ -212,14 +195,12 @@ static struct se_device *fd_create_virtdevice(
" %llu total bytes\n", fd_host->fd_host_id, fd_dev->fd_dev_id,
fd_dev->fd_dev_name, fd_dev->fd_dev_size);
- putname(dev_p);
return dev;
fail:
if (fd_dev->fd_file) {
filp_close(fd_dev->fd_file, NULL);
fd_dev->fd_file = NULL;
}
- putname(dev_p);
return ERR_PTR(ret);
}
@@ -452,14 +433,11 @@ static ssize_t fd_set_configfs_dev_params(
token = match_token(ptr, tokens, args);
switch (token) {
case Opt_fd_dev_name:
- arg_p = match_strdup(&args[0]);
- if (!arg_p) {
- ret = -ENOMEM;
+ if (match_strlcpy(fd_dev->fd_dev_name, &args[0],
+ FD_MAX_DEV_NAME) == 0) {
+ ret = -EINVAL;
break;
}
- snprintf(fd_dev->fd_dev_name, FD_MAX_DEV_NAME,
- "%s", arg_p);
- kfree(arg_p);
pr_debug("FILEIO: Referencing Path: %s\n",
fd_dev->fd_dev_name);
fd_dev->fbd_flags |= FBDF_HAS_PATH;
diff --git a/drivers/usb/gadget/storage_common.c b/drivers/usb/gadget/storage_common.c
index ae8b18869b8c..8d9bcd8207c8 100644
--- a/drivers/usb/gadget/storage_common.c
+++ b/drivers/usb/gadget/storage_common.c
@@ -656,9 +656,8 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
if (!(filp->f_mode & FMODE_WRITE))
ro = 1;
- if (filp->f_path.dentry)
- inode = filp->f_path.dentry->d_inode;
- if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) {
+ inode = filp->f_path.dentry->d_inode;
+ if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) {
LINFO(curlun, "invalid file type: %s\n", filename);
goto out;
}
@@ -667,7 +666,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
* If we can't read the file, it's no good.
* If we can't write the file, use it read-only.
*/
- if (!filp->f_op || !(filp->f_op->read || filp->f_op->aio_read)) {
+ if (!(filp->f_op->read || filp->f_op->aio_read)) {
LINFO(curlun, "file not readable: %s\n", filename);
goto out;
}
@@ -712,7 +711,6 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
if (fsg_lun_is_open(curlun))
fsg_lun_close(curlun);
- get_file(filp);
curlun->blksize = blksize;
curlun->blkbits = blkbits;
curlun->ro = ro;
@@ -720,10 +718,10 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename)
curlun->file_length = size;
curlun->num_sectors = num_sectors;
LDBG(curlun, "open backing file: %s\n", filename);
- rc = 0;
+ return 0;
out:
- filp_close(filp, current->files);
+ fput(filp);
return rc;
}
diff --git a/drivers/usb/gadget/u_uac1.c b/drivers/usb/gadget/u_uac1.c
index af9898982059..e0c5e88e03ed 100644
--- a/drivers/usb/gadget/u_uac1.c
+++ b/drivers/usb/gadget/u_uac1.c
@@ -275,17 +275,17 @@ static int gaudio_close_snd_dev(struct gaudio *gau)
/* Close control device */
snd = &gau->control;
if (snd->filp)
- filp_close(snd->filp, current->files);
+ filp_close(snd->filp, NULL);
/* Close PCM playback device and setup substream */
snd = &gau->playback;
if (snd->filp)
- filp_close(snd->filp, current->files);
+ filp_close(snd->filp, NULL);
/* Close PCM capture device and setup substream */
snd = &gau->capture;
if (snd->filp)
- filp_close(snd->filp, current->files);
+ filp_close(snd->filp, NULL);
return 0;
}
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 1ddeb11659d4..64cda560c488 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -104,6 +104,8 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
deferred framebuffer IO. then if userspace touches a page
again, we repeat the same scheme */
+ file_update_time(vma->vm_file);
+
/* protect against the workqueue changing the page list */
mutex_lock(&fbdefio->lock);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index fc06fd27065e..dd6f7ee1e312 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
page, (unsigned long)filp->private_data);
+ /* Update file times before taking page lock */
+ file_update_time(filp);
+
v9inode = V9FS_I(inode);
/* make sure the cache has finished storing the page */
v9fs_fscache_wait_on_page_write(inode, page);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fadeba6a5db9..62e0cafd6e25 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1614,8 +1614,6 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
do {
- vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
-
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
mutex_trylock(&root->fs_info->cleaner_mutex)) {
btrfs_run_delayed_iputs(root);
@@ -1647,7 +1645,6 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
delay = HZ * 30;
- vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9aa01ec2138d..5caf285c6e4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
ssize_t err = 0;
size_t count, ocount;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
@@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
num_written = err;
}
out:
+ sb_end_write(inode->i_sb);
current->backing_dev_info = NULL;
return num_written ? num_written : err;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48bdfd2591c2..83baec24946d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6629,6 +6629,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_start;
u64 page_end;
+ sb_start_pagefault(inode->i_sb);
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
@@ -6718,12 +6719,15 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
out_unlock:
- if (!ret)
+ if (!ret) {
+ sb_end_pagefault(inode->i_sb);
return VM_FAULT_LOCKED;
+ }
unlock_page(page);
out:
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out_noreserve:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 43f0012016e3..bc2f6ffff3cf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -195,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
mutex_lock(&inode->i_mutex);
ip_oldflags = ip->flags;
@@ -209,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
}
- ret = mnt_want_write_file(file);
- if (ret)
- goto out_unlock;
-
if (flags & FS_SYNC_FL)
ip->flags |= BTRFS_INODE_SYNC;
else
@@ -275,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode->i_flags = i_oldflags;
}
- mnt_drop_write_file(file);
out_unlock:
mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(file);
return ret;
}
@@ -664,6 +664,10 @@ static noinline int btrfs_mksubvol(struct path *parent,
struct dentry *dentry;
int error;
+ error = mnt_want_write(parent->mnt);
+ if (error)
+ return error;
+
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, parent->dentry, namelen);
@@ -699,6 +703,7 @@ out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(parent->mnt);
return error;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7ac7cdcc294e..17be3dedacba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -335,6 +335,8 @@ again:
if (!h)
return ERR_PTR(-ENOMEM);
+ sb_start_intwrite(root->fs_info->sb);
+
if (may_wait_transaction(root, type))
wait_current_trans(root);
@@ -345,6 +347,7 @@ again:
} while (ret == -EBUSY);
if (ret < 0) {
+ sb_end_intwrite(root->fs_info->sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
return ERR_PTR(ret);
}
@@ -548,6 +551,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ sb_end_intwrite(root->fs_info->sb);
+
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root)) {
trans->transaction->blocked = 1;
@@ -1578,6 +1583,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
put_transaction(cur_trans);
put_transaction(cur_trans);
+ sb_end_intwrite(root->fs_info->sb);
+
trace_btrfs_transaction_commit(root);
btrfs_scrub_continue(root);
diff --git a/fs/buffer.c b/fs/buffer.c
index c7062c896d7c..9f6d2e41281d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2306,8 +2306,8 @@ EXPORT_SYMBOL(block_commit_write);
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*
- * Direct callers of this function should call vfs_check_frozen() so that page
- * fault does not busyloop until the fs is thawed.
+ * Direct callers of this function should protect against filesystem freezing
+ * using sb_start_write() - sb_end_write() functions.
*/
int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
@@ -2318,6 +2318,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
loff_t size;
int ret;
+ /*
+ * Update file times before taking page lock. We may end up failing the
+ * fault so this update may be superfluous but who really cares...
+ */
+ file_update_time(vma->vm_file);
+
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
@@ -2339,18 +2345,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (unlikely(ret < 0))
goto out_unlock;
- /*
- * Freezing in progress? We check after the page is marked dirty and
- * with page lock held so if the test here fails, we are sure freezing
- * code will wait during syncing until the page fault is done - at that
- * point page will be dirty and unlocked so freezing code will write it
- * and writeprotect it again.
- */
set_page_dirty(page);
- if (inode->i_sb->s_frozen != SB_UNFROZEN) {
- ret = -EAGAIN;
- goto out_unlock;
- }
wait_on_page_writeback(page);
return 0;
out_unlock:
@@ -2365,12 +2360,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
int ret;
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
- /*
- * This check is racy but catches the common case. The check in
- * __block_page_mkwrite() is reliable.
- */
- vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ sb_start_pagefault(sb);
ret = __block_page_mkwrite(vma, vmf, get_block);
+ sb_end_pagefault(sb);
return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL(block_page_mkwrite);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b67304e4b80..452e71a1b753 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size, len;
int ret;
+ /* Update time before taking page lock */
+ file_update_time(vma->vm_file);
+
size = i_size_read(inode);
if (off + PAGE_CACHE_SIZE <= size)
len = PAGE_CACHE_SIZE;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ffa2be57804d..c3ca12c33ca2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -318,21 +318,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
struct vfsmount *lower_mnt;
int rc = 0;
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
- fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
- BUG_ON(!lower_dentry->d_count);
-
dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- ecryptfs_set_dentry_private(dentry, dentry_info);
if (!dentry_info) {
printk(KERN_ERR "%s: Out of memory whilst attempting "
"to allocate ecryptfs_dentry_info struct\n",
__func__);
dput(lower_dentry);
- mntput(lower_mnt);
- d_drop(dentry);
return -ENOMEM;
}
+
+ lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+ fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
+ BUG_ON(!lower_dentry->d_count);
+
+ ecryptfs_set_dentry_private(dentry, dentry_info);
ecryptfs_set_dentry_lower(dentry, lower_dentry);
ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
@@ -381,12 +380,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
struct dentry *lower_dir_dentry, *lower_dentry;
int rc = 0;
- if ((ecryptfs_dentry->d_name.len == 1
- && !strcmp(ecryptfs_dentry->d_name.name, "."))
- || (ecryptfs_dentry->d_name.len == 2
- && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
- goto out_d_drop;
- }
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
@@ -397,8 +390,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
"[%d] on lower_dentry = [%s]\n", __func__, rc,
- encrypted_and_encoded_name);
- goto out_d_drop;
+ ecryptfs_dentry->d_name.name);
+ goto out;
}
if (lower_dentry->d_inode)
goto interpose;
@@ -415,7 +408,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt and encode "
"filename; rc = [%d]\n", __func__, rc);
- goto out_d_drop;
+ goto out;
}
mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
lower_dentry = lookup_one_len(encrypted_and_encoded_name,
@@ -427,14 +420,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
"[%d] on lower_dentry = [%s]\n", __func__, rc,
encrypted_and_encoded_name);
- goto out_d_drop;
+ goto out;
}
interpose:
rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
ecryptfs_dir_inode);
- goto out;
-out_d_drop:
- d_drop(ecryptfs_dentry);
out:
kfree(encrypted_and_encoded_name);
return ERR_PTR(rc);
diff --git a/fs/exec.c b/fs/exec.c
index 3684353ebd5f..574cf4de4ec3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file)
*/
static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
{
- struct file *rp, *wp;
+ struct file *files[2];
struct fdtable *fdt;
struct coredump_params *cp = (struct coredump_params *)info->data;
struct files_struct *cf = current->files;
+ int err = create_pipe_files(files, 0);
+ if (err)
+ return err;
- wp = create_write_pipe(0);
- if (IS_ERR(wp))
- return PTR_ERR(wp);
-
- rp = create_read_pipe(wp, 0);
- if (IS_ERR(rp)) {
- free_write_pipe(wp);
- return PTR_ERR(rp);
- }
-
- cp->file = wp;
+ cp->file = files[1];
sys_close(0);
- fd_install(0, rp);
+ fd_install(0, files[0]);
spin_lock(&cf->file_lock);
fdt = files_fdtable(cf);
__set_open_fd(0, fdt);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 264d315f6c47..6363ac66fafa 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode)
truncate_inode_pages(&inode->i_data, 0);
if (want_delete) {
+ sb_start_intwrite(inode->i_sb);
/* set dtime */
EXT2_I(inode)->i_dtime = get_seconds();
mark_inode_dirty(inode);
@@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode)
if (unlikely(rsv))
kfree(rsv);
- if (want_delete)
+ if (want_delete) {
ext2_free_inode(inode);
+ sb_end_intwrite(inode->i_sb);
+ }
}
typedef struct {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9f311d27b16f..af74d9e27b71 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb,
static int ext2_remount (struct super_block * sb, int * flags, char * data);
static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
static int ext2_sync_fs(struct super_block *sb, int wait);
+static int ext2_freeze(struct super_block *sb);
+static int ext2_unfreeze(struct super_block *sb);
void ext2_error(struct super_block *sb, const char *function,
const char *fmt, ...)
@@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = {
.evict_inode = ext2_evict_inode,
.put_super = ext2_put_super,
.sync_fs = ext2_sync_fs,
+ .freeze_fs = ext2_freeze,
+ .unfreeze_fs = ext2_unfreeze,
.statfs = ext2_statfs,
.remount_fs = ext2_remount,
.show_options = ext2_show_options,
@@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
return 0;
}
+static int ext2_freeze(struct super_block *sb)
+{
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+ /*
+ * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared
+ * because we have unattached inodes and thus filesystem is not fully
+ * consistent.
+ */
+ if (atomic_long_read(&sb->s_remove_count)) {
+ ext2_sync_fs(sb, 1);
+ return 0;
+ }
+ /* Set EXT2_FS_VALID flag */
+ spin_lock(&sbi->s_lock);
+ sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state);
+ spin_unlock(&sbi->s_lock);
+ ext2_sync_super(sb, sbi->s_es, 1);
+
+ return 0;
+}
+
+static int ext2_unfreeze(struct super_block *sb)
+{
+ /* Just write sb to clear EXT2_VALID_FS flag */
+ ext2_write_super(sb);
+
+ return 0;
+}
void ext2_write_super(struct super_block *sb)
{
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89b59cb7f9b8..6324f74e0342 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
+ /*
+ * Protect us against freezing - iput() caller didn't have to have any
+ * protection against it
+ */
+ sb_start_intwrite(inode->i_sb);
handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
@@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
+ sb_end_intwrite(inode->i_sb);
goto no_delete;
}
}
@@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode)
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
+ sb_end_intwrite(inode->i_sb);
return;
no_delete:
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -4779,11 +4787,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
get_block_t *get_block;
int retries = 0;
- /*
- * This check is racy but catches the common case. We rely on
- * __block_page_mkwrite() to do a reliable check.
- */
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_pagefault(inode->i_sb);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -4851,5 +4855,6 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f99a1311e847..fe7c63f4717e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+ /*
+ * We protect against freezing so that we don't create dirty buffers
+ * on frozen filesystem.
+ */
+ sb_start_write(sb);
ext4_mmp_csum_set(sb, mmp);
mark_buffer_dirty(bh);
lock_buffer(bh);
@@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
get_bh(bh);
submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
+ sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
return 1;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2d51cd9af225..d76ec8277d3f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -331,33 +331,17 @@ static void ext4_put_nojournal(handle_t *handle)
* journal_end calls result in the superblock being marked dirty, so
* that sync() will call the filesystem's write_super callback if
* appropriate.
- *
- * To avoid j_barrier hold in userspace when a user calls freeze(),
- * ext4 prevents a new handle from being started by s_frozen, which
- * is in an upper layer.
*/
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
{
journal_t *journal;
- handle_t *handle;
trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
+ WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
- handle = ext4_journal_current_handle();
-
- /*
- * If a handle has been started, it should be allowed to
- * finish, otherwise deadlock could happen between freeze
- * and others(e.g. truncate) due to the restart of the
- * journal handle if the filesystem is forzen and active
- * handles are not stopped.
- */
- if (!handle)
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
-
if (!journal)
return ext4_get_nojournal();
/*
@@ -2747,6 +2731,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
sb = elr->lr_super;
ngroups = EXT4_SB(sb)->s_groups_count;
+ sb_start_write(sb);
for (group = elr->lr_next_group; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
@@ -2773,6 +2758,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
}
+ sb_end_write(sb);
return ret;
}
@@ -4460,10 +4446,8 @@ int ext4_force_commit(struct super_block *sb)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal) {
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
+ if (journal)
ret = ext4_journal_force_commit(journal);
- }
return ret;
}
@@ -4493,9 +4477,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
* gives us a chance to flush the journal completely and mark the fs clean.
*
* Note that only this function cannot bring a filesystem to be in a clean
- * state independently, because ext4 prevents a new handle from being started
- * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
- * the upper layer.
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
*/
static int ext4_freeze(struct super_block *sb)
{
@@ -4522,7 +4505,7 @@ static int ext4_freeze(struct super_block *sb)
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
out:
- /* we rely on s_frozen to stop further updates */
+ /* we rely on upper layer to stop further updates */
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return error;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a71fe3715ee8..e007b8bd8e5e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
if (err)
goto out;
- mutex_lock(&inode->i_mutex);
err = mnt_want_write_file(file);
if (err)
- goto out_unlock_inode;
+ goto out;
+ mutex_lock(&inode->i_mutex);
/*
* ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
/* The root directory has no attributes */
if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
err = -EINVAL;
- goto out_drop_write;
+ goto out_unlock_inode;
}
if (sbi->options.sys_immutable &&
((attr | oldattr) & ATTR_SYS) &&
!capable(CAP_LINUX_IMMUTABLE)) {
err = -EPERM;
- goto out_drop_write;
+ goto out_unlock_inode;
}
/*
@@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
*/
err = security_inode_setattr(file->f_path.dentry, &ia);
if (err)
- goto out_drop_write;
+ goto out_unlock_inode;
/* This MUST be done before doing anything irreversible... */
err = fat_setattr(file->f_path.dentry, &ia);
if (err)
- goto out_drop_write;
+ goto out_unlock_inode;
fsnotify_change(file->f_path.dentry, ia.ia_valid);
if (sbi->options.sys_immutable) {
@@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
fat_save_attrs(inode, attr);
mark_inode_dirty(inode);
-out_drop_write:
- mnt_drop_write_file(file);
out_unlock_inode:
mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(file);
out:
return err;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index b3fc4d67a26b..701985e4ccda 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-static inline void file_free_rcu(struct rcu_head *head)
+static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
@@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file)
return;
if (file_check_writeable(file) != 0)
return;
- mnt_drop_write(mnt);
+ __mnt_drop_write(mnt);
file_release_write(file);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b321a688cde7..93d8d6c9494d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -944,9 +944,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return err;
count = ocount;
-
+ sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -1004,6 +1003,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
out:
current->backing_dev_info = NULL;
mutex_unlock(&inode->i_mutex);
+ sb_end_write(inode->i_sb);
return written ? written : err;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9aa6af13823c..d1d791ef38de 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -373,11 +373,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size;
int ret;
- /* Wait if fs is frozen. This is racy so we check again later on
- * and retry if the fs has been frozen after the page lock has
- * been acquired
- */
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_pagefault(inode->i_sb);
+
+ /* Update file times before taking page lock */
+ file_update_time(vma->vm_file);
ret = gfs2_rs_alloc(ip);
if (ret)
@@ -462,14 +461,9 @@ out:
gfs2_holder_uninit(&gh);
if (ret == 0) {
set_page_dirty(page);
- /* This check must be post dropping of transaction lock */
- if (inode->i_sb->s_frozen == SB_UNFROZEN) {
- wait_on_page_writeback(page);
- } else {
- ret = -EAGAIN;
- unlock_page(page);
- }
+ wait_on_page_writeback(page);
}
+ sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ad3e2fb763d7..adbd27875ef9 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
if (revokes)
tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
sizeof(u64));
+ sb_start_intwrite(sdp->sd_vfs);
gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
error = gfs2_glock_nq(&tr->tr_t_gh);
@@ -68,6 +69,7 @@ fail_gunlock:
gfs2_glock_dq(&tr->tr_t_gh);
fail_holder_uninit:
+ sb_end_intwrite(sdp->sd_vfs);
gfs2_holder_uninit(&tr->tr_t_gh);
kfree(tr);
@@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
gfs2_holder_uninit(&tr->tr_t_gh);
kfree(tr);
}
+ sb_end_intwrite(sdp->sd_vfs);
return;
}
@@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
gfs2_log_flush(sdp, NULL);
+ sb_end_intwrite(sdp->sd_vfs);
}
/**
diff --git a/fs/inode.c b/fs/inode.c
index 3cc504320467..ac8d904b3f16 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1542,9 +1542,11 @@ void touch_atime(struct path *path)
if (timespec_equal(&inode->i_atime, &now))
return;
- if (mnt_want_write(mnt))
+ if (!sb_start_write_trylock(inode->i_sb))
return;
+ if (__mnt_want_write(mnt))
+ goto skip_update;
/*
* File systems can error out when updating inodes if they need to
* allocate new space to modify an inode (such is the case for
@@ -1555,7 +1557,9 @@ void touch_atime(struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
update_time(inode, &now, S_ATIME);
- mnt_drop_write(mnt);
+ __mnt_drop_write(mnt);
+skip_update:
+ sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);
@@ -1662,11 +1666,11 @@ int file_update_time(struct file *file)
return 0;
/* Finally allowed to write? Takes lock. */
- if (mnt_want_write_file(file))
+ if (__mnt_want_write_file(file))
return 0;
ret = update_time(inode, &now, sync_it);
- mnt_drop_write_file(file);
+ __mnt_drop_write_file(file);
return ret;
}
diff --git a/fs/internal.h b/fs/internal.h
index a6fd56c68b11..371bcc4b1697 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,6 +61,10 @@ extern void __init mnt_init(void);
extern struct lglock vfsmount_lock;
+extern int __mnt_want_write(struct vfsmount *);
+extern int __mnt_want_write_file(struct file *);
+extern void __mnt_drop_write(struct vfsmount *);
+extern void __mnt_drop_write_file(struct file *);
/*
* fs_struct.c
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 8392cb85bd54..05d29124c6ab 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -156,12 +156,16 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
struct nlm_rqst *call;
int status;
- nlm_get_host(host);
call = nlm_alloc_call(host);
if (call == NULL)
return -ENOMEM;
nlmclnt_locks_init_private(fl, host);
+ if (!fl->fl_u.nfs_fl.owner) {
+ /* lockowner allocation has failed */
+ nlmclnt_release_call(call);
+ return -ENOMEM;
+ }
/* Set up the argument struct */
nlmclnt_setlockargs(call, fl);
@@ -185,9 +189,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc);
/*
* Allocate an NLM RPC call struct
- *
- * Note: the caller must hold a reference to host. In case of failure,
- * this reference will be released.
*/
struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
{
@@ -199,7 +200,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
atomic_set(&call->a_count, 1);
locks_init_lock(&call->a_args.lock.fl);
locks_init_lock(&call->a_res.lock.fl);
- call->a_host = host;
+ call->a_host = nlm_get_host(host);
return call;
}
if (signalled())
@@ -207,7 +208,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
printk("nlm_alloc_call: failed, waiting for memory\n");
schedule_timeout_interruptible(5*HZ);
}
- nlmclnt_release_host(host);
return NULL;
}
@@ -750,7 +750,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
" Attempting to cancel lock.\n");
- req = nlm_alloc_call(nlm_get_host(host));
+ req = nlm_alloc_call(host);
if (!req)
return -ENOMEM;
req->a_flags = RPC_TASK_ASYNC;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a43d253c045..b147d1ae71fd 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -257,6 +257,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
return rpc_system_err;
call = nlm_alloc_call(host);
+ nlmsvc_release_host(host);
if (call == NULL)
return rpc_system_err;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index afe4488c33d8..fb1a2bedbe97 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
struct nlm_block *block;
struct nlm_rqst *call = NULL;
- nlm_get_host(host);
call = nlm_alloc_call(host);
if (call == NULL)
return NULL;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index de8f2caa2235..3009a365e082 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -297,6 +297,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
return rpc_system_err;
call = nlm_alloc_call(host);
+ nlmsvc_release_host(host);
if (call == NULL)
return rpc_system_err;
diff --git a/fs/namei.c b/fs/namei.c
index 2ccc35c4dc24..1b464390dde8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -650,6 +650,121 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki
path_put(link);
}
+int sysctl_protected_symlinks __read_mostly = 1;
+int sysctl_protected_hardlinks __read_mostly = 1;
+
+/**
+ * may_follow_link - Check symlink following for unsafe situations
+ * @link: The path of the symlink
+ *
+ * In the case of the sysctl_protected_symlinks sysctl being enabled,
+ * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
+ * in a sticky world-writable directory. This is to protect privileged
+ * processes from failing races against path names that may change out
+ * from under them by way of other users creating malicious symlinks.
+ * It will permit symlinks to be followed only when outside a sticky
+ * world-writable directory, or when the uid of the symlink and follower
+ * match, or when the directory owner matches the symlink's owner.
+ *
+ * Returns 0 if following the symlink is allowed, -ve on error.
+ */
+static inline int may_follow_link(struct path *link, struct nameidata *nd)
+{
+ const struct inode *inode;
+ const struct inode *parent;
+
+ if (!sysctl_protected_symlinks)
+ return 0;
+
+ /* Allowed if owner and follower match. */
+ inode = link->dentry->d_inode;
+ if (current_cred()->fsuid == inode->i_uid)
+ return 0;
+
+ /* Allowed if parent directory not sticky and world-writable. */
+ parent = nd->path.dentry->d_inode;
+ if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
+ return 0;
+
+ /* Allowed if parent directory and link owner match. */
+ if (parent->i_uid == inode->i_uid)
+ return 0;
+
+ path_put_conditional(link, nd);
+ path_put(&nd->path);
+ audit_log_link_denied("follow_link", link);
+ return -EACCES;
+}
+
+/**
+ * safe_hardlink_source - Check for safe hardlink conditions
+ * @inode: the source inode to hardlink from
+ *
+ * Return false if at least one of the following conditions:
+ * - inode is not a regular file
+ * - inode is setuid
+ * - inode is setgid and group-exec
+ * - access failure for read and write
+ *
+ * Otherwise returns true.
+ */
+static bool safe_hardlink_source(struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+
+ /* Special files should not get pinned to the filesystem. */
+ if (!S_ISREG(mode))
+ return false;
+
+ /* Setuid files should not get pinned to the filesystem. */
+ if (mode & S_ISUID)
+ return false;
+
+ /* Executable setgid files should not get pinned to the filesystem. */
+ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
+ return false;
+
+ /* Hardlinking to unreadable or unwritable sources is dangerous. */
+ if (inode_permission(inode, MAY_READ | MAY_WRITE))
+ return false;
+
+ return true;
+}
+
+/**
+ * may_linkat - Check permissions for creating a hardlink
+ * @link: the source to hardlink from
+ *
+ * Block hardlink when all of:
+ * - sysctl_protected_hardlinks enabled
+ * - fsuid does not match inode
+ * - hardlink source is unsafe (see safe_hardlink_source() above)
+ * - not CAP_FOWNER
+ *
+ * Returns 0 if successful, -ve on error.
+ */
+static int may_linkat(struct path *link)
+{
+ const struct cred *cred;
+ struct inode *inode;
+
+ if (!sysctl_protected_hardlinks)
+ return 0;
+
+ cred = current_cred();
+ inode = link->dentry->d_inode;
+
+ /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
+ * otherwise, it must be a safe source.
+ */
+ if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) ||
+ capable(CAP_FOWNER))
+ return 0;
+
+ audit_log_link_denied("linkat", link);
+ return -EPERM;
+}
+
static __always_inline int
follow_link(struct path *link, struct nameidata *nd, void **p)
{
@@ -1818,6 +1933,9 @@ static int path_lookupat(int dfd, const char *name,
while (err > 0) {
void *cookie;
struct path link = path;
+ err = may_follow_link(&link, nd);
+ if (unlikely(err))
+ break;
nd->flags |= LOOKUP_PARENT;
err = follow_link(&link, nd, &cookie);
if (err)
@@ -2277,7 +2395,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
struct path *path, struct file *file,
const struct open_flags *op,
- bool *want_write, bool need_lookup,
+ bool got_write, bool need_lookup,
int *opened)
{
struct inode *dir = nd->path.dentry->d_inode;
@@ -2300,7 +2418,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
mode &= ~current_umask();
- if (open_flag & O_EXCL) {
+ if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
open_flag &= ~O_TRUNC;
*opened |= FILE_CREATED;
}
@@ -2314,12 +2432,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
* Another problem is returing the "right" error value (e.g. for an
* O_EXCL open we want to return EEXIST not EROFS).
*/
- if ((open_flag & (O_CREAT | O_TRUNC)) ||
- (open_flag & O_ACCMODE) != O_RDONLY) {
- error = mnt_want_write(nd->path.mnt);
- if (!error) {
- *want_write = true;
- } else if (!(open_flag & O_CREAT)) {
+ if (((open_flag & (O_CREAT | O_TRUNC)) ||
+ (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
+ if (!(open_flag & O_CREAT)) {
/*
* No O_CREATE -> atomicity not a requirement -> fall
* back to lookup + open
@@ -2327,11 +2442,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
goto no_open;
} else if (open_flag & (O_EXCL | O_TRUNC)) {
/* Fall back and fail with the right error */
- create_error = error;
+ create_error = -EROFS;
goto no_open;
} else {
/* No side effects, safe to clear O_CREAT */
- create_error = error;
+ create_error = -EROFS;
open_flag &= ~O_CREAT;
}
}
@@ -2438,7 +2553,7 @@ looked_up:
static int lookup_open(struct nameidata *nd, struct path *path,
struct file *file,
const struct open_flags *op,
- bool *want_write, int *opened)
+ bool got_write, int *opened)
{
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
@@ -2456,7 +2571,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
goto out_no_open;
if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
- return atomic_open(nd, dentry, path, file, op, want_write,
+ return atomic_open(nd, dentry, path, file, op, got_write,
need_lookup, opened);
}
@@ -2480,10 +2595,10 @@ static int lookup_open(struct nameidata *nd, struct path *path,
* a permanent write count is taken through
* the 'struct file' in finish_open().
*/
- error = mnt_want_write(nd->path.mnt);
- if (error)
+ if (!got_write) {
+ error = -EROFS;
goto out_dput;
- *want_write = true;
+ }
*opened |= FILE_CREATED;
error = security_path_mknod(&nd->path, dentry, mode, 0);
if (error)
@@ -2513,7 +2628,7 @@ static int do_last(struct nameidata *nd, struct path *path,
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
- bool want_write = false;
+ bool got_write = false;
int acc_mode = op->acc_mode;
struct inode *inode;
bool symlink_ok = false;
@@ -2582,8 +2697,18 @@ static int do_last(struct nameidata *nd, struct path *path,
}
retry_lookup:
+ if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+ error = mnt_want_write(nd->path.mnt);
+ if (!error)
+ got_write = true;
+ /*
+ * do _not_ fail yet - we might not need that or fail with
+ * a different error; let lookup_open() decide; we'll be
+ * dropping this one anyway.
+ */
+ }
mutex_lock(&dir->d_inode->i_mutex);
- error = lookup_open(nd, path, file, op, &want_write, opened);
+ error = lookup_open(nd, path, file, op, got_write, opened);
mutex_unlock(&dir->d_inode->i_mutex);
if (error <= 0) {
@@ -2608,22 +2733,23 @@ retry_lookup:
}
/*
- * It already exists.
+ * create/update audit record if it already exists.
*/
- audit_inode(pathname, path->dentry);
+ if (path->dentry->d_inode)
+ audit_inode(pathname, path->dentry);
/*
* If atomic_open() acquired write access it is dropped now due to
* possible mount and symlink following (this might be optimized away if
* necessary...)
*/
- if (want_write) {
+ if (got_write) {
mnt_drop_write(nd->path.mnt);
- want_write = false;
+ got_write = false;
}
error = -EEXIST;
- if (open_flag & O_EXCL)
+ if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
goto exit_dput;
error = follow_managed(path, nd->flags);
@@ -2684,7 +2810,7 @@ finish_open:
error = mnt_want_write(nd->path.mnt);
if (error)
goto out;
- want_write = true;
+ got_write = true;
}
finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
@@ -2711,7 +2837,7 @@ opened:
goto exit_fput;
}
out:
- if (want_write)
+ if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
terminate_walk(nd);
@@ -2735,9 +2861,9 @@ stale_open:
nd->inode = dir->d_inode;
save_parent.mnt = NULL;
save_parent.dentry = NULL;
- if (want_write) {
+ if (got_write) {
mnt_drop_write(nd->path.mnt);
- want_write = false;
+ got_write = false;
}
retried = true;
goto retry_lookup;
@@ -2777,6 +2903,9 @@ static struct file *path_openat(int dfd, const char *pathname,
error = -ELOOP;
break;
}
+ error = may_follow_link(&link, nd);
+ if (unlikely(error))
+ break;
nd->flags |= LOOKUP_PARENT;
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
error = follow_link(&link, nd, &cookie);
@@ -2846,6 +2975,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct nameidata nd;
+ int err2;
int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
if (error)
return ERR_PTR(error);
@@ -2859,16 +2989,19 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
nd.flags &= ~LOOKUP_PARENT;
nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ /* don't fail immediately if it's r/o, at least try to report other errors */
+ err2 = mnt_want_write(nd.path.mnt);
/*
* Do the final lookup.
*/
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_hash(&nd);
if (IS_ERR(dentry))
- goto fail;
+ goto unlock;
+ error = -EEXIST;
if (dentry->d_inode)
- goto eexist;
+ goto fail;
/*
* Special case - lookup gave negative, but... we had foo/bar/
* From the vfs_mknod() POV we just have a negative dentry -
@@ -2876,23 +3009,37 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
* been asking for (non-existent) directory. -ENOENT for you.
*/
if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
- dput(dentry);
- dentry = ERR_PTR(-ENOENT);
+ error = -ENOENT;
+ goto fail;
+ }
+ if (unlikely(err2)) {
+ error = err2;
goto fail;
}
*path = nd.path;
return dentry;
-eexist:
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
fail:
+ dput(dentry);
+ dentry = ERR_PTR(error);
+unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ if (!err2)
+ mnt_drop_write(nd.path.mnt);
out:
path_put(&nd.path);
return dentry;
}
EXPORT_SYMBOL(kern_path_create);
+void done_path_create(struct path *path, struct dentry *dentry)
+{
+ dput(dentry);
+ mutex_unlock(&path->dentry->d_inode->i_mutex);
+ mnt_drop_write(path->mnt);
+ path_put(path);
+}
+EXPORT_SYMBOL(done_path_create);
+
struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
{
char *tmp = getname(pathname);
@@ -2956,8 +3103,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
struct path path;
int error;
- if (S_ISDIR(mode))
- return -EPERM;
+ error = may_mknod(mode);
+ if (error)
+ return error;
dentry = user_path_create(dfd, filename, &path, 0);
if (IS_ERR(dentry))
@@ -2965,15 +3113,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
- error = may_mknod(mode);
- if (error)
- goto out_dput;
- error = mnt_want_write(path.mnt);
- if (error)
- goto out_dput;
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
- goto out_drop_write;
+ goto out;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(path.dentry->d_inode,dentry,mode,true);
@@ -2986,13 +3128,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
break;
}
-out_drop_write:
- mnt_drop_write(path.mnt);
-out_dput:
- dput(dentry);
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
-
+out:
+ done_path_create(&path, dentry);
return error;
}
@@ -3038,19 +3175,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
- error = mnt_want_write(path.mnt);
- if (error)
- goto out_dput;
error = security_path_mkdir(&path, dentry, mode);
- if (error)
- goto out_drop_write;
- error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
-out_drop_write:
- mnt_drop_write(path.mnt);
-out_dput:
- dput(dentry);
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
+ if (!error)
+ error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ done_path_create(&path, dentry);
return error;
}
@@ -3144,6 +3272,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
}
nd.flags &= ~LOOKUP_PARENT;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit1;
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_hash(&nd);
@@ -3154,19 +3285,15 @@ static long do_rmdir(int dfd, const char __user *pathname)
error = -ENOENT;
goto exit3;
}
- error = mnt_want_write(nd.path.mnt);
- if (error)
- goto exit3;
error = security_path_rmdir(&nd.path, dentry);
if (error)
- goto exit4;
+ goto exit3;
error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
-exit4:
- mnt_drop_write(nd.path.mnt);
exit3:
dput(dentry);
exit2:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ mnt_drop_write(nd.path.mnt);
exit1:
path_put(&nd.path);
putname(name);
@@ -3233,6 +3360,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
goto exit1;
nd.flags &= ~LOOKUP_PARENT;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit1;
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_hash(&nd);
@@ -3245,21 +3375,17 @@ static long do_unlinkat(int dfd, const char __user *pathname)
if (!inode)
goto slashes;
ihold(inode);
- error = mnt_want_write(nd.path.mnt);
- if (error)
- goto exit2;
error = security_path_unlink(&nd.path, dentry);
if (error)
- goto exit3;
+ goto exit2;
error = vfs_unlink(nd.path.dentry->d_inode, dentry);
-exit3:
- mnt_drop_write(nd.path.mnt);
- exit2:
+exit2:
dput(dentry);
}
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
if (inode)
iput(inode); /* truncate the inode here */
+ mnt_drop_write(nd.path.mnt);
exit1:
path_put(&nd.path);
putname(name);
@@ -3324,19 +3450,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
if (IS_ERR(dentry))
goto out_putname;
- error = mnt_want_write(path.mnt);
- if (error)
- goto out_dput;
error = security_path_symlink(&path, dentry, from);
- if (error)
- goto out_drop_write;
- error = vfs_symlink(path.dentry->d_inode, dentry, from);
-out_drop_write:
- mnt_drop_write(path.mnt);
-out_dput:
- dput(dentry);
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
+ if (!error)
+ error = vfs_symlink(path.dentry->d_inode, dentry, from);
+ done_path_create(&path, dentry);
out_putname:
putname(from);
return error;
@@ -3436,19 +3553,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
goto out_dput;
- error = mnt_want_write(new_path.mnt);
- if (error)
+ error = may_linkat(&old_path);
+ if (unlikely(error))
goto out_dput;
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
- goto out_drop_write;
+ goto out_dput;
error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
-out_drop_write:
- mnt_drop_write(new_path.mnt);
out_dput:
- dput(new_dentry);
- mutex_unlock(&new_path.dentry->d_inode->i_mutex);
- path_put(&new_path);
+ done_path_create(&new_path, new_dentry);
out:
path_put(&old_path);
@@ -3644,6 +3757,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
if (newnd.last_type != LAST_NORM)
goto exit2;
+ error = mnt_want_write(oldnd.path.mnt);
+ if (error)
+ goto exit2;
+
oldnd.flags &= ~LOOKUP_PARENT;
newnd.flags &= ~LOOKUP_PARENT;
newnd.flags |= LOOKUP_RENAME_TARGET;
@@ -3679,23 +3796,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
if (new_dentry == trap)
goto exit5;
- error = mnt_want_write(oldnd.path.mnt);
- if (error)
- goto exit5;
error = security_path_rename(&oldnd.path, old_dentry,
&newnd.path, new_dentry);
if (error)
- goto exit6;
+ goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry);
-exit6:
- mnt_drop_write(oldnd.path.mnt);
exit5:
dput(new_dentry);
exit4:
dput(old_dentry);
exit3:
unlock_rename(new_dir, old_dir);
+ mnt_drop_write(oldnd.path.mnt);
exit2:
path_put(&newnd.path);
putname(to);
diff --git a/fs/namespace.c b/fs/namespace.c
index c53d3381b0d0..4d31f73e2561 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt)
}
/*
- * Most r/o checks on a fs are for operations that take
- * discrete amounts of time, like a write() or unlink().
- * We must keep track of when those operations start
- * (for permission checks) and when they end, so that
- * we can determine when writes are able to occur to
- * a filesystem.
+ * Most r/o & frozen checks on a fs are for operations that take discrete
+ * amounts of time, like a write() or unlink(). We must keep track of when
+ * those operations start (for permission checks) and when they end, so that we
+ * can determine when writes are able to occur to a filesystem.
*/
/**
- * mnt_want_write - get write access to a mount
+ * __mnt_want_write - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
- * This tells the low-level filesystem that a write is
- * about to be performed to it, and makes sure that
- * writes are allowed before returning success. When
- * the write operation is finished, mnt_drop_write()
- * must be called. This is effectively a refcount.
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mnt it read-write) before
+ * returning success. This operation does not protect against filesystem being
+ * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * called. This is effectively a refcount.
*/
-int mnt_want_write(struct vfsmount *m)
+int __mnt_want_write(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
@@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m)
ret = -EROFS;
}
preempt_enable();
+
+ return ret;
+}
+
+/**
+ * mnt_want_write - get write access to a mount
+ * @m: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mount is read-write, filesystem
+ * is not frozen) before returning success. When the write operation is
+ * finished, mnt_drop_write() must be called. This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *m)
+{
+ int ret;
+
+ sb_start_write(m->mnt_sb);
+ ret = __mnt_want_write(m);
+ if (ret)
+ sb_end_write(m->mnt_sb);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt)
EXPORT_SYMBOL_GPL(mnt_clone_write);
/**
- * mnt_want_write_file - get write access to a file's mount
+ * __mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like mnt_want_write, but it takes a file and can
+ * This is like __mnt_want_write, but it takes a file and can
* do some optimisations if the file is open for write already
*/
-int mnt_want_write_file(struct file *file)
+int __mnt_want_write_file(struct file *file)
{
struct inode *inode = file->f_dentry->d_inode;
+
if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
- return mnt_want_write(file->f_path.mnt);
+ return __mnt_want_write(file->f_path.mnt);
else
return mnt_clone_write(file->f_path.mnt);
}
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+ int ret;
+
+ sb_start_write(file->f_path.mnt->mnt_sb);
+ ret = __mnt_want_write_file(file);
+ if (ret)
+ sb_end_write(file->f_path.mnt->mnt_sb);
+ return ret;
+}
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
- * mnt_drop_write - give up write access to a mount
+ * __mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
- * mnt_want_write() call above.
+ * __mnt_want_write() call above.
*/
-void mnt_drop_write(struct vfsmount *mnt)
+void __mnt_drop_write(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done performing writes to it and
+ * also allows filesystem to be frozen again. Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+ __mnt_drop_write(mnt);
+ sb_end_write(mnt->mnt_sb);
+}
EXPORT_SYMBOL_GPL(mnt_drop_write);
+void __mnt_drop_write_file(struct file *file)
+{
+ __mnt_drop_write(file->f_path.mnt);
+}
+
void mnt_drop_write_file(struct file *file)
{
mnt_drop_write(file->f_path.mnt);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5ff0b7b9fc08..43295d45cc2b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
if (status < 0)
return;
+ status = mnt_want_write_file(rec_file);
+ if (status)
+ return;
+
dir = rec_file->f_path.dentry;
/* lock the parent */
mutex_lock(&dir->d_inode->i_mutex);
@@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
* as well be forgiving and just succeed silently.
*/
goto out_put;
- status = mnt_want_write_file(rec_file);
- if (status)
- goto out_put;
status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
- mnt_drop_write_file(rec_file);
out_put:
dput(dentry);
out_unlock:
@@ -189,6 +189,7 @@ out_unlock:
" (err %d); please check that %s exists"
" and is writeable", status,
user_recovery_dirname);
+ mnt_drop_write_file(rec_file);
nfs4_reset_creds(original_cred);
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cc793005a87c..032af381b3aa 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp)
fhp->fh_post_saved = 0;
#endif
}
+ fh_drop_write(fhp);
if (exp) {
exp_put(exp);
fhp->fh_export = NULL;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e15dc45fc5ec..aad6d457b9e8 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
struct dentry *dchild;
int type, mode;
__be32 nfserr;
+ int hosterr;
dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
dprintk("nfsd: CREATE %s %.*s\n",
@@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
nfserr = nfserr_exist;
if (isdotent(argp->name, argp->len))
goto done;
+ hosterr = fh_want_write(dirfhp);
+ if (hosterr) {
+ nfserr = nfserrno(hosterr);
+ goto done;
+ }
+
fh_lock_nested(dirfhp, I_MUTEX_PARENT);
dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
if (IS_ERR(dchild)) {
@@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
out_unlock:
/* We don't really need to unlock, as fh_put does it. */
fh_unlock(dirfhp);
-
+ fh_drop_write(dirfhp);
done:
fh_put(dirfhp);
return nfsd_return_dirop(nfserr, resp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 702f64e820c3..a9269f142cc4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1284,6 +1284,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
* If it has, the parent directory should already be locked.
*/
if (!resfhp->fh_dentry) {
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
fh_lock_nested(fhp, I_MUTEX_PARENT);
dchild = lookup_one_len(fname, dentry, flen);
@@ -1327,14 +1331,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
/*
* Get the dir op function pointer.
*/
err = 0;
+ host_err = 0;
switch (type) {
case S_IFREG:
host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
@@ -1351,10 +1352,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
}
- if (host_err < 0) {
- fh_drop_write(fhp);
+ if (host_err < 0)
goto out_nfserr;
- }
err = nfsd_create_setattr(rqstp, resfhp, iap);
@@ -1366,7 +1365,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err2 = nfserrno(commit_metadata(fhp));
if (err2)
err = err2;
- fh_drop_write(fhp);
/*
* Update the file handle to get the new inode info.
*/
@@ -1425,6 +1423,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserr_notdir;
if (!dirp->i_op->lookup)
goto out;
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
fh_lock_nested(fhp, I_MUTEX_PARENT);
/*
@@ -1457,9 +1460,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
v_atime = verifier[1]&0x7fffffff;
}
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
if (dchild->d_inode) {
err = 0;
@@ -1530,7 +1530,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!err)
err = nfserrno(commit_metadata(fhp));
- fh_drop_write(fhp);
/*
* Update the filehandle to get the new inode info.
*/
@@ -1541,6 +1540,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
fh_unlock(fhp);
if (dchild && !IS_ERR(dchild))
dput(dchild);
+ fh_drop_write(fhp);
return err;
out_nfserr:
@@ -1621,6 +1621,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
fh_lock(fhp);
dentry = fhp->fh_dentry;
dnew = lookup_one_len(fname, dentry, flen);
@@ -1628,10 +1633,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (IS_ERR(dnew))
goto out_nfserr;
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
if (unlikely(path[plen] != 0)) {
char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
if (path_alloced == NULL)
@@ -1691,6 +1692,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
if (isdotent(name, len))
goto out;
+ host_err = fh_want_write(tfhp);
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out;
+ }
+
fh_lock_nested(ffhp, I_MUTEX_PARENT);
ddir = ffhp->fh_dentry;
dirp = ddir->d_inode;
@@ -1702,18 +1709,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dold = tfhp->fh_dentry;
- host_err = fh_want_write(tfhp);
- if (host_err) {
- err = nfserrno(host_err);
- goto out_dput;
- }
err = nfserr_noent;
if (!dold->d_inode)
- goto out_drop_write;
+ goto out_dput;
host_err = nfsd_break_lease(dold->d_inode);
if (host_err) {
err = nfserrno(host_err);
- goto out_drop_write;
+ goto out_dput;
}
host_err = vfs_link(dold, dirp, dnew);
if (!host_err) {
@@ -1726,12 +1728,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
else
err = nfserrno(host_err);
}
-out_drop_write:
- fh_drop_write(tfhp);
out_dput:
dput(dnew);
out_unlock:
fh_unlock(ffhp);
+ fh_drop_write(tfhp);
out:
return err;
@@ -1774,6 +1775,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
goto out;
+ host_err = fh_want_write(ffhp);
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out;
+ }
+
/* cannot use fh_lock as we need deadlock protective ordering
* so do it by hand */
trap = lock_rename(tdentry, fdentry);
@@ -1804,17 +1811,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
host_err = -EXDEV;
if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
goto out_dput_new;
- host_err = fh_want_write(ffhp);
- if (host_err)
- goto out_dput_new;
host_err = nfsd_break_lease(odentry->d_inode);
if (host_err)
- goto out_drop_write;
+ goto out_dput_new;
if (ndentry->d_inode) {
host_err = nfsd_break_lease(ndentry->d_inode);
if (host_err)
- goto out_drop_write;
+ goto out_dput_new;
}
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
if (!host_err) {
@@ -1822,8 +1826,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (!host_err)
host_err = commit_metadata(ffhp);
}
-out_drop_write:
- fh_drop_write(ffhp);
out_dput_new:
dput(ndentry);
out_dput_old:
@@ -1839,6 +1841,7 @@ out_drop_write:
fill_post_wcc(tfhp);
unlock_rename(tdentry, fdentry);
ffhp->fh_locked = tfhp->fh_locked = 0;
+ fh_drop_write(ffhp);
out:
return err;
@@ -1864,6 +1867,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (err)
goto out;
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
fh_lock_nested(fhp, I_MUTEX_PARENT);
dentry = fhp->fh_dentry;
dirp = dentry->d_inode;
@@ -1882,21 +1889,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_put;
-
host_err = nfsd_break_lease(rdentry->d_inode);
if (host_err)
- goto out_drop_write;
+ goto out_put;
if (type != S_IFDIR)
host_err = vfs_unlink(dirp, rdentry);
else
host_err = vfs_rmdir(dirp, rdentry);
if (!host_err)
host_err = commit_metadata(fhp);
-out_drop_write:
- fh_drop_write(fhp);
out_put:
dput(rdentry);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index ec0611b2b738..359594c393d2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
static inline int fh_want_write(struct svc_fh *fh)
{
- return mnt_want_write(fh->fh_export->ex_path.mnt);
+ int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
+
+ if (!ret)
+ fh->fh_want_write = 1;
+ return ret;
}
static inline void fh_drop_write(struct svc_fh *fh)
{
- mnt_drop_write(fh->fh_export->ex_path.mnt);
+ if (fh->fh_want_write) {
+ fh->fh_want_write = 0;
+ mnt_drop_write(fh->fh_export->ex_path.mnt);
+ }
}
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 62cebc8e1a1f..a4d56ac02e6c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_dentry->d_inode;
struct nilfs_transaction_info ti;
- int ret;
+ int ret = 0;
if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
return VM_FAULT_SIGBUS; /* -ENOSPC */
+ sb_start_pagefault(inode->i_sb);
lock_page(page);
if (page->mapping != inode->i_mapping ||
page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
unlock_page(page);
- return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+ ret = -EFAULT; /* make the VM retry the fault */
+ goto out;
}
/*
@@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
/* never returns -ENOMEM, but may return -ENOSPC */
if (unlikely(ret))
- return VM_FAULT_SIGBUS;
+ goto out;
- ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
- if (ret != VM_FAULT_LOCKED) {
+ ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+ if (ret) {
nilfs_transaction_abort(inode->i_sb);
- return ret;
+ goto out;
}
nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
nilfs_transaction_commit(inode->i_sb);
mapped:
wait_on_page_writeback(page);
- return VM_FAULT_LOCKED;
+ out:
+ sb_end_pagefault(inode->i_sb);
+ return block_page_mkwrite_return(ret);
}
static const struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 0b6387c67e6c..fdb180769485 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
goto out_free;
}
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
if (ret < 0)
printk(KERN_ERR "NILFS: GC failed during preparation: "
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 88e11fb346b6..a5752a589932 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb,
if (ret > 0)
return 0;
- vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ sb_start_intwrite(sb);
nilfs = sb->s_fs_info;
down_read(&nilfs->ns_segctor_sem);
@@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb,
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
+ sb_end_intwrite(sb);
return ret;
}
@@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb)
err = nilfs_construct_segment(sb);
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
+ sb_end_intwrite(sb);
return err;
}
@@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb)
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
+ sb_end_intwrite(sb);
}
void nilfs_relax_pressure_in_lock(struct super_block *sb)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7389d2d5e51d..1ecf46448f85 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
if (err)
return err;
pos = *ppos;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
/* We can write back this queue in page reclaim. */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
@@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
+ sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
@@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0)
ret = err;
}
+ sb_end_write(inode->i_sb);
return ret;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7602783d7f41..46a1f6d75104 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
{
struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
!ocfs2_writes_unwritten_extents(osb))
@@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+ ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ mnt_drop_write_file(file);
+ return ret;
}
static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
@@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
if (iocb->ki_left == 0)
return 0;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ sb_start_write(inode->i_sb);
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
@@ -2436,6 +2442,7 @@ out_sems:
ocfs2_iocb_clear_sem_locked(iocb);
mutex_unlock(&inode->i_mutex);
+ sb_end_write(inode->i_sb);
if (written)
ret = written;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d96f7f81d8dd..f20edcbfe700 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(new_clusters, (int __user *)arg))
return -EFAULT;
- return ocfs2_group_extend(inode, new_clusters);
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_extend(inode, new_clusters);
+ mnt_drop_write_file(filp);
+ return status;
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
if (!capable(CAP_SYS_RESOURCE))
@@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
return -EFAULT;
- return ocfs2_group_add(inode, &input);
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_add(inode, &input);
+ mnt_drop_write_file(filp);
+ return status;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 0a42ae96dca7..2dd36af79e26 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
if (journal_current_handle())
return jbd2_journal_start(journal, max_buffs);
+ sb_start_intwrite(osb->sb);
+
down_read(&osb->journal->j_trans_barrier);
handle = jbd2_journal_start(journal, max_buffs);
if (IS_ERR(handle)) {
up_read(&osb->journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
mlog_errno(PTR_ERR(handle));
@@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
if (ret < 0)
mlog_errno(ret);
- if (!nested)
+ if (!nested) {
up_read(&journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
+ }
return ret;
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9cd41083e991..d150372fd81d 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sigset_t oldset;
int ret;
+ sb_start_pagefault(inode->i_sb);
ocfs2_block_signals(&oldset);
/*
@@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out:
ocfs2_unblock_signals(&oldset);
+ sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9f32d7cbb7a3..30a055049e16 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4466,20 +4466,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
goto out_dput;
}
- error = mnt_want_write(new_path.mnt);
- if (error) {
- mlog_errno(error);
- goto out_dput;
- }
-
error = ocfs2_vfs_reflink(old_path.dentry,
new_path.dentry->d_inode,
new_dentry, preserve);
- mnt_drop_write(new_path.mnt);
out_dput:
- dput(new_dentry);
- mutex_unlock(&new_path.dentry->d_inode->i_mutex);
- path_put(&new_path);
+ done_path_create(&new_path, new_dentry);
out:
path_put(&old_path);
diff --git a/fs/open.c b/fs/open.c
index 1e914b397e12..f3d96e7e7b19 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (IS_APPEND(inode))
goto out_putf;
+ sb_start_write(inode->i_sb);
error = locks_verify_truncate(inode, file, length);
if (!error)
error = security_path_truncate(&file->f_path);
if (!error)
error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
+ sb_end_write(inode->i_sb);
out_putf:
fput(file);
out:
@@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!file->f_op->fallocate)
return -EOPNOTSUPP;
- return file->f_op->fallocate(file, mode, offset, len);
+ sb_start_write(inode->i_sb);
+ ret = file->f_op->fallocate(file, mode, offset, len);
+ sb_end_write(inode->i_sb);
+ return ret;
}
SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -620,7 +625,7 @@ static inline int __get_file_write_access(struct inode *inode,
/*
* Balanced in __fput()
*/
- error = mnt_want_write(mnt);
+ error = __mnt_want_write(mnt);
if (error)
put_write_access(inode);
}
@@ -654,6 +659,7 @@ static int do_dentry_open(struct file *f,
if (unlikely(f->f_flags & O_PATH))
f->f_mode = FMODE_PATH;
+ path_get(&f->f_path);
inode = f->f_path.dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, f->f_path.mnt);
@@ -739,9 +745,7 @@ int finish_open(struct file *file, struct dentry *dentry,
int error;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
- mntget(file->f_path.mnt);
- file->f_path.dentry = dget(dentry);
-
+ file->f_path.dentry = dentry;
error = do_dentry_open(file, open, current_cred());
if (!error)
*opened |= FILE_OPENED;
@@ -784,7 +788,6 @@ struct file *dentry_open(const struct path *path, int flags,
f->f_flags = flags;
f->f_path = *path;
- path_get(&f->f_path);
error = do_dentry_open(f, NULL, cred);
if (!error) {
error = open_check_o_direct(f);
diff --git a/fs/pipe.c b/fs/pipe.c
index 95cbd6b227e6..8d85d7068c1e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,18 +1016,16 @@ fail_inode:
return NULL;
}
-struct file *create_write_pipe(int flags)
+int create_pipe_files(struct file **res, int flags)
{
int err;
- struct inode *inode;
+ struct inode *inode = get_pipe_inode();
struct file *f;
struct path path;
- struct qstr name = { .name = "" };
+ static struct qstr name = { .name = "" };
- err = -ENFILE;
- inode = get_pipe_inode();
if (!inode)
- goto err;
+ return -ENFILE;
err = -ENOMEM;
path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
@@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags)
f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
if (!f)
goto err_dentry;
- f->f_mapping = inode->i_mapping;
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
- f->f_version = 0;
- return f;
+ res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
+ if (!res[0])
+ goto err_file;
+
+ path_get(&path);
+ res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+ res[1] = f;
+ return 0;
- err_dentry:
+err_file:
+ put_filp(f);
+err_dentry:
free_pipe_info(inode);
path_put(&path);
- return ERR_PTR(err);
+ return err;
- err_inode:
+err_inode:
free_pipe_info(inode);
iput(inode);
- err:
- return ERR_PTR(err);
-}
-
-void free_write_pipe(struct file *f)
-{
- free_pipe_info(f->f_dentry->d_inode);
- path_put(&f->f_path);
- put_filp(f);
-}
-
-struct file *create_read_pipe(struct file *wrf, int flags)
-{
- /* Grab pipe from the writer */
- struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
- &read_pipefifo_fops);
- if (!f)
- return ERR_PTR(-ENFILE);
-
- path_get(&wrf->f_path);
- f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-
- return f;
+ return err;
}
int do_pipe_flags(int *fd, int flags)
{
- struct file *fw, *fr;
+ struct file *files[2];
int error;
int fdw, fdr;
if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
return -EINVAL;
- fw = create_write_pipe(flags);
- if (IS_ERR(fw))
- return PTR_ERR(fw);
- fr = create_read_pipe(fw, flags);
- error = PTR_ERR(fr);
- if (IS_ERR(fr))
- goto err_write_pipe;
+ error = create_pipe_files(files, flags);
+ if (error)
+ return error;
error = get_unused_fd_flags(flags);
if (error < 0)
@@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags)
fdw = error;
audit_fd_pair(fdr, fdw);
- fd_install(fdr, fr);
- fd_install(fdw, fw);
+ fd_install(fdr, files[0]);
+ fd_install(fdw, files[1]);
fd[0] = fdr;
fd[1] = fdw;
@@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags)
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
- path_put(&fr->f_path);
- put_filp(fr);
- err_write_pipe:
- free_write_pipe(fw);
+ fput(files[0]);
+ fput(files[1]);
return error;
}
diff --git a/fs/splice.c b/fs/splice.c
index 7bf08fa22ec9..41514dd89462 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
};
ssize_t ret;
+ sb_start_write(inode->i_sb);
+
pipe_lock(pipe);
splice_from_pipe_begin(&sd);
@@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
*ppos += ret;
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
+ sb_end_write(inode->i_sb);
return ret;
}
diff --git a/fs/super.c b/fs/super.c
index 4bf714459a4b..b05cf47463d0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
#include <linux/rculist_bl.h>
#include <linux/cleancache.h>
#include <linux/fsnotify.h>
+#include <linux/lockdep.h>
#include "internal.h"
LIST_HEAD(super_blocks);
DEFINE_SPINLOCK(sb_lock);
+static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+ "sb_writers",
+ "sb_pagefaults",
+ "sb_internal",
+};
+
/*
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
return total_objects;
}
+static int init_sb_writers(struct super_block *s, struct file_system_type *type)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+ err = percpu_counter_init(&s->s_writers.counter[i], 0);
+ if (err < 0)
+ goto err_out;
+ lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+ &type->s_writers_key[i], 0);
+ }
+ init_waitqueue_head(&s->s_writers.wait);
+ init_waitqueue_head(&s->s_writers.wait_unfrozen);
+ return 0;
+err_out:
+ while (--i >= 0)
+ percpu_counter_destroy(&s->s_writers.counter[i]);
+ return err;
+}
+
+static void destroy_sb_writers(struct super_block *s)
+{
+ int i;
+
+ for (i = 0; i < SB_FREEZE_LEVELS; i++)
+ percpu_counter_destroy(&s->s_writers.counter[i]);
+}
+
/**
* alloc_super - create new superblock
* @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
if (s) {
if (security_sb_alloc(s)) {
+ /*
+ * We cannot call security_sb_free() without
+ * security_sb_alloc() succeeding. So bail out manually
+ */
kfree(s);
s = NULL;
goto out;
}
#ifdef CONFIG_SMP
s->s_files = alloc_percpu(struct list_head);
- if (!s->s_files) {
- security_sb_free(s);
- kfree(s);
- s = NULL;
- goto out;
- } else {
+ if (!s->s_files)
+ goto err_out;
+ else {
int i;
for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
#else
INIT_LIST_HEAD(&s->s_files);
#endif
+ if (init_sb_writers(s, type))
+ goto err_out;
s->s_flags = flags;
s->s_bdi = &default_backing_dev_info;
INIT_HLIST_NODE(&s->s_instances);
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
mutex_init(&s->s_dquot.dqio_mutex);
mutex_init(&s->s_dquot.dqonoff_mutex);
init_rwsem(&s->s_dquot.dqptr_sem);
- init_waitqueue_head(&s->s_wait_unfrozen);
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
}
out:
return s;
+err_out:
+ security_sb_free(s);
+#ifdef CONFIG_SMP
+ if (s->s_files)
+ free_percpu(s->s_files);
+#endif
+ destroy_sb_writers(s);
+ kfree(s);
+ s = NULL;
+ goto out;
}
/**
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)
#ifdef CONFIG_SMP
free_percpu(s->s_files);
#endif
+ destroy_sb_writers(s);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
@@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
{
while (1) {
struct super_block *s = get_super(bdev);
- if (!s || s->s_frozen == SB_UNFROZEN)
+ if (!s || s->s_writers.frozen == SB_UNFROZEN)
return s;
up_read(&s->s_umount);
- vfs_check_frozen(s, SB_FREEZE_WRITE);
+ wait_event(s->s_writers.wait_unfrozen,
+ s->s_writers.frozen == SB_UNFROZEN);
put_super(s);
}
}
@@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
int retval;
int remount_ro;
- if (sb->s_frozen != SB_UNFROZEN)
+ if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
#ifdef CONFIG_BLOCK
@@ -1163,6 +1213,120 @@ out:
return ERR_PTR(error);
}
+/*
+ * This is an internal function, please use sb_end_{write,pagefault,intwrite}
+ * instead.
+ */
+void __sb_end_write(struct super_block *sb, int level)
+{
+ percpu_counter_dec(&sb->s_writers.counter[level-1]);
+ /*
+ * Make sure s_writers are updated before we wake up waiters in
+ * freeze_super().
+ */
+ smp_mb();
+ if (waitqueue_active(&sb->s_writers.wait))
+ wake_up(&sb->s_writers.wait);
+ rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+}
+EXPORT_SYMBOL(__sb_end_write);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * We want lockdep to tell us about possible deadlocks with freezing but
+ * it's it bit tricky to properly instrument it. Getting a freeze protection
+ * works as getting a read lock but there are subtle problems. XFS for example
+ * gets freeze protection on internal level twice in some cases, which is OK
+ * only because we already hold a freeze protection also on higher level. Due
+ * to these cases we have to tell lockdep we are doing trylock when we
+ * already hold a freeze protection for a higher freeze level.
+ */
+static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
+ unsigned long ip)
+{
+ int i;
+
+ if (!trylock) {
+ for (i = 0; i < level - 1; i++)
+ if (lock_is_held(&sb->s_writers.lock_map[i])) {
+ trylock = true;
+ break;
+ }
+ }
+ rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
+}
+#endif
+
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+retry:
+ if (unlikely(sb->s_writers.frozen >= level)) {
+ if (!wait)
+ return 0;
+ wait_event(sb->s_writers.wait_unfrozen,
+ sb->s_writers.frozen < level);
+ }
+
+#ifdef CONFIG_LOCKDEP
+ acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+#endif
+ percpu_counter_inc(&sb->s_writers.counter[level-1]);
+ /*
+ * Make sure counter is updated before we check for frozen.
+ * freeze_super() first sets frozen and then checks the counter.
+ */
+ smp_mb();
+ if (unlikely(sb->s_writers.frozen >= level)) {
+ __sb_end_write(sb, level);
+ goto retry;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(__sb_start_write);
+
+/**
+ * sb_wait_write - wait until all writers to given file system finish
+ * @sb: the super for which we wait
+ * @level: type of writers we wait for (normal vs page fault)
+ *
+ * This function waits until there are no writers of given type to given file
+ * system. Caller of this function should make sure there can be no new writers
+ * of type @level before calling this function. Otherwise this function can
+ * livelock.
+ */
+static void sb_wait_write(struct super_block *sb, int level)
+{
+ s64 writers;
+
+ /*
+ * We just cycle-through lockdep here so that it does not complain
+ * about returning with lock to userspace
+ */
+ rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+ rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+
+ do {
+ DEFINE_WAIT(wait);
+
+ /*
+ * We use a barrier in prepare_to_wait() to separate setting
+ * of frozen and checking of the counter
+ */
+ prepare_to_wait(&sb->s_writers.wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
+ if (writers)
+ schedule();
+
+ finish_wait(&sb->s_writers.wait, &wait);
+ } while (writers);
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
@@ -1170,6 +1334,31 @@ out:
* Syncs the super to make sure the filesystem is consistent and calls the fs's
* freeze_fs. Subsequent calls to this without first thawing the fs will return
* -EBUSY.
+ *
+ * During this function, sb->s_writers.frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen. New
+ * writes should be blocked, though page faults are still allowed. We wait for
+ * all writes to complete and then proceed to the next stage.
+ *
+ * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
+ * but internal fs threads can still modify the filesystem (although they
+ * should not dirty new pages or inodes), writeback can run etc. After waiting
+ * for all running page faults we sync the filesystem which will clean all
+ * dirty pages and inodes (no new dirty pages or inodes can be created when
+ * sync is running).
+ *
+ * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
+ * modification are blocked (e.g. XFS preallocation truncation on inode
+ * reclaim). This is usually implemented by blocking new transactions for
+ * filesystems that have them and need this additional guard. After all
+ * internal writers are finished we call ->freeze_fs() to finish filesystem
+ * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
+ * mostly auxiliary for filesystems to verify they do not modify frozen fs.
+ *
+ * sb->s_writers.frozen is protected by sb->s_umount.
*/
int freeze_super(struct super_block *sb)
{
@@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb)
atomic_inc(&sb->s_active);
down_write(&sb->s_umount);
- if (sb->s_frozen) {
+ if (sb->s_writers.frozen != SB_UNFROZEN) {
deactivate_locked_super(sb);
return -EBUSY;
}
@@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb)
}
if (sb->s_flags & MS_RDONLY) {
- sb->s_frozen = SB_FREEZE_TRANS;
- smp_wmb();
+ /* Nothing to do really... */
+ sb->s_writers.frozen = SB_FREEZE_COMPLETE;
up_write(&sb->s_umount);
return 0;
}
- sb->s_frozen = SB_FREEZE_WRITE;
+ /* From now on, no new normal writers can start */
+ sb->s_writers.frozen = SB_FREEZE_WRITE;
+ smp_wmb();
+
+ /* Release s_umount to preserve sb_start_write -> s_umount ordering */
+ up_write(&sb->s_umount);
+
+ sb_wait_write(sb, SB_FREEZE_WRITE);
+
+ /* Now we go and block page faults... */
+ down_write(&sb->s_umount);
+ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
smp_wmb();
+ sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
+
+ /* All writers are done so after syncing there won't be dirty data */
sync_filesystem(sb);
- sb->s_frozen = SB_FREEZE_TRANS;
+ /* Now wait for internal filesystem counter */
+ sb->s_writers.frozen = SB_FREEZE_FS;
smp_wmb();
+ sb_wait_write(sb, SB_FREEZE_FS);
- sync_blockdev(sb->s_bdev);
if (sb->s_op->freeze_fs) {
ret = sb->s_op->freeze_fs(sb);
if (ret) {
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
- sb->s_frozen = SB_UNFROZEN;
+ sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb();
- wake_up(&sb->s_wait_unfrozen);
+ wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
}
}
+ /*
+ * This is just for debugging purposes so that fs can warn if it
+ * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+ */
+ sb->s_writers.frozen = SB_FREEZE_COMPLETE;
up_write(&sb->s_umount);
return 0;
}
@@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb)
int error;
down_write(&sb->s_umount);
- if (sb->s_frozen == SB_UNFROZEN) {
+ if (sb->s_writers.frozen == SB_UNFROZEN) {
up_write(&sb->s_umount);
return -EINVAL;
}
@@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb)
if (error) {
printk(KERN_ERR
"VFS:Filesystem thaw failed\n");
- sb->s_frozen = SB_FREEZE_TRANS;
up_write(&sb->s_umount);
return error;
}
}
out:
- sb->s_frozen = SB_UNFROZEN;
+ sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb();
- wake_up(&sb->s_wait_unfrozen);
+ wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return 0;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a4759833d62d..614b2b544880 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = 0;
if (bb->vm_ops->page_mkwrite)
ret = bb->vm_ops->page_mkwrite(vma, vmf);
+ else
+ file_update_time(file);
sysfs_put_active(attr_sd);
return ret;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 15052ff916ec..e562dd43f41f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc(
ioend->io_append_trans = tp;
/*
+ * We will pass freeze protection with a transaction. So tell lockdep
+ * we released it.
+ */
+ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
+ /*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
@@ -199,6 +205,15 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
int error = 0;
+ if (ioend->io_append_trans) {
+ /*
+ * We've got freeze protection passed with the transaction.
+ * Tell lockdep about it.
+ */
+ rwsem_acquire_read(
+ &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
+ }
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
ioend->io_error = -EIO;
goto done;
@@ -1425,6 +1440,9 @@ out_trans_cancel:
if (ioend->io_append_trans) {
current_set_flags_nested(&ioend->io_append_trans->t_pflags,
PF_FSTRANS);
+ rwsem_acquire_read(
+ &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
xfs_trans_cancel(ioend->io_append_trans, 0);
}
out_destroy_ioend:
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4559c6e6f2c..56afcdb2377d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -770,10 +770,12 @@ xfs_file_aio_write(
if (ocount == 0)
return 0;
- xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+ sb_start_write(inode->i_sb);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ ret = -EIO;
+ goto out;
+ }
if (unlikely(file->f_flags & O_DIRECT))
ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
@@ -792,6 +794,8 @@ xfs_file_aio_write(
ret = err;
}
+out:
+ sb_end_write(inode->i_sb);
return ret;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f1535d25a9b..0e0232c3b6d9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -364,9 +364,15 @@ xfs_fssetdm_by_handle(
if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ return error;
+
dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
- if (IS_ERR(dentry))
+ if (IS_ERR(dentry)) {
+ mnt_drop_write_file(parfilp);
return PTR_ERR(dentry);
+ }
if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
error = -XFS_ERROR(EPERM);
@@ -382,6 +388,7 @@ xfs_fssetdm_by_handle(
fsd.fsd_dmstate);
out:
+ mnt_drop_write_file(parfilp);
dput(dentry);
return error;
}
@@ -634,7 +641,11 @@ xfs_ioc_space(
if (ioflags & IO_INVIS)
attr_flags |= XFS_ATTR_DMI;
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr(
{
struct fsxattr fa;
unsigned int mask;
+ int error;
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
@@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr(
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
mask |= FSX_NONBLOCK;
- return -xfs_ioctl_setattr(ip, &fa, mask);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_ioctl_setattr(ip, &fa, mask);
+ mnt_drop_write_file(filp);
+ return -error;
}
STATIC int
@@ -1196,6 +1213,7 @@ xfs_ioc_setxflags(
struct fsxattr fa;
unsigned int flags;
unsigned int mask;
+ int error;
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
@@ -1210,7 +1228,12 @@ xfs_ioc_setxflags(
mask |= FSX_NONBLOCK;
fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
- return -xfs_ioctl_setattr(ip, &fa, mask);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_ioctl_setattr(ip, &fa, mask);
+ mnt_drop_write_file(filp);
+ return -error;
}
STATIC int
@@ -1385,8 +1408,13 @@ xfs_file_ioctl(
if (copy_from_user(&dmi, arg, sizeof(dmi)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
dmi.fsd_dmstate);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1434,7 +1462,11 @@ xfs_file_ioctl(
if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_swapext(&sxp);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1463,9 +1495,14 @@ xfs_file_ioctl(
if (copy_from_user(&inout, arg, sizeof(inout)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
/* input parameter is passed in resblks field of structure */
in = inout.resblks;
error = xfs_reserve_blocks(mp, &in, &inout);
+ mnt_drop_write_file(filp);
if (error)
return -error;
@@ -1496,7 +1533,11 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_data(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1506,7 +1547,11 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_log(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
@@ -1516,7 +1561,11 @@ xfs_file_ioctl(
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_rt(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c4f2da0d2bf5..1244274a5674 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -600,7 +600,11 @@ xfs_file_compat_ioctl(
if (xfs_compat_growfs_data_copyin(&in, arg))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_data(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
case XFS_IOC_FSGROWFSRT_32: {
@@ -608,7 +612,11 @@ xfs_file_compat_ioctl(
if (xfs_compat_growfs_rt_copyin(&in, arg))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_growfs_rt(mp, &in);
+ mnt_drop_write_file(filp);
return -error;
}
#endif
@@ -627,7 +635,11 @@ xfs_file_compat_ioctl(
offsetof(struct xfs_swapext, sx_stat)) ||
xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
return -XFS_ERROR(EFAULT);
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
error = xfs_swapext(&sxp);
+ mnt_drop_write_file(filp);
return -error;
}
case XFS_IOC_FSBULKSTAT_32:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 915edf6639f0..973dff6ad935 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -680,9 +680,9 @@ xfs_iomap_write_unwritten(
* the same inode that we complete here and might deadlock
* on the iolock.
*/
- xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ sb_start_intwrite(mp->m_super);
tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
- tp->t_flags |= XFS_TRANS_RESERVE;
+ tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
error = xfs_trans_reserve(tp, resblks,
XFS_WRITE_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 711ca51ca3d7..29c2f83d4147 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1551,7 +1551,7 @@ xfs_unmountfs(
int
xfs_fs_writable(xfs_mount_t *mp)
{
- return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) ||
+ return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
(mp->m_flags & XFS_MOUNT_RDONLY));
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8724336a9a08..05a05a7b6119 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -311,9 +311,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
-#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
-
/*
* Flags for xfs_mountfs
*/
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 97304f10e78a..96548176db80 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -403,7 +403,7 @@ xfs_sync_worker(
if (!(mp->m_super->s_flags & MS_ACTIVE) &&
!(mp->m_flags & XFS_MOUNT_RDONLY)) {
/* dgc: errors ignored here */
- if (mp->m_super->s_frozen == SB_UNFROZEN &&
+ if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
xfs_log_need_covered(mp))
error = xfs_fs_log_dummy(mp);
else
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdf324508c5e..06ed520a767f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -576,8 +576,12 @@ xfs_trans_alloc(
xfs_mount_t *mp,
uint type)
{
- xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
- return _xfs_trans_alloc(mp, type, KM_SLEEP);
+ xfs_trans_t *tp;
+
+ sb_start_intwrite(mp->m_super);
+ tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
+ tp->t_flags |= XFS_TRANS_FREEZE_PROT;
+ return tp;
}
xfs_trans_t *
@@ -588,6 +592,7 @@ _xfs_trans_alloc(
{
xfs_trans_t *tp;
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
atomic_inc(&mp->m_active_trans);
tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
@@ -611,6 +616,8 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
+ if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+ sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
kmem_zone_free(xfs_trans_zone, tp);
}
@@ -643,7 +650,11 @@ xfs_trans_dup(
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(tp->t_ticket != NULL);
- ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+ ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
+ (tp->t_flags & XFS_TRANS_RESERVE) |
+ (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+ /* We gave our writer reference to the new transaction */
+ tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
tp->t_blk_res = tp->t_blk_res_used;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index bc2afd52a0b7..db056544cbb5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -179,6 +179,8 @@ struct xfs_log_item_desc {
#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
+ count in superblock */
/*
* Values for call flags parameter.
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 22f292a917a3..36abf2aa7e68 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -130,6 +130,7 @@
#define AUDIT_LAST_KERN_ANOM_MSG 1799
#define AUDIT_ANOM_PROMISCUOUS 1700 /* Device changed promiscuous mode */
#define AUDIT_ANOM_ABEND 1701 /* Process ended abnormally */
+#define AUDIT_ANOM_LINK 1702 /* Suspicious use of file links */
#define AUDIT_INTEGRITY_DATA 1800 /* Data integrity verification */
#define AUDIT_INTEGRITY_METADATA 1801 /* Metadata integrity verification */
#define AUDIT_INTEGRITY_STATUS 1802 /* Integrity enable status */
@@ -687,6 +688,8 @@ extern void audit_log_d_path(struct audit_buffer *ab,
const struct path *path);
extern void audit_log_key(struct audit_buffer *ab,
char *key);
+extern void audit_log_link_denied(const char *operation,
+ struct path *link);
extern void audit_log_lost(const char *message);
#ifdef CONFIG_SECURITY
extern void audit_log_secctx(struct audit_buffer *ab, u32 secid);
@@ -716,6 +719,7 @@ extern int audit_enabled;
#define audit_log_untrustedstring(a,s) do { ; } while (0)
#define audit_log_d_path(b, p, d) do { ; } while (0)
#define audit_log_key(b, k) do { ; } while (0)
+#define audit_log_link_denied(o, l) do { ; } while (0)
#define audit_log_secctx(b,s) do { ; } while (0)
#define audit_enabled 0
#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4ba5c8715523..38dba16c4176 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -414,6 +414,7 @@ struct inodes_stat_t {
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
+#include <linux/lockdep.h>
#include <asm/byteorder.h>
@@ -440,6 +441,8 @@ extern unsigned long get_max_files(void);
extern int sysctl_nr_open;
extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
+extern int sysctl_protected_symlinks;
+extern int sysctl_protected_hardlinks;
struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
@@ -1445,6 +1448,8 @@ extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
+struct mm_struct;
+
/*
* Umount options
*/
@@ -1458,6 +1463,31 @@ extern int send_sigurg(struct fown_struct *fown);
extern struct list_head super_blocks;
extern spinlock_t sb_lock;
+/* Possible states of 'frozen' field */
+enum {
+ SB_UNFROZEN = 0, /* FS is unfrozen */
+ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
+ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
+ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
+ * internal threads if needed) */
+ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
+};
+
+#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
+
+struct sb_writers {
+ /* Counters for counting writers at each level */
+ struct percpu_counter counter[SB_FREEZE_LEVELS];
+ wait_queue_head_t wait; /* queue for waiting for
+ writers / faults to finish */
+ int frozen; /* Is sb frozen? */
+ wait_queue_head_t wait_unfrozen; /* queue for waiting for
+ sb to be thawed */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map lock_map[SB_FREEZE_LEVELS];
+#endif
+};
+
struct super_block {
struct list_head s_list; /* Keep this first */
dev_t s_dev; /* search index; _not_ kdev_t */
@@ -1505,8 +1535,7 @@ struct super_block {
struct hlist_node s_instances;
struct quota_info s_dquot; /* Diskquota specific options */
- int s_frozen;
- wait_queue_head_t s_wait_unfrozen;
+ struct sb_writers s_writers;
char s_id[32]; /* Informational name */
u8 s_uuid[16]; /* UUID */
@@ -1561,14 +1590,117 @@ extern struct timespec current_fs_time(struct super_block *sb);
/*
* Snapshotting support.
*/
-enum {
- SB_UNFROZEN = 0,
- SB_FREEZE_WRITE = 1,
- SB_FREEZE_TRANS = 2,
-};
-#define vfs_check_frozen(sb, level) \
- wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
+void __sb_end_write(struct super_block *sb, int level);
+int __sb_start_write(struct super_block *sb, int level, bool wait);
+
+/**
+ * sb_end_write - drop write access to a superblock
+ * @sb: the super we wrote to
+ *
+ * Decrement number of writers to the filesystem. Wake up possible waiters
+ * wanting to freeze the filesystem.
+ */
+static inline void sb_end_write(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_end_pagefault - drop write access to a superblock from a page fault
+ * @sb: the super we wrote to
+ *
+ * Decrement number of processes handling write page fault to the filesystem.
+ * Wake up possible waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_pagefault(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_end_intwrite - drop write access to a superblock for internal fs purposes
+ * @sb: the super we wrote to
+ *
+ * Decrement fs-internal number of writers to the filesystem. Wake up possible
+ * waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_intwrite(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_FS);
+}
+
+/**
+ * sb_start_write - get write access to a superblock
+ * @sb: the super we write to
+ *
+ * When a process wants to write data or metadata to a file system (i.e. dirty
+ * a page or an inode), it should embed the operation in a sb_start_write() -
+ * sb_end_write() pair to get exclusion against file system freezing. This
+ * function increments number of writers preventing freezing. If the file
+ * system is already frozen, the function waits until the file system is
+ * thawed.
+ *
+ * Since freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. Generally,
+ * freeze protection should be the outermost lock. In particular, we have:
+ *
+ * sb_start_write
+ * -> i_mutex (write path, truncate, directory ops, ...)
+ * -> s_umount (freeze_super, thaw_super)
+ */
+static inline void sb_start_write(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_WRITE, true);
+}
+
+static inline int sb_start_write_trylock(struct super_block *sb)
+{
+ return __sb_start_write(sb, SB_FREEZE_WRITE, false);
+}
+
+/**
+ * sb_start_pagefault - get write access to a superblock from a page fault
+ * @sb: the super we write to
+ *
+ * When a process starts handling write page fault, it should embed the
+ * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
+ * exclusion against file system freezing. This is needed since the page fault
+ * is going to dirty a page. This function increments number of running page
+ * faults preventing freezing. If the file system is already frozen, the
+ * function waits until the file system is thawed.
+ *
+ * Since page fault freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. It is advised to
+ * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault
+ * handling code implies lock dependency:
+ *
+ * mmap_sem
+ * -> sb_start_pagefault
+ */
+static inline void sb_start_pagefault(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true);
+}
+
+/*
+ * sb_start_intwrite - get write access to a superblock for internal fs purposes
+ * @sb: the super we write to
+ *
+ * This is the third level of protection against filesystem freezing. It is
+ * free for use by a filesystem. The only requirement is that it must rank
+ * below sb_start_pagefault.
+ *
+ * For example filesystem can call sb_start_intwrite() when starting a
+ * transaction which somewhat eases handling of freezing for internal sources
+ * of filesystem changes (internal fs threads, discarding preallocation on file
+ * close, etc.).
+ */
+static inline void sb_start_intwrite(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_FS, true);
+}
+
extern bool inode_owner_or_capable(const struct inode *inode);
@@ -1892,6 +2024,7 @@ struct file_system_type {
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
+ struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
@@ -2334,9 +2467,6 @@ static inline void i_readcount_inc(struct inode *inode)
}
#endif
extern int do_pipe_flags(int *, int);
-extern struct file *create_read_pipe(struct file *f, int flags);
-extern struct file *create_write_pipe(int flags);
-extern void free_write_pipe(struct file *);
extern int kernel_read(struct file *, loff_t, char *, unsigned long);
extern struct file * open_exec(const char *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd079a1b0fdc..311be906b57d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,6 +1441,7 @@ extern void truncate_inode_pages_range(struct address_space *,
/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
/* mm/page-writeback.c */
int write_one_page(struct page *page, int wait);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d2ef8b34b967..4bf19d8174ed 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -67,6 +67,7 @@ extern int kern_path(const char *, unsigned, struct path *);
extern struct dentry *kern_path_create(int, const char *, struct path *, int);
extern struct dentry *user_path_create(int, const char __user *, struct path *, int);
+extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index ce4743a26015..fa63048fecff 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -143,6 +143,7 @@ typedef struct svc_fh {
int fh_maxsize; /* max size for fh_handle */
unsigned char fh_locked; /* inode locked by us */
+ unsigned char fh_want_write; /* remount protection taken */
#ifdef CONFIG_NFSD_V3
unsigned char fh_post_saved; /* post-op attrs saved */
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index e11d1c0fc60f..ad1a427b5267 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -160,4 +160,6 @@ void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
struct pipe_inode_info *get_pipe_info(struct file *file);
+int create_pipe_files(struct file **, int);
+
#endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 4a3f28d2ca65..ea3b7b6191c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1456,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)
}
/**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_ANOM_LINK);
+ audit_log_format(ab, "op=%s action=denied", operation);
+ audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_untrustedstring(ab, current->comm);
+ audit_log_d_path(ab, " path=", link);
+ audit_log_format(ab, " dev=");
+ audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
+ audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
+ audit_log_end(ab);
+}
+
+/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6502d35a25ba..87174ef59161 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1498,6 +1498,24 @@ static struct ctl_table fs_table[] = {
#endif
#endif
{
+ .procname = "protected_symlinks",
+ .data = &sysctl_protected_symlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "protected_hardlinks",
+ .data = &sysctl_protected_hardlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "suid_dumpable",
.data = &suid_dumpable,
.maxlen = sizeof(int),
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index f8a3f1a829b8..ba6085d9c741 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -12,7 +12,7 @@
#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
-static DEFINE_MUTEX(percpu_counters_lock);
+static DEFINE_SPINLOCK(percpu_counters_lock);
#endif
#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
@@ -123,9 +123,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
#ifdef CONFIG_HOTPLUG_CPU
INIT_LIST_HEAD(&fbc->list);
- mutex_lock(&percpu_counters_lock);
+ spin_lock(&percpu_counters_lock);
list_add(&fbc->list, &percpu_counters);
- mutex_unlock(&percpu_counters_lock);
+ spin_unlock(&percpu_counters_lock);
#endif
return 0;
}
@@ -139,9 +139,9 @@ void percpu_counter_destroy(struct percpu_counter *fbc)
debug_percpu_counter_deactivate(fbc);
#ifdef CONFIG_HOTPLUG_CPU
- mutex_lock(&percpu_counters_lock);
+ spin_lock(&percpu_counters_lock);
list_del(&fbc->list);
- mutex_unlock(&percpu_counters_lock);
+ spin_unlock(&percpu_counters_lock);
#endif
free_percpu(fbc->counters);
fbc->counters = NULL;
@@ -170,7 +170,7 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
return NOTIFY_OK;
cpu = (unsigned long)hcpu;
- mutex_lock(&percpu_counters_lock);
+ spin_lock(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
unsigned long flags;
@@ -181,7 +181,7 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
*pcount = 0;
raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
- mutex_unlock(&percpu_counters_lock);
+ spin_unlock(&percpu_counters_lock);
#endif
return NOTIFY_OK;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index a4a5260b0279..fa5ca304148e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1712,8 +1712,35 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
+int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ int ret = VM_FAULT_LOCKED;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+ /*
+ * We mark the page dirty already here so that when freeze is in
+ * progress, we are guaranteed that writeback during freezing will
+ * see the dirty page and writeprotect it again.
+ */
+ set_page_dirty(page);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_page_mkwrite);
+
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
+ .page_mkwrite = filemap_page_mkwrite,
};
/* This is used for a general mmap of a disk file */
@@ -2407,8 +2434,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
count = ocount;
pos = *ppos;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
@@ -2507,6 +2532,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
+ sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
@@ -2520,6 +2546,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = err;
}
blk_finish_plug(&plug);
+ sb_end_write(inode->i_sb);
return ret;
}
EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 213ca1f53409..13e013b1270c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -304,6 +304,7 @@ out:
static const struct vm_operations_struct xip_file_vm_ops = {
.fault = xip_file_fault,
+ .page_mkwrite = filemap_page_mkwrite,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
loff_t pos;
ssize_t ret;
+ sb_start_write(inode->i_sb);
+
mutex_lock(&inode->i_mutex);
if (!access_ok(VERIFY_READ, buf, len)) {
@@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
pos = *ppos;
count = len;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -436,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
current->backing_dev_info = NULL;
out_up:
mutex_unlock(&inode->i_mutex);
+ sb_end_write(inode->i_sb);
return ret;
}
EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/memory.c b/mm/memory.c
index 482f089765ff..57361708d1a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2650,6 +2650,9 @@ reuse:
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
}
put_page(dirty_page);
if (page_mkwrite) {
@@ -2667,10 +2670,6 @@ reuse:
}
}
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
-
return ret;
}
@@ -3339,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (dirty_page) {
struct address_space *mapping = page->mapping;
+ int dirtied = 0;
if (set_page_dirty(dirty_page))
- page_mkwrite = 1;
+ dirtied = 1;
unlock_page(dirty_page);
put_page(dirty_page);
- if (page_mkwrite && mapping) {
+ if ((dirtied || page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
@@ -3353,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* file_update_time outside page_lock */
- if (vma->vm_file)
+ if (vma->vm_file && !page_mkwrite)
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 79981d97bc9c..e4768c180da2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -823,6 +823,34 @@ fail:
return NULL;
}
+static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+{
+ struct dentry *dentry;
+ struct path path;
+ int err = 0;
+ /*
+ * Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ return err;
+
+ /*
+ * All right, let's create it.
+ */
+ err = security_path_mknod(&path, dentry, mode, 0);
+ if (!err) {
+ err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
+ if (!err) {
+ res->mnt = mntget(path.mnt);
+ res->dentry = dget(dentry);
+ }
+ }
+ done_path_create(&path, dentry);
+ return err;
+}
static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
@@ -831,8 +859,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
- struct dentry *dentry = NULL;
- struct path path;
int err;
unsigned int hash;
struct unix_address *addr;
@@ -869,43 +895,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
atomic_set(&addr->refcnt, 1);
if (sun_path[0]) {
- umode_t mode;
- err = 0;
- /*
- * Get the parent directory, calculate the hash for last
- * component.
- */
- dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- goto out_mknod_parent;
-
- /*
- * All right, let's create it.
- */
- mode = S_IFSOCK |
+ struct path path;
+ umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
- err = mnt_want_write(path.mnt);
- if (err)
- goto out_mknod_dput;
- err = security_path_mknod(&path, dentry, mode, 0);
- if (err)
- goto out_mknod_drop_write;
- err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
-out_mknod_drop_write:
- mnt_drop_write(path.mnt);
- if (err)
- goto out_mknod_dput;
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- dput(path.dentry);
- path.dentry = dentry;
-
+ err = unix_mknod(sun_path, mode, &path);
+ if (err) {
+ if (err == -EEXIST)
+ err = -EADDRINUSE;
+ unix_release_addr(addr);
+ goto out_up;
+ }
addr->hash = UNIX_HASH_SIZE;
- }
-
- spin_lock(&unix_table_lock);
-
- if (!sun_path[0]) {
+ hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
+ spin_lock(&unix_table_lock);
+ u->path = path;
+ list = &unix_socket_table[hash];
+ } else {
+ spin_lock(&unix_table_lock);
err = -EADDRINUSE;
if (__unix_find_socket_byname(net, sunaddr, addr_len,
sk->sk_type, hash)) {
@@ -914,9 +920,6 @@ out_mknod_drop_write:
}
list = &unix_socket_table[addr->hash];
- } else {
- list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
- u->path = path;
}
err = 0;
@@ -930,16 +933,6 @@ out_up:
mutex_unlock(&u->readlock);
out:
return err;
-
-out_mknod_dput:
- dput(dentry);
- mutex_unlock(&path.dentry->d_inode->i_mutex);
- path_put(&path);
-out_mknod_parent:
- if (err == -EEXIST)
- err = -EADDRINUSE;
- unix_release_addr(addr);
- goto out_up;
}
static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
diff --git a/sound/sound_firmware.c b/sound/sound_firmware.c
index 7e96249536b4..37711a5d0d6b 100644
--- a/sound/sound_firmware.c
+++ b/sound/sound_firmware.c
@@ -23,14 +23,14 @@ static int do_mod_firmware_load(const char *fn, char **fp)
if (l <= 0 || l > 131072)
{
printk(KERN_INFO "Invalid firmware '%s'\n", fn);
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
return 0;
}
dp = vmalloc(l);
if (dp == NULL)
{
printk(KERN_INFO "Out of memory loading '%s'.\n", fn);
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
return 0;
}
pos = 0;
@@ -38,10 +38,10 @@ static int do_mod_firmware_load(const char *fn, char **fp)
{
printk(KERN_INFO "Failed to read '%s'.\n", fn);
vfree(dp);
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
return 0;
}
- filp_close(filp, current->files);
+ filp_close(filp, NULL);
*fp = dp;
return (int) l;
}