summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c200
1 files changed, 133 insertions, 67 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b..a72eaabfe8f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -38,12 +40,10 @@
#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
#define HASH_SIZE (1UL << HASH_SHIFT)
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
-
static int event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
static int mnt_id_start = 0;
static int mnt_group_start = 1;
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
static int mnt_alloc_id(struct vfsmount *mnt)
{
int res;
retry:
ida_pre_get(&mnt_id_ida, GFP_KERNEL);
- spin_lock(&vfsmount_lock);
+ spin_lock(&mnt_id_lock);
res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
if (!res)
mnt_id_start = mnt->mnt_id + 1;
- spin_unlock(&vfsmount_lock);
+ spin_unlock(&mnt_id_lock);
if (res == -EAGAIN)
goto retry;
@@ -86,11 +99,11 @@ retry:
static void mnt_free_id(struct vfsmount *mnt)
{
int id = mnt->mnt_id;
- spin_lock(&vfsmount_lock);
+ spin_lock(&mnt_id_lock);
ida_remove(&mnt_id_ida, id);
if (mnt_id_start > id)
mnt_id_start = id;
- spin_unlock(&vfsmount_lock);
+ spin_unlock(&mnt_id_lock);
}
/*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
{
int ret = 0;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt->mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
*/
smp_wmb();
mnt->mnt_flags &= ~MNT_WRITE_HOLD;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
return ret;
}
static void __mnt_unmake_readonly(struct vfsmount *mnt)
{
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt->mnt_flags &= ~MNT_READONLY;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
/*
* find the first or last mount at @dentry on vfsmount @mnt depending on
* @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
*/
struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *lookup_mnt(struct path *path)
{
struct vfsmount *child_mnt;
- spin_lock(&vfsmount_lock);
+
+ br_read_lock(vfsmount_lock);
if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
mntget(child_mnt);
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return child_mnt;
}
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
+/*
+ * vfsmount lock must be held for write
+ */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
}
}
+/*
+ * vfsmount lock must be held for write
+ */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
}
}
+/*
+ * vfsmount lock must be held for write
+ */
static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
{
old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
old_path->dentry->d_mounted--;
}
+/*
+ * vfsmount lock must be held for write
+ */
void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *child_mnt)
{
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
dentry->d_mounted++;
}
+/*
+ * vfsmount lock must be held for write
+ */
static void attach_mnt(struct vfsmount *mnt, struct path *path)
{
mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
}
/*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
*/
static void commit_tree(struct vfsmount *mnt)
{
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
void mntput_no_expire(struct vfsmount *mnt)
{
repeat:
- if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
- if (likely(!mnt->mnt_pinned)) {
- spin_unlock(&vfsmount_lock);
- __mntput(mnt);
- return;
- }
- atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
- mnt->mnt_pinned = 0;
- spin_unlock(&vfsmount_lock);
- acct_auto_close_mnt(mnt);
- goto repeat;
+ if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+ return;
+ br_write_lock(vfsmount_lock);
+ if (!atomic_dec_and_test(&mnt->mnt_count)) {
+ br_write_unlock(vfsmount_lock);
+ return;
}
+ if (likely(!mnt->mnt_pinned)) {
+ br_write_unlock(vfsmount_lock);
+ __mntput(mnt);
+ return;
+ }
+ atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+ mnt->mnt_pinned = 0;
+ br_write_unlock(vfsmount_lock);
+ acct_auto_close_mnt(mnt);
+ goto repeat;
}
-
EXPORT_SYMBOL(mntput_no_expire);
void mnt_pin(struct vfsmount *mnt)
{
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt->mnt_pinned++;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *mnt)
{
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (mnt->mnt_pinned) {
atomic_inc(&mnt->mnt_count);
mnt->mnt_pinned--;
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
struct mnt_namespace *ns = p->ns;
int res = 0;
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
if (p->event != ns->event) {
p->event = ns->event;
res = 1;
}
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
return res;
}
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
int minimum_refs = 0;
struct vfsmount *p;
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += atomic_read(&p->mnt_count);
minimum_refs += 2;
}
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
if (actual_refs > minimum_refs)
return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
{
int ret = 1;
down_read(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_read_lock(vfsmount_lock);
if (propagate_mount_busy(mnt, 2))
ret = 0;
- spin_unlock(&vfsmount_lock);
+ br_read_unlock(vfsmount_lock);
up_read(&namespace_sem);
return ret;
}
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
if (mnt->mnt_parent != mnt) {
struct dentry *dentry;
struct vfsmount *m;
- spin_lock(&vfsmount_lock);
+
+ br_write_lock(vfsmount_lock);
dentry = mnt->mnt_mountpoint;
m = mnt->mnt_parent;
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
m->mnt_ghosts--;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
dput(dentry);
mntput(m);
}
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
}
}
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
{
struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
}
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
event++;
if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
umount_tree(mnt, 1, &umount_list);
retval = 0;
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
q = clone_mnt(p, p->mnt_root, flag);
if (!q)
goto Enomem;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &path);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
}
return res;
Enomem:
if (res) {
LIST_HEAD(umount_list);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
umount_tree(res, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
release_mounts(&umount_list);
}
return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
LIST_HEAD(umount_list);
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
umount_tree(mnt, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
}
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
if (err)
goto out_cleanup_ids;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
list_del_init(&child->mnt_hash);
commit_tree(child);
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
+
return 0;
out_cleanup_ids:
@@ -1444,13 +1484,30 @@ out_unlock:
}
/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+ int type = flags & ~MS_REC;
+
+ /* Fail if any non-propagation flags are set */
+ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+ return 0;
+ /* Only one propagation flag should be set */
+ if (!is_power_of_2(type))
+ return 0;
+ return type;
+}
+
+/*
* recursively change the type of the mountpoint.
*/
static int do_change_type(struct path *path, int flag)
{
struct vfsmount *m, *mnt = path->mnt;
int recurse = flag & MS_REC;
- int type = flag & ~MS_REC;
+ int type;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -1459,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
+ type = flags_to_propagation_type(flag);
+ if (!type)
+ return -EINVAL;
+
down_write(&namespace_sem);
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
@@ -1466,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
goto out_unlock;
}
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
out_unlock:
up_write(&namespace_sem);
@@ -1513,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
err = graft_tree(mnt, path);
if (err) {
LIST_HEAD(umount_list);
- spin_lock(&vfsmount_lock);
+
+ br_write_lock(vfsmount_lock);
umount_tree(mnt, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
release_mounts(&umount_list);
}
@@ -1568,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
else
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
path->mnt->mnt_flags = mnt_flags;
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
up_write(&sb->s_umount);
if (!err) {
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
touch_mnt_namespace(path->mnt->mnt_ns);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
}
return err;
}
@@ -1754,7 +1816,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
return;
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -1773,7 +1835,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, 1, &umounts);
}
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umounts);
@@ -1830,6 +1892,8 @@ resume:
/*
* process a list of expirable mountpoints with the intent of discarding any
* submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
*/
static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
{
@@ -2048,9 +2112,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
kfree(new_ns);
return ERR_PTR(-ENOMEM);
}
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2308,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out2; /* not attached */
/* make sure we can reach put_old from new_root */
tmp = old.mnt;
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
if (tmp != new.mnt) {
for (;;) {
if (tmp->mnt_parent == tmp)
@@ -2264,7 +2328,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* mount new_root on / */
attach_mnt(new.mnt, &root_parent);
touch_mnt_namespace(current->nsproxy->mnt_ns);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
chroot_fs_refs(&root, &new);
error = 0;
path_put(&root_parent);
@@ -2279,7 +2343,7 @@ out1:
out0:
return error;
out3:
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
goto out2;
}
@@ -2326,6 +2390,8 @@ void __init mnt_init(void)
for (u = 0; u < HASH_SIZE; u++)
INIT_LIST_HEAD(&mount_hashtable[u]);
+ br_lock_init(vfsmount_lock);
+
err = sysfs_init();
if (err)
printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2410,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
if (!atomic_dec_and_test(&ns->count))
return;
down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
+ br_write_lock(vfsmount_lock);
umount_tree(ns->root, 0, &umount_list);
- spin_unlock(&vfsmount_lock);
+ br_write_unlock(vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);
kfree(ns);