summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTiger Yang <tiger.yang@oracle.com>2007-03-20 16:01:38 -0700
committerMark Fasheh <mark.fasheh@oracle.com>2007-04-26 14:39:48 -0700
commit500086300e6dc5308a7328990bd50d17e075162b (patch)
tree4083cda09445c260c2cc2ac1d0f68c05ad2b958e
parenta9f5f70739363ccca2e771c274c4f015c5fb7a88 (diff)
downloadlinux-3.10-500086300e6dc5308a7328990bd50d17e075162b.tar.gz
linux-3.10-500086300e6dc5308a7328990bd50d17e075162b.tar.bz2
linux-3.10-500086300e6dc5308a7328990bd50d17e075162b.zip
ocfs2: Remove delete inode vote
Ocfs2 currently does cluster-wide node messaging to check the open state of an inode during delete. This patch removes that mechanism in favor of an inode cluster lock which is taken at shared read when an inode is first read and dropped in clear_inode(). This allows a deleting node to test the liveness of an inode by attempting to take an exclusive lock. Signed-off-by: Tiger Yang <tiger.yang@oracle.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dlmglue.c119
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/inode.c93
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c3
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/super.c1
10 files changed, 205 insertions, 38 deletions
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e46..9606111fe89 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
+ * New in version 8:
+ * - Replace delete inode votes with a cluster lock
+ *
* New in version 7:
* - DLM join domain includes the live nodemap
*
@@ -57,7 +60,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 31d519a6dbd..ca4f0e0e758 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+ .get_osb = ocfs2_get_inode_osb,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
- lockres->l_type == OCFS2_LOCK_TYPE_RW;
+ lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+ lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ ops = &ocfs2_inode_open_lops;
+ break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
goto bail;
}
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
bail:
mlog_exit(ret);
return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
mlog_exit_void();
}
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu take PRMODE open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+ int status = 0, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu try to take %s open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ level = write ? LKM_EXMODE : LKM_PRMODE;
+
+ /*
+ * The file system may already holding a PRMODE/EXMODE open lock.
+ * Since we pass LKM_NOQUEUE, the request won't block waiting on
+ * other nodes and the -EAGAIN will indicate to the caller that
+ * this inode is still in use.
+ */
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ level, LKM_NOQUEUE, 0);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu drop open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ if(lockres->l_ro_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE);
+ if(lockres->l_ex_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_EXMODE);
+
+out:
+ mlog_exit_void();
+}
+
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
@@ -2455,13 +2563,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
* ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_data_lockres);
+ &OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+ &OCFS2_I(inode)->ip_data_lockres);
+ if (err < 0)
+ mlog_errno(err);
+ if (err < 0 && !status)
+ status = err;
+
+ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
if (err < 0)
mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf..59cb566e798 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98..10d16a9e4fd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -289,7 +289,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)fe->i_blkno);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_nlink = le16_to_cpu(fe->i_links_count);
@@ -347,6 +346,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
+
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN, 0, inode);
}
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +423,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
- && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+ && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
/*
@@ -438,7 +440,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_LOCK_TYPE_META,
generation, inode);
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN,
+ 0, inode);
+
if (can_lock) {
+ status = ocfs2_open_lock(inode);
+ if (status) {
+ make_bad_inode(inode);
+ mlog_errno(status);
+ return status;
+ }
status = ocfs2_meta_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
@@ -447,6 +459,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
+ if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+ status = ocfs2_try_open_lock(inode, 0);
+ if (status) {
+ make_bad_inode(inode);
+ return status;
+ }
+ }
+
status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL);
if (status < 0) {
@@ -678,10 +698,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di;
- /* We've already voted on this so it should be readonly - no
- * spinlock needed. */
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status)
@@ -787,6 +807,35 @@ bail:
return ret;
}
+static int ocfs2_request_delete(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (ocfs2_inode_is_new(inode))
+ return 0;
+
+ if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+ osb->node_num))
+ return 0;
+ /*
+ * This is how ocfs2 determines whether an inode is still live
+ * within the cluster. Every node takes a shared read lock on
+ * the inode open lock in ocfs2_read_locked_inode(). When we
+ * get to ->delete_inode(), each node tries to convert it's
+ * lock to an exclusive. Trylocks are serialized by the inode
+ * meta data lock. If the upconvert suceeds, we know the inode
+ * is no longer live and can be deleted.
+ *
+ * Though we call this with the meta data lock held, the
+ * trylock keeps us from ABBA deadlock.
+ */
+ status = ocfs2_try_open_lock(inode, 1);
+ if (status < 0 && status != -EAGAIN)
+ mlog_errno(status);
+ return status;
+}
+
/* Query the cluster to determine whether we should wipe an inode from
* disk or not.
*
@@ -839,11 +888,11 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- status = ocfs2_request_delete_vote(inode);
- /* -EBUSY means that other nodes are still using the
+ status = ocfs2_request_delete(inode);
+ /* -EAGAIN means that other nodes are still using the
* inode. We're done here though, so avoid doing anything on
* disk and let them worry about deleting it. */
- if (status == -EBUSY) {
+ if (status == -EAGAIN) {
status = 0;
mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +903,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- spin_lock(&oi->ip_lock);
- if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
- /* Nobody knew which slot this inode was orphaned
- * into. This may happen during node death and
- * recovery knows how to clean it up so we can safely
- * ignore this inode for now on. */
- mlog(0, "Nobody knew where inode %llu was orphaned!\n",
- (unsigned long long)oi->ip_blkno);
- } else {
- *wipe = 1;
-
- mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
- (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
- }
- spin_unlock(&oi->ip_lock);
+ *wipe = 1;
+ mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+ (unsigned long long)oi->ip_blkno,
+ le16_to_cpu(di->i_orphaned_slot));
bail:
return status;
@@ -1001,11 +1039,16 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ /* For remove delete_inode vote, we hold open lock before,
+ * now it is time to unlock PR and EX open locks. */
+ ocfs2_open_unlock(inode);
+
/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+ ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@@ -1030,6 +1073,7 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
@@ -1086,9 +1130,6 @@ void ocfs2_drop_inode(struct inode *inode)
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
- /* Testing ip_orphaned_slot here wouldn't work because we may
- * not have gotten a delete_inode vote from any other nodes
- * yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode);
else
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b3..92d4feb34d7 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,6 +34,7 @@ struct ocfs2_inode_info
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
+ struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
@@ -119,8 +120,8 @@ void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_NOWAIT 0x1
#define OCFS2_FI_FLAG_DELETE 0x2
-#define OCFS2_FI_FLAG_SYSFILE 0x4
-#define OCFS2_FI_FLAG_NOLOCK 0x8
+#define OCFS2_FI_FLAG_SYSFILE 0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4..12445a31f73 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1306,7 +1306,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
continue;
iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
- OCFS2_FI_FLAG_NOLOCK);
+ OCFS2_FI_FLAG_ORPHAN_RECOVERY);
if (IS_ERR(iter))
continue;
@@ -1418,7 +1418,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock);
iput(inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 1fff0c02d98..a93c15fdcef 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -187,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
* unlink. */
spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock);
bail_add:
@@ -2220,9 +2219,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
mlog(0, "Inode %llu orphaned in slot %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0..a476b63e2e6 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -446,7 +446,9 @@ struct ocfs2_dinode {
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
__le32 i_attr;
- __le32 i_reserved1;
+ __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
+ was set in i_flags */
+ __le16 i_reserved1;
/*70*/ __le64 i_reserved2[8];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c18..4ca02b1c38a 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
+ OCFS2_LOCK_TYPE_OPEN,
OCFS2_NUM_LOCK_TYPES
};
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ c = 'O';
+ break;
default:
c = '\0';
}
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+ [OCFS2_LOCK_TYPE_OPEN] = "Open",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424d..16564ea6c14 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -963,6 +963,7 @@ static void ocfs2_inode_init_once(void *data,
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);