diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2005-12-15 14:31:24 -0800 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2006-01-03 11:45:47 -0800 |
commit | ccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch) | |
tree | c50ed941849ce06ccadd4ce27599b3ef9fdbe2ae | |
parent | 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff) | |
download | linux-3.10-ccd979bdbce9fba8412beb3f1de68a9d0171b12c.tar.gz linux-3.10-ccd979bdbce9fba8412beb3f1de68a9d0171b12c.tar.bz2 linux-3.10-ccd979bdbce9fba8412beb3f1de68a9d0171b12c.zip |
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
55 files changed, 24504 insertions, 0 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index d9b0a069186..2580ada100a 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -36,6 +36,8 @@ ntfs.txt - info and mount options for the NTFS filesystem (Windows NT). proc.txt - info on Linux's /proc filesystem. +ocfs2.txt + - info and mount options for the OCFS2 clustered filesystem. romfs.txt - Description of the ROMFS filesystem. smbfs.txt diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt new file mode 100644 index 00000000000..f2595caf052 --- /dev/null +++ b/Documentation/filesystems/ocfs2.txt @@ -0,0 +1,55 @@ +OCFS2 filesystem +================== +OCFS2 is a general purpose extent based shared disk cluster file +system with many similarities to ext3. It supports 64 bit inode +numbers, and has automatically extending metadata groups which may +also make it attractive for non-clustered use. + +You'll want to install the ocfs2-tools package in order to at least +get "mount.ocfs2" and "ocfs2_hb_ctl". + +Project web page: http://oss.oracle.com/projects/ocfs2 +Tools web page: http://oss.oracle.com/projects/ocfs2-tools +OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ + +All code copyright 2005 Oracle except when otherwise noted. + +CREDITS: +Lots of code taken from ext3 and other projects. + +Authors in alphabetical order: +Joel Becker <joel.becker@oracle.com> +Zach Brown <zach.brown@oracle.com> +Mark Fasheh <mark.fasheh@oracle.com> +Kurt Hackel <kurt.hackel@oracle.com> +Sunil Mushran <sunil.mushran@oracle.com> +Manish Singh <manish.singh@oracle.com> + +Caveats +======= +Features which OCFS2 does not support yet: + - sparse files + - extended attributes + - shared writeable mmap + - loopback is supported, but data written will not + be cluster coherent. + - quotas + - cluster aware flock + - Directory change notification (F_NOTIFY) + - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) + - POSIX ACLs + - readpages / writepages (not user visible) + +Mount options +============= + +OCFS2 supports the following mount options: +(*) == default + +barrier=1 This enables/disables barriers. barrier=0 disables it, + barrier=1 enables it. +errors=remount-ro(*) Remount the filesystem read-only on an error. +errors=panic Panic and halt the machine if an error occurs. +intr (*) Allow signals to interrupt cluster operations. +nointr Do not allow signals to interrupt cluster + operations. diff --git a/MAINTAINERS b/MAINTAINERS index 86ee06f4379..15888302025 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1905,6 +1905,15 @@ M: ajoshi@shell.unixbox.com L: linux-nvidia@lists.surfsouth.com S: Maintained +ORACLE CLUSTER FILESYSTEM 2 (OCFS2) +P: Mark Fasheh +M: mark.fasheh@oracle.com +P: Kurt Hackel +M: kurt.hackel@oracle.com +L: ocfs2-devel@oss.oracle.com +W: http://oss.oracle.com/projects/ocfs2/ +S: Supported + OLYMPIC NETWORK DRIVER P: Peter De Shrijver M: p2@ace.ulyssis.student.kuleuven.ac.be diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile new file mode 100644 index 00000000000..7d3be845a61 --- /dev/null +++ b/fs/ocfs2/Makefile @@ -0,0 +1,33 @@ +EXTRA_CFLAGS += -Ifs/ocfs2 + +EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES + +obj-$(CONFIG_OCFS2_FS) += ocfs2.o + +ocfs2-objs := \ + alloc.o \ + aops.o \ + buffer_head_io.o \ + dcache.o \ + dir.o \ + dlmglue.o \ + export.o \ + extent_map.o \ + file.o \ + heartbeat.o \ + inode.o \ + journal.o \ + localalloc.o \ + mmap.o \ + namei.o \ + slot_map.o \ + suballoc.o \ + super.o \ + symlink.o \ + sysfile.o \ + uptodate.o \ + ver.o \ + vote.o + +obj-$(CONFIG_OCFS2_FS) += cluster/ +obj-$(CONFIG_OCFS2_FS) += dlm/ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c new file mode 100644 index 00000000000..465f797451e --- /dev/null +++ b/fs/ocfs2/alloc.c @@ -0,0 +1,2040 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.c + * + * Extent allocs and frees + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "sysfile.h" +#include "file.h" +#include "super.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static int ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + u64 blkno); + +static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]); + +static int ocfs2_add_branch(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *eb_bh, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac); + +static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh); + +static int ocfs2_do_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 blkno, + u32 new_clusters); + +static int ocfs2_find_branch_target(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **target_bh); + +static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe, + unsigned int new_i_clusters, + struct buffer_head *old_last_eb, + struct buffer_head **new_last_eb); + +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); + +static int ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + return blkno == (le64_to_cpu(ext->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(ext->e_clusters))); +} + +/* + * How many free extents have we got before we need more meta data? + */ +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe) +{ + int retval; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + retval = -EIO; + goto bail; + } + + if (fe->i_last_eb_blk) { + retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &eb_bh, OCFS2_BH_CACHED, inode); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + + BUG_ON(el->l_tree_depth != 0); + + retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); +bail: + if (eb_bh) + brelse(eb_bh); + + mlog_exit(retval); + return retval; +} + +/* expects array to already be allocated + * + * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and + * l_count for you + */ +static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]) +{ + int count, status, i; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + struct ocfs2_extent_block *eb; + + mlog_entry_void(); + + count = 0; + while (count < wanted) { + status = ocfs2_claim_metadata(osb, + handle, + meta_ac, + wanted - count, + &suballoc_bit_start, + &num_got, + &first_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = count; i < (num_got + count); i++) { + bhs[i] = sb_getblk(osb->sb, first_blkno); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + + status = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) bhs[i]->b_data; + /* Ok, setup the minimal stuff here. */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(first_blkno); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + +#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS + /* we always use slot zero's suballocator */ + eb->h_suballoc_slot = 0; +#else + eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); +#endif + eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + suballoc_bit_start++; + first_blkno++; + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. */ + status = ocfs2_journal_dirty(handle, bhs[i]); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + count += num_got; + } + + status = 0; +bail: + if (status < 0) { + for(i = 0; i < wanted; i++) { + if (bhs[i]) + brelse(bhs[i]); + bhs[i] = NULL; + } + } + mlog_exit(status); + return status; +} + +/* + * Add an entire tree branch to our inode. eb_bh is the extent block + * to start at, if we don't want to start the branch at the dinode + * structure. + * + * last_eb_bh is required as we have to update it's next_leaf pointer + * for the new last extent block. + * + * the new branch will be 'empty' in the sense that every block will + * contain a single record with e_clusters == 0. + */ +static int ocfs2_add_branch(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *eb_bh, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int status, new_blocks, i; + u64 next_blkno, new_last_eb_blk; + struct buffer_head *bh; + struct buffer_head **new_eb_bhs = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *eb_el; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + BUG_ON(!last_eb_bh); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (eb_bh) { + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + + /* we never add a branch to a leaf. */ + BUG_ON(!el->l_tree_depth); + + new_blocks = le16_to_cpu(el->l_tree_depth); + + /* allocate the number of new eb blocks we need */ + new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!new_eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, + meta_ac, new_eb_bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be + * linked with the rest of the tree. + * conversly, new_eb_bhs[0] is the new bottommost leaf. + * + * when we leave the loop, new_last_eb_blk will point to the + * newest leaf, and next_blkno will point to the topmost extent + * block. */ + next_blkno = new_last_eb_blk = 0; + for(i = 0; i < new_blocks; i++) { + bh = new_eb_bhs[i]; + eb = (struct ocfs2_extent_block *) bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + eb_el = &eb->h_list; + + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb->h_next_leaf_blk = 0; + eb_el->l_tree_depth = cpu_to_le16(i); + eb_el->l_next_free_rec = cpu_to_le16(1); + eb_el->l_recs[0].e_cpos = fe->i_clusters; + eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); + eb_el->l_recs[0].e_clusters = cpu_to_le32(0); + if (!eb_el->l_tree_depth) + new_last_eb_blk = le64_to_cpu(eb->h_blkno); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + next_blkno = le64_to_cpu(eb->h_blkno); + } + + /* This is a bit hairy. We want to update up to three blocks + * here without leaving any of them in an inconsistent state + * in case of error. We don't have to worry about + * journal_dirty erroring as it won't unless we've aborted the + * handle (in which case we would never be here) so reserving + * the write with journal_access is all we need to do. */ + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (eb_bh) { + status = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Link the new branch into the rest of the tree (el will + * either be on the fe, or the extent block passed in. */ + i = le16_to_cpu(el->l_next_free_rec); + el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); + el->l_recs[i].e_cpos = fe->i_clusters; + el->l_recs[i].e_clusters = 0; + le16_add_cpu(&el->l_next_free_rec, 1); + + /* fe needs a new last extent block pointer, as does the + * next_leaf on the previously last-extent-block. */ + fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); + + status = ocfs2_journal_dirty(handle, last_eb_bh); + if (status < 0) + mlog_errno(status); + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) + mlog_errno(status); + if (eb_bh) { + status = ocfs2_journal_dirty(handle, eb_bh); + if (status < 0) + mlog_errno(status); + } + + status = 0; +bail: + if (new_eb_bhs) { + for (i = 0; i < new_blocks; i++) + if (new_eb_bhs[i]) + brelse(new_eb_bhs[i]); + kfree(new_eb_bhs); + } + + mlog_exit(status); + return status; +} + +/* + * adds another level to the allocation tree. + * returns back the new extent block so you can add a branch to it + * after this call. + */ +static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh) +{ + int status, i; + struct buffer_head *new_eb_bh = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *eb_el; + + mlog_entry_void(); + + status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, + &new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + + eb_el = &eb->h_list; + fe = (struct ocfs2_dinode *) fe_bh->b_data; + fe_el = &fe->id2.i_list; + + status = ocfs2_journal_access(handle, inode, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* copy the fe data into the new extent block */ + eb_el->l_tree_depth = fe_el->l_tree_depth; + eb_el->l_next_free_rec = fe_el->l_next_free_rec; + for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { + eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; + eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; + eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; + } + + status = ocfs2_journal_dirty(handle, new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* update fe now */ + le16_add_cpu(&fe_el->l_tree_depth, 1); + fe_el->l_recs[0].e_cpos = 0; + fe_el->l_recs[0].e_blkno = eb->h_blkno; + fe_el->l_recs[0].e_clusters = fe->i_clusters; + for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { + fe_el->l_recs[i].e_cpos = 0; + fe_el->l_recs[i].e_clusters = 0; + fe_el->l_recs[i].e_blkno = 0; + } + fe_el->l_next_free_rec = cpu_to_le16(1); + + /* If this is our 1st tree depth shift, then last_eb_blk + * becomes the allocated extent block */ + if (fe_el->l_tree_depth == cpu_to_le16(1)) + fe->i_last_eb_blk = eb->h_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *ret_new_eb_bh = new_eb_bh; + new_eb_bh = NULL; + status = 0; +bail: + if (new_eb_bh) + brelse(new_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Expects the tree to already have room in the rightmost leaf for the + * extent. Updates all the extent blocks (and the dinode) on the way + * down. + */ +static int ocfs2_do_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 start_blk, + u32 new_clusters) +{ + int status, i, num_bhs = 0; + u64 next_blkno; + u16 next_free; + struct buffer_head **eb_bhs = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + if (el->l_tree_depth) { + /* This is another operation where we want to be + * careful about our tree updates. An error here means + * none of the previous changes we made should roll + * forward. As a result, we have to record the buffers + * for this part of the tree in an array and reserve a + * journal write to them before making any changes. */ + num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); + eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + i = 0; + while(el->l_tree_depth) { + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" has a bad " + "extent list", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); + + BUG_ON(i >= num_bhs); + status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, + eb); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access(handle, inode, eb_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + el = &eb->h_list; + i++; + /* When we leave this loop, eb_bhs[num_bhs - 1] will + * hold the bottom-most leaf extent block. */ + } + BUG_ON(el->l_tree_depth); + + el = &fe->id2.i_list; + /* If we have tree depth, then the fe update is + * trivial, and we want to switch el out for the + * bottom-most leaf in order to update it with the + * actual extent data below. */ + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" has a bad " + "extent list", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + new_clusters); + /* (num_bhs - 1) to avoid the leaf */ + for(i = 0; i < (num_bhs - 1); i++) { + eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; + el = &eb->h_list; + + /* finally, make our actual change to the + * intermediate extent blocks. */ + next_free = le16_to_cpu(el->l_next_free_rec); + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + new_clusters); + + status = ocfs2_journal_dirty(handle, eb_bhs[i]); + if (status < 0) + mlog_errno(status); + } + BUG_ON(i != (num_bhs - 1)); + /* note that the leaf block wasn't touched in + * the loop above */ + eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; + el = &eb->h_list; + BUG_ON(el->l_tree_depth); + } + + /* yay, we can finally add the actual extent now! */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le16_to_cpu(el->l_next_free_rec) && + ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { + le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); + } else if (le16_to_cpu(el->l_next_free_rec) && + (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { + /* having an empty extent at eof is legal. */ + if (el->l_recs[i].e_cpos != fe->i_clusters) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" trailing extent is bad: " + "cpos (%u) != number of clusters (%u)", + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(fe->i_clusters)); + status = -EIO; + goto bail; + } + el->l_recs[i].e_blkno = cpu_to_le64(start_blk); + el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); + } else { + /* No contiguous record, or no empty record at eof, so + * we add a new one. */ + + BUG_ON(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count)); + i = le16_to_cpu(el->l_next_free_rec); + + el->l_recs[i].e_blkno = cpu_to_le64(start_blk); + el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); + el->l_recs[i].e_cpos = fe->i_clusters; + le16_add_cpu(&el->l_next_free_rec, 1); + } + + /* + * extent_map errors are not fatal, so they are ignored outside + * of flushing the thing. + */ + status = ocfs2_extent_map_append(inode, &el->l_recs[i], + new_clusters); + if (status) { + mlog_errno(status); + ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); + } + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) + mlog_errno(status); + if (fe->id2.i_list.l_tree_depth) { + status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); + if (status < 0) + mlog_errno(status); + } + + status = 0; +bail: + if (eb_bhs) { + for (i = 0; i < num_bhs; i++) + if (eb_bhs[i]) + brelse(eb_bhs[i]); + kfree(eb_bhs); + } + + mlog_exit(status); + return status; +} + +/* + * Should only be called when there is no space left in any of the + * leaf nodes. What we want to do is find the lowest tree depth + * non-leaf extent block with room for new records. There are three + * valid results of this search: + * + * 1) a lowest extent block is found, then we pass it back in + * *lowest_eb_bh and return '0' + * + * 2) the search fails to find anything, but the dinode has room. We + * pass NULL back in *lowest_eb_bh, but still return '0' + * + * 3) the search fails to find anything AND the dinode is full, in + * which case we return > 0 + * + * return status < 0 indicates an error. + */ +static int ocfs2_find_branch_target(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **target_bh) +{ + int status = 0, i; + u64 blkno; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + struct buffer_head *lowest_bh = NULL; + + mlog_entry_void(); + + *target_bh = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + + while(le16_to_cpu(el->l_tree_depth) > 1) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty " + "extent list (next_free_rec == 0)", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + i = le16_to_cpu(el->l_next_free_rec) - 1; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (!blkno) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent " + "list where extent # %d has no physical " + "block start", + OCFS2_I(inode)->ip_blkno, i); + status = -EIO; + goto bail; + } + + if (bh) { + brelse(bh); + bh = NULL; + } + + status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count)) { + if (lowest_bh) + brelse(lowest_bh); + lowest_bh = bh; + get_bh(lowest_bh); + } + } + + /* If we didn't find one and the fe doesn't have any room, + * then return '1' */ + if (!lowest_bh + && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + status = 1; + + *target_bh = lowest_bh; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +/* the caller needs to update fe->i_clusters */ +int ocfs2_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 start_blk, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac) +{ + int status, i, shift; + struct buffer_head *last_eb_bh = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + mlog(0, "add %u clusters starting at block %"MLFu64" to " + "inode %"MLFu64"\n", + new_clusters, start_blk, OCFS2_I(inode)->ip_blkno); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + + if (el->l_tree_depth) { + /* jump to end of tree */ + status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_exit(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + el = &eb->h_list; + } + + /* Can we allocate without adding/shifting tree bits? */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le16_to_cpu(el->l_next_free_rec) == 0 + || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) + || le32_to_cpu(el->l_recs[i].e_clusters) == 0 + || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) + goto out_add; + + mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " + "tree now.\n"); + + shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); + if (shift < 0) { + status = shift; + mlog_errno(status); + goto bail; + } + + /* We traveled all the way to the bottom of the allocation tree + * and didn't find room for any more extents - we need to add + * another tree level */ + if (shift) { + /* if we hit a leaf, we'd better be empty :) */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) != + le16_to_cpu(el->l_count)); + BUG_ON(bh); + mlog(0, "ocfs2_allocate_extent: need to shift tree depth " + "(current = %u)\n", + le16_to_cpu(fe->id2.i_list.l_tree_depth)); + + /* ocfs2_shift_tree_depth will return us a buffer with + * the new extent block (so we can pass that to + * ocfs2_add_branch). */ + status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, + meta_ac, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + /* Special case: we have room now if we shifted from + * tree_depth 0 */ + if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) + goto out_add; + } + + /* call ocfs2_add_branch to add the final part of the tree with + * the new data. */ + mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); + status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, + meta_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +out_add: + /* Finally, we can add clusters. */ + status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, + start_blk, new_clusters); + if (status < 0) + mlog_errno(status); + +bail: + if (bh) + brelse(bh); + + if (last_eb_bh) + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) +{ + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + + mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), + "slot %d, invalid truncate log parameters: used = " + "%u, count = %u\n", osb->slot_num, + le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); + return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); +} + +static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, + unsigned int new_start) +{ + unsigned int tail_index; + unsigned int current_tail; + + /* No records, nothing to coalesce */ + if (!le16_to_cpu(tl->tl_used)) + return 0; + + tail_index = le16_to_cpu(tl->tl_used) - 1; + current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); + current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); + + return current_tail == new_start; +} + +static int ocfs2_truncate_log_append(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u64 start_blk, + unsigned int num_clusters) +{ + int status, index; + unsigned int start_cluster, tl_count; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk, + num_clusters); + + BUG_ON(!down_trylock(&tl_inode->i_sem)); + + start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); + status = -EIO; + goto bail; + } + + tl_count = le16_to_cpu(tl->tl_count); + mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0, + "Truncate record count on #%"MLFu64" invalid (" + "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno, + ocfs2_truncate_recs_per_inode(osb->sb), + le16_to_cpu(tl->tl_count)); + + /* Caller should have known to flush before calling us. */ + index = le16_to_cpu(tl->tl_used); + if (index >= tl_count) { + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Log truncate of %u clusters starting at cluster %u to " + "%"MLFu64" (index = %d)\n", num_clusters, start_cluster, + OCFS2_I(tl_inode)->ip_blkno, index); + + if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { + /* + * Move index back to the record we are coalescing with. + * ocfs2_truncate_log_can_coalesce() guarantees nonzero + */ + index--; + + num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); + mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); + } else { + tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); + tl->tl_used = cpu_to_le16(index + 1); + } + tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); + + status = ocfs2_journal_dirty(handle, tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *data_alloc_inode, + struct buffer_head *data_alloc_bh) +{ + int status = 0; + int i; + unsigned int num_clusters; + u64 start_blk; + struct ocfs2_truncate_rec rec; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + + mlog_entry_void(); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + i = le16_to_cpu(tl->tl_used) - 1; + while (i >= 0) { + /* Caller has given us at least enough credits to + * update the truncate log dinode */ + status = ocfs2_journal_access(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tl->tl_used = cpu_to_le16(i); + + status = ocfs2_journal_dirty(handle, tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* TODO: Perhaps we can calculate the bulk of the + * credits up front rather than extending like + * this. */ + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + rec = tl->tl_recs[i]; + start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, + le32_to_cpu(rec.t_start)); + num_clusters = le32_to_cpu(rec.t_clusters); + + /* if start_blk is not set, we ignore the record as + * invalid. */ + if (start_blk) { + mlog(0, "free record %d, start = %u, clusters = %u\n", + i, le32_to_cpu(rec.t_start), num_clusters); + + status = ocfs2_free_clusters(handle, data_alloc_inode, + data_alloc_bh, start_blk, + num_clusters); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i--; + } + +bail: + mlog_exit(status); + return status; +} + +/* Expects you to already be holding tl_inode->i_sem */ +static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + unsigned int num_to_flush; + struct ocfs2_journal_handle *handle = NULL; + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *data_alloc_inode = NULL; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct buffer_head *data_alloc_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + mlog_entry_void(); + + BUG_ON(!down_trylock(&tl_inode->i_sem)); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); + status = -EIO; + goto bail; + } + + num_to_flush = le16_to_cpu(tl->tl_used); + mlog(0, "Flush %u records from truncate log #%"MLFu64"\n", + num_to_flush, OCFS2_I(tl_inode)->ip_blkno); + if (!num_to_flush) { + status = 0; + goto bail; + } + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + data_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!data_alloc_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto bail; + } + + ocfs2_handle_add_inode(handle, data_alloc_inode); + status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + data_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (data_alloc_inode) + iput(data_alloc_inode); + + if (data_alloc_bh) + brelse(data_alloc_bh); + + mlog_exit(status); + return status; +} + +int ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + down(&tl_inode->i_sem); + status = __ocfs2_flush_truncate_log(osb); + up(&tl_inode->i_sem); + + return status; +} + +static void ocfs2_truncate_log_worker(void *data) +{ + int status; + struct ocfs2_super *osb = data; + + mlog_entry_void(); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); +} + +#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel) +{ + if (osb->osb_tl_inode) { + /* We want to push off log flushes while truncates are + * still running. */ + if (cancel) + cancel_delayed_work(&osb->osb_truncate_log_wq); + + queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); + } +} + +static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, + int slot_num, + struct inode **tl_inode, + struct buffer_head **tl_bh) +{ + int status; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, + TRUNCATE_LOG_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get load truncate log inode!\n"); + goto bail; + } + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + iput(inode); + mlog_errno(status); + goto bail; + } + + *tl_inode = inode; + *tl_bh = bh; +bail: + mlog_exit(status); + return status; +} + +/* called during the 1st stage of node recovery. we stamp a clean + * truncate log and pass back a copy for processing later. if the + * truncate log does not require processing, a *tl_copy is set to + * NULL. */ +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + *tl_copy = NULL; + + mlog(0, "recover truncate log from slot %d\n", slot_num); + + status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); + status = -EIO; + goto bail; + } + + if (le16_to_cpu(tl->tl_used)) { + mlog(0, "We'll have %u logs to recover\n", + le16_to_cpu(tl->tl_used)); + + *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + if (!(*tl_copy)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* Assuming the write-out below goes well, this copy + * will be passed back to recovery for processing. */ + memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); + + /* All we need to do to clear the truncate log is set + * tl_used. */ + tl->tl_used = 0; + + status = ocfs2_write_block(osb, tl_bh, tl_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (tl_inode) + iput(tl_inode); + if (tl_bh) + brelse(tl_bh); + + if (status < 0 && (*tl_copy)) { + kfree(*tl_copy); + *tl_copy = NULL; + } + + mlog_exit(status); + return status; +} + +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy) +{ + int status = 0; + int i; + unsigned int clusters, num_recs, start_cluster; + u64 start_blk; + struct ocfs2_journal_handle *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_truncate_log *tl; + + mlog_entry_void(); + + if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { + mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); + return -EINVAL; + } + + tl = &tl_copy->id2.i_dealloc; + num_recs = le16_to_cpu(tl->tl_used); + mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs, + tl_copy->i_blkno); + + down(&tl_inode->i_sem); + for(i = 0; i < num_recs; i++) { + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + + handle = ocfs2_start_trans(osb, NULL, + OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_up; + } + + clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); + start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); + start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); + + status = ocfs2_truncate_log_append(osb, handle, + start_blk, clusters); + ocfs2_commit_trans(handle); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + +bail_up: + up(&tl_inode->i_sem); + + mlog_exit(status); + return status; +} + +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + mlog_entry_void(); + + if (tl_inode) { + cancel_delayed_work(&osb->osb_truncate_log_wq); + flush_workqueue(ocfs2_wq); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + brelse(osb->osb_tl_bh); + iput(osb->osb_tl_inode); + } + + mlog_exit_void(); +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + + mlog_entry_void(); + + status = ocfs2_get_truncate_log_info(osb, + osb->slot_num, + &tl_inode, + &tl_bh); + if (status < 0) + mlog_errno(status); + + /* ocfs2_truncate_log_shutdown keys on the existence of + * osb->osb_tl_inode so we don't set any of the osb variables + * until we're sure all is well. */ + INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb); + osb->osb_tl_bh = tl_bh; + osb->osb_tl_inode = tl_inode; + + mlog_exit(status); + return status; +} + +/* This function will figure out whether the currently last extent + * block will be deleted, and if it will, what the new last extent + * block will be so we can update his h_next_leaf_blk field, as well + * as the dinodes i_last_eb_blk */ +static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe, + u32 new_i_clusters, + struct buffer_head *old_last_eb, + struct buffer_head **new_last_eb) +{ + int i, status = 0; + u64 block = 0; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + + *new_last_eb = NULL; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail; + } + + /* we have no tree, so of course, no last_eb. */ + if (!fe->id2.i_list.l_tree_depth) + goto bail; + + /* trunc to zero special case - this makes tree_depth = 0 + * regardless of what it is. */ + if (!new_i_clusters) + goto bail; + + eb = (struct ocfs2_extent_block *) old_last_eb->b_data; + el = &(eb->h_list); + BUG_ON(!el->l_next_free_rec); + + /* Make sure that this guy will actually be empty after we + * clear away the data. */ + if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) + goto bail; + + /* Ok, at this point, we know that last_eb will definitely + * change, so lets traverse the tree and find the second to + * last extent block. */ + el = &(fe->id2.i_list); + /* go down the tree, */ + do { + for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { + if (le32_to_cpu(el->l_recs[i].e_cpos) < + new_i_clusters) { + block = le64_to_cpu(el->l_recs[i].e_blkno); + break; + } + } + BUG_ON(i < 0); + + if (bh) { + brelse(bh); + bh = NULL; + } + + status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + } while (el->l_tree_depth); + + *new_last_eb = bh; + get_bh(*new_last_eb); + mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno)); +bail: + if (bh) + brelse(bh); + + return status; +} + +static int ocfs2_do_truncate(struct ocfs2_super *osb, + unsigned int clusters_to_del, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *old_last_eb_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_truncate_context *tc) +{ + int status, i, depth; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_block *last_eb = NULL; + struct ocfs2_extent_list *el; + struct buffer_head *eb_bh = NULL; + struct buffer_head *last_eb_bh = NULL; + u64 next_eb = 0; + u64 delete_blk = 0; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + status = ocfs2_find_new_last_ext_blk(osb, + inode, + fe, + le32_to_cpu(fe->i_clusters) - + clusters_to_del, + old_last_eb_bh, + &last_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (last_eb_bh) + last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + el = &(fe->id2.i_list); + + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - + clusters_to_del; + spin_unlock(&OCFS2_I(inode)->ip_lock); + le32_add_cpu(&fe->i_clusters, -clusters_to_del); + fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); + le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); + /* tree depth zero, we can just delete the clusters, otherwise + * we need to record the offset of the next level extent block + * as we may overwrite it. */ + if (!el->l_tree_depth) + delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(el->l_recs[i].e_clusters)); + else + next_eb = le64_to_cpu(el->l_recs[i].e_blkno); + + if (!el->l_recs[i].e_clusters) { + /* if we deleted the whole extent record, then clear + * out the other fields and update the extent + * list. For depth > 0 trees, we've already recorded + * the extent block in 'next_eb' */ + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + } + + depth = le16_to_cpu(el->l_tree_depth); + if (!fe->i_clusters) { + /* trunc to zero is a special case. */ + el->l_tree_depth = 0; + fe->i_last_eb_blk = 0; + } else if (last_eb) + fe->i_last_eb_blk = last_eb->h_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (last_eb) { + /* If there will be a new last extent block, then by + * definition, there cannot be any leaves to the right of + * him. */ + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + last_eb->h_next_leaf_blk = 0; + status = ocfs2_journal_dirty(handle, last_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* if our tree depth > 0, update all the tree blocks below us. */ + while (depth) { + mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n", + depth, next_eb); + status = ocfs2_read_block(osb, next_eb, &eb_bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + + status = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + mlog(0, "extent block %"MLFu64", before: record %d: " + "(%u, %u, %"MLFu64"), next = %u\n", + le64_to_cpu(eb->h_blkno), i, + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le64_to_cpu(el->l_recs[i].e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); + le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); + + next_eb = le64_to_cpu(el->l_recs[i].e_blkno); + /* bottom-most block requires us to delete data.*/ + if (!el->l_tree_depth) + delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(el->l_recs[i].e_clusters)); + if (!el->l_recs[i].e_clusters) { + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + } + mlog(0, "extent block %"MLFu64", after: record %d: " + "(%u, %u, %"MLFu64"), next = %u\n", + le64_to_cpu(eb->h_blkno), i, + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le64_to_cpu(el->l_recs[i].e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + status = ocfs2_journal_dirty(handle, eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!el->l_next_free_rec) { + mlog(0, "deleting this extent block.\n"); + + ocfs2_remove_from_cache(inode, eb_bh); + + BUG_ON(eb->h_suballoc_slot); + BUG_ON(el->l_recs[0].e_clusters); + BUG_ON(el->l_recs[0].e_cpos); + BUG_ON(el->l_recs[0].e_blkno); + status = ocfs2_free_extent_block(handle, + tc->tc_ext_alloc_inode, + tc->tc_ext_alloc_bh, + eb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + brelse(eb_bh); + eb_bh = NULL; + depth--; + } + + BUG_ON(!delete_blk); + status = ocfs2_truncate_log_append(osb, handle, delete_blk, + clusters_to_del); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = 0; +bail: + if (!status) + ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); + else + ocfs2_extent_map_drop(inode, 0); + mlog_exit(status); + return status; +} + +/* + * It is expected, that by the time you call this function, + * inode->i_size and fe->i_size have been adjusted. + * + * WARNING: This will kfree the truncate context + */ +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context *tc) +{ + int status, i, credits, tl_sem = 0; + u32 clusters_to_del, target_i_clusters; + u64 last_eb = 0; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *last_eb_bh; + struct ocfs2_journal_handle *handle = NULL; + struct inode *tl_inode = osb->osb_tl_inode; + + mlog_entry_void(); + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + + last_eb_bh = tc->tc_last_eb_bh; + tc->tc_last_eb_bh = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (fe->id2.i_list.l_tree_depth) { + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + last_eb = le64_to_cpu(fe->i_last_eb_blk); +start: + mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " + "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", " + "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", + le32_to_cpu(fe->i_clusters), last_eb, + le64_to_cpu(fe->i_last_eb_blk), + le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); + + if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { + mlog(0, "last_eb changed!\n"); + BUG_ON(!fe->id2.i_list.l_tree_depth); + last_eb = le64_to_cpu(fe->i_last_eb_blk); + /* i_last_eb_blk may have changed, read it if + * necessary. We don't have to worry about the + * truncate to zero case here (where there becomes no + * last_eb) because we never loop back after our work + * is done. */ + if (last_eb_bh) { + brelse(last_eb_bh); + last_eb_bh = NULL; + } + + status = ocfs2_read_block(osb, last_eb, + &last_eb_bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + } + + /* by now, el will point to the extent list on the bottom most + * portion of this tree. */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) + clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); + else + clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + + le32_to_cpu(el->l_recs[i].e_cpos)) - + target_i_clusters; + + mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); + + down(&tl_inode->i_sem); + tl_sem = 1; + /* ocfs2_truncate_log_needs_flush guarantees us at least one + * record is free for use. If there isn't any, we flush to get + * an empty truncate log. */ + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, + fe, el); + handle = ocfs2_start_trans(osb, NULL, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) + mlog_errno(status); + + status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, + last_eb_bh, handle, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + up(&tl_inode->i_sem); + tl_sem = 0; + + ocfs2_commit_trans(handle); + handle = NULL; + + BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); + if (le32_to_cpu(fe->i_clusters) > target_i_clusters) + goto start; +bail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_schedule_truncate_log_flush(osb, 1); + + if (tl_sem) + up(&tl_inode->i_sem); + + if (handle) + ocfs2_commit_trans(handle); + + if (last_eb_bh) + brelse(last_eb_bh); + + /* This will drop the ext_alloc cluster lock for us */ + ocfs2_free_truncate_context(tc); + + mlog_exit(status); + return status; +} + + +/* + * Expects the inode to already be locked. This will figure out which + * inodes need to be locked and will put them on the returned truncate + * context. + */ +int ocfs2_prepare_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context **tc) +{ + int status, metadata_delete; + unsigned int new_i_clusters; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *last_eb_bh = NULL; + struct inode *ext_alloc_inode = NULL; + struct buffer_head *ext_alloc_bh = NULL; + + mlog_entry_void(); + + *tc = NULL; + + new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" + "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size); + + if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count " + "%u and size %"MLFu64" whereas struct inode has " + "cluster count %u and size %llu which caused an " + "invalid truncate to %u clusters.", + le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->i_clusters), + le64_to_cpu(fe->i_size), + OCFS2_I(inode)->ip_clusters, i_size_read(inode), + new_i_clusters); + mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); + status = -EIO; + goto bail; + } + + *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); + if (!(*tc)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + metadata_delete = 0; + if (fe->id2.i_list.l_tree_depth) { + /* If we have a tree, then the truncate may result in + * metadata deletes. Figure this out from the + * rightmost leaf block.*/ + status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + + brelse(last_eb_bh); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) + metadata_delete = 1; + } + + (*tc)->tc_last_eb_bh = last_eb_bh; + + if (metadata_delete) { + mlog(0, "Will have to delete metadata for this trunc. " + "locking allocator.\n"); + ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); + if (!ext_alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + down(&ext_alloc_inode->i_sem); + (*tc)->tc_ext_alloc_inode = ext_alloc_inode; + + status = ocfs2_meta_lock(ext_alloc_inode, + NULL, + &ext_alloc_bh, + 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + (*tc)->tc_ext_alloc_bh = ext_alloc_bh; + (*tc)->tc_ext_alloc_locked = 1; + } + + status = 0; +bail: + if (status < 0) { + if (*tc) + ocfs2_free_truncate_context(*tc); + *tc = NULL; + } + mlog_exit_void(); + return status; +} + +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) +{ + if (tc->tc_ext_alloc_inode) { + if (tc->tc_ext_alloc_locked) + ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); + + up(&tc->tc_ext_alloc_inode->i_sem); + iput(tc->tc_ext_alloc_inode); + } + + if (tc->tc_ext_alloc_bh) + brelse(tc->tc_ext_alloc_bh); + + if (tc->tc_last_eb_bh) + brelse(tc->tc_last_eb_bh); + + kfree(tc); +} diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h new file mode 100644 index 00000000000..12ba897743f --- /dev/null +++ b/fs/ocfs2/alloc.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_ALLOC_H +#define OCFS2_ALLOC_H + +struct ocfs2_alloc_context; +int ocfs2_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 blkno, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac); +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe); +/* how many new metadata chunks would an allocation need at maximum? */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) +{ + /* + * Rather than do all the work of determining how much we need + * (involves a ton of reads and locks), just ask for the + * maximal limit. That's a tree depth shift. So, one block for + * level of the tree (current l_tree_depth), one block for the + * new tree_depth==0 extent_block, and one block at the new + * top-of-the tree. + */ + return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb); +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel); +int ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy); +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy); + +struct ocfs2_truncate_context { + struct inode *tc_ext_alloc_inode; + struct buffer_head *tc_ext_alloc_bh; + int tc_ext_alloc_locked; /* is it cluster locked? */ + /* these get destroyed once it's passed to ocfs2_commit_truncate. */ + struct buffer_head *tc_last_eb_bh; +}; + +int ocfs2_prepare_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context **tc); +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context *tc); + +#endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c new file mode 100644 index 00000000000..8f4467a930a --- /dev/null +++ b/fs/ocfs2/aops.c @@ -0,0 +1,643 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <asm/byteorder.h> + +#define MLOG_MASK_PREFIX ML_FILE_IO +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "super.h" +#include "symlink.h" + +#include "buffer_head_io.h" + +static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int status; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *buffer_cache_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + void *kaddr; + + mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, + (unsigned long long)iblock, bh_result, create); + + BUG_ON(ocfs2_inode_is_fast_symlink(inode)); + + if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { + mlog(ML_ERROR, "block offset > PATH_MAX: %llu", + (unsigned long long)iblock); + goto bail; + } + + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + &bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + fe = (struct ocfs2_dinode *) bh->b_data; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", + fe->i_blkno, 7, fe->i_signature); + goto bail; + } + + if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(fe->i_clusters))) { + mlog(ML_ERROR, "block offset is outside the allocated size: " + "%llu\n", (unsigned long long)iblock); + goto bail; + } + + /* We don't use the page cache to create symlink data, so if + * need be, copy it over from the buffer cache. */ + if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { + u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + + iblock; + buffer_cache_bh = sb_getblk(osb->sb, blkno); + if (!buffer_cache_bh) { + mlog(ML_ERROR, "couldn't getblock for symlink!\n"); + goto bail; + } + + /* we haven't locked out transactions, so a commit + * could've happened. Since we've got a reference on + * the bh, even if it commits while we're doing the + * copy, the data is still good. */ + if (buffer_jbd(buffer_cache_bh) + && ocfs2_inode_is_new(inode)) { + kaddr = kmap_atomic(bh_result->b_page, KM_USER0); + if (!kaddr) { + mlog(ML_ERROR, "couldn't kmap!\n"); + goto bail; + } + memcpy(kaddr + (bh_result->b_size * iblock), + buffer_cache_bh->b_data, + bh_result->b_size); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh_result); + } + brelse(buffer_cache_bh); + } + + map_bh(bh_result, inode->i_sb, + le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); + + err = 0; + +bail: + if (bh) + brelse(bh); + + mlog_exit(err); + return err; +} + +static int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = 0; + u64 p_blkno, past_eof; + + mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, + (unsigned long long)iblock, bh_result, create); + + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", + inode, inode->i_ino); + + if (S_ISLNK(inode->i_mode)) { + /* this always does I/O for some reason. */ + err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); + goto bail; + } + + /* this can happen if another node truncs after our extend! */ + spin_lock(&OCFS2_I(inode)->ip_lock); + if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters)) + err = -EIO; + spin_unlock(&OCFS2_I(inode)->ip_lock); + if (err) + goto bail; + + err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, + NULL); + if (err) { + mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " + "%"MLFu64", NULL)\n", err, inode, + (unsigned long long)iblock, p_blkno); + goto bail; + } + + map_bh(bh_result, inode->i_sb, p_blkno); + + if (bh_result->b_blocknr == 0) { + err = -EIO; + mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" " + "blkno=(%"MLFu64")\n", (unsigned long long)iblock, + p_blkno, OCFS2_I(inode)->ip_blkno); + } + + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof); + + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + +bail: + if (err < 0) + err = -EIO; + + mlog_exit(err); + return err; +} + +static int ocfs2_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + int ret, unlock = 1; + + mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); + + ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); + if (ret != 0) { + if (ret == AOP_TRUNCATED_PAGE) + unlock = 0; + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * i_size might have just been updated as we grabed the meta lock. We + * might now be discovering a truncate that hit on another node. + * block_read_full_page->get_block freaks out if it is asked to read + * beyond the end of a file, so we check here. Callers + * (generic_file_read, fault->nopage) are clever enough to check i_size + * and notice that the page they just read isn't needed. + * + * XXX sys_readahead() seems to get that wrong? + */ + if (start >= i_size_read(inode)) { + char *addr = kmap(page); + memset(addr, 0, PAGE_SIZE); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + ret = 0; + goto out_alloc; + } + + ret = ocfs2_data_lock_with_page(inode, 0, page); + if (ret != 0) { + if (ret == AOP_TRUNCATED_PAGE) + unlock = 0; + mlog_errno(ret); + goto out_alloc; + } + + ret = block_read_full_page(page, ocfs2_get_block); + unlock = 0; + + ocfs2_data_unlock(inode, 0); +out_alloc: + up_read(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 0); +out: + if (unlock) + unlock_page(page); + mlog_exit(ret); + return ret; +} + +/* Note: Because we don't support holes, our allocation has + * already happened (allocation writes zeros to the file data) + * so we don't have to worry about ordered writes in + * ocfs2_writepage. + * + * ->writepage is called during the process of invalidating the page cache + * during blocked lock processing. It can't block on any cluster locks + * to during block mapping. It's relying on the fact that the block + * mapping can't have disappeared under the dirty pages that it is + * being asked to write back. + */ +static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + + mlog_entry("(0x%p)\n", page); + + ret = block_write_full_page(page, ocfs2_get_block, wbc); + + mlog_exit(ret); + + return ret; +} + +/* + * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called + * from loopback. It must be able to perform its own locking around + * ocfs2_get_block(). + */ +int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + int ret; + + mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); + + ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); + if (ret != 0) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + ret = block_prepare_write(page, from, to, ocfs2_get_block); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_meta_unlock(inode, 0); +out: + mlog_exit(ret); + return ret; +} + +/* Taken from ext3. We don't necessarily need the full blown + * functionality yet, but IMHO it's better to cut and paste the whole + * thing so we can avoid introducing our own bugs (and easily pick up + * their fixes when they happen) --Mark */ +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, + struct page *page, + unsigned from, + unsigned to) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_journal_handle *handle = NULL; + int ret = 0; + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (!handle) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle->k_handle, + page_buffers(page), + from, to, NULL, + ocfs2_journal_dirty_data); + if (ret < 0) + mlog_errno(ret); + } +out: + if (ret) { + if (handle) + ocfs2_commit_trans(handle); + handle = ERR_PTR(ret); + } + return handle; +} + +static int ocfs2_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int ret, extending = 0, locklevel = 0; + loff_t new_i_size; + struct buffer_head *di_bh = NULL; + struct inode *inode = page->mapping->host; + struct ocfs2_journal_handle *handle = NULL; + + mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); + + /* NOTE: ocfs2_file_aio_write has ensured that it's safe for + * us to sample inode->i_size here without the metadata lock: + * + * 1) We're currently holding the inode alloc lock, so no + * nodes can change it underneath us. + * + * 2) We've had to take the metadata lock at least once + * already to check for extending writes, hence insuring + * that our current copy is also up to date. + */ + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > i_size_read(inode)) { + extending = 1; + locklevel = 1; + } + + ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); + if (ret != 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_data_lock_with_page(inode, 1, page); + if (ret != 0) { + mlog_errno(ret); + goto out_unlock_meta; + } + + if (extending) { + handle = ocfs2_start_walk_page_trans(inode, page, from, to); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out_unlock_data; + } + + /* Mark our buffer early. We'd rather catch this error up here + * as opposed to after a successful commit_write which would + * require us to set back inode->i_size. */ + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + /* might update i_size */ + ret = generic_commit_write(file, page, from, to); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + if (extending) { + loff_t size = (u64) i_size_read(inode); + struct ocfs2_dinode *di = + (struct ocfs2_dinode *)di_bh->b_data; + + /* ocfs2_mark_inode_dirty is too heavy to use here. */ + inode->i_blocks = ocfs2_align_bytes_to_sectors(size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di->i_size = cpu_to_le64(size); + di->i_ctime = di->i_mtime = + cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = + cpu_to_le32(inode->i_mtime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + BUG_ON(extending && (i_size_read(inode) != new_i_size)); + +out_commit: + if (handle) + ocfs2_commit_trans(handle); +out_unlock_data: + ocfs2_data_unlock(inode, 1); +out_unlock_meta: + ocfs2_meta_unlock(inode, locklevel); +out: + if (di_bh) + brelse(di_bh); + + mlog_exit(ret); + return ret; +} + +static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) +{ + sector_t status; + u64 p_blkno = 0; + int err = 0; + struct inode *inode = mapping->host; + + mlog_entry("(block = %llu)\n", (unsigned long long)block); + + /* We don't need to lock journal system files, since they aren't + * accessed concurrently from multiple nodes. + */ + if (!INODE_JOURNAL(inode)) { + err = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } + + err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, + NULL); + + if (!INODE_JOURNAL(inode)) { + up_read(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 0); + } + + if (err) { + mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", + (unsigned long long)block); + mlog_errno(err); + goto bail; + } + + +bail: + status = err ? 0 : p_blkno; + + mlog_exit((int)status); + + return status; +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + */ +static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, int create) +{ + int ret; + u64 vbo_max; /* file offset, max_blocks from iblock */ + u64 p_blkno; + int contig_blocks; + unsigned char blocksize_bits; + + if (!inode || !bh_result) { + mlog(ML_ERROR, "inode or bh_result is null\n"); + return -EIO; + } + + blocksize_bits = inode->i_sb->s_blocksize_bits; + + /* This function won't even be called if the request isn't all + * nicely aligned and of the right size, so there's no need + * for us to check any of that. */ + + vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; + + spin_lock(&OCFS2_I(inode)->ip_lock); + if ((iblock + max_blocks) > + ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters)) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + ret = -EIO; + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* This figures out the size of the next contiguous block, and + * our logical offset */ + ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, + &contig_blocks); + if (ret) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + + map_bh(bh_result, inode->i_sb, p_blkno); + + /* make sure we don't map more than max_blocks blocks here as + that's all the kernel will handle at this point. */ + if (max_blocks < contig_blocks) + contig_blocks = max_blocks; + bh_result->b_size = contig_blocks << blocksize_bits; +bail: + return ret; +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. Like the core uses + * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from + * truncation on another. + */ +static void ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private) +{ + struct inode *inode = iocb->ki_filp->f_dentry->d_inode; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + ocfs2_iocb_clear_rw_locked(iocb); + up_read(&inode->i_alloc_sem); + ocfs2_rw_unlock(inode, 0); +} + +static ssize_t ocfs2_direct_IO(int rw, + struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + int ret; + + mlog_entry_void(); + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, + nr_segs, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io); + mlog_exit(ret); + return ret; +} + +struct address_space_operations ocfs2_aops = { + .readpage = ocfs2_readpage, + .writepage = ocfs2_writepage, + .prepare_write = ocfs2_prepare_write, + .commit_write = ocfs2_commit_write, + .bmap = ocfs2_bmap, + .sync_page = block_sync_page, + .direct_IO = ocfs2_direct_IO +}; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h new file mode 100644 index 00000000000..d40456d509a --- /dev/null +++ b/fs/ocfs2/aops.h @@ -0,0 +1,41 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_AOPS_H +#define OCFS2_AOPS_H + +int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to); + +struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, + struct page *page, + unsigned from, + unsigned to); + +/* all ocfs2_dio_end_io()'s fault */ +#define ocfs2_iocb_is_rw_locked(iocb) \ + test_bit(0, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_rw_locked(iocb) \ + set_bit(0, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_rw_locked(iocb) \ + clear_bit(0, (unsigned long *)&iocb->private) + +#endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c new file mode 100644 index 00000000000..d424041b38e --- /dev/null +++ b/fs/ocfs2/buffer_head_io.c @@ -0,0 +1,232 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * io.c + * + * Buffer cache handling + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "inode.h" +#include "journal.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, + struct inode *inode) +{ + int ret = 0; + + mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", + (unsigned long long)bh->b_blocknr, inode); + + BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); + BUG_ON(buffer_jbd(bh)); + + /* No need to check for a soft readonly file system here. non + * journalled writes are only ever done on system files which + * can get modified during recovery even if read-only. */ + if (ocfs2_is_hard_readonly(osb)) { + ret = -EROFS; + goto out; + } + + down(&OCFS2_I(inode)->ip_io_sem); + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (buffer_uptodate(bh)) { + ocfs2_set_buffer_uptodate(inode, bh); + } else { + /* We don't need to remove the clustered uptodate + * information for this bh as it's not marked locally + * uptodate. */ + ret = -EIO; + brelse(bh); + } + + up(&OCFS2_I(inode)->ip_io_sem); +out: + mlog_exit(ret); + return ret; +} + +int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, + struct buffer_head *bhs[], int flags, + struct inode *inode) +{ + int status = 0; + struct super_block *sb; + int i, ignore_cache = 0; + struct buffer_head *bh; + + mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n", + block, nr, flags, inode); + + if (osb == NULL || osb->sb == NULL || bhs == NULL) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr < 0) { + mlog(ML_ERROR, "asked to read %d blocks!\n", nr); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr == 0) { + mlog(ML_BH_IO, "No buffers will be read!\n"); + status = 0; + goto bail; + } + + sb = osb->sb; + + if (flags & OCFS2_BH_CACHED && !inode) + flags &= ~OCFS2_BH_CACHED; + + if (inode) + down(&OCFS2_I(inode)->ip_io_sem); + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(sb, block++); + if (bhs[i] == NULL) { + if (inode) + up(&OCFS2_I(inode)->ip_io_sem); + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + ignore_cache = 0; + + if (flags & OCFS2_BH_CACHED && + !ocfs2_buffer_uptodate(inode, bh)) { + mlog(ML_UPTODATE, + "bh (%llu), inode %"MLFu64" not uptodate\n", + (unsigned long long)bh->b_blocknr, + OCFS2_I(inode)->ip_blkno); + ignore_cache = 1; + } + + /* XXX: Can we ever get this and *not* have the cached + * flag set? */ + if (buffer_jbd(bh)) { + if (!(flags & OCFS2_BH_CACHED) || ignore_cache) + mlog(ML_BH_IO, "trying to sync read a jbd " + "managed bh (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_BH_IO, "asking me to sync read a dirty " + "buffer! (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES + mlog(ML_ERROR, "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); +#else + unlock_buffer(bh); + continue; +#endif + } + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + if (flags & OCFS2_BH_READAHEAD) + submit_bh(READA, bh); + else + submit_bh(READ, bh); + continue; + } + } + + status = 0; + + for (i = (nr - 1); i >= 0; i--) { + bh = bhs[i]; + + /* We know this can't have changed as we hold the + * inode sem. Avoid doing any work on the bh if the + * journal has it. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. Don't need to + * remove the clustered uptodate information + * for this bh as it's not marked locally + * uptodate. */ + status = -EIO; + brelse(bh); + bhs[i] = NULL; + continue; + } + + if (inode) + ocfs2_set_buffer_uptodate(inode, bh); + } + if (inode) + up(&OCFS2_I(inode)->ip_io_sem); + + mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, + (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); + +bail: + + mlog_exit(status); + return status; +} diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h new file mode 100644 index 00000000000..6ecb90937b6 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.h @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_buffer_head.h + * + * Buffer cache handling functions defined + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_BUFFER_HEAD_IO_H +#define OCFS2_BUFFER_HEAD_IO_H + +#include <linux/buffer_head.h> + +void ocfs2_end_buffer_io_sync(struct buffer_head *bh, + int uptodate); + +static inline int ocfs2_read_block(struct ocfs2_super *osb, + u64 off, + struct buffer_head **bh, + int flags, + struct inode *inode); + +int ocfs2_write_block(struct ocfs2_super *osb, + struct buffer_head *bh, + struct inode *inode); +int ocfs2_read_blocks(struct ocfs2_super *osb, + u64 block, + int nr, + struct buffer_head *bhs[], + int flags, + struct inode *inode); + + +#define OCFS2_BH_CACHED 1 +#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */ + +static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, + struct buffer_head **bh, int flags, + struct inode *inode) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_blocks(osb, off, 1, bh, + flags, inode); + +bail: + return status; +} + +#endif /* OCFS2_BUFFER_HEAD_IO_H */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c new file mode 100644 index 00000000000..bd85182e97b --- /dev/null +++ b/fs/ocfs2/dcache.c @@ -0,0 +1,91 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.c + * + * dentry cache handling code + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/namei.h> + +#define MLOG_MASK_PREFIX ML_DCACHE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "file.h" +#include "inode.h" + +static int ocfs2_dentry_revalidate(struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + int ret = 0; /* if all else fails, just return false */ + struct ocfs2_super *osb; + + mlog_entry("(0x%p, '%.*s')\n", dentry, + dentry->d_name.len, dentry->d_name.name); + + /* Never trust a negative dentry - force a new lookup. */ + if (inode == NULL) { + mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, + dentry->d_name.name); + goto bail; + } + + osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!osb); + + if (inode != osb->root_inode) { + spin_lock(&OCFS2_I(inode)->ip_lock); + /* did we or someone else delete this inode? */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + mlog(0, "inode (%"MLFu64") deleted, returning false\n", + OCFS2_I(inode)->ip_blkno); + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (!inode->i_nlink) { + mlog(0, "Inode %"MLFu64" orphaned, returning false " + "dir = %d\n", OCFS2_I(inode)->ip_blkno, + S_ISDIR(inode->i_mode)); + goto bail; + } + } + + ret = 1; + +bail: + mlog_exit(ret); + + return ret; +} + +struct dentry_operations ocfs2_dentry_ops = { + .d_revalidate = ocfs2_dentry_revalidate, +}; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h new file mode 100644 index 00000000000..90072771114 --- /dev/null +++ b/fs/ocfs2/dcache.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DCACHE_H +#define OCFS2_DCACHE_H + +extern struct dentry_operations ocfs2_dentry_ops; + +#endif /* OCFS2_DCACHE_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c new file mode 100644 index 00000000000..856e20ae826 --- /dev/null +++ b/fs/ocfs2/dir.c @@ -0,0 +1,618 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c + * + * Creates, reads, walks and deletes directory-nodes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_NAMEI +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static unsigned char ocfs2_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ocfs2_extend_dir(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct buffer_head **new_de_bh); +/* + * ocfs2_readdir() + * + */ +int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + int error = 0; + unsigned long offset, blk; + int i, num, stored; + struct buffer_head * bh, * tmp; + struct ocfs2_dir_entry * de; + int err; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block * sb = inode->i_sb; + int have_disk_lock = 0; + + mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + stored = 0; + bh = NULL; + + error = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (error < 0) { + if (error != -ENOENT) + mlog_errno(error); + /* we haven't got any yet, so propagate the error. */ + stored = error; + goto bail; + } + have_disk_lock = 1; + + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < i_size_read(inode)) { + blk = (filp->f_pos) >> sb->s_blocksize_bits; + bh = ocfs2_bread(inode, blk, &err, 0); + if (!bh) { + mlog(ML_ERROR, "directory #%"MLFu64" contains a hole " + "at offset %lld\n", + OCFS2_I(inode)->ip_blkno, + filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + + /* + * Do the readahead (8k) + */ + if (!offset) { + for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0; + i > 0; i--) { + tmp = ocfs2_bread(inode, ++blk, &err, 1); + if (tmp) + brelse(tmp); + } + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ocfs2_dir_entry *) (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + OCFS2_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < i_size_read(inode) + && offset < sb->s_blocksize) { + de = (struct ocfs2_dir_entry *) (bh->b_data + offset); + if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse(bh); + goto bail; + } + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = filp->f_version; + unsigned char d_type = DT_UNKNOWN; + + if (de->file_type < OCFS2_FT_MAX) + d_type = ocfs2_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + ino_from_blkno(sb, le64_to_cpu(de->inode)), + d_type); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse(bh); + } + + stored = 0; +bail: + if (have_disk_lock) + ocfs2_meta_unlock(inode, 0); + + mlog_exit(stored); + + return stored; +} + +/* + * NOTE: this should always be called with parent dir i_sem taken. + */ +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct buffer_head **dirent_bh, + struct ocfs2_dir_entry **dirent) +{ + int status = -ENOENT; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, " + "inode=%p)\n", + osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode); + + *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); + if (!*dirent_bh || !*dirent) { + status = -ENOENT; + goto leave; + } + + *blkno = le64_to_cpu((*dirent)->inode); + + status = 0; +leave: + if (status < 0) { + *dirent = NULL; + if (*dirent_bh) { + brelse(*dirent_bh); + *dirent_bh = NULL; + } + } + + mlog_exit(status); + return status; +} + +/* Check for a name within a directory. + * + * Return 0 if the name does not exist + * Return -EEXIST if the directory contains the name + * + * Callers should have i_sem + a cluster lock on dir + */ +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen) +{ + int ret; + struct buffer_head *dirent_bh = NULL; + struct ocfs2_dir_entry *dirent = NULL; + + mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno, + namelen, name); + + ret = -EEXIST; + dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); + if (dirent_bh) + goto bail; + + ret = 0; +bail: + if (dirent_bh) + brelse(dirent_bh); + + mlog_exit(ret); + return ret; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ocfs2_empty_dir(struct inode *inode) +{ + unsigned long offset; + struct buffer_head * bh; + struct ocfs2_dir_entry * de, * de1; + struct super_block * sb; + int err; + + sb = inode->i_sb; + if ((i_size_read(inode) < + (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) || + !(bh = ocfs2_bread(inode, 0, &err, 0))) { + mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " + "no data block\n", + OCFS2_I(inode)->ip_blkno); + return 1; + } + + de = (struct ocfs2_dir_entry *) bh->b_data; + de1 = (struct ocfs2_dir_entry *) + ((char *)de + le16_to_cpu(de->rec_len)); + if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) || + !le64_to_cpu(de1->inode) || + strcmp(".", de->name) || + strcmp("..", de1->name)) { + mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " + "no `.' or `..'\n", + OCFS2_I(inode)->ip_blkno); + brelse(bh); + return 1; + } + offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); + de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len)); + while (offset < i_size_read(inode) ) { + if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) { + brelse(bh); + bh = ocfs2_bread(inode, + offset >> sb->s_blocksize_bits, &err, 0); + if (!bh) { + mlog(ML_ERROR, "directory #%"MLFu64" contains " + "a hole at offset %lu\n", + OCFS2_I(inode)->ip_blkno, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ocfs2_dir_entry *) bh->b_data; + } + if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + brelse(bh); + return 1; + } + if (le64_to_cpu(de->inode)) { + brelse(bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *) + ((char *)de + le16_to_cpu(de->rec_len)); + } + brelse(bh); + return 1; +} + +/* returns a bh of the 1st new block in the allocation. */ +int ocfs2_do_extend_dir(struct super_block *sb, + struct ocfs2_journal_handle *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh) +{ + int status; + int extend; + u64 p_blkno; + + spin_lock(&OCFS2_I(dir)->ip_lock); + extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); + spin_unlock(&OCFS2_I(dir)->ip_lock); + + if (extend) { + status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, + parent_fe_bh, handle, + data_ac, meta_ac, NULL); + BUG_ON(status == -EAGAIN); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> + (sb->s_blocksize_bits - 9)), + 1, &p_blkno, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *new_bh = sb_getblk(sb, p_blkno); + if (!*new_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + status = 0; +bail: + mlog_exit(status); + return status; +} + +/* assumes you already have a cluster lock on the directory. */ +static int ocfs2_extend_dir(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct buffer_head **new_de_bh) +{ + int status = 0; + int credits, num_free_extents; + loff_t dir_i_size; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry * de; + struct super_block *sb = osb->sb; + + mlog_entry_void(); + + dir_i_size = i_size_read(dir); + mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n", + OCFS2_I(dir)->ip_blkno, dir_i_size); + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* dir->i_size is always block aligned. */ + spin_lock(&OCFS2_I(dir)->ip_lock); + if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { + spin_unlock(&OCFS2_I(dir)->ip_lock); + num_free_extents = ocfs2_num_free_extents(osb, dir, fe); + if (num_free_extents < 0) { + status = num_free_extents; + mlog_errno(status); + goto bail; + } + + if (!num_free_extents) { + status = ocfs2_reserve_new_metadata(osb, handle, + fe, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + credits = ocfs2_calc_extend_credits(sb, fe, 1); + } else { + spin_unlock(&OCFS2_I(dir)->ip_lock); + credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; + } + + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, + data_ac, meta_ac, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(dir, new_bh); + + status = ocfs2_journal_access(handle, dir, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, sb->s_blocksize); + de = (struct ocfs2_dir_entry *) new_bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(sb->s_blocksize); + status = ocfs2_journal_dirty(handle, new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + dir_i_size += dir->i_sb->s_blocksize; + i_size_write(dir, dir_i_size); + dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); + status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *new_de_bh = new_bh; + get_bh(*new_de_bh); +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (new_bh) + brelse(new_bh); + + mlog_exit(status); + return status; +} + +/* + * Search the dir for a good spot, extending it if necessary. The + * block containing an appropriate record is returned in ret_de_bh. + */ +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct buffer_head **ret_de_bh) +{ + unsigned long offset; + struct buffer_head * bh = NULL; + unsigned short rec_len; + struct ocfs2_dinode *fe; + struct ocfs2_dir_entry *de; + struct super_block *sb; + int status; + + mlog_entry_void(); + + mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n", + namelen, OCFS2_I(dir)->ip_blkno); + + BUG_ON(!S_ISDIR(dir->i_mode)); + fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir)); + + sb = dir->i_sb; + + if (!namelen) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + bh = ocfs2_bread(dir, 0, &status, 0); + if (!bh) { + mlog_errno(status); + goto bail; + } + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) bh->b_data; + while (1) { + if ((char *)de >= sb->s_blocksize + bh->b_data) { + brelse(bh); + bh = NULL; + + if (i_size_read(dir) <= offset) { + status = ocfs2_extend_dir(osb, + dir, + parent_fe_bh, + &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + BUG_ON(!bh); + *ret_de_bh = bh; + get_bh(*ret_de_bh); + goto bail; + } + bh = ocfs2_bread(dir, + offset >> sb->s_blocksize_bits, + &status, + 0); + if (!bh) { + mlog_errno(status); + goto bail; + } + /* move to next block */ + de = (struct ocfs2_dir_entry *) bh->b_data; + } + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + status = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + status = -EEXIST; + goto bail; + } + if (((le64_to_cpu(de->inode) == 0) && + (le16_to_cpu(de->rec_len) >= rec_len)) || + (le16_to_cpu(de->rec_len) >= + (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { + /* Ok, we found a spot. Return this bh and let + * the caller actually fill it in. */ + *ret_de_bh = bh; + get_bh(*ret_de_bh); + status = 0; + goto bail; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); + } + + status = 0; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h new file mode 100644 index 00000000000..5f614ec9649 --- /dev/null +++ b/fs/ocfs2/dir.h @@ -0,0 +1,54 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DIR_H +#define OCFS2_DIR_H + +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen); +int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */ +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct buffer_head **dirent_bh, + struct ocfs2_dir_entry **dirent); +int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct buffer_head **ret_de_bh); +struct ocfs2_alloc_context; +int ocfs2_do_extend_dir(struct super_block *sb, + struct ocfs2_journal_handle *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh); +#endif /* OCFS2_DIR_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c new file mode 100644 index 00000000000..e971ec2f840 --- /dev/null +++ b/fs/ocfs2/dlmglue.c @@ -0,0 +1,2904 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.c + * + * Code which implements an OCFS2 specific interface to our DLM. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/smp_lock.h> +#include <linux/crc32.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <cluster/heartbeat.h> +#include <cluster/nodemanager.h> +#include <cluster/tcp.h> + +#include <dlm/dlmapi.h> + +#define MLOG_MASK_PREFIX ML_DLM_GLUE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "slot_map.h" +#include "super.h" +#include "uptodate.h" +#include "vote.h" + +#include "buffer_head_io.h" + +struct ocfs2_mask_waiter { + struct list_head mw_item; + int mw_status; + struct completion mw_complete; + unsigned long mw_mask; + unsigned long mw_goal; +}; + +static void ocfs2_inode_ast_func(void *opaque); +static void ocfs2_inode_bast_func(void *opaque, + int level); +static void ocfs2_super_ast_func(void *opaque); +static void ocfs2_super_bast_func(void *opaque, + int level); +static void ocfs2_rename_ast_func(void *opaque); +static void ocfs2_rename_bast_func(void *opaque, + int level); + +/* so far, all locks have gotten along with the same unlock ast */ +static void ocfs2_unlock_ast_func(void *opaque, + enum dlm_status status); +static int ocfs2_do_unblock_meta(struct inode *inode, + int *requeue); +static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, + int *requeue); +static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, + int *requeue); +static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, + int *requeue); +static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, + int *requeue); +typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); +static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int *requeue, + ocfs2_convert_worker_t *worker); + +struct ocfs2_lock_res_ops { + void (*ast)(void *); + void (*bast)(void *, int); + void (*unlock_ast)(void *, enum dlm_status); + int (*unblock)(struct ocfs2_lock_res *, int *); +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { + .ast = ocfs2_inode_ast_func, + .bast = ocfs2_inode_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_inode_lock, +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { + .ast = ocfs2_inode_ast_func, + .bast = ocfs2_inode_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_meta, +}; + +static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { + .ast = ocfs2_inode_ast_func, + .bast = ocfs2_inode_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_data, +}; + +static struct ocfs2_lock_res_ops ocfs2_super_lops = { + .ast = ocfs2_super_ast_func, + .bast = ocfs2_super_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_osb_lock, +}; + +static struct ocfs2_lock_res_ops ocfs2_rename_lops = { + .ast = ocfs2_rename_ast_func, + .bast = ocfs2_rename_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_osb_lock, +}; + +static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_META || + lockres->l_type == OCFS2_LOCK_TYPE_DATA || + lockres->l_type == OCFS2_LOCK_TYPE_RW; +} + +static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; +} + +static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; +} + +static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!ocfs2_is_super_lock(lockres) + && !ocfs2_is_rename_lock(lockres)); + + return (struct ocfs2_super *) lockres->l_priv; +} + +static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + return (struct inode *) lockres->l_priv; +} + +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + int dlm_flags); +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted); +static void ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level); +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert); +#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ + "resource %s: %s\n", dlm_errname(_stat), _func, \ + _lockres->l_name, dlm_errmsg(_stat)); \ +} while (0) +static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_meta_lock_update(struct inode *inode, + struct buffer_head **bh); +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); +static inline int ocfs2_highest_compat_lock_level(int level); +static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, + struct ocfs2_lock_res *lockres, + int new_level); + +static char *ocfs2_lock_type_strings[] = { + [OCFS2_LOCK_TYPE_META] = "Meta", + [OCFS2_LOCK_TYPE_DATA] = "Data", + [OCFS2_LOCK_TYPE_SUPER] = "Super", + [OCFS2_LOCK_TYPE_RENAME] = "Rename", + /* Need to differntiate from [R]ename.. serializing writes is the + * important job it does, anyway. */ + [OCFS2_LOCK_TYPE_RW] = "Write/Read", +}; + +static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) +{ + mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); + return ocfs2_lock_type_strings[type]; +} + +static void ocfs2_build_lock_name(enum ocfs2_lock_type type, + u64 blkno, + u32 generation, + char *name) +{ + int len; + + mlog_entry_void(); + + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); + + len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x", + ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno, + generation); + + BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); + + mlog(0, "built lock resource with name: %s\n", name); + + mlog_exit_void(); +} + +static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; + +static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, + struct ocfs2_dlm_debug *dlm_debug) +{ + mlog(0, "Add tracking for lockres %s\n", res->l_name); + + spin_lock(&ocfs2_dlm_tracking_lock); + list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) +{ + spin_lock(&ocfs2_dlm_tracking_lock); + if (!list_empty(&res->l_debug_list)) + list_del_init(&res->l_debug_list); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, + struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + u64 blkno, + u32 generation, + struct ocfs2_lock_res_ops *ops, + void *priv) +{ + ocfs2_build_lock_name(type, blkno, generation, res->l_name); + + res->l_type = type; + res->l_ops = ops; + res->l_priv = priv; + + res->l_level = LKM_IVMODE; + res->l_requested = LKM_IVMODE; + res->l_blocking = LKM_IVMODE; + res->l_action = OCFS2_AST_INVALID; + res->l_unlock_action = OCFS2_UNLOCK_INVALID; + + res->l_flags = OCFS2_LOCK_INITIALIZED; + + ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); +} + +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) +{ + /* This also clears out the lock status block */ + memset(res, 0, sizeof(struct ocfs2_lock_res)); + spin_lock_init(&res->l_lock); + init_waitqueue_head(&res->l_event); + INIT_LIST_HEAD(&res->l_blocked_list); + INIT_LIST_HEAD(&res->l_mask_waiters); +} + +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + struct inode *inode) +{ + struct ocfs2_lock_res_ops *ops; + + switch(type) { + case OCFS2_LOCK_TYPE_RW: + ops = &ocfs2_inode_rw_lops; + break; + case OCFS2_LOCK_TYPE_META: + ops = &ocfs2_inode_meta_lops; + break; + case OCFS2_LOCK_TYPE_DATA: + ops = &ocfs2_inode_data_lops; + break; + default: + mlog_bug_on_msg(1, "type: %d\n", type); + ops = NULL; /* thanks, gcc */ + break; + }; + + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, + OCFS2_I(inode)->ip_blkno, + inode->i_generation, ops, inode); +} + +static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Superblock lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, + OCFS2_SUPER_BLOCK_BLKNO, 0, + &ocfs2_super_lops, osb); +} + +static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Rename lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, + &ocfs2_rename_lops, osb); +} + +void ocfs2_lock_res_free(struct ocfs2_lock_res *res) +{ + mlog_entry_void(); + + if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) + return; + + ocfs2_remove_lockres_tracking(res); + + mlog_bug_on_msg(!list_empty(&res->l_blocked_list), + "Lockres %s is on the blocked list\n", + res->l_name); + mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), + "Lockres %s has mask waiters pending\n", + res->l_name); + mlog_bug_on_msg(spin_is_locked(&res->l_lock), + "Lockres %s is locked\n", + res->l_name); + mlog_bug_on_msg(res->l_ro_holders, + "Lockres %s has %u ro holders\n", + res->l_name, res->l_ro_holders); + mlog_bug_on_msg(res->l_ex_holders, + "Lockres %s has %u ex holders\n", + res->l_name, res->l_ex_holders); + + /* Need to clear out the lock status block for the dlm */ + memset(&res->l_lksb, 0, sizeof(res->l_lksb)); + + res->l_flags = 0UL; + mlog_exit_void(); +} + +static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, + int level) +{ + mlog_entry_void(); + + BUG_ON(!lockres); + + switch(level) { + case LKM_EXMODE: + lockres->l_ex_holders++; + break; + case LKM_PRMODE: + lockres->l_ro_holders++; + break; + default: + BUG(); + } + + mlog_exit_void(); +} + +static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, + int level) +{ + mlog_entry_void(); + + BUG_ON(!lockres); + + switch(level) { + case LKM_EXMODE: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case LKM_PRMODE: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } + mlog_exit_void(); +} + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int ocfs2_highest_compat_lock_level(int level) +{ + int new_level = LKM_EXMODE; + + if (level == LKM_EXMODE) + new_level = LKM_NLMODE; + else if (level == LKM_PRMODE) + new_level = LKM_PRMODE; + return new_level; +} + +static void lockres_set_flags(struct ocfs2_lock_res *lockres, + unsigned long newflags) +{ + struct list_head *pos, *tmp; + struct ocfs2_mask_waiter *mw; + + assert_spin_locked(&lockres->l_lock); + + lockres->l_flags = newflags; + + list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { + mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + continue; + + list_del_init(&mw->mw_item); + mw->mw_status = 0; + complete(&mw->mw_complete); + } +} +static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) +{ + lockres_set_flags(lockres, lockres->l_flags | or); +} +static void lockres_clear_flags(struct ocfs2_lock_res *lockres, + unsigned long clear) +{ + lockres_set_flags(lockres, lockres->l_flags & ~clear); +} + +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + BUG_ON(lockres->l_blocking <= LKM_NLMODE); + + lockres->l_level = lockres->l_requested; + if (lockres->l_level <= + ocfs2_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = LKM_NLMODE; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + } + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + + mlog_exit_void(); +} + +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + + /* Convert from RO to EX doesn't really need anything as our + * information is already up to data. Convert from NL to + * *anything* however should mark ourselves as needing an + * update */ + if (lockres->l_level == LKM_NLMODE) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + + mlog_exit_void(); +} + +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + + if (lockres->l_requested > LKM_NLMODE && + !(lockres->l_flags & OCFS2_LOCK_LOCAL)) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + + mlog_exit_void(); +} + +static void ocfs2_inode_ast_func(void *opaque) +{ + struct ocfs2_lock_res *lockres = opaque; + struct inode *inode; + struct dlm_lockstatus *lksb; + unsigned long flags; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + + mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n", + OCFS2_I(inode)->ip_blkno, lockres->l_action, + ocfs2_lock_type_string(lockres->l_type)); + + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + spin_lock_irqsave(&lockres->l_lock, flags); + + lksb = &(lockres->l_lksb); + if (lksb->status != DLM_NORMAL) { + mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " + "on inode %"MLFu64"\n", lksb->status, + OCFS2_I(inode)->ip_blkno); + spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); + return; + } + + switch(lockres->l_action) { + case OCFS2_AST_ATTACH: + ocfs2_generic_handle_attach_action(lockres); + lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); + break; + case OCFS2_AST_CONVERT: + ocfs2_generic_handle_convert_action(lockres); + break; + case OCFS2_AST_DOWNCONVERT: + ocfs2_generic_handle_downconvert_action(lockres); + break; + default: + mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " + "lockres flags = 0x%lx, unlock action: %u\n", + lockres->l_name, lockres->l_action, lockres->l_flags, + lockres->l_unlock_action); + + BUG(); + } + + /* data and rw locking ignores refresh flag for now. */ + if (lockres->l_type != OCFS2_LOCK_TYPE_META) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + /* set it to something invalid so if we get called again we + * can catch it. */ + lockres->l_action = OCFS2_AST_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + wake_up(&lockres->l_event); + + mlog_exit_void(); +} + +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, + int level) +{ + int needs_downconvert = 0; + mlog_entry_void(); + + assert_spin_locked(&lockres->l_lock); + + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + + if (level > lockres->l_blocking) { + /* only schedule a downconvert if we haven't already scheduled + * one that goes low enough to satisfy the level we're + * blocking. this also catches the case where we get + * duplicate BASTs */ + if (ocfs2_highest_compat_lock_level(level) < + ocfs2_highest_compat_lock_level(lockres->l_blocking)) + needs_downconvert = 1; + + lockres->l_blocking = level; + } + + mlog_exit(needs_downconvert); + return needs_downconvert; +} + +static void ocfs2_generic_bast_func(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level) +{ + int needs_downconvert; + unsigned long flags; + + mlog_entry_void(); + + BUG_ON(level <= LKM_NLMODE); + + spin_lock_irqsave(&lockres->l_lock, flags); + needs_downconvert = ocfs2_generic_handle_bast(lockres, level); + if (needs_downconvert) + ocfs2_schedule_blocked_lock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_kick_vote_thread(osb); + + wake_up(&lockres->l_event); + mlog_exit_void(); +} + +static void ocfs2_inode_bast_func(void *opaque, int level) +{ + struct ocfs2_lock_res *lockres = opaque; + struct inode *inode; + struct ocfs2_super *osb; + + mlog_entry_void(); + + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + inode = ocfs2_lock_res_inode(lockres); + osb = OCFS2_SB(inode->i_sb); + + mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d " + "type = %s\n", OCFS2_I(inode)->ip_blkno, level, + lockres->l_level, + ocfs2_lock_type_string(lockres->l_type)); + + ocfs2_generic_bast_func(osb, lockres, level); + + mlog_exit_void(); +} + +static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, + int ignore_refresh) +{ + struct dlm_lockstatus *lksb = &lockres->l_lksb; + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + + if (lksb->status != DLM_NORMAL) { + mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", + lockres->l_name, lksb->status); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + switch(lockres->l_action) { + case OCFS2_AST_ATTACH: + ocfs2_generic_handle_attach_action(lockres); + break; + case OCFS2_AST_CONVERT: + ocfs2_generic_handle_convert_action(lockres); + break; + case OCFS2_AST_DOWNCONVERT: + ocfs2_generic_handle_downconvert_action(lockres); + break; + default: + BUG(); + } + + if (ignore_refresh) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + /* set it to something invalid so if we get called again we + * can catch it. */ + lockres->l_action = OCFS2_AST_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); +} + +static void ocfs2_super_ast_func(void *opaque) +{ + struct ocfs2_lock_res *lockres = opaque; + + mlog_entry_void(); + mlog(0, "Superblock AST fired\n"); + + BUG_ON(!ocfs2_is_super_lock(lockres)); + ocfs2_generic_ast_func(lockres, 0); + + mlog_exit_void(); +} + +static void ocfs2_super_bast_func(void *opaque, + int level) +{ + struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_super *osb; + + mlog_entry_void(); + mlog(0, "Superblock BAST fired\n"); + + BUG_ON(!ocfs2_is_super_lock(lockres)); + osb = ocfs2_lock_res_super(lockres); + ocfs2_generic_bast_func(osb, lockres, level); + + mlog_exit_void(); +} + +static void ocfs2_rename_ast_func(void *opaque) +{ + struct ocfs2_lock_res *lockres = opaque; + + mlog_entry_void(); + + mlog(0, "Rename AST fired\n"); + + BUG_ON(!ocfs2_is_rename_lock(lockres)); + + ocfs2_generic_ast_func(lockres, 1); + + mlog_exit_void(); +} + +static void ocfs2_rename_bast_func(void *opaque, + int level) +{ + struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_super *osb; + + mlog_entry_void(); + + mlog(0, "Rename BAST fired\n"); + + BUG_ON(!ocfs2_is_rename_lock(lockres)); + + osb = ocfs2_lock_res_super(lockres); + ocfs2_generic_bast_func(osb, lockres, level); + + mlog_exit_void(); +} + +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert) +{ + unsigned long flags; + + mlog_entry_void(); + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + if (convert) + lockres->l_action = OCFS2_AST_INVALID; + else + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + mlog_exit_void(); +} + +/* Note: If we detect another process working on the lock (i.e., + * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller + * to do the right thing in that case. + */ +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + int dlm_flags) +{ + int ret = 0; + enum dlm_status status; + unsigned long flags; + + mlog_entry_void(); + + mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, + dlm_flags); + + spin_lock_irqsave(&lockres->l_lock, flags); + if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || + (lockres->l_flags & OCFS2_LOCK_BUSY)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + lockres->l_action = OCFS2_AST_ATTACH; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = dlmlock(osb->dlm, + level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + lockres->l_ops->ast, + lockres, + lockres->l_ops->bast); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmlock", status, lockres); + ret = -EINVAL; + ocfs2_recover_from_dlm_error(lockres, 1); + } + + mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); + +bail: + mlog_exit(ret); + return ret; +} + +static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, + int flag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = lockres->l_flags & flag; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; +} + +static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); +} + +static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + + return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); +} + +static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) +{ + INIT_LIST_HEAD(&mw->mw_item); + init_completion(&mw->mw_complete); +} + +static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) +{ + wait_for_completion(&mw->mw_complete); + /* Re-arm the completion in case we want to wait on it again */ + INIT_COMPLETION(mw->mw_complete); + return mw->mw_status; +} + +static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw, + unsigned long mask, + unsigned long goal) +{ + BUG_ON(!list_empty(&mw->mw_item)); + + assert_spin_locked(&lockres->l_lock); + + list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); + mw->mw_mask = mask; + mw->mw_goal = goal; +} + +/* returns 0 if the mw that was removed was already satisfied, -EBUSY + * if the mask still hadn't reached its goal */ +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + if (!list_empty(&mw->mw_item)) { + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + ret = -EBUSY; + + list_del_init(&mw->mw_item); + init_completion(&mw->mw_complete); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; + +} + +static int ocfs2_cluster_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + int lkm_flags, + int arg_flags) +{ + struct ocfs2_mask_waiter mw; + enum dlm_status status; + int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); + int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ + unsigned long flags; + + mlog_entry_void(); + + ocfs2_init_mask_waiter(&mw); + +again: + wait = 0; + + if (catch_signals && signal_pending(current)) { + ret = -ERESTARTSYS; + goto out; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + + mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, + "Cluster lock called on freeing lockres %s! flags " + "0x%lx\n", lockres->l_name, lockres->l_flags); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if (lockres->l_flags & OCFS2_LOCK_BUSY && + level > lockres->l_level) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + wait = 1; + goto unlock; + } + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + /* lock has not been created yet. */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + goto again; + } + + if (lockres->l_flags & OCFS2_LOCK_BLOCKED && + !ocfs2_may_continue_on_blocked_lock(lockres, level)) { + /* is the lock is currently blocked on behalf of + * another node */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); + wait = 1; + goto unlock; + } + + if (level > lockres->l_level) { + if (lockres->l_action != OCFS2_AST_INVALID) + mlog(ML_ERROR, "lockres %s has action %u pending\n", + lockres->l_name, lockres->l_action); + + lockres->l_action = OCFS2_AST_CONVERT; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + BUG_ON(level == LKM_IVMODE); + BUG_ON(level == LKM_NLMODE); + + mlog(0, "lock %s, convert from %d to level = %d\n", + lockres->l_name, lockres->l_level, level); + + /* call dlm_lock to upgrade lock now */ + status = dlmlock(osb->dlm, + level, + &lockres->l_lksb, + lkm_flags|LKM_CONVERT|LKM_VALBLK, + lockres->l_name, + lockres->l_ops->ast, + lockres, + lockres->l_ops->bast); + if (status != DLM_NORMAL) { + if ((lkm_flags & LKM_NOQUEUE) && + (status == DLM_NOTQUEUED)) + ret = -EAGAIN; + else { + ocfs2_log_dlm_error("dlmlock", status, + lockres); + ret = -EINVAL; + } + ocfs2_recover_from_dlm_error(lockres, 1); + goto out; + } + + mlog(0, "lock %s, successfull return from dlmlock\n", + lockres->l_name); + + /* At this point we've gone inside the dlm and need to + * complete our work regardless. */ + catch_signals = 0; + + /* wait for busy to clear and carry on */ + goto again; + } + + /* Ok, if we get here then we're good to go. */ + ocfs2_inc_holders(lockres, level); + + ret = 0; +unlock: + spin_unlock_irqrestore(&lockres->l_lock, flags); +out: + /* + * This is helping work around a lock inversion between the page lock + * and dlm locks. One path holds the page lock while calling aops + * which block acquiring dlm locks. The voting thread holds dlm + * locks while acquiring page locks while down converting data locks. + * This block is helping an aop path notice the inversion and back + * off to unlock its page lock before trying the dlm lock again. + */ + if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && + mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { + wait = 0; + if (lockres_remove_mask_waiter(lockres, &mw)) + ret = -EAGAIN; + else + goto again; + } + if (wait) { + ret = ocfs2_wait_for_mask(&mw); + if (ret == 0) + goto again; + mlog_errno(ret); + } + + mlog_exit(ret); + return ret; +} + +static void ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level) +{ + unsigned long flags; + + mlog_entry_void(); + spin_lock_irqsave(&lockres->l_lock, flags); + ocfs2_dec_holders(lockres, level); + ocfs2_vote_on_unlock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); +} + +static int ocfs2_create_new_inode_lock(struct inode *inode, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); +} + +/* Grants us an EX lock on the data and metadata resources, skipping + * the normal cluster directory lookup. Use this ONLY on newly created + * inodes which other nodes can't possibly see, and which haven't been + * hashed in the inode hash yet. This can give us a good performance + * increase as it'll skip the network broadcast normally associated + * with creating a new lock resource. */ +int ocfs2_create_new_inode_locks(struct inode *inode) +{ + int ret; + + BUG_ON(!inode); + BUG_ON(!ocfs2_inode_is_new(inode)); + + mlog_entry_void(); + + mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + /* NOTE: That we don't increment any of the holder counts, nor + * do we add anything to a journal handle. Since this is + * supposed to be a new inode which the cluster doesn't know + * about yet, there is no need to. As far as the LVB handling + * is concerned, this is basically like acquiring an EX lock + * on a resource which has an invalid one -- we'll set it + * valid when we release the EX. */ + + ret = ocfs2_create_new_inode_lock(inode, + &OCFS2_I(inode)->ip_rw_lockres); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_create_new_inode_lock(inode, + &OCFS2_I(inode)->ip_meta_lockres); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_create_new_inode_lock(inode, + &OCFS2_I(inode)->ip_data_lockres); + if (ret) { + mlog_errno(ret); + goto bail; + } + +bail: + mlog_exit(ret); + return ret; +} + +int ocfs2_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" take %s RW lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, + 0); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + +void ocfs2_rw_unlock(struct inode *inode, int write) +{ + int level = write ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" drop %s RW lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + + mlog_exit_void(); +} + +int ocfs2_data_lock_full(struct inode *inode, + int write, + int arg_flags) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" take %s DATA lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + /* We'll allow faking a readonly data lock for + * rodevices. */ + if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { + if (write) { + status = -EROFS; + mlog_errno(status); + } + goto out; + } + + lockres = &OCFS2_I(inode)->ip_data_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, + 0, arg_flags); + if (status < 0 && status != -EAGAIN) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +/* see ocfs2_meta_lock_with_page() */ +int ocfs2_data_lock_with_page(struct inode *inode, + int write, + struct page *page) +{ + int ret; + + ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); + if (ret == -EAGAIN) { + unlock_page(page); + if (ocfs2_data_lock(inode, write) == 0) + ocfs2_data_unlock(inode, write); + ret = AOP_TRUNCATED_PAGE; + } + + return ret; +} + +static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int kick = 0; + + mlog_entry_void(); + + /* If we know that another node is waiting on our lock, kick + * the vote thread * pre-emptively when we reach a release + * condition. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { + switch(lockres->l_blocking) { + case LKM_EXMODE: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + kick = 1; + break; + case LKM_PRMODE: + if (!lockres->l_ex_holders) + kick = 1; + break; + default: + BUG(); + } + } + + if (kick) + ocfs2_kick_vote_thread(osb); + + mlog_exit_void(); +} + +void ocfs2_data_unlock(struct inode *inode, + int write) +{ + int level = write ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" drop %s DATA lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + + mlog_exit_void(); +} + +#define OCFS2_SEC_BITS 34 +#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) + +/* LVB only has room for 64 bits of time here so we pack it for + * now. */ +static u64 ocfs2_pack_timespec(struct timespec *spec) +{ + u64 res; + u64 sec = spec->tv_sec; + u32 nsec = spec->tv_nsec; + + res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); + + return res; +} + +/* Call this with the lockres locked. I am reasonably sure we don't + * need ip_lock in this function as anyone who would be changing those + * values is supposed to be blocked in ocfs2_meta_lock right now. */ +static void __ocfs2_stuff_meta_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_meta_lvb *lvb; + + mlog_entry_void(); + + lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); + lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); + lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); + lvb->lvb_iuid = cpu_to_be32(inode->i_uid); + lvb->lvb_igid = cpu_to_be32(inode->i_gid); + lvb->lvb_imode = cpu_to_be16(inode->i_mode); + lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + lvb->lvb_iatime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); + lvb->lvb_ictime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + lvb->lvb_imtime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + + mlog_meta_lvb(0, lockres); + + mlog_exit_void(); +} + +static void ocfs2_unpack_timespec(struct timespec *spec, + u64 packed_time) +{ + spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; + spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; +} + +static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_meta_lvb *lvb; + + mlog_entry_void(); + + mlog_meta_lvb(0, lockres); + + lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + /* We're safe here without the lockres lock... */ + spin_lock(&oi->ip_lock); + oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); + i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); + + /* fast-symlinks are a special case */ + if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = + ocfs2_align_bytes_to_sectors(i_size_read(inode)); + + inode->i_uid = be32_to_cpu(lvb->lvb_iuid); + inode->i_gid = be32_to_cpu(lvb->lvb_igid); + inode->i_mode = be16_to_cpu(lvb->lvb_imode); + inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); + ocfs2_unpack_timespec(&inode->i_atime, + be64_to_cpu(lvb->lvb_iatime_packed)); + ocfs2_unpack_timespec(&inode->i_mtime, + be64_to_cpu(lvb->lvb_imtime_packed)); + ocfs2_unpack_timespec(&inode->i_ctime, + be64_to_cpu(lvb->lvb_ictime_packed)); + spin_unlock(&oi->ip_lock); + + mlog_exit_void(); +} + +static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) + return 1; + return 0; +} + +/* Determine whether a lock resource needs to be refreshed, and + * arbitrate who gets to refresh it. + * + * 0 means no refresh needed. + * + * > 0 means you need to refresh this and you MUST call + * ocfs2_complete_lock_res_refresh afterwards. */ +static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) +{ + unsigned long flags; + int status = 0; + + mlog_entry_void(); + +refresh_check: + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_wait_on_refreshing_lock(lockres); + goto refresh_check; + } + + /* Ok, I'll be the one to refresh this lock. */ + lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = 1; +bail: + mlog_exit(status); + return status; +} + +/* If status is non zero, I'll mark it as not being in refresh + * anymroe, but i won't clear the needs refresh flag. */ +static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, + int status) +{ + unsigned long flags; + mlog_entry_void(); + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); + if (!status) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + + mlog_exit_void(); +} + +/* may or may not return a bh if it went to disk. */ +static int ocfs2_meta_lock_update(struct inode *inode, + struct buffer_head **bh) +{ + int status = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + spin_lock(&oi->ip_lock); + if (oi->ip_flags & OCFS2_INODE_DELETED) { + mlog(0, "Orphaned inode %"MLFu64" was deleted while we " + "were waiting on a lock. ip_flags = 0x%x\n", + oi->ip_blkno, oi->ip_flags); + spin_unlock(&oi->ip_lock); + status = -ENOENT; + goto bail; + } + spin_unlock(&oi->ip_lock); + + lockres = &oi->ip_meta_lockres; + + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + + /* This will discard any caching information we might have had + * for the inode metadata. */ + ocfs2_metadata_cache_purge(inode); + + /* will do nothing for inode types that don't use the extent + * map (directories, bitmap files, etc) */ + ocfs2_extent_map_trunc(inode, 0); + + if (ocfs2_meta_lvb_is_trustable(lockres)) { + mlog(0, "Trusting LVB on inode %"MLFu64"\n", + oi->ip_blkno); + ocfs2_refresh_inode_from_lvb(inode); + } else { + /* Boo, we have to go to disk. */ + /* read bh, cast, ocfs2_refresh_inode */ + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, + bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail_refresh; + } + fe = (struct ocfs2_dinode *) (*bh)->b_data; + + /* This is a good chance to make sure we're not + * locking an invalid object. + * + * We bug on a stale inode here because we checked + * above whether it was wiped from disk. The wiping + * node provides a guarantee that we receive that + * message and can mark the inode before dropping any + * locks associated with it. */ + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail_refresh; + } + mlog_bug_on_msg(inode->i_generation != + le32_to_cpu(fe->i_generation), + "Invalid dinode %"MLFu64" disk generation: %u " + "inode->i_generation: %u\n", + oi->ip_blkno, le32_to_cpu(fe->i_generation), + inode->i_generation); + mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), + "Stale dinode %"MLFu64" dtime: %"MLFu64" " + "flags: 0x%x\n", oi->ip_blkno, + le64_to_cpu(fe->i_dtime), + le32_to_cpu(fe->i_flags)); + + ocfs2_refresh_inode(inode, fe); + } + + status = 0; +bail_refresh: + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_assign_bh(struct inode *inode, + struct buffer_head **ret_bh, + struct buffer_head *passed_bh) +{ + int status; + + if (passed_bh) { + /* Ok, the update went to disk for us, use the + * returned bh. */ + *ret_bh = passed_bh; + get_bh(*ret_bh); + + return 0; + } + + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + ret_bh, + OCFS2_BH_CACHED, + inode); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * returns < 0 error if the callback will never be called, otherwise + * the result of the lock will be communicated via the callback. + */ +int ocfs2_meta_lock_full(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + int arg_flags) +{ + int status, level, dlm_flags, acquired; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *local_bh = NULL; + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64", take %s META lock\n", + OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + status = 0; + acquired = 0; + /* We'll allow faking a readonly metadata lock for + * rodevices. */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + wait_event(osb->recovery_event, + ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + + acquired = 0; + lockres = &OCFS2_I(inode)->ip_meta_lockres; + level = ex ? LKM_EXMODE : LKM_PRMODE; + dlm_flags = 0; + if (arg_flags & OCFS2_META_LOCK_NOQUEUE) + dlm_flags |= LKM_NOQUEUE; + + status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); + if (status < 0) { + if (status != -EAGAIN && status != -EIOCBRETRY) + mlog_errno(status); + goto bail; + } + + /* Notify the error cleanup path to drop the cluster lock. */ + acquired = 1; + + /* We wait twice because a node may have died while we were in + * the lower dlm layers. The second time though, we've + * committed to owning this lock so we don't allow signals to + * abort the operation. */ + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + wait_event(osb->recovery_event, + ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + + /* This is fun. The caller may want a bh back, or it may + * not. ocfs2_meta_lock_update definitely wants one in, but + * may or may not read one, depending on what's in the + * LVB. The result of all of this is that we've *only* gone to + * disk if we have to, so the complexity is worthwhile. */ + status = ocfs2_meta_lock_update(inode, &local_bh); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + if (ret_bh) { + status = ocfs2_assign_bh(inode, ret_bh, local_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (handle) { + status = ocfs2_handle_add_lock(handle, inode); + if (status < 0) + mlog_errno(status); + } + +bail: + if (status < 0) { + if (ret_bh && (*ret_bh)) { + brelse(*ret_bh); + *ret_bh = NULL; + } + if (acquired) + ocfs2_meta_unlock(inode, ex); + } + + if (local_bh) + brelse(local_bh); + + mlog_exit(status); + return status; +} + +/* + * This is working around a lock inversion between tasks acquiring DLM locks + * while holding a page lock and the vote thread which blocks dlm lock acquiry + * while acquiring page locks. + * + * ** These _with_page variantes are only intended to be called from aop + * methods that hold page locks and return a very specific *positive* error + * code that aop methods pass up to the VFS -- test for errors with != 0. ** + * + * The DLM is called such that it returns -EAGAIN if it would have blocked + * waiting for the vote thread. In that case we unlock our page so the vote + * thread can make progress. Once we've done this we have to return + * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up + * into the VFS who will then immediately retry the aop call. + * + * We do a blocking lock and immediate unlock before returning, though, so that + * the lock has a great chance of being cached on this node by the time the VFS + * calls back to retry the aop. This has a potential to livelock as nodes + * ping locks back and forth, but that's a risk we're willing to take to avoid + * the lock inversion simply. + */ +int ocfs2_meta_lock_with_page(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + struct page *page) +{ + int ret; + + ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, + OCFS2_LOCK_NONBLOCK); + if (ret == -EAGAIN) { + unlock_page(page); + if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) + ocfs2_meta_unlock(inode, ex); + ret = AOP_TRUNCATED_PAGE; + } + + return ret; +} + +void ocfs2_meta_unlock(struct inode *inode, + int ex) +{ + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" drop %s META lock\n", + OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + + mlog_exit_void(); +} + +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex) +{ + int status; + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + struct buffer_head *bh; + struct ocfs2_slot_info *si = osb->slot_info; + + mlog_entry_void(); + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* The super block lock path is really in the best position to + * know when resources covered by the lock need to be + * refreshed, so we do it here. Of course, making sense of + * everything is up to the caller :) */ + status = ocfs2_should_refresh_lock_res(lockres); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (status) { + bh = si->si_bh; + status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, + si->si_inode); + if (status == 0) + ocfs2_update_slot_info(si); + + ocfs2_complete_lock_res_refresh(lockres, status); + + if (status < 0) + mlog_errno(status); + } +bail: + mlog_exit(status); + return status; +} + +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex) +{ + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + + ocfs2_cluster_unlock(osb, lockres, level); +} + +int ocfs2_rename_lock(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_rename_unlock(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); +} + +/* Reference counting of the dlm debug structure. We want this because + * open references on the debug inodes can live on after a mount, so + * we can't rely on the ocfs2_super to always exist. */ +static void ocfs2_dlm_debug_free(struct kref *kref) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); + + kfree(dlm_debug); +} + +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) +{ + if (dlm_debug) + kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); +} + +static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) +{ + kref_get(&debug->d_refcnt); +} + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); + if (!dlm_debug) { + mlog_errno(-ENOMEM); + goto out; + } + + kref_init(&dlm_debug->d_refcnt); + INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); + dlm_debug->d_locking_state = NULL; +out: + return dlm_debug; +} + +/* Access to this is arbitrated for us via seq_file->sem. */ +struct ocfs2_dlm_seq_priv { + struct ocfs2_dlm_debug *p_dlm_debug; + struct ocfs2_lock_res p_iter_res; + struct ocfs2_lock_res p_tmp_res; +}; + +static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, + struct ocfs2_dlm_seq_priv *priv) +{ + struct ocfs2_lock_res *iter, *ret = NULL; + struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; + + assert_spin_locked(&ocfs2_dlm_tracking_lock); + + list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { + /* discover the head of the list */ + if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { + mlog(0, "End of list found, %p\n", ret); + break; + } + + /* We track our "dummy" iteration lockres' by a NULL + * l_ops field. */ + if (iter->l_ops != NULL) { + ret = iter; + break; + } + } + + return ret; +} + +static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); + if (iter) { + /* Since lockres' have the lifetime of their container + * (which can be inodes, ocfs2_supers, etc) we want to + * copy this out to a temporary lockres while still + * under the spinlock. Obviously after this we can't + * trust any pointers on the copy returned, but that's + * ok as the information we want isn't typically held + * in them. */ + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter = v; + struct ocfs2_lock_res *dummy = &priv->p_iter_res; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(iter, priv); + list_del_init(&dummy->l_debug_list); + if (iter) { + list_add(&dummy->l_debug_list, &iter->l_debug_list); + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +/* So that debugfs.ocfs2 can determine which format is being used */ +#define OCFS2_DLM_DEBUG_STR_VERSION 1 +static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) +{ + int i; + char *lvb; + struct ocfs2_lock_res *lockres = v; + + if (!lockres) + return -EINVAL; + + seq_printf(m, "0x%x\t" + "%.*s\t" + "%d\t" + "0x%lx\t" + "0x%x\t" + "0x%x\t" + "%u\t" + "%u\t" + "%d\t" + "%d\t", + OCFS2_DLM_DEBUG_STR_VERSION, + OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, + lockres->l_level, + lockres->l_flags, + lockres->l_action, + lockres->l_unlock_action, + lockres->l_ro_holders, + lockres->l_ex_holders, + lockres->l_requested, + lockres->l_blocking); + + /* Dump the raw LVB */ + lvb = lockres->l_lksb.lvb; + for(i = 0; i < DLM_LVB_LEN; i++) + seq_printf(m, "0x%x\t", lvb[i]); + + /* End the line */ + seq_printf(m, "\n"); + return 0; +} + +static struct seq_operations ocfs2_dlm_seq_ops = { + .start = ocfs2_dlm_seq_start, + .stop = ocfs2_dlm_seq_stop, + .next = ocfs2_dlm_seq_next, + .show = ocfs2_dlm_seq_show, +}; + +static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = (struct seq_file *) file->private_data; + struct ocfs2_dlm_seq_priv *priv = seq->private; + struct ocfs2_lock_res *res = &priv->p_iter_res; + + ocfs2_remove_lockres_tracking(res); + ocfs2_put_dlm_debug(priv->p_dlm_debug); + return seq_release_private(inode, file); +} + +static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) +{ + int ret; + struct ocfs2_dlm_seq_priv *priv; + struct seq_file *seq; + struct ocfs2_super *osb; + + priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + osb = (struct ocfs2_super *) inode->u.generic_ip; + ocfs2_get_dlm_debug(osb->osb_dlm_debug); + priv->p_dlm_debug = osb->osb_dlm_debug; + INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); + + ret = seq_open(file, &ocfs2_dlm_seq_ops); + if (ret) { + kfree(priv); + mlog_errno(ret); + goto out; + } + + seq = (struct seq_file *) file->private_data; + seq->private = priv; + + ocfs2_add_lockres_tracking(&priv->p_iter_res, + priv->p_dlm_debug); + +out: + return ret; +} + +static struct file_operations ocfs2_dlm_debug_fops = { + .open = ocfs2_dlm_debug_open, + .release = ocfs2_dlm_debug_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +{ + int ret = 0; + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + dlm_debug->d_locking_state = debugfs_create_file("locking_state", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_dlm_debug_fops); + if (!dlm_debug->d_locking_state) { + ret = -EINVAL; + mlog(ML_ERROR, + "Unable to create locking state debugfs file.\n"); + goto out; + } + + ocfs2_get_dlm_debug(dlm_debug); +out: + return ret; +} + +static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) +{ + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + if (dlm_debug) { + debugfs_remove(dlm_debug->d_locking_state); + ocfs2_put_dlm_debug(dlm_debug); + } +} + +int ocfs2_dlm_init(struct ocfs2_super *osb) +{ + int status; + u32 dlm_key; + struct dlm_ctxt *dlm; + + mlog_entry_void(); + + status = ocfs2_dlm_init_debug(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* launch vote thread */ + osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d", + osb->osb_id); + if (IS_ERR(osb->vote_task)) { + status = PTR_ERR(osb->vote_task); + osb->vote_task = NULL; + mlog_errno(status); + goto bail; + } + + /* used by the dlm code to make message headers unique, each + * node in this domain must agree on this. */ + dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); + + /* for now, uuid == domain */ + dlm = dlm_register_domain(osb->uuid_str, dlm_key); + if (IS_ERR(dlm)) { + status = PTR_ERR(dlm); + mlog_errno(status); + goto bail; + } + + ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); + ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + + dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); + + osb->dlm = dlm; + + status = 0; +bail: + if (status < 0) { + ocfs2_dlm_shutdown_debug(osb); + if (osb->vote_task) + kthread_stop(osb->vote_task); + } + + mlog_exit(status); + return status; +} + +void ocfs2_dlm_shutdown(struct ocfs2_super *osb) +{ + mlog_entry_void(); + + dlm_unregister_eviction_cb(&osb->osb_eviction_cb); + + ocfs2_drop_osb_locks(osb); + + if (osb->vote_task) { + kthread_stop(osb->vote_task); + osb->vote_task = NULL; + } + + ocfs2_lock_res_free(&osb->osb_super_lockres); + ocfs2_lock_res_free(&osb->osb_rename_lockres); + + dlm_unregister_domain(osb->dlm); + osb->dlm = NULL; + + ocfs2_dlm_shutdown_debug(osb); + + mlog_exit_void(); +} + +static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) +{ + struct ocfs2_lock_res *lockres = opaque; + unsigned long flags; + + mlog_entry_void(); + + mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, + lockres->l_unlock_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + /* We tried to cancel a convert request, but it was already + * granted. All we want to do here is clear our unlock + * state. The wake_up call done at the bottom is redundant + * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't + * hurt anything anyway */ + if (status == DLM_CANCELGRANT && + lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { + mlog(0, "Got cancelgrant for %s\n", lockres->l_name); + + /* We don't clear the busy flag in this case as it + * should have been cleared by the ast which the dlm + * has called. */ + goto complete_unlock; + } + + if (status != DLM_NORMAL) { + mlog(ML_ERROR, "Dlm passes status %d for lock %s, " + "unlock_action %d\n", status, lockres->l_name, + lockres->l_unlock_action); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + switch(lockres->l_unlock_action) { + case OCFS2_UNLOCK_CANCEL_CONVERT: + mlog(0, "Cancel convert success for %s\n", lockres->l_name); + lockres->l_action = OCFS2_AST_INVALID; + break; + case OCFS2_UNLOCK_DROP_LOCK: + lockres->l_level = LKM_IVMODE; + break; + default: + BUG(); + } + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +complete_unlock: + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + + mlog_exit_void(); +} + +typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); + +struct drop_lock_cb { + ocfs2_pre_drop_cb_t *drop_func; + void *drop_data; +}; + +static int ocfs2_drop_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + struct drop_lock_cb *dcb) +{ + enum dlm_status status; + unsigned long flags; + + /* We didn't get anywhere near actually using this lockres. */ + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) + goto out; + + spin_lock_irqsave(&lockres->l_lock, flags); + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), + "lockres %s, flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + + while (lockres->l_flags & OCFS2_LOCK_BUSY) { + mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " + "%u, unlock_action = %u\n", + lockres->l_name, lockres->l_flags, lockres->l_action, + lockres->l_unlock_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* XXX: Today we just wait on any busy + * locks... Perhaps we need to cancel converts in the + * future? */ + ocfs2_wait_on_busy_lock(lockres); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + + if (dcb) + dcb->drop_func(lockres, dcb->drop_data); + + if (lockres->l_flags & OCFS2_LOCK_BUSY) + mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", + lockres->l_name); + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto out; + } + + lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); + + /* make sure we never get here while waiting for an ast to + * fire. */ + BUG_ON(lockres->l_action != OCFS2_AST_INVALID); + + /* is this necessary? */ + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "lock %s\n", lockres->l_name); + + status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, + lockres->l_ops->unlock_ast, lockres); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmunlock", status, lockres); + mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); + dlm_print_one_lock(lockres->l_lksb.lockid); + BUG(); + } + mlog(0, "lock %s, successfull return from dlmunlock\n", + lockres->l_name); + + ocfs2_wait_on_busy_lock(lockres); +out: + mlog_exit(0); + return 0; +} + +/* Mark the lockres as being dropped. It will no longer be + * queued if blocking, but we still may have to wait on it + * being dequeued from the vote thread before we can consider + * it safe to drop. + * + * You can *not* attempt to call cluster_lock on this lockres anymore. */ +void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +{ + int status; + struct ocfs2_mask_waiter mw; + unsigned long flags; + + ocfs2_init_mask_waiter(&mw); + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres->l_flags |= OCFS2_LOCK_FREEING; + while (lockres->l_flags & OCFS2_LOCK_QUEUED) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "Waiting on lockres %s\n", lockres->l_name); + + status = ocfs2_wait_for_mask(&mw); + if (status) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) +{ + int status; + + mlog_entry_void(); + + ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); + + status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); + if (status < 0) + mlog_errno(status); + + ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); + + status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); +} + +static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) +{ + struct inode *inode = data; + + /* the metadata lock requires a bit more work as we have an + * LVB to worry about. */ + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level == LKM_EXMODE && + !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + __ocfs2_stuff_meta_lvb(inode); +} + +int ocfs2_drop_inode_locks(struct inode *inode) +{ + int status, err; + struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; + + mlog_entry_void(); + + /* No need to call ocfs2_mark_lockres_freeing here - + * ocfs2_clear_inode has done it for us. */ + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_data_lockres, + NULL); + if (err < 0) + mlog_errno(err); + + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_meta_lockres, + &meta_dcb); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_rw_lockres, + NULL); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + mlog_exit(status); + return status; +} + +static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + assert_spin_locked(&lockres->l_lock); + + BUG_ON(lockres->l_blocking <= LKM_NLMODE); + + if (lockres->l_level <= new_level) { + mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", + lockres->l_level, new_level); + BUG(); + } + + mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", + lockres->l_name, new_level, lockres->l_blocking); + + lockres->l_action = OCFS2_AST_DOWNCONVERT; + lockres->l_requested = new_level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); +} + +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb) +{ + int ret, dlm_flags = LKM_CONVERT; + enum dlm_status status; + + mlog_entry_void(); + + if (lvb) + dlm_flags |= LKM_VALBLK; + + status = dlmlock(osb->dlm, + new_level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + lockres->l_ops->ast, + lockres, + lockres->l_ops->bast); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmlock", status, lockres); + ret = -EINVAL; + ocfs2_recover_from_dlm_error(lockres, 1); + goto bail; + } + + ret = 0; +bail: + mlog_exit(ret); + return ret; +} + +/* returns 1 when the caller should unlock and call dlmunlock */ +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + + mlog_entry_void(); + mlog(0, "lock %s\n", lockres->l_name); + + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { + /* If we're already trying to cancel a lock conversion + * then just drop the spinlock and allow the caller to + * requeue this lock. */ + + mlog(0, "Lockres %s, skip convert\n", lockres->l_name); + return 0; + } + + /* were we in a convert when we got the bast fire? */ + BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && + lockres->l_action != OCFS2_AST_DOWNCONVERT); + /* set things up for the unlockast to know to just + * clear out the ast_action and unset busy, etc. */ + lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), + "lock %s, invalid flags: 0x%lx\n", + lockres->l_name, lockres->l_flags); + + return 1; +} + +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + enum dlm_status status; + + mlog_entry_void(); + mlog(0, "lock %s\n", lockres->l_name); + + ret = 0; + status = dlmunlock(osb->dlm, + &lockres->l_lksb, + LKM_CANCEL, + lockres->l_ops->unlock_ast, + lockres); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmunlock", status, lockres); + ret = -EINVAL; + ocfs2_recover_from_dlm_error(lockres, 0); + } + + mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); + + mlog_exit(ret); + return ret; +} + +static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, + struct ocfs2_lock_res *lockres, + int new_level) +{ + int ret; + + mlog_entry_void(); + + BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); + + if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { + ret = 0; + mlog(0, "lockres %s currently being refreshed -- backing " + "off!\n", lockres->l_name); + } else if (new_level == LKM_PRMODE) + ret = !lockres->l_ex_holders && + ocfs2_inode_fully_checkpointed(inode); + else /* Must be NLMODE we're converting to. */ + ret = !lockres->l_ro_holders && !lockres->l_ex_holders && + ocfs2_inode_fully_checkpointed(inode); + + mlog_exit(ret); + return ret; +} + +static int ocfs2_do_unblock_meta(struct inode *inode, + int *requeue) +{ + int new_level; + int set_lvb = 0; + int ret = 0; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + unsigned long flags; + + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry_void(); + + spin_lock_irqsave(&lockres->l_lock, flags); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + + mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, + lockres->l_blocking); + + BUG_ON(lockres->l_level != LKM_EXMODE && + lockres->l_level != LKM_PRMODE); + + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + *requeue = 1; + ret = ocfs2_prepare_cancel_convert(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ret) { + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) + mlog_errno(ret); + } + goto leave; + } + + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", + lockres->l_level, lockres->l_blocking, new_level); + + if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { + if (lockres->l_level == LKM_EXMODE) + set_lvb = 1; + + /* If the lock hasn't been refreshed yet (rare), then + * our memory inode values are old and we skip + * stuffing the lvb. There's no need to actually clear + * out the lvb here as it's value is still valid. */ + if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { + if (set_lvb) + __ocfs2_stuff_meta_lvb(inode); + } else + mlog(0, "lockres %s: downconverting stale lock!\n", + lockres->l_name); + + mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " + "l_blocking=%d, new_level=%d\n", + lockres->l_level, lockres->l_blocking, new_level); + + ocfs2_prepare_downconvert(lockres, new_level); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); + goto leave; + } + if (!ocfs2_inode_fully_checkpointed(inode)) + ocfs2_start_checkpoint(osb); + + *requeue = 1; + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = 0; +leave: + mlog_exit(ret); + return ret; +} + +static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int *requeue, + ocfs2_convert_worker_t *worker) +{ + unsigned long flags; + int blocking; + int new_level; + int ret = 0; + + mlog_entry_void(); + + spin_lock_irqsave(&lockres->l_lock, flags); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + +recheck: + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + *requeue = 1; + ret = ocfs2_prepare_cancel_convert(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ret) { + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) + mlog_errno(ret); + } + goto leave; + } + + /* if we're blocking an exclusive and we have *any* holders, + * then requeue. */ + if ((lockres->l_blocking == LKM_EXMODE) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + *requeue = 1; + ret = 0; + goto leave; + } + + /* If it's a PR we're blocking, then only + * requeue if we've got any EX holders */ + if (lockres->l_blocking == LKM_PRMODE && + lockres->l_ex_holders) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + *requeue = 1; + ret = 0; + goto leave; + } + + /* If we get here, then we know that there are no more + * incompatible holders (and anyone asking for an incompatible + * lock is blocked). We can now downconvert the lock */ + if (!worker) + goto downconvert; + + /* Some lockres types want to do a bit of work before + * downconverting a lock. Allow that here. The worker function + * may sleep, so we save off a copy of what we're blocking as + * it may change while we're not holding the spin lock. */ + blocking = lockres->l_blocking; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + worker(lockres, blocking); + + spin_lock_irqsave(&lockres->l_lock, flags); + if (blocking != lockres->l_blocking) { + /* If this changed underneath us, then we can't drop + * it just yet. */ + goto recheck; + } + +downconvert: + *requeue = 0; + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + ocfs2_prepare_downconvert(lockres, new_level); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); +leave: + mlog_exit(ret); + return ret; +} + +static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct inode *inode; + struct address_space *mapping; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + mapping = inode->i_mapping; + + if (filemap_fdatawrite(mapping)) { + mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!", + OCFS2_I(inode)->ip_blkno); + } + sync_mapping_buffers(mapping); + if (blocking == LKM_EXMODE) { + truncate_inode_pages(mapping, 0); + unmap_mapping_range(mapping, 0, 0, 0); + } else { + /* We only need to wait on the I/O if we're not also + * truncating pages because truncate_inode_pages waits + * for us above. We don't truncate pages if we're + * blocking anything < EXMODE because we want to keep + * them around in that case. */ + filemap_fdatawait(mapping); + } + + mlog_exit_void(); +} + +int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct inode *inode; + struct ocfs2_super *osb; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + osb = OCFS2_SB(inode->i_sb); + + mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + status = ocfs2_generic_unblock_lock(osb, + lockres, + requeue, + ocfs2_data_convert_worker); + if (status < 0) + mlog_errno(status); + + mlog(0, "inode %"MLFu64", requeue = %d\n", + OCFS2_I(inode)->ip_blkno, *requeue); + + mlog_exit(status); + return status; +} + +static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct inode *inode; + + mlog_entry_void(); + + mlog(0, "Unblock lockres %s\n", lockres->l_name); + + inode = ocfs2_lock_res_inode(lockres); + + status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), + lockres, + requeue, + NULL); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + + +int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct inode *inode; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + + mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + status = ocfs2_do_unblock_meta(inode, requeue); + if (status < 0) + mlog_errno(status); + + mlog(0, "inode %"MLFu64", requeue = %d\n", + OCFS2_I(inode)->ip_blkno, *requeue); + + mlog_exit(status); + return status; +} + +/* Generic unblock function for any lockres whose private data is an + * ocfs2_super pointer. */ +static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct ocfs2_super *osb; + + mlog_entry_void(); + + mlog(0, "Unblock lockres %s\n", lockres->l_name); + + osb = ocfs2_lock_res_super(lockres); + + status = ocfs2_generic_unblock_lock(osb, + lockres, + requeue, + NULL); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + +void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int status; + int requeue = 0; + unsigned long flags; + + /* Our reference to the lockres in this function can be + * considered valid until we remove the OCFS2_LOCK_QUEUED + * flag. */ + + mlog_entry_void(); + + BUG_ON(!lockres); + BUG_ON(!lockres->l_ops); + BUG_ON(!lockres->l_ops->unblock); + + mlog(0, "lockres %s blocked.\n", lockres->l_name); + + /* Detect whether a lock has been marked as going away while + * the vote thread was processing other things. A lock can + * still be marked with OCFS2_LOCK_FREEING after this check, + * but short circuiting here will still save us some + * performance. */ + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_FREEING) + goto unqueue; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = lockres->l_ops->unblock(lockres, &requeue); + if (status < 0) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); +unqueue: + if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); + } else + ocfs2_schedule_blocked_lock(osb, lockres); + + mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, + requeue ? "yes" : "no"); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog_exit_void(); +} + +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + assert_spin_locked(&lockres->l_lock); + + if (lockres->l_flags & OCFS2_LOCK_FREEING) { + /* Do not schedule a lock for downconvert when it's on + * the way to destruction - any nodes wanting access + * to the resource will get it soon. */ + mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + return; + } + + lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); + + spin_lock(&osb->vote_task_lock); + if (list_empty(&lockres->l_blocked_list)) { + list_add_tail(&lockres->l_blocked_list, + &osb->blocked_lock_list); + osb->blocked_lock_count++; + } + spin_unlock(&osb->vote_task_lock); + + mlog_exit_void(); +} + +/* This aids in debugging situations where a bad LVB might be involved. */ +void ocfs2_dump_meta_lvb_info(u64 level, + const char *function, + unsigned int line, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + mlog(level, "LVB information for %s (called from %s:%u):\n", + lockres->l_name, function, line); + mlog(level, "version: %u, clusters: %u\n", + be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); + mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n", + be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid), + be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode)); + mlog(level, "nlink %u, atime_packed 0x%"MLFx64", " + "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n", + be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed), + be64_to_cpu(lvb->lvb_ictime_packed), + be64_to_cpu(lvb->lvb_imtime_packed)); +} diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h new file mode 100644 index 00000000000..8f2d1db2d9e --- /dev/null +++ b/fs/ocfs2/dlmglue.h @@ -0,0 +1,111 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef DLMGLUE_H +#define DLMGLUE_H + +#define OCFS2_LVB_VERSION 2 + +struct ocfs2_meta_lvb { + __be32 lvb_version; + __be32 lvb_iclusters; + __be32 lvb_iuid; + __be32 lvb_igid; + __be64 lvb_iatime_packed; + __be64 lvb_ictime_packed; + __be64 lvb_imtime_packed; + __be64 lvb_isize; + __be16 lvb_imode; + __be16 lvb_inlink; + __be32 lvb_reserved[3]; +}; + +/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ +/* don't wait on recovery. */ +#define OCFS2_META_LOCK_RECOVERY (0x01) +/* Instruct the dlm not to queue ourselves on the other node. */ +#define OCFS2_META_LOCK_NOQUEUE (0x02) +/* don't block waiting for the vote thread, instead return -EAGAIN */ +#define OCFS2_LOCK_NONBLOCK (0x04) + +int ocfs2_dlm_init(struct ocfs2_super *osb); +void ocfs2_dlm_shutdown(struct ocfs2_super *osb); +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + struct inode *inode); +void ocfs2_lock_res_free(struct ocfs2_lock_res *res); +int ocfs2_create_new_inode_locks(struct inode *inode); +int ocfs2_drop_inode_locks(struct inode *inode); +int ocfs2_data_lock_full(struct inode *inode, + int write, + int arg_flags); +#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0) +int ocfs2_data_lock_with_page(struct inode *inode, + int write, + struct page *page); +void ocfs2_data_unlock(struct inode *inode, + int write); +int ocfs2_rw_lock(struct inode *inode, int write); +void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_meta_lock_full(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + int arg_flags); +int ocfs2_meta_lock_with_page(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + struct page *page); +/* 99% of the time we don't want to supply any additional flags -- + * those are for very specific cases only. */ +#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0) +void ocfs2_meta_unlock(struct inode *inode, + int ex); +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex); +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex); +int ocfs2_rename_lock(struct ocfs2_super *osb); +void ocfs2_rename_unlock(struct ocfs2_super *osb); +void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); + +/* for the vote thread */ +void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); + +/* aids in debugging and tracking lvbs */ +void ocfs2_dump_meta_lvb_info(u64 level, + const char *function, + unsigned int line, + struct ocfs2_lock_res *lockres); +#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) + +#endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h new file mode 100644 index 00000000000..f226b220762 --- /dev/null +++ b/fs/ocfs2/endian.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_ENDIAN_H +#define OCFS2_ENDIAN_H + +static inline void le16_add_cpu(__le16 *var, u16 val) +{ + *var = cpu_to_le16(le16_to_cpu(*var) + val); +} + +static inline void le32_add_cpu(__le32 *var, u32 val) +{ + *var = cpu_to_le32(le32_to_cpu(*var) + val); +} + +static inline void le32_and_cpu(__le32 *var, u32 val) +{ + *var = cpu_to_le32(le32_to_cpu(*var) & val); +} + +static inline void be32_add_cpu(__be32 *var, u32 val) +{ + *var = cpu_to_be32(be32_to_cpu(*var) + val); +} + +#endif /* OCFS2_ENDIAN_H */ diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c new file mode 100644 index 00000000000..5810160d92a --- /dev/null +++ b/fs/ocfs2/export.c @@ -0,0 +1,248 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.c + * + * Functions to facilitate NFS exporting + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> + +#define MLOG_MASK_PREFIX ML_EXPORT +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dir.h" +#include "dlmglue.h" +#include "export.h" +#include "inode.h" + +#include "buffer_head_io.h" + +struct ocfs2_inode_handle +{ + u64 ih_blkno; + u32 ih_generation; +}; + +static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) +{ + struct ocfs2_inode_handle *handle = vobjp; + struct inode *inode; + struct dentry *result; + + mlog_entry("(0x%p, 0x%p)\n", sb, handle); + + if (handle->ih_blkno == 0) { + mlog_errno(-ESTALE); + return ERR_PTR(-ESTALE); + } + + inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno); + + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + return (void *)inode; + } + + if (handle->ih_generation != inode->i_generation) { + iput(inode); + mlog_errno(-ESTALE); + return ERR_PTR(-ESTALE); + } + + result = d_alloc_anon(inode); + + if (!result) { + iput(inode); + mlog_errno(-ENOMEM); + return ERR_PTR(-ENOMEM); + } + + mlog_exit_ptr(result); + return result; +} + +static struct dentry *ocfs2_get_parent(struct dentry *child) +{ + int status; + u64 blkno; + struct dentry *parent; + struct inode *inode; + struct inode *dir = child->d_inode; + struct buffer_head *dirent_bh = NULL; + struct ocfs2_dir_entry *dirent; + + mlog_entry("(0x%p, '%.*s')\n", child, + child->d_name.len, child->d_name.name); + + mlog(0, "find parent of directory %"MLFu64"\n", + OCFS2_I(dir)->ip_blkno); + + status = ocfs2_meta_lock(dir, NULL, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + parent = ERR_PTR(status); + goto bail; + } + + status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh, + &dirent); + if (status < 0) { + parent = ERR_PTR(-ENOENT); + goto bail_unlock; + } + + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); + if (IS_ERR(inode)) { + mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); + parent = ERR_PTR(-EACCES); + goto bail_unlock; + } + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + +bail_unlock: + ocfs2_meta_unlock(dir, 0); + + if (dirent_bh) + brelse(dirent_bh); + +bail: + mlog_exit_ptr(parent); + + return parent; +} + +static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type = 1; + u64 blkno; + u32 generation; + + mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, + dentry->d_name.len, dentry->d_name.name, + fh, len, connectable); + + if (len < 3 || (connectable && len < 6)) { + mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + type = 255; + goto bail; + } + + blkno = OCFS2_I(inode)->ip_blkno; + generation = inode->i_generation; + + mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", + blkno, generation); + + len = 3; + fh[0] = cpu_to_le32((u32)(blkno >> 32)); + fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[2] = cpu_to_le32(generation); + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + blkno = OCFS2_I(parent)->ip_blkno; + generation = parent->i_generation; + + fh[3] = cpu_to_le32((u32)(blkno >> 32)); + fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[5] = cpu_to_le32(generation); + + spin_unlock(&dentry->d_lock); + + len = 6; + type = 2; + + mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n", + blkno, generation); + } + + *max_len = len; + +bail: + mlog_exit(type); + return type; +} + +static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh, + int fh_len, int fileid_type, + int (*acceptable)(void *context, + struct dentry *de), + void *context) +{ + struct ocfs2_inode_handle handle, parent; + struct dentry *ret = NULL; + + mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n", + sb, fh, fh_len, fileid_type, acceptable, context); + + if (fh_len < 3 || fileid_type > 2) + goto bail; + + if (fileid_type == 2) { + if (fh_len < 6) + goto bail; + + parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu(fh[4]); + parent.ih_generation = le32_to_cpu(fh[5]); + + mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n", + parent.ih_blkno, parent.ih_generation); + } + + handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu(fh[1]); + handle.ih_generation = le32_to_cpu(fh[2]); + + mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", + handle.ih_blkno, handle.ih_generation); + + ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent, + acceptable, context); + +bail: + mlog_exit_ptr(ret); + return ret; +} + +struct export_operations ocfs2_export_ops = { + .decode_fh = ocfs2_decode_fh, + .encode_fh = ocfs2_encode_fh, + + .get_parent = ocfs2_get_parent, + .get_dentry = ocfs2_get_dentry, +}; diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h new file mode 100644 index 00000000000..5b77ee7866e --- /dev/null +++ b/fs/ocfs2/export.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.h + * + * Function prototypes + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_EXPORT_H +#define OCFS2_EXPORT_H + +extern struct export_operations ocfs2_export_ops; + +#endif /* OCFS2_EXPORT_H */ diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c new file mode 100644 index 00000000000..f2fb40cd296 --- /dev/null +++ b/fs/ocfs2/extent_map.c @@ -0,0 +1,994 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.c + * + * In-memory extent map for OCFS2. Man, this code was prettier in + * the library. + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/rbtree.h> + +#define MLOG_MASK_PREFIX ML_EXTENT_MAP +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "extent_map.h" +#include "inode.h" +#include "super.h" + +#include "buffer_head_io.h" + + +/* + * SUCK SUCK SUCK + * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h + */ + +struct ocfs2_extent_map_entry { + struct rb_node e_node; + int e_tree_depth; + struct ocfs2_extent_rec e_rec; +}; + +struct ocfs2_em_insert_context { + int need_left; + int need_right; + struct ocfs2_extent_map_entry *new_ent; + struct ocfs2_extent_map_entry *old_ent; + struct ocfs2_extent_map_entry *left_ent; + struct ocfs2_extent_map_entry *right_ent; +}; + +static kmem_cache_t *ocfs2_em_ent_cachep = NULL; + + +static struct ocfs2_extent_map_entry * +ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + u32 cpos, u32 clusters, + struct rb_node ***ret_p, + struct rb_node **ret_parent); +static int ocfs2_extent_map_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth); +static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, + struct ocfs2_extent_map_entry *ent); +static int ocfs2_extent_map_find_leaf(struct inode *inode, + u32 cpos, u32 clusters, + struct ocfs2_extent_list *el); +static int ocfs2_extent_map_lookup_read(struct inode *inode, + u32 cpos, u32 clusters, + struct ocfs2_extent_map_entry **ret_ent); +static int ocfs2_extent_map_try_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth, + struct ocfs2_em_insert_context *ctxt); + +/* returns 1 only if the rec contains all the given clusters -- that is that + * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + + * clusters) is >= the argument's endpoint */ +static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, + u32 cpos, u32 clusters) +{ + if (le32_to_cpu(rec->e_cpos) > cpos) + return 0; + if (cpos + clusters > le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)) + return 0; + return 1; +} + + +/* + * Find an entry in the tree that intersects the region passed in. + * Note that this will find straddled intervals, it is up to the + * callers to enforce any boundary conditions. + * + * Callers must hold ip_lock. This lookup is not guaranteed to return + * a tree_depth 0 match, and as such can race inserts if the lock + * were not held. + * + * The rb_node garbage lets insertion share the search. Trivial + * callers pass NULL. + */ +static struct ocfs2_extent_map_entry * +ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + u32 cpos, u32 clusters, + struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &em->em_extents.rb_node; + struct rb_node *parent = NULL; + struct ocfs2_extent_map_entry *ent = NULL; + + while (*p) + { + parent = *p; + ent = rb_entry(parent, struct ocfs2_extent_map_entry, + e_node); + if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { + p = &(*p)->rb_left; + ent = NULL; + } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + + le32_to_cpu(ent->e_rec.e_clusters))) { + p = &(*p)->rb_right; + ent = NULL; + } else + break; + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + return ent; +} + +/* + * Find the leaf containing the interval we want. While we're on our + * way down the tree, fill in every record we see at any depth, because + * we might want it later. + * + * Note that this code is run without ip_lock. That's because it + * sleeps while reading. If someone is also filling the extent list at + * the same time we are, we might have to restart. + */ +static int ocfs2_extent_map_find_leaf(struct inode *inode, + u32 cpos, u32 clusters, + struct ocfs2_extent_list *el) +{ + int i, ret; + struct buffer_head *eb_bh = NULL; + u64 blkno; + u32 rec_end; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + + /* + * The bh data containing the el cannot change here, because + * we hold alloc_sem. So we can do this without other + * locks. + */ + while (el->l_tree_depth) + { + blkno = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + rec_end = (le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)); + + ret = -EBADR; + if (rec_end > OCFS2_I(inode)->ip_clusters) { + mlog_errno(ret); + goto out_free; + } + + if (rec_end <= cpos) { + ret = ocfs2_extent_map_insert(inode, rec, + le16_to_cpu(el->l_tree_depth)); + if (ret && (ret != -EEXIST)) { + mlog_errno(ret); + goto out_free; + } + continue; + } + if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { + ret = ocfs2_extent_map_insert(inode, rec, + le16_to_cpu(el->l_tree_depth)); + if (ret && (ret != -EEXIST)) { + mlog_errno(ret); + goto out_free; + } + continue; + } + + /* + * We've found a record that matches our + * interval. We don't insert it because we're + * about to traverse it. + */ + + /* Check to see if we're stradling */ + ret = -ESRCH; + if (!ocfs2_extent_rec_contains_clusters(rec, + cpos, + clusters)) { + mlog_errno(ret); + goto out_free; + } + + /* + * If we've already found a record, the el has + * two records covering the same interval. + * EEEK! + */ + ret = -EBADR; + if (blkno) { + mlog_errno(ret); + goto out_free; + } + + blkno = le64_to_cpu(rec->e_blkno); + } + + /* + * We don't support holes, and we're still up + * in the branches, so we'd better have found someone + */ + ret = -EBADR; + if (!blkno) { + mlog_errno(ret); + goto out_free; + } + + if (eb_bh) { + brelse(eb_bh); + eb_bh = NULL; + } + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + blkno, &eb_bh, OCFS2_BH_CACHED, + inode); + if (ret) { + mlog_errno(ret); + goto out_free; + } + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EIO; + goto out_free; + } + el = &eb->h_list; + } + + if (el->l_tree_depth) + BUG(); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_extent_map_insert(inode, rec, + le16_to_cpu(el->l_tree_depth)); + if (ret) { + mlog_errno(ret); + goto out_free; + } + } + + ret = 0; + +out_free: + if (eb_bh) + brelse(eb_bh); + + return ret; +} + +/* + * This lookup actually will read from disk. It has one invariant: + * It will never re-traverse blocks. This means that all inserts should + * be new regions or more granular regions (both allowed by insert). + */ +static int ocfs2_extent_map_lookup_read(struct inode *inode, + u32 cpos, + u32 clusters, + struct ocfs2_extent_map_entry **ret_ent) +{ + int ret; + u64 blkno; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + struct buffer_head *bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_dinode *di; + struct ocfs2_extent_list *el; + + spin_lock(&OCFS2_I(inode)->ip_lock); + ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); + if (ent) { + if (!ent->e_tree_depth) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + *ret_ent = ent; + return 0; + } + blkno = le64_to_cpu(ent->e_rec.e_blkno); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + if (bh) + brelse(bh); + return ret; + } + eb = (struct ocfs2_extent_block *)bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + brelse(bh); + return -EIO; + } + el = &eb->h_list; + } else { + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + if (bh) + brelse(bh); + return ret; + } + di = (struct ocfs2_dinode *)bh->b_data; + if (!OCFS2_IS_VALID_DINODE(di)) { + brelse(bh); + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); + return -EIO; + } + el = &di->id2.i_list; + } + + ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); + brelse(bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); + if (!ent) { + ret = -ESRCH; + mlog_errno(ret); + return ret; + } + + if (ent->e_tree_depth) + BUG(); /* FIXME: Make sure this isn't a corruption */ + + *ret_ent = ent; + + return 0; +} + +/* + * Callers must hold ip_lock. This can insert pieces of the tree, + * thus racing lookup if the lock weren't held. + */ +static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, + struct ocfs2_extent_map_entry *ent) +{ + struct rb_node **p, *parent; + struct ocfs2_extent_map_entry *old_ent; + + old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), + le32_to_cpu(ent->e_rec.e_clusters), + &p, &parent); + if (old_ent) + return -EEXIST; + + rb_link_node(&ent->e_node, parent, p); + rb_insert_color(&ent->e_node, &em->em_extents); + + return 0; +} + + +/* + * Simple rule: on any return code other than -EAGAIN, anything left + * in the insert_context will be freed. + */ +static int ocfs2_extent_map_try_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth, + struct ocfs2_em_insert_context *ctxt) +{ + int ret; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *old_ent; + + ctxt->need_left = 0; + ctxt->need_right = 0; + ctxt->old_ent = NULL; + + spin_lock(&OCFS2_I(inode)->ip_lock); + ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); + if (!ret) { + ctxt->new_ent = NULL; + goto out_unlock; + } + + old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), + le32_to_cpu(rec->e_clusters), NULL, + NULL); + + if (!old_ent) + BUG(); + + ret = -EEXIST; + if (old_ent->e_tree_depth < tree_depth) + goto out_unlock; + + if (old_ent->e_tree_depth == tree_depth) { + if (!memcmp(rec, &old_ent->e_rec, + sizeof(struct ocfs2_extent_rec))) + ret = 0; + + /* FIXME: Should this be ESRCH/EBADR??? */ + goto out_unlock; + } + + /* + * We do it in this order specifically so that no actual tree + * changes occur until we have all the pieces we need. We + * don't want malloc failures to leave an inconsistent tree. + * Whenever we drop the lock, another process could be + * inserting. Also note that, if another process just beat us + * to an insert, we might not need the same pieces we needed + * the first go round. In the end, the pieces we need will + * be used, and the pieces we don't will be freed. + */ + ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > + le32_to_cpu(old_ent->e_rec.e_cpos)); + ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + + le32_to_cpu(old_ent->e_rec.e_clusters)) > + (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); + ret = -EAGAIN; + if (ctxt->need_left) { + if (!ctxt->left_ent) + goto out_unlock; + *(ctxt->left_ent) = *old_ent; + ctxt->left_ent->e_rec.e_clusters = + cpu_to_le32(le32_to_cpu(rec->e_cpos) - + le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); + } + if (ctxt->need_right) { + if (!ctxt->right_ent) + goto out_unlock; + *(ctxt->right_ent) = *old_ent; + ctxt->right_ent->e_rec.e_cpos = + cpu_to_le32(le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)); + ctxt->right_ent->e_rec.e_clusters = + cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + + le32_to_cpu(old_ent->e_rec.e_clusters)) - + le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); + } + + rb_erase(&old_ent->e_node, &em->em_extents); + /* Now that he's erased, set him up for deletion */ + ctxt->old_ent = old_ent; + + if (ctxt->need_left) { + ret = ocfs2_extent_map_insert_entry(em, + ctxt->left_ent); + if (ret) + goto out_unlock; + ctxt->left_ent = NULL; + } + + if (ctxt->need_right) { + ret = ocfs2_extent_map_insert_entry(em, + ctxt->right_ent); + if (ret) + goto out_unlock; + ctxt->right_ent = NULL; + } + + ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); + + if (!ret) + ctxt->new_ent = NULL; + +out_unlock: + spin_unlock(&OCFS2_I(inode)->ip_lock); + + return ret; +} + + +static int ocfs2_extent_map_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth) +{ + int ret; + struct ocfs2_em_insert_context ctxt = {0, }; + + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > + OCFS2_I(inode)->ip_map.em_clusters) { + ret = -EBADR; + mlog_errno(ret); + return ret; + } + + /* Zero e_clusters means a truncated tail record. It better be EOF */ + if (!rec->e_clusters) { + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != + OCFS2_I(inode)->ip_map.em_clusters) { + ret = -EBADR; + mlog_errno(ret); + return ret; + } + + /* Ignore the truncated tail */ + return 0; + } + + ret = -ENOMEM; + ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, + GFP_KERNEL); + if (!ctxt.new_ent) { + mlog_errno(ret); + return ret; + } + + ctxt.new_ent->e_rec = *rec; + ctxt.new_ent->e_tree_depth = tree_depth; + + do { + ret = -ENOMEM; + if (ctxt.need_left && !ctxt.left_ent) { + ctxt.left_ent = + kmem_cache_alloc(ocfs2_em_ent_cachep, + GFP_KERNEL); + if (!ctxt.left_ent) + break; + } + if (ctxt.need_right && !ctxt.right_ent) { + ctxt.right_ent = + kmem_cache_alloc(ocfs2_em_ent_cachep, + GFP_KERNEL); + if (!ctxt.right_ent) + break; + } + + ret = ocfs2_extent_map_try_insert(inode, rec, + tree_depth, &ctxt); + } while (ret == -EAGAIN); + + if (ret < 0) + mlog_errno(ret); + + if (ctxt.left_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); + if (ctxt.right_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); + if (ctxt.old_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); + if (ctxt.new_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); + + return ret; +} + +/* + * Append this record to the tail of the extent map. It must be + * tree_depth 0. The record might be an extension of an existing + * record, and as such that needs to be handled. eg: + * + * Existing record in the extent map: + * + * cpos = 10, len = 10 + * |---------| + * + * New Record: + * + * cpos = 10, len = 20 + * |------------------| + * + * The passed record is the new on-disk record. The new_clusters value + * is how many clusters were added to the file. If the append is a + * contiguous append, the new_clusters has been added to + * rec->e_clusters. If the append is an entirely new extent, then + * rec->e_clusters is == new_clusters. + */ +int ocfs2_extent_map_append(struct inode *inode, + struct ocfs2_extent_rec *rec, + u32 new_clusters) +{ + int ret; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + struct ocfs2_extent_rec *old; + + BUG_ON(!new_clusters); + BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); + + if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters; + } + + mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)) != + (em->em_clusters + new_clusters), + "Inode %"MLFu64":\n" + "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" + "em->em_clusters = %u + new_clusters = %u = %u\n", + OCFS2_I(inode)->ip_blkno, + le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), + le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), + em->em_clusters, new_clusters, + em->em_clusters + new_clusters); + + em->em_clusters += new_clusters; + + ret = -ENOENT; + if (le32_to_cpu(rec->e_clusters) > new_clusters) { + /* This is a contiguous append */ + ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, + NULL, NULL); + if (ent) { + old = &ent->e_rec; + BUG_ON((le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)) != + (le32_to_cpu(old->e_cpos) + + le32_to_cpu(old->e_clusters) + + new_clusters)); + if (ent->e_tree_depth == 0) { + BUG_ON(le32_to_cpu(old->e_cpos) != + le32_to_cpu(rec->e_cpos)); + BUG_ON(le64_to_cpu(old->e_blkno) != + le64_to_cpu(rec->e_blkno)); + ret = 0; + } + /* + * Let non-leafs fall through as -ENOENT to + * force insertion of the new leaf. + */ + le32_add_cpu(&old->e_clusters, new_clusters); + } + } + + if (ret == -ENOENT) + ret = ocfs2_extent_map_insert(inode, rec, 0); + if (ret < 0) + mlog_errno(ret); + return ret; +} + +#if 0 +/* Code here is included but defined out as it completes the extent + * map api and may be used in the future. */ + +/* + * Look up the record containing this cluster offset. This record is + * part of the extent map. Do not free it. Any changes you make to + * it will reflect in the extent map. So, if your last extent + * is (cpos = 10, clusters = 10) and you truncate the file by 5 + * clusters, you can do: + * + * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); + * rec->e_clusters -= 5; + * + * The lookup does not read from disk. If the map isn't filled in for + * an entry, you won't find it. + * + * Also note that the returned record is valid until alloc_sem is + * dropped. After that, truncate and extend can happen. Caveat Emptor. + */ +int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, + struct ocfs2_extent_rec **rec, + int *tree_depth) +{ + int ret = -ENOENT; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + + *rec = NULL; + + if (cpos >= OCFS2_I(inode)->ip_clusters) + return -EINVAL; + + if (cpos >= em->em_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters ; + } + + ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, + NULL, NULL); + + if (ent) { + *rec = &ent->e_rec; + if (tree_depth) + *tree_depth = ent->e_tree_depth; + ret = 0; + } + + return ret; +} + +int ocfs2_extent_map_get_clusters(struct inode *inode, + u32 v_cpos, int count, + u32 *p_cpos, int *ret_count) +{ + int ret; + u32 coff, ccount; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent = NULL; + + *p_cpos = ccount = 0; + + if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) + return -EINVAL; + + if ((v_cpos + count) > em->em_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters; + } + + + ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); + if (ret) + return ret; + + if (ent) { + /* We should never find ourselves straddling an interval */ + if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, + v_cpos, + count)) + return -ESRCH; + + coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); + *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(ent->e_rec.e_blkno)) + + coff; + + if (ret_count) + *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; + + return 0; + } + + + return -ENOENT; +} + +#endif /* 0 */ + +int ocfs2_extent_map_get_blocks(struct inode *inode, + u64 v_blkno, int count, + u64 *p_blkno, int *ret_count) +{ + int ret; + u64 boff; + u32 cpos, clusters; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + struct ocfs2_extent_map_entry *ent = NULL; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_rec *rec; + + *p_blkno = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); + clusters = ocfs2_blocks_to_clusters(inode->i_sb, + (u64)count + bpc - 1); + if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } + + if ((cpos + clusters) > em->em_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters; + } + + ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (ent) + { + rec = &ent->e_rec; + + /* We should never find ourselves straddling an interval */ + if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { + ret = -ESRCH; + mlog_errno(ret); + return ret; + } + + boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - + le32_to_cpu(rec->e_cpos)); + boff += (v_blkno & (u64)(bpc - 1)); + *p_blkno = le64_to_cpu(rec->e_blkno) + boff; + + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(rec->e_clusters)) - boff; + } + + return 0; + } + + return -ENOENT; +} + +int ocfs2_extent_map_init(struct inode *inode) +{ + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + + em->em_extents = RB_ROOT; + em->em_clusters = 0; + + return 0; +} + +/* Needs the lock */ +static void __ocfs2_extent_map_drop(struct inode *inode, + u32 new_clusters, + struct rb_node **free_head, + struct ocfs2_extent_map_entry **tail_ent) +{ + struct rb_node *node, *next; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + + *free_head = NULL; + + ent = NULL; + node = rb_last(&em->em_extents); + while (node) + { + next = rb_prev(node); + + ent = rb_entry(node, struct ocfs2_extent_map_entry, + e_node); + if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) + break; + + rb_erase(&ent->e_node, &em->em_extents); + + node->rb_right = *free_head; + *free_head = node; + + ent = NULL; + node = next; + } + + /* Do we have an entry straddling new_clusters? */ + if (tail_ent) { + if (ent && + ((le32_to_cpu(ent->e_rec.e_cpos) + + le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) + *tail_ent = ent; + else + *tail_ent = NULL; + } +} + +static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) +{ + struct rb_node *node; + struct ocfs2_extent_map_entry *ent; + + while (free_head) { + node = free_head; + free_head = node->rb_right; + + ent = rb_entry(node, struct ocfs2_extent_map_entry, + e_node); + kmem_cache_free(ocfs2_em_ent_cachep, ent); + } +} + +/* + * Remove all entries past new_clusters, inclusive of an entry that + * contains new_clusters. This is effectively a cache forget. + * + * If you want to also clip the last extent by some number of clusters, + * you need to call ocfs2_extent_map_trunc(). + * This code does not check or modify ip_clusters. + */ +int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) +{ + struct rb_node *free_head = NULL; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + + spin_lock(&OCFS2_I(inode)->ip_lock); + + __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + + if (ent) { + rb_erase(&ent->e_node, &em->em_extents); + ent->e_node.rb_right = free_head; + free_head = &ent->e_node; + } + + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (free_head) + __ocfs2_extent_map_drop_cleanup(free_head); + + return 0; +} + +/* + * Remove all entries past new_clusters and also clip any extent + * straddling new_clusters, if there is one. This does not check + * or modify ip_clusters + */ +int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) +{ + struct rb_node *free_head = NULL; + struct ocfs2_extent_map_entry *ent = NULL; + + spin_lock(&OCFS2_I(inode)->ip_lock); + + __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + + if (ent) + ent->e_rec.e_clusters = cpu_to_le32(new_clusters - + le32_to_cpu(ent->e_rec.e_cpos)); + + OCFS2_I(inode)->ip_map.em_clusters = new_clusters; + + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (free_head) + __ocfs2_extent_map_drop_cleanup(free_head); + + return 0; +} + +int __init init_ocfs2_extent_maps(void) +{ + ocfs2_em_ent_cachep = + kmem_cache_create("ocfs2_em_ent", + sizeof(struct ocfs2_extent_map_entry), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ocfs2_em_ent_cachep) + return -ENOMEM; + + return 0; +} + +void __exit exit_ocfs2_extent_maps(void) +{ + kmem_cache_destroy(ocfs2_em_ent_cachep); +} diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h new file mode 100644 index 00000000000..fa3745efa88 --- /dev/null +++ b/fs/ocfs2/extent_map.h @@ -0,0 +1,46 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.h + * + * In-memory file extent mappings for OCFS2. + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _EXTENT_MAP_H +#define _EXTENT_MAP_H + +int init_ocfs2_extent_maps(void); +void exit_ocfs2_extent_maps(void); + +/* + * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem + * to be held. The allocation cannot change at all while the map is + * in the process of being updated. + */ +int ocfs2_extent_map_init(struct inode *inode); +int ocfs2_extent_map_append(struct inode *inode, + struct ocfs2_extent_rec *rec, + u32 new_clusters); +int ocfs2_extent_map_get_blocks(struct inode *inode, + u64 v_blkno, int count, + u64 *p_blkno, int *ret_count); +int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); +int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); + +#endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c new file mode 100644 index 00000000000..72ae9e3306f --- /dev/null +++ b/fs/ocfs2/file.c @@ -0,0 +1,1237 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c + * + * File open, close, extend, truncate + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "mmap.h" +#include "suballoc.h" +#include "super.h" + +#include "buffer_head_io.h" + +static int ocfs2_sync_inode(struct inode *inode) +{ + filemap_fdatawrite(inode->i_mapping); + return sync_mapping_buffers(inode->i_mapping); +} + +static int ocfs2_file_open(struct inode *inode, struct file *file) +{ + int status; + int mode = file->f_flags; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, + file->f_dentry->d_name.len, file->f_dentry->d_name.name); + + spin_lock(&oi->ip_lock); + + /* Check that the inode hasn't been wiped from disk by another + * node. If it hasn't then we're safe as long as we hold the + * spin lock until our increment of open count. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&oi->ip_lock); + + status = -ENOENT; + goto leave; + } + + if (mode & O_DIRECT) + oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; + + oi->ip_open_count++; + spin_unlock(&oi->ip_lock); + status = 0; +leave: + mlog_exit(status); + return status; +} + +static int ocfs2_file_release(struct inode *inode, struct file *file) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, + file->f_dentry->d_name.len, + file->f_dentry->d_name.name); + + spin_lock(&oi->ip_lock); + if (!--oi->ip_open_count) + oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; + spin_unlock(&oi->ip_lock); + + mlog_exit(0); + + return 0; +} + +static int ocfs2_sync_file(struct file *file, + struct dentry *dentry, + int datasync) +{ + int err = 0; + journal_t *journal; + struct inode *inode = dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, + dentry->d_name.len, dentry->d_name.name); + + err = ocfs2_sync_inode(dentry->d_inode); + if (err) + goto bail; + + journal = osb->journal->j_journal; + err = journal_force_commit(journal); + +bail: + mlog_exit(err); + + return (err < 0) ? -EIO : 0; +} + +int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + + mlog_entry_void(); + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_journal_handle *handle = NULL; + + handle = ocfs2_start_trans(osb, NULL, + OCFS2_INODE_UPDATE_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_set_inode_size(handle, inode, di_bh, + new_i_size); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(handle); +out: + return ret; +} + +static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + struct ocfs2_journal_handle *handle; + + mlog_entry_void(); + + /* TODO: This needs to actually orphan the inode in this + * transaction. */ + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(handle); +out: + mlog_exit(status); + return status; +} + +static int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int status = 0; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_truncate_context *tc = NULL; + + mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n", + OCFS2_I(inode)->ip_blkno, new_i_size); + + truncate_inode_pages(inode->i_mapping, new_i_size); + + fe = (struct ocfs2_dinode *) di_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail; + } + + mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), + "Inode %"MLFu64", inode i_size = %lld != di " + "i_size = %"MLFu64", i_flags = 0x%x\n", + OCFS2_I(inode)->ip_blkno, + i_size_read(inode), + le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); + + if (new_i_size > le64_to_cpu(fe->i_size)) { + mlog(0, "asked to truncate file with size (%"MLFu64") " + "to size (%"MLFu64")!\n", + le64_to_cpu(fe->i_size), new_i_size); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n", + le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size); + + /* lets handle the simple truncate cases before doing any more + * cluster locking. */ + if (new_i_size == le64_to_cpu(fe->i_size)) + goto bail; + + if (le32_to_cpu(fe->i_clusters) == + ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { + mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", + fe->i_clusters); + /* No allocation change is required, so lets fast path + * this truncate. */ + status = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (status < 0) + mlog_errno(status); + goto bail; + } + + /* This forces other nodes to sync and drop their pages */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + + /* alright, we're going to need to do a full blown alloc size + * change. Orphan the inode so that recovery can complete the + * truncate if necessary. This does the task of marking + * i_size. */ + status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_commit_truncate(osb, inode, di_bh, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* TODO: orphan dir cleanup here. */ +bail: + + mlog_exit(status); + return status; +} + +/* + * extend allocation only here. + * we'll update all the disk stuff, and oip->alloc_size + * + * expect stuff to be locked, a transaction started and enough data / + * metadata reservations in the contexts. + * + * Will return -EAGAIN, and a reason if a restart is needed. + * If passed in, *reason will always be set, even in error. + */ +int ocfs2_do_extend_allocation(struct ocfs2_super *osb, + struct inode *inode, + u32 clusters_to_add, + struct buffer_head *fe_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0; + int free_extents; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + + BUG_ON(!clusters_to_add); + + free_extents = ocfs2_num_free_extents(osb, inode, fe); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + mlog(0, "we haven't reserved any metadata!\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(fe))) { + mlog(0, "filesystem is really fragmented...\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = ocfs2_claim_clusters(osb, handle, data_ac, 1, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the inode */ + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n", + num_bits, bit_off, OCFS2_I(inode)->ip_blkno); + status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, + num_bits, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + le32_add_cpu(&fe->i_clusters, num_bits); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= num_bits; + + if (clusters_to_add) { + mlog(0, "need to alloc once more, clusters = %u, wanted = " + "%u\n", fe->i_clusters, clusters_to_add); + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: + mlog_exit(status); + if (reason_ret) + *reason_ret = reason; + return status; +} + +static int ocfs2_extend_allocation(struct inode *inode, + u32 clusters_to_add) +{ + int status = 0; + int restart_func = 0; + int drop_alloc_sem = 0; + int credits, num_free_extents; + u32 prev_clusters; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto leave; + } + +restart_all: + BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); + + mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, " + "clusters_to_add = %u\n", + OCFS2_I(inode)->ip_blkno, i_size_read(inode), + fe->i_clusters, clusters_to_add); + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + num_free_extents = ocfs2_num_free_extents(osb, + inode, + fe); + if (num_free_extents < 0) { + status = num_free_extents; + mlog_errno(status); + goto leave; + } + + if (!num_free_extents) { + status = ocfs2_reserve_new_metadata(osb, + handle, + fe, + &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + } + + status = ocfs2_reserve_clusters(osb, + handle, + clusters_to_add, + &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + /* blocks peope in read/write from reading our allocation + * until we're done changing it. We depend on i_sem to block + * other extend/truncate calls while we're here. Ordering wrt + * start_trans is important here -- always do it before! */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + drop_alloc_sem = 1; + + credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + /* reserve a write to the file entry early on - that we if we + * run out of credits in the allocation path, we can still + * update i_size. */ + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = OCFS2_I(inode)->ip_clusters; + + status = ocfs2_do_extend_allocation(osb, + inode, + clusters_to_add, + bh, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + mlog(0, "restarting function.\n"); + restart_func = 1; + } else { + BUG_ON(why != RESTART_TRANS); + + mlog(0, "restarting transaction.\n"); + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + fe, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + + mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n", + fe->i_clusters, fe->i_size); + mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", + OCFS2_I(inode)->ip_clusters, i_size_read(inode)); + +leave: + if (drop_alloc_sem) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); + drop_alloc_sem = 0; + } + if (handle) { + ocfs2_commit_trans(handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + if (bh) { + brelse(bh); + bh = NULL; + } + + mlog_exit(status); + return status; +} + +/* Some parts of this taken from generic_cont_expand, which turned out + * to be too fragile to do exactly what we need without us having to + * worry about recursive locking in ->commit_write(). */ +static int ocfs2_write_zero_page(struct inode *inode, + u64 size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index; + unsigned int offset; + struct ocfs2_journal_handle *handle = NULL; + int ret; + + offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ + /* ugh. in prepare/commit_write, if from==to==start of block, we + ** skip the prepare. make sure we never send an offset for the start + ** of a block + */ + if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { + offset++; + } + index = size >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(mapping, index); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_write(NULL, page, offset, offset); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + if (ocfs2_should_order_data(inode)) { + handle = ocfs2_start_walk_page_trans(inode, page, offset, + offset); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out_unlock; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, offset, offset); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + + if (handle) + ocfs2_commit_trans(handle); +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +static int ocfs2_zero_extend(struct inode *inode, + u64 zero_to_size) +{ + int ret = 0; + u64 start_off; + struct super_block *sb = inode->i_sb; + + start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + while (start_off < zero_to_size) { + ret = ocfs2_write_zero_page(inode, start_off); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + start_off += sb->s_blocksize; + } + +out: + return ret; +} + +static int ocfs2_extend_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret = 0; + u32 clusters_to_add; + + /* setattr sometimes calls us like this. */ + if (new_i_size == 0) + goto out; + + if (i_size_read(inode) == new_i_size) + goto out; + BUG_ON(new_i_size < i_size_read(inode)); + + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - + OCFS2_I(inode)->ip_clusters; + + if (clusters_to_add) { + ret = ocfs2_extend_allocation(inode, clusters_to_add); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_zero_extend(inode, new_i_size); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + /* No allocation required, we just use this helper to + * do a trivial update of i_size. */ + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + return ret; +} + +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + int status = 0, size_change; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *bh = NULL; + struct ocfs2_journal_handle *handle = NULL; + + mlog_entry("(0x%p, '%.*s')\n", dentry, + dentry->d_name.len, dentry->d_name.name); + + if (attr->ia_valid & ATTR_MODE) + mlog(0, "mode change: %d\n", attr->ia_mode); + if (attr->ia_valid & ATTR_UID) + mlog(0, "uid change: %d\n", attr->ia_uid); + if (attr->ia_valid & ATTR_GID) + mlog(0, "gid change: %d\n", attr->ia_gid); + if (attr->ia_valid & ATTR_SIZE) + mlog(0, "size change...\n"); + if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) + mlog(0, "time change...\n"); + +#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ + | ATTR_GID | ATTR_UID | ATTR_MODE) + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { + mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); + return 0; + } + + status = inode_change_ok(inode, attr); + if (status) + return status; + + size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; + if (size_change) { + status = ocfs2_rw_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_meta_lock(inode, NULL, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail_unlock_rw; + } + + if (size_change && attr->ia_size != i_size_read(inode)) { + if (i_size_read(inode) > attr->ia_size) + status = ocfs2_truncate_file(inode, bh, attr->ia_size); + else + status = ocfs2_extend_file(inode, bh, attr->ia_size); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + status = -ENOSPC; + goto bail_unlock; + } + } + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + status = inode_setattr(inode, attr); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_mark_inode_dirty(handle, inode, bh); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(handle); +bail_unlock: + ocfs2_meta_unlock(inode, 1); +bail_unlock_rw: + if (size_change) + ocfs2_rw_unlock(inode, 1); +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +int ocfs2_getattr(struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = dentry->d_inode->i_sb; + struct ocfs2_super *osb = sb->s_fs_info; + int err; + + mlog_entry_void(); + + err = ocfs2_inode_revalidate(dentry); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + generic_fillattr(inode, stat); + + /* We set the blksize from the cluster size for performance */ + stat->blksize = osb->s_clustersize; + +bail: + mlog_exit(err); + + return err; +} + +static int ocfs2_write_remove_suid(struct inode *inode) +{ + int ret; + struct buffer_head *bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_journal_handle *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + + mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, + inode->i_mode); + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + if (ret < 0) { + mlog_errno(ret); + goto out_trans; + } + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_bh; + } + + inode->i_mode &= ~S_ISUID; + if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) + inode->i_mode &= ~S_ISGID; + + di = (struct ocfs2_dinode *) bh->b_data; + di->i_mode = cpu_to_le16(inode->i_mode); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret < 0) + mlog_errno(ret); +out_bh: + brelse(bh); +out_trans: + ocfs2_commit_trans(handle); +out: + mlog_exit(ret); + return ret; +} + +static inline int ocfs2_write_should_remove_suid(struct inode *inode) +{ + mode_t mode = inode->i_mode; + + if (!capable(CAP_FSETID)) { + if (unlikely(mode & S_ISUID)) + return 1; + + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + return 1; + } + return 0; +} + +static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; + u32 clusters; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_dentry->d_inode; + loff_t newsize, saved_pos; +#ifdef OCFS2_ORACORE_WORKAROUNDS + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +#endif + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, + (unsigned int)count, + filp->f_dentry->d_name.len, + filp->f_dentry->d_name.name); + + /* happy write of zero bytes */ + if (count == 0) + return 0; + + if (!inode) { + mlog(0, "bad inode\n"); + return -EIO; + } + +#ifdef OCFS2_ORACORE_WORKAROUNDS + /* ugh, work around some applications which open everything O_DIRECT + + * O_APPEND and really don't mean to use O_DIRECT. */ + if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && + (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) + filp->f_flags &= ~O_DIRECT; +#endif + + down(&inode->i_sem); + /* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */ + if (filp->f_flags & O_DIRECT) { + have_alloc_sem = 1; + down_read(&inode->i_alloc_sem); + } + + /* concurrent O_DIRECT writes are allowed */ + rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; + ret = ocfs2_rw_lock(inode, rw_level); + if (ret < 0) { + rw_level = -1; + mlog_errno(ret); + goto out; + } + + /* + * We sample i_size under a read level meta lock to see if our write + * is extending the file, if it is we back off and get a write level + * meta lock. + */ + meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; + for(;;) { + ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); + if (ret < 0) { + meta_level = -1; + mlog_errno(ret); + goto out; + } + + /* Clear suid / sgid if necessary. We do this here + * instead of later in the write path because + * remove_suid() calls ->setattr without any hint that + * we may have already done our cluster locking. Since + * ocfs2_setattr() *must* take cluster locks to + * proceeed, this will lead us to recursively lock the + * inode. There's also the dinode i_size state which + * can be lost via setattr during extending writes (we + * set inode->i_size at the end of a write. */ + if (ocfs2_write_should_remove_suid(inode)) { + if (meta_level == 0) { + ocfs2_meta_unlock(inode, meta_level); + meta_level = 1; + continue; + } + + ret = ocfs2_write_remove_suid(inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + /* work on a copy of ppos until we're sure that we won't have + * to recalculate it due to relocking. */ + if (filp->f_flags & O_APPEND) { + saved_pos = i_size_read(inode); + mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); + } else { + saved_pos = iocb->ki_pos; + } + newsize = count + saved_pos; + + mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", + saved_pos, newsize, i_size_read(inode)); + + /* No need for a higher level metadata lock if we're + * never going past i_size. */ + if (newsize <= i_size_read(inode)) + break; + + if (meta_level == 0) { + ocfs2_meta_unlock(inode, meta_level); + meta_level = 1; + continue; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - + OCFS2_I(inode)->ip_clusters; + spin_unlock(&OCFS2_I(inode)->ip_lock); + + mlog(0, "Writing at EOF, may need more allocation: " + "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", + i_size_read(inode), newsize, clusters); + + /* We only want to continue the rest of this loop if + * our extend will actually require more + * allocation. */ + if (!clusters) + break; + + ret = ocfs2_extend_allocation(inode, clusters); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + /* Fill any holes which would've been created by this + * write. If we're O_APPEND, this will wind up + * (correctly) being a noop. */ + ret = ocfs2_zero_extend(inode, (u64) newsize - count); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + break; + } + + /* ok, we're done with i_size and alloc work */ + iocb->ki_pos = saved_pos; + ocfs2_meta_unlock(inode, meta_level); + meta_level = -1; + + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb); + +#ifdef OCFS2_ORACORE_WORKAROUNDS + if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && + filp->f_flags & O_DIRECT) { + unsigned int saved_flags = filp->f_flags; + int sector_size = 1 << osb->s_sectsize_bits; + + if ((saved_pos & (sector_size - 1)) || + (count & (sector_size - 1)) || + ((unsigned long)buf & (sector_size - 1))) { + filp->f_flags |= O_SYNC; + filp->f_flags &= ~O_DIRECT; + } + + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + &iocb->ki_pos); + + filp->f_flags = saved_flags; + } else +#endif + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + &iocb->ki_pos); + + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. (it's the clustered equivalent of + * i_alloc_sem; protects truncate from racing with pending ios). + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + rw_level = -1; + have_alloc_sem = 0; + } + +out: + if (meta_level != -1) + ocfs2_meta_unlock(inode, meta_level); + if (have_alloc_sem) + up_read(&inode->i_alloc_sem); + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + up(&inode->i_sem); + + mlog_exit(ret); + return ret; +} + +static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + int ret = 0, rw_level = -1, have_alloc_sem = 0; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_dentry->d_inode; +#ifdef OCFS2_ORACORE_WORKAROUNDS + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +#endif + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, + (unsigned int)count, + filp->f_dentry->d_name.len, + filp->f_dentry->d_name.name); + + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto bail; + } + +#ifdef OCFS2_ORACORE_WORKAROUNDS + if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { + if (filp->f_flags & O_DIRECT) { + int sector_size = 1 << osb->s_sectsize_bits; + + if ((pos & (sector_size - 1)) || + (count & (sector_size - 1)) || + ((unsigned long)buf & (sector_size - 1)) || + (i_size_read(inode) & (sector_size -1))) { + filp->f_flags &= ~O_DIRECT; + } + } + } +#endif + + /* + * buffered reads protect themselves in ->readpage(). O_DIRECT reads + * need locks to protect pending reads from racing with truncate. + */ + if (filp->f_flags & O_DIRECT) { + down_read(&inode->i_alloc_sem); + have_alloc_sem = 1; + + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + rw_level = 0; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb); + } + + ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); + if (ret == -EINVAL) + mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); + + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + + /* see ocfs2_file_aio_write */ + if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + rw_level = -1; + have_alloc_sem = 0; + } + +bail: + if (have_alloc_sem) + up_read(&inode->i_alloc_sem); + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + mlog_exit(ret); + + return ret; +} + +struct inode_operations ocfs2_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, +}; + +struct inode_operations ocfs2_special_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, +}; + +struct file_operations ocfs2_fops = { + .read = do_sync_read, + .write = do_sync_write, + .sendfile = generic_file_sendfile, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, +}; + +struct file_operations ocfs2_dops = { + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, +}; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h new file mode 100644 index 00000000000..a5ea33b2406 --- /dev/null +++ b/fs/ocfs2/file.h @@ -0,0 +1,57 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_FILE_H +#define OCFS2_FILE_H + +extern struct file_operations ocfs2_fops; +extern struct file_operations ocfs2_dops; +extern struct inode_operations ocfs2_file_iops; +extern struct inode_operations ocfs2_special_file_iops; +struct ocfs2_alloc_context; + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_do_extend_allocation(struct ocfs2_super *osb, + struct inode *inode, + u32 clusters_to_add, + struct buffer_head *fe_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason); +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); +int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); + +#endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c new file mode 100644 index 00000000000..0bbd22f46c8 --- /dev/null +++ b/fs/ocfs2/heartbeat.c @@ -0,0 +1,378 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.c + * + * Register ourselves with the heartbaet service, keep our node maps + * up to date, and fire off recovery when needed. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/kmod.h> + +#include <cluster/heartbeat.h> +#include <cluster/nodemanager.h> + +#include <dlm/dlmapi.h> + +#define MLOG_MASK_PREFIX ML_SUPER +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define OCFS2_HB_NODE_DOWN_PRI (0x0000002) +#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit); +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit); +static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); +static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, + struct ocfs2_node_map *from); +static void __ocfs2_node_map_set(struct ocfs2_node_map *target, + struct ocfs2_node_map *from); + +void ocfs2_init_node_maps(struct ocfs2_super *osb) +{ + spin_lock_init(&osb->node_map_lock); + ocfs2_node_map_init(&osb->mounted_map); + ocfs2_node_map_init(&osb->recovery_map); + ocfs2_node_map_init(&osb->umount_map); +} + +static void ocfs2_do_node_down(int node_num, + struct ocfs2_super *osb) +{ + BUG_ON(osb->node_num == node_num); + + mlog(0, "ocfs2: node down event for %d\n", node_num); + + if (!osb->dlm) { + /* + * No DLM means we're not even ready to participate yet. + * We check the slots after the DLM comes up, so we will + * notice the node death then. We can safely ignore it + * here. + */ + return; + } + + if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) { + /* If a node is in the umount map, then we've been + * expecting him to go down and we know ahead of time + * that recovery is not necessary. */ + ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); + return; + } + + ocfs2_recovery_thread(osb, node_num); + + ocfs2_remove_node_from_vote_queues(osb, node_num); +} + +static void ocfs2_hb_node_down_cb(struct o2nm_node *node, + int node_num, + void *data) +{ + ocfs2_do_node_down(node_num, (struct ocfs2_super *) data); +} + +/* Called from the dlm when it's about to evict a node. We may also + * get a heartbeat callback later. */ +static void ocfs2_dlm_eviction_cb(int node_num, + void *data) +{ + struct ocfs2_super *osb = (struct ocfs2_super *) data; + struct super_block *sb = osb->sb; + + mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n", + MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num); + + ocfs2_do_node_down(node_num, osb); +} + +static void ocfs2_hb_node_up_cb(struct o2nm_node *node, + int node_num, + void *data) +{ + struct ocfs2_super *osb = data; + + BUG_ON(osb->node_num == node_num); + + mlog(0, "node up event for %d\n", node_num); + ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); +} + +void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) +{ + o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB, + ocfs2_hb_node_down_cb, osb, + OCFS2_HB_NODE_DOWN_PRI); + + o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB, + ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI); + + /* Not exactly a heartbeat callback, but leads to essentially + * the same path so we set it up here. */ + dlm_setup_eviction_cb(&osb->osb_eviction_cb, + ocfs2_dlm_eviction_cb, + osb); +} + +/* Most functions here are just stubs for now... */ +int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) +{ + int status; + + status = o2hb_register_callback(&osb->osb_hb_down); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = o2hb_register_callback(&osb->osb_hb_up); + if (status < 0) + mlog_errno(status); + +bail: + return status; +} + +void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) +{ + int status; + + status = o2hb_unregister_callback(&osb->osb_hb_down); + if (status < 0) + mlog_errno(status); + + status = o2hb_unregister_callback(&osb->osb_hb_up); + if (status < 0) + mlog_errno(status); +} + +void ocfs2_stop_heartbeat(struct ocfs2_super *osb) +{ + int ret; + char *argv[5], *envp[3]; + + if (!osb->uuid_str) { + /* This can happen if we don't get far enough in mount... */ + mlog(0, "No UUID with which to stop heartbeat!\n\n"); + return; + } + + argv[0] = (char *)o2nm_get_hb_ctl_path(); + argv[1] = "-K"; + argv[2] = "-u"; + argv[3] = osb->uuid_str; + argv[4] = NULL; + + mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); + + /* minimal command environment taken from cpu_run_sbin_hotplug */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, 1); + if (ret < 0) + mlog_errno(ret); +} + +/* special case -1 for now + * TODO: should *really* make sure the calling func never passes -1!! */ +void ocfs2_node_map_init(struct ocfs2_node_map *map) +{ + map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; + memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * + sizeof(unsigned long)); +} + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit) +{ + set_bit(bit, map->map); +} + +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_set_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit) +{ + clear_bit(bit, map->map); +} + +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_clear_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + int ret; + if (bit >= map->num_nodes) { + mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes); + BUG(); + } + spin_lock(&osb->node_map_lock); + ret = test_bit(bit, map->map); + spin_unlock(&osb->node_map_lock); + return ret; +} + +static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) +{ + int bit; + bit = find_next_bit(map->map, map->num_nodes, 0); + if (bit < map->num_nodes) + return 0; + return 1; +} + +int ocfs2_node_map_is_empty(struct ocfs2_super *osb, + struct ocfs2_node_map *map) +{ + int ret; + BUG_ON(map->num_nodes == 0); + spin_lock(&osb->node_map_lock); + ret = __ocfs2_node_map_is_empty(map); + spin_unlock(&osb->node_map_lock); + return ret; +} + +static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, + struct ocfs2_node_map *from) +{ + BUG_ON(from->num_nodes == 0); + ocfs2_node_map_init(target); + __ocfs2_node_map_set(target, from); +} + +/* returns 1 if bit is the only bit set in target, 0 otherwise */ +int ocfs2_node_map_is_only(struct ocfs2_super *osb, + struct ocfs2_node_map *target, + int bit) +{ + struct ocfs2_node_map temp; + int ret; + + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_dup(&temp, target); + __ocfs2_node_map_clear_bit(&temp, bit); + ret = __ocfs2_node_map_is_empty(&temp); + spin_unlock(&osb->node_map_lock); + + return ret; +} + +static void __ocfs2_node_map_set(struct ocfs2_node_map *target, + struct ocfs2_node_map *from) +{ + int num_longs, i; + + BUG_ON(target->num_nodes != from->num_nodes); + BUG_ON(target->num_nodes == 0); + + num_longs = BITS_TO_LONGS(target->num_nodes); + for (i = 0; i < num_longs; i++) + target->map[i] = from->map[i]; +} + +/* Returns whether the recovery bit was actually set - it may not be + * if a node is still marked as needing recovery */ +int ocfs2_recovery_map_set(struct ocfs2_super *osb, + int num) +{ + int set = 0; + + spin_lock(&osb->node_map_lock); + + __ocfs2_node_map_clear_bit(&osb->mounted_map, num); + + if (!test_bit(num, osb->recovery_map.map)) { + __ocfs2_node_map_set_bit(&osb->recovery_map, num); + set = 1; + } + + spin_unlock(&osb->node_map_lock); + + return set; +} + +void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + int num) +{ + ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); +} + +int ocfs2_node_map_iterate(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int idx) +{ + int i = idx; + + idx = O2NM_INVALID_NODE_NUM; + spin_lock(&osb->node_map_lock); + if ((i != O2NM_INVALID_NODE_NUM) && + (i >= 0) && + (i < map->num_nodes)) { + while(i < map->num_nodes) { + if (test_bit(i, map->map)) { + idx = i; + break; + } + i++; + } + } + spin_unlock(&osb->node_map_lock); + return idx; +} diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h new file mode 100644 index 00000000000..e8fb079122e --- /dev/null +++ b/fs/ocfs2/heartbeat.h @@ -0,0 +1,67 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_HEARTBEAT_H +#define OCFS2_HEARTBEAT_H + +void ocfs2_init_node_maps(struct ocfs2_super *osb); + +void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); +int ocfs2_register_hb_callbacks(struct ocfs2_super *osb); +void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb); +void ocfs2_stop_heartbeat(struct ocfs2_super *osb); + +/* node map functions - used to keep track of mounted and in-recovery + * nodes. */ +void ocfs2_node_map_init(struct ocfs2_node_map *map); +int ocfs2_node_map_is_empty(struct ocfs2_super *osb, + struct ocfs2_node_map *map); +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +int ocfs2_node_map_iterate(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int idx); +static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map) +{ + return ocfs2_node_map_iterate(osb, map, 0); +} +int ocfs2_recovery_map_set(struct ocfs2_super *osb, + int num); +void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + int num); +/* returns 1 if bit is the only bit set in target, 0 otherwise */ +int ocfs2_node_map_is_only(struct ocfs2_super *osb, + struct ocfs2_node_map *target, + int bit); + +#endif /* OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c new file mode 100644 index 00000000000..a91ba4dec93 --- /dev/null +++ b/fs/ocfs2/inode.c @@ -0,0 +1,1140 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c + * + * vfs' aops, fops, dops and iops + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> + +#include <asm/byteorder.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define OCFS2_FI_FLAG_NOWAIT 0x1 +#define OCFS2_FI_FLAG_DELETE 0x2 +struct ocfs2_find_inode_args +{ + u64 fi_blkno; + unsigned long fi_ino; + unsigned int fi_flags; +}; + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args); +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); +static int ocfs2_find_actor(struct inode *inode, void *opaque); +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh); + +struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, + u64 blkno, + int delete_vote) +{ + struct ocfs2_find_inode_args args; + + /* ocfs2_ilookup_for_vote should *only* be called from the + * vote thread */ + BUG_ON(current != osb->vote_task); + + args.fi_blkno = blkno; + args.fi_flags = OCFS2_FI_FLAG_NOWAIT; + if (delete_vote) + args.fi_flags |= OCFS2_FI_FLAG_DELETE; + args.fi_ino = ino_from_blkno(osb->sb, blkno); + return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); +} + +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) +{ + struct inode *inode = NULL; + struct super_block *sb = osb->sb; + struct ocfs2_find_inode_args args; + + mlog_entry("(blkno = %"MLFu64")\n", blkno); + + /* Ok. By now we've either got the offsets passed to us by the + * caller, or we just pulled them off the bh. Lets do some + * sanity checks to make sure they're OK. */ + if (blkno == 0) { + inode = ERR_PTR(-EINVAL); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + + inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, + ocfs2_init_locked_inode, &args); + /* inode was *not* in the inode cache. 2.6.x requires + * us to do our own read_inode call and unlock it + * afterwards. */ + if (inode && inode->i_state & I_NEW) { + mlog(0, "Inode was not in inode cache, reading it.\n"); + ocfs2_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + if (is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + +bail: + if (!IS_ERR(inode)) { + mlog(0, "returning inode with number %"MLFu64"\n", + OCFS2_I(inode)->ip_blkno); + mlog_exit_ptr(inode); + } else + mlog_errno(PTR_ERR(inode)); + + return inode; +} + + +/* + * here's how inodes get read from disk: + * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR + * found? : return the in-memory inode + * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE + */ + +static int ocfs2_find_actor(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); + + args = opaque; + + mlog_bug_on_msg(!inode, "No inode in find actor!\n"); + + if (oi->ip_blkno != args->fi_blkno) + goto bail; + + /* OCFS2_FI_FLAG_NOWAIT is *only* set from + * ocfs2_ilookup_for_vote which won't create an inode for one + * that isn't found. The vote thread which doesn't want to get + * an inode which is in the process of going away - otherwise + * the call to __wait_on_freeing_inode in find_inode_fast will + * cause it to deadlock on an inode which may be waiting on a + * vote (or lock release) in delete_inode */ + if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && + (inode->i_state & (I_FREEING|I_CLEAR))) { + /* As stated above, we're not going to return an + * inode. In the case of a delete vote, the voting + * code is going to signal the other node to go + * ahead. Mark that state here, so this freeing inode + * has the state when it gets to delete_inode. */ + if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { + spin_lock(&oi->ip_lock); + ocfs2_mark_inode_remotely_deleted(inode); + spin_unlock(&oi->ip_lock); + } + goto bail; + } + + ret = 1; +bail: + mlog_exit(ret); + return ret; +} + +/* + * initialize the new inode, but don't do anything that would cause + * us to sleep. + * return 0 on success, 1 on failure + */ +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = opaque; + + mlog_entry("inode = %p, opaque = %p\n", inode, opaque); + + inode->i_ino = args->fi_ino; + OCFS2_I(inode)->ip_blkno = args->fi_blkno; + + mlog_exit(0); + return 0; +} + +int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino) +{ + struct super_block *sb; + struct ocfs2_super *osb; + int status = -EINVAL; + + mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size); + + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + /* this means that read_inode cannot create a superblock inode + * today. change if needed. */ + if (!OCFS2_IS_VALID_DINODE(fe) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", " + "signature = %.*s, flags = 0x%x\n", + inode->i_ino, le64_to_cpu(fe->i_blkno), 7, + fe->i_signature, le32_to_cpu(fe->i_flags)); + goto bail; + } + + if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { + mlog(ML_ERROR, "file entry generation does not match " + "superblock! osb->fs_generation=%x, " + "fe->i_fs_generation=%x\n", + osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); + goto bail; + } + + inode->i_version = 1; + inode->i_generation = le32_to_cpu(fe->i_generation); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_mode = le16_to_cpu(fe->i_mode); + inode->i_uid = le32_to_cpu(fe->i_uid); + inode->i_gid = le32_to_cpu(fe->i_gid); + inode->i_blksize = (u32)osb->s_clustersize; + + /* Fast symlinks will have i_size but no allocated clusters. */ + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = + ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); + inode->i_mapping->a_ops = &ocfs2_aops; + inode->i_flags |= S_NOATIME; + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) + mlog(ML_ERROR, + "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n", + OCFS2_I(inode)->ip_blkno, fe->i_blkno); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; + + if (create_ino) + inode->i_ino = ino_from_blkno(inode->i_sb, + le64_to_cpu(fe->i_blkno)); + + mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n", + fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); + + inode->i_nlink = le16_to_cpu(fe->i_links_count); + + if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); + } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { + mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); + /* we can't actually hit this as read_inode can't + * handle superblocks today ;-) */ + BUG(); + } + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &ocfs2_fops; + inode->i_op = &ocfs2_file_iops; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + case S_IFDIR: + inode->i_op = &ocfs2_dir_iops; + inode->i_fop = &ocfs2_dops; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + case S_IFLNK: + if (ocfs2_inode_is_fast_symlink(inode)) + inode->i_op = &ocfs2_fast_symlink_inode_operations; + else + inode->i_op = &ocfs2_symlink_inode_operations; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + default: + inode->i_op = &ocfs2_special_file_iops; + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + break; + } + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, + OCFS2_LOCK_TYPE_RW, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, + OCFS2_LOCK_TYPE_META, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, + OCFS2_LOCK_TYPE_DATA, inode); + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args) +{ + struct super_block *sb; + struct ocfs2_super *osb; + struct ocfs2_dinode *fe; + struct buffer_head *bh = NULL; + int status; + int sysfile = 0; + + mlog_entry("(0x%p, 0x%p)\n", inode, args); + + status = -EINVAL; + if (inode == NULL || inode->i_sb == NULL) { + mlog(ML_ERROR, "bad inode\n"); + goto bail; + } + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + if (!args) { + mlog(ML_ERROR, "bad inode args\n"); + make_bad_inode(inode); + goto bail; + } + + /* Read the FE off disk. This is safe because the kernel only + * does one read_inode2 for a new inode, and if it doesn't + * exist yet then nobody can be working on it! */ + status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); + if (status < 0) { + mlog_errno(status); + make_bad_inode(inode); + goto bail; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", + fe->i_blkno, 7, fe->i_signature); + make_bad_inode(inode); + goto bail; + } + + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) + sysfile = 1; + + if (S_ISCHR(le16_to_cpu(fe->i_mode)) || + S_ISBLK(le16_to_cpu(fe->i_mode))) + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + + status = -EINVAL; + if (ocfs2_populate_inode(inode, fe, 0) < 0) { + mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", " + "i_ino=%lu\n", fe->i_blkno, inode->i_ino); + make_bad_inode(inode); + goto bail; + } + + BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + + if (sysfile) + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + + status = 0; + +bail: + if (args && bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +void ocfs2_sync_blockdev(struct super_block *sb) +{ + sync_blockdev(sb->s_bdev); +} + +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh) +{ + int status = 0; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_truncate_context *tc = NULL; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + /* zero allocation, zero truncate :) */ + if (!fe->i_clusters) + goto bail; + + handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_commit_trans(handle); + handle = NULL; + + status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } +bail: + if (handle) + ocfs2_commit_trans(handle); + + mlog_exit(status); + return status; +} + +static int ocfs2_remove_inode(struct inode *inode, + struct buffer_head *di_bh, + struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh) +{ + int status; + struct inode *inode_alloc_inode = NULL; + struct buffer_head *inode_alloc_bh = NULL; + struct ocfs2_journal_handle *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + le16_to_cpu(di->i_suballoc_slot)); + if (!inode_alloc_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + down(&inode_alloc_inode->i_sem); + status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1); + if (status < 0) { + up(&inode_alloc_inode->i_sem); + + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + /* set the inodes dtime */ + status = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); + + status = ocfs2_journal_dirty(handle, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + ocfs2_remove_from_cache(inode, di_bh); + + status = ocfs2_free_dinode(handle, inode_alloc_inode, + inode_alloc_bh, di); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(handle); +bail_unlock: + ocfs2_meta_unlock(inode_alloc_inode, 1); + up(&inode_alloc_inode->i_sem); + brelse(inode_alloc_bh); +bail: + iput(inode_alloc_inode); + + return status; +} + +static int ocfs2_wipe_inode(struct inode *inode, + struct buffer_head *di_bh) +{ + int status, orphaned_slot; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* We've already voted on this so it should be readonly - no + * spinlock needed. */ + orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + orphaned_slot); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + /* Lock the orphan dir. The lock will be held for the entire + * delete_inode operation. We do this now to avoid races with + * recovery completion on other nodes. */ + down(&orphan_dir_inode->i_sem); + status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1); + if (status < 0) { + up(&orphan_dir_inode->i_sem); + + mlog_errno(status); + goto bail; + } + + /* we do this while holding the orphan dir lock because we + * don't want recovery being run from another node to vote for + * an inode delete on us -- this will result in two nodes + * truncating the same file! */ + status = ocfs2_truncate_for_delete(osb, inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, + orphan_dir_bh); + if (status < 0) + mlog_errno(status); + +bail_unlock_dir: + ocfs2_meta_unlock(orphan_dir_inode, 1); + up(&orphan_dir_inode->i_sem); + brelse(orphan_dir_bh); +bail: + iput(orphan_dir_inode); + + return status; +} + +/* There is a series of simple checks that should be done before a + * vote is even considered. Encapsulate those in this function. */ +static int ocfs2_inode_is_valid_to_delete(struct inode *inode) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* We shouldn't be getting here for the root directory + * inode.. */ + if (inode == osb->root_inode) { + mlog(ML_ERROR, "Skipping delete of root inode.\n"); + goto bail; + } + + /* If we're coming from process_vote we can't go into our own + * voting [hello, deadlock city!], so unforuntately we just + * have to skip deleting this guy. That's OK though because + * the node who's doing the actual deleting should handle it + * anyway. */ + if (current == osb->vote_task) { + mlog(0, "Skipping delete of %lu because we're currently " + "in process_vote\n", inode->i_ino); + goto bail; + } + + spin_lock(&oi->ip_lock); + /* OCFS2 *never* deletes system files. This should technically + * never get here as system file inodes should always have a + * positive link count. */ + if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n", + oi->ip_blkno); + goto bail_unlock; + } + + /* If we have voted "yes" on the wipe of this inode for + * another node, it will be marked here so we can safely skip + * it. Recovery will cleanup any inodes we might inadvertantly + * skip here. */ + if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { + mlog(0, "Skipping delete of %lu because another node " + "has done this for us.\n", inode->i_ino); + goto bail_unlock; + } + + ret = 1; +bail_unlock: + spin_unlock(&oi->ip_lock); +bail: + return ret; +} + +/* Query the cluster to determine whether we should wipe an inode from + * disk or not. + * + * Requires the inode to have the cluster lock. */ +static int ocfs2_query_inode_wipe(struct inode *inode, + struct buffer_head *di_bh, + int *wipe) +{ + int status = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di; + + *wipe = 0; + + /* While we were waiting for the cluster lock in + * ocfs2_delete_inode, another node might have asked to delete + * the inode. Recheck our flags to catch this. */ + if (!ocfs2_inode_is_valid_to_delete(inode)) { + mlog(0, "Skipping delete of %"MLFu64" because flags changed\n", + oi->ip_blkno); + goto bail; + } + + /* Now that we have an up to date inode, we can double check + * the link count. */ + if (inode->i_nlink) { + mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n", + oi->ip_blkno, inode->i_nlink); + goto bail; + } + + /* Do some basic inode verification... */ + di = (struct ocfs2_dinode *) di_bh->b_data; + if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { + /* for lack of a better error? */ + status = -EEXIST; + mlog(ML_ERROR, + "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! " + "Disk flags 0x%x, inode flags 0x%x\n", + oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags); + goto bail; + } + + /* has someone already deleted us?! baaad... */ + if (di->i_dtime) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + status = ocfs2_request_delete_vote(inode); + /* -EBUSY means that other nodes are still using the + * inode. We're done here though, so avoid doing anything on + * disk and let them worry about deleting it. */ + if (status == -EBUSY) { + status = 0; + mlog(0, "Skipping delete of %"MLFu64" because it is in use on" + "other nodes\n", oi->ip_blkno); + goto bail; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + spin_lock(&oi->ip_lock); + if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { + /* Nobody knew which slot this inode was orphaned + * into. This may happen during node death and + * recovery knows how to clean it up so we can safely + * ignore this inode for now on. */ + mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n", + oi->ip_blkno); + } else { + *wipe = 1; + + mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n", + oi->ip_blkno, oi->ip_orphaned_slot); + } + spin_unlock(&oi->ip_lock); + +bail: + return status; +} + +/* Support function for ocfs2_delete_inode. Will help us keep the + * inode data in a consistent state for clear_inode. Always truncates + * pages, optionally sync's them first. */ +static void ocfs2_cleanup_delete_inode(struct inode *inode, + int sync_data) +{ + mlog(0, "Cleanup inode %"MLFu64", sync = %d\n", + OCFS2_I(inode)->ip_blkno, sync_data); + if (sync_data) + write_inode_now(inode, 1); + truncate_inode_pages(&inode->i_data, 0); +} + +void ocfs2_delete_inode(struct inode *inode) +{ + int wipe, status; + sigset_t blocked, oldset; + struct buffer_head *di_bh = NULL; + + mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); + + if (is_bad_inode(inode)) { + mlog(0, "Skipping delete of bad inode\n"); + goto bail; + } + + if (!ocfs2_inode_is_valid_to_delete(inode)) { + /* It's probably not necessary to truncate_inode_pages + * here but we do it for safety anyway (it will most + * likely be a no-op anyway) */ + ocfs2_cleanup_delete_inode(inode, 0); + goto bail; + } + + /* We want to block signals in delete_inode as the lock and + * messaging paths may return us -ERESTARTSYS. Which would + * cause us to exit early, resulting in inodes being orphaned + * forever. */ + sigfillset(&blocked); + status = sigprocmask(SIG_BLOCK, &blocked, &oldset); + if (status < 0) { + mlog_errno(status); + ocfs2_cleanup_delete_inode(inode, 1); + goto bail; + } + + /* Lock down the inode. This gives us an up to date view of + * it's metadata (for verification), and allows us to + * serialize delete_inode votes. + * + * Even though we might be doing a truncate, we don't take the + * allocation lock here as it won't be needed - nobody will + * have the file open. + */ + status = ocfs2_meta_lock(inode, NULL, &di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unblock; + } + + /* Query the cluster. This will be the final decision made + * before we go ahead and wipe the inode. */ + status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); + if (!wipe || status < 0) { + /* Error and inode busy vote both mean we won't be + * removing the inode, so they take almost the same + * path. */ + if (status < 0) + mlog_errno(status); + + /* Someone in the cluster has voted to not wipe this + * inode, or it was never completely orphaned. Write + * out the pages and exit now. */ + ocfs2_cleanup_delete_inode(inode, 1); + goto bail_unlock_inode; + } + + ocfs2_cleanup_delete_inode(inode, 0); + + status = ocfs2_wipe_inode(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + /* Mark the inode as successfully deleted. This is important + * for ocfs2_clear_inode as it will check this flag and skip + * any checkpointing work */ + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; + +bail_unlock_inode: + ocfs2_meta_unlock(inode, 1); + brelse(di_bh); +bail_unblock: + status = sigprocmask(SIG_SETMASK, &oldset, NULL); + if (status < 0) + mlog_errno(status); +bail: + clear_inode(inode); + mlog_exit_void(); +} + +void ocfs2_clear_inode(struct inode *inode) +{ + int status; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry_void(); + + if (!inode) + goto bail; + + mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n", + OCFS2_I(inode)->ip_blkno, inode->i_nlink); + + mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + "Inode=%lu\n", inode->i_ino); + + /* Do these before all the other work so that we don't bounce + * the vote thread while waiting to destroy the locks. */ + ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); + + /* We very well may get a clear_inode before all an inodes + * metadata has hit disk. Of course, we can't drop any cluster + * locks until the journal has finished with it. The only + * exception here are successfully wiped inodes - their + * metadata can now be considered to be part of the system + * inodes from which it came. */ + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + ocfs2_checkpoint_inode(inode); + + mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), + "Clear inode of %"MLFu64", inode has io markers\n", + oi->ip_blkno); + + ocfs2_extent_map_drop(inode, 0); + ocfs2_extent_map_init(inode); + + status = ocfs2_drop_inode_locks(inode); + if (status < 0) + mlog_errno(status); + + ocfs2_lock_res_free(&oi->ip_rw_lockres); + ocfs2_lock_res_free(&oi->ip_meta_lockres); + ocfs2_lock_res_free(&oi->ip_data_lockres); + + ocfs2_metadata_cache_purge(inode); + + mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, + "Clear inode of %"MLFu64", inode has %u cache items\n", + oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); + + mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), + "Clear inode of %"MLFu64", inode has a bad flag\n", + oi->ip_blkno); + + mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), + "Clear inode of %"MLFu64", inode is locked\n", + oi->ip_blkno); + + mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), + "Clear inode of %"MLFu64", io_sem is locked\n", + oi->ip_blkno); + up(&oi->ip_io_sem); + + /* + * down_trylock() returns 0, down_write_trylock() returns 1 + * kernel 1, world 0 + */ + mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), + "Clear inode of %"MLFu64", alloc_sem is locked\n", + oi->ip_blkno); + up_write(&oi->ip_alloc_sem); + + mlog_bug_on_msg(oi->ip_open_count, + "Clear inode of %"MLFu64" has open count %d\n", + oi->ip_blkno, oi->ip_open_count); + mlog_bug_on_msg(!list_empty(&oi->ip_handle_list), + "Clear inode of %"MLFu64" has non empty handle list\n", + oi->ip_blkno); + mlog_bug_on_msg(oi->ip_handle, + "Clear inode of %"MLFu64" has non empty handle pointer\n", + oi->ip_blkno); + + /* Clear all other flags. */ + oi->ip_flags = OCFS2_INODE_CACHE_INLINE; + oi->ip_created_trans = 0; + oi->ip_last_trans = 0; + oi->ip_dir_start_lookup = 0; + oi->ip_blkno = 0ULL; + +bail: + mlog_exit_void(); +} + +/* Called under inode_lock, with no more references on the + * struct inode, so it's safe here to check the flags field + * and to manipulate i_nlink without any other locks. */ +void ocfs2_drop_inode(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry_void(); + + mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n", + oi->ip_blkno, inode->i_nlink, oi->ip_flags); + + /* Testing ip_orphaned_slot here wouldn't work because we may + * not have gotten a delete_inode vote from any other nodes + * yet. */ + if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) { + mlog(0, "Inode was orphaned on another node, clearing nlink.\n"); + inode->i_nlink = 0; + } + + generic_drop_inode(inode); + + mlog_exit_void(); +} + +/* + * TODO: this should probably be merged into ocfs2_get_block + * + * However, you now need to pay attention to the cont_prepare_write() + * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much + * expects never to extend). + */ +struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada) +{ + struct buffer_head *bh = NULL; + int tmperr; + u64 p_blkno; + int readflags = OCFS2_BH_CACHED; + +#if 0 + /* only turn this on if we know we can deal with read_block + * returning nothing */ + if (reada) + readflags |= OCFS2_BH_READAHEAD; +#endif + + if (((u64)block << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!reada); + return NULL; + } + + tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, + &p_blkno, NULL); + if (tmperr < 0) { + mlog_errno(tmperr); + goto fail; + } + + tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, + readflags, inode); + if (tmperr < 0) + goto fail; + + tmperr = 0; + + *err = 0; + return bh; + +fail: + if (bh) { + brelse(bh); + bh = NULL; + } + *err = -EIO; + return NULL; +} + +/* + * This is called from our getattr. + */ +int ocfs2_inode_revalidate(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int status = 0; + + mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode, + inode ? OCFS2_I(inode)->ip_blkno : 0ULL); + + if (!inode) { + mlog(0, "eep, no inode!\n"); + status = -ENOENT; + goto bail; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + mlog(0, "inode deleted!\n"); + status = -ENOENT; + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* Let ocfs2_meta_lock do the work of updating our struct + * inode for us. */ + status = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + ocfs2_meta_unlock(inode, 0); +bail: + mlog_exit(status); + + return status; +} + +/* + * Updates a disk inode from a + * struct inode. + * Only takes ip_lock. + */ +int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh) +{ + int status; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + + mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno); + + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + fe->i_size = cpu_to_le64(i_size_read(inode)); + fe->i_links_count = cpu_to_le16(inode->i_nlink); + fe->i_uid = cpu_to_le32(inode->i_uid); + fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_mode = cpu_to_le16(inode->i_mode); + fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) + mlog_errno(status); + + status = 0; +leave: + + mlog_exit(status); + return status; +} + +/* + * + * Updates a struct inode from a disk inode. + * does no i/o, only takes ip_lock. + */ +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + spin_lock(&OCFS2_I(inode)->ip_lock); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + i_size_write(inode, le64_to_cpu(fe->i_size)); + inode->i_nlink = le16_to_cpu(fe->i_links_count); + inode->i_uid = le32_to_cpu(fe->i_uid); + inode->i_gid = le32_to_cpu(fe->i_gid); + inode->i_mode = le16_to_cpu(fe->i_mode); + inode->i_blksize = (u32) osb->s_clustersize; + if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + spin_unlock(&OCFS2_I(inode)->ip_lock); +} diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h new file mode 100644 index 00000000000..9b017743365 --- /dev/null +++ b/fs/ocfs2/inode.h @@ -0,0 +1,145 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_INODE_H +#define OCFS2_INODE_H + +/* OCFS2 Inode Private Data */ +struct ocfs2_inode_info +{ + u64 ip_blkno; + + struct ocfs2_lock_res ip_rw_lockres; + struct ocfs2_lock_res ip_meta_lockres; + struct ocfs2_lock_res ip_data_lockres; + + /* protects allocation changes on this inode. */ + struct rw_semaphore ip_alloc_sem; + + /* These fields are protected by ip_lock */ + spinlock_t ip_lock; + u32 ip_open_count; + u32 ip_clusters; + struct ocfs2_extent_map ip_map; + struct list_head ip_io_markers; + int ip_orphaned_slot; + + struct semaphore ip_io_sem; + + /* Used by the journalling code to attach an inode to a + * handle. These are protected by ip_io_sem in order to lock + * out other I/O to the inode until we either commit or + * abort. */ + struct list_head ip_handle_list; + struct ocfs2_journal_handle *ip_handle; + + u32 ip_flags; /* see below */ + + /* protected by recovery_lock. */ + struct inode *ip_next_orphan; + + u32 ip_dir_start_lookup; + + /* next two are protected by trans_inc_lock */ + /* which transaction were we created on? Zero if none. */ + unsigned long ip_created_trans; + /* last transaction we were a part of. */ + unsigned long ip_last_trans; + + struct ocfs2_caching_info ip_metadata_cache; + + struct inode vfs_inode; +}; + +/* + * Flags for the ip_flags field + */ +/* System file inodes */ +#define OCFS2_INODE_SYSTEM_FILE 0x00000001 +#define OCFS2_INODE_JOURNAL 0x00000002 +#define OCFS2_INODE_BITMAP 0x00000004 +/* This inode has been wiped from disk */ +#define OCFS2_INODE_DELETED 0x00000008 +/* Another node is deleting, so our delete is a nop */ +#define OCFS2_INODE_SKIP_DELETE 0x00000010 +/* Has the inode been orphaned on another node? + * + * This hints to ocfs2_drop_inode that it should clear i_nlink before + * continuing. + * + * We *only* set this on unlink vote from another node. If the inode + * was locally orphaned, then we're sure of the state and don't need + * to twiddle i_nlink later - it's either zero or not depending on + * whether our unlink succeeded. Otherwise we got this from a node + * whose intention was to orphan the inode, however he may have + * crashed, failed etc, so we let ocfs2_drop_inode zero the value and + * rely on ocfs2_delete_inode to sort things out under the proper + * cluster locks. + */ +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +/* Does someone have the file open O_DIRECT */ +#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +/* Indicates that the metadata cache should be used as an array. */ +#define OCFS2_INODE_CACHE_INLINE 0x00000080 + +static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) +{ + return container_of(inode, struct ocfs2_inode_info, vfs_inode); +} + +#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) +#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) + +extern kmem_cache_t *ocfs2_inode_cache; + +extern struct address_space_operations ocfs2_aops; + +struct buffer_head *ocfs2_bread(struct inode *inode, int block, + int *err, int reada); +void ocfs2_clear_inode(struct inode *inode); +void ocfs2_delete_inode(struct inode *inode); +void ocfs2_drop_inode(struct inode *inode); +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff); +struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, + u64 blkno, + int delete_vote); +int ocfs2_inode_init_private(struct inode *inode); +int ocfs2_inode_revalidate(struct dentry *dentry); +int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino); +void ocfs2_read_inode(struct inode *inode); +void ocfs2_read_inode2(struct inode *inode, void *opaque); +ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, + size_t size, loff_t *offp); +void ocfs2_sync_blockdev(struct super_block *sb); +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe); +int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh); +int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); +int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); + +#endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c new file mode 100644 index 00000000000..04428042e5e --- /dev/null +++ b/fs/ocfs2/journal.c @@ -0,0 +1,1652 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.c + * + * Defines functions of journalling api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/kthread.h> + +#define MLOG_MASK_PREFIX ML_JOURNAL +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "namei.h" +#include "slot_map.h" +#include "super.h" +#include "vote.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; + +static int ocfs2_force_read_journal(struct inode *inode); +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num); +static int __ocfs2_recovery_thread(void *arg); +static int ocfs2_commit_cache(struct ocfs2_super *osb); +static int ocfs2_wait_on_mount(struct ocfs2_super *osb); +static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, + struct ocfs2_journal_handle *handle); +static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle); +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty); +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num); +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot); +static int ocfs2_commit_thread(void *arg); + +static int ocfs2_commit_cache(struct ocfs2_super *osb) +{ + int status = 0; + unsigned int flushed; + unsigned long old_id; + struct ocfs2_journal *journal = NULL; + + mlog_entry_void(); + + journal = osb->journal; + + /* Flush all pending commits and checkpoint the journal. */ + down_write(&journal->j_trans_barrier); + + if (atomic_read(&journal->j_num_trans) == 0) { + up_write(&journal->j_trans_barrier); + mlog(0, "No transactions for me to flush!\n"); + goto finally; + } + + journal_lock_updates(journal->j_journal); + status = journal_flush(journal->j_journal); + journal_unlock_updates(journal->j_journal); + if (status < 0) { + up_write(&journal->j_trans_barrier); + mlog_errno(status); + goto finally; + } + + old_id = ocfs2_inc_trans_id(journal); + + flushed = atomic_read(&journal->j_num_trans); + atomic_set(&journal->j_num_trans, 0); + up_write(&journal->j_trans_barrier); + + mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", + journal->j_trans_id, flushed); + + ocfs2_kick_vote_thread(osb); + wake_up(&journal->j_checkpointed); +finally: + mlog_exit(status); + return status; +} + +struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) +{ + struct ocfs2_journal_handle *retval = NULL; + + retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); + if (!retval) { + mlog(ML_ERROR, "Failed to allocate memory for journal " + "handle!\n"); + return NULL; + } + + retval->max_buffs = 0; + retval->num_locks = 0; + retval->k_handle = NULL; + + INIT_LIST_HEAD(&retval->locks); + INIT_LIST_HEAD(&retval->inode_list); + retval->journal = osb->journal; + + return retval; +} + +/* pass it NULL and it will allocate a new handle object for you. If + * you pass it a handle however, it may still return error, in which + * case it has free'd the passed handle for you. */ +struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + int max_buffs) +{ + int ret; + journal_t *journal = osb->journal->j_journal; + + mlog_entry("(max_buffs = %d)\n", max_buffs); + + if (!osb || !osb->journal->j_journal) + BUG(); + + if (ocfs2_is_hard_readonly(osb)) { + ret = -EROFS; + goto done_free; + } + + BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); + BUG_ON(max_buffs <= 0); + + /* JBD might support this, but our journalling code doesn't yet. */ + if (journal_current_handle()) { + mlog(ML_ERROR, "Recursive transaction attempted!\n"); + BUG(); + } + + if (!handle) + handle = ocfs2_alloc_handle(osb); + if (!handle) { + ret = -ENOMEM; + mlog(ML_ERROR, "Failed to allocate memory for journal " + "handle!\n"); + goto done_free; + } + + handle->max_buffs = max_buffs; + + down_read(&osb->journal->j_trans_barrier); + + /* actually start the transaction now */ + handle->k_handle = journal_start(journal, max_buffs); + if (IS_ERR(handle->k_handle)) { + up_read(&osb->journal->j_trans_barrier); + + ret = PTR_ERR(handle->k_handle); + handle->k_handle = NULL; + mlog_errno(ret); + + if (is_journal_aborted(journal)) { + ocfs2_abort(osb->sb, "Detected aborted journal"); + ret = -EROFS; + } + goto done_free; + } + + atomic_inc(&(osb->journal->j_num_trans)); + handle->flags |= OCFS2_HANDLE_STARTED; + + mlog_exit_ptr(handle); + return handle; + +done_free: + if (handle) + ocfs2_commit_unstarted_handle(handle); /* will kfree handle */ + + mlog_exit(ret); + return ERR_PTR(ret); +} + +void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, + struct inode *inode) +{ + BUG_ON(!handle); + BUG_ON(!inode); + + atomic_inc(&inode->i_count); + + /* we're obviously changing it... */ + down(&inode->i_sem); + + /* sanity check */ + BUG_ON(OCFS2_I(inode)->ip_handle); + BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); + + OCFS2_I(inode)->ip_handle = handle; + list_del(&(OCFS2_I(inode)->ip_handle_list)); + list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); +} + +static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) +{ + struct list_head *p, *n; + struct inode *inode; + struct ocfs2_inode_info *oi; + + list_for_each_safe(p, n, &handle->inode_list) { + oi = list_entry(p, struct ocfs2_inode_info, + ip_handle_list); + inode = &oi->vfs_inode; + + OCFS2_I(inode)->ip_handle = NULL; + list_del_init(&OCFS2_I(inode)->ip_handle_list); + + up(&inode->i_sem); + iput(inode); + } +} + +/* This is trivial so we do it out of the main commit + * paths. Beware, it can be called from start_trans too! */ +static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle) +{ + mlog_entry_void(); + + BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); + + ocfs2_handle_unlock_inodes(handle); + /* You are allowed to add journal locks before the transaction + * has started. */ + ocfs2_handle_cleanup_locks(handle->journal, handle); + + kfree(handle); + + mlog_exit_void(); +} + +void ocfs2_commit_trans(struct ocfs2_journal_handle *handle) +{ + handle_t *jbd_handle; + int retval; + struct ocfs2_journal *journal = handle->journal; + + mlog_entry_void(); + + BUG_ON(!handle); + + if (!(handle->flags & OCFS2_HANDLE_STARTED)) { + ocfs2_commit_unstarted_handle(handle); + mlog_exit_void(); + return; + } + + /* release inode semaphores we took during this transaction */ + ocfs2_handle_unlock_inodes(handle); + + /* ocfs2_extend_trans may have had to call journal_restart + * which will always commit the transaction, but may return + * error for any number of reasons. If this is the case, we + * clear k_handle as it's not valid any more. */ + if (handle->k_handle) { + jbd_handle = handle->k_handle; + + if (handle->flags & OCFS2_HANDLE_SYNC) + jbd_handle->h_sync = 1; + else + jbd_handle->h_sync = 0; + + /* actually stop the transaction. if we've set h_sync, + * it'll have been committed when we return */ + retval = journal_stop(jbd_handle); + if (retval < 0) { + mlog_errno(retval); + mlog(ML_ERROR, "Could not commit transaction\n"); + BUG(); + } + + handle->k_handle = NULL; /* it's been free'd in journal_stop */ + } + + ocfs2_handle_cleanup_locks(journal, handle); + + up_read(&journal->j_trans_barrier); + + kfree(handle); + mlog_exit_void(); +} + +/* + * 'nblocks' is what you want to add to the current + * transaction. extend_trans will either extend the current handle by + * nblocks, or commit it and start a new one with nblocks credits. + * + * WARNING: This will not release any semaphores or disk locks taken + * during the transaction, so make sure they were taken *before* + * start_trans or we'll have ordering deadlocks. + * + * WARNING2: Note that we do *not* drop j_trans_barrier here. This is + * good because transaction ids haven't yet been recorded on the + * cluster locks associated with this handle. + */ +int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, + int nblocks) +{ + int status; + + BUG_ON(!handle); + BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); + BUG_ON(!nblocks); + + mlog_entry_void(); + + mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); + + status = journal_extend(handle->k_handle, nblocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (status > 0) { + mlog(0, "journal_extend failed, trying journal_restart\n"); + status = journal_restart(handle->k_handle, nblocks); + if (status < 0) { + handle->k_handle = NULL; + mlog_errno(status); + goto bail; + } + handle->max_buffs = nblocks; + } else + handle->max_buffs += nblocks; + + status = 0; +bail: + + mlog_exit(status); + return status; +} + +int ocfs2_journal_access(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh, + int type) +{ + int status; + + BUG_ON(!inode); + BUG_ON(!handle); + BUG_ON(!bh); + BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); + + mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n", + (unsigned long long)bh->b_blocknr, type, + (type == OCFS2_JOURNAL_ACCESS_CREATE) ? + "OCFS2_JOURNAL_ACCESS_CREATE" : + "OCFS2_JOURNAL_ACCESS_WRITE", + bh->b_size); + + /* we can safely remove this assertion after testing. */ + if (!buffer_uptodate(bh)) { + mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); + mlog(ML_ERROR, "b_blocknr=%llu\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + /* Set the current transaction information on the inode so + * that the locking code knows whether it can drop it's locks + * on this inode or not. We're protected from the commit + * thread updating the current transaction id until + * ocfs2_commit_trans() because ocfs2_start_trans() took + * j_trans_barrier for us. */ + ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); + + down(&OCFS2_I(inode)->ip_io_sem); + switch (type) { + case OCFS2_JOURNAL_ACCESS_CREATE: + case OCFS2_JOURNAL_ACCESS_WRITE: + status = journal_get_write_access(handle->k_handle, bh); + break; + + case OCFS2_JOURNAL_ACCESS_UNDO: + status = journal_get_undo_access(handle->k_handle, bh); + break; + + default: + status = -EINVAL; + mlog(ML_ERROR, "Uknown access type!\n"); + } + up(&OCFS2_I(inode)->ip_io_sem); + + if (status < 0) + mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", + status, type); + + mlog_exit(status); + return status; +} + +int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, + struct buffer_head *bh) +{ + int status; + + BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); + + mlog_entry("(bh->b_blocknr=%llu)\n", + (unsigned long long)bh->b_blocknr); + + status = journal_dirty_metadata(handle->k_handle, bh); + if (status < 0) + mlog(ML_ERROR, "Could not dirty metadata buffer. " + "(bh->b_blocknr=%llu)\n", + (unsigned long long)bh->b_blocknr); + + mlog_exit(status); + return status; +} + +int ocfs2_journal_dirty_data(handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + mlog_errno(err); + /* TODO: When we can handle it, abort the handle and go RO on + * error here. */ + + return err; +} + +/* We always assume you're adding a metadata lock at level 'ex' */ +int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, + struct inode *inode) +{ + int status; + struct ocfs2_journal_lock *lock; + + BUG_ON(!inode); + + lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS); + if (!lock) { + status = -ENOMEM; + mlog_errno(-ENOMEM); + goto bail; + } + + if (!igrab(inode)) + BUG(); + lock->jl_inode = inode; + + list_add_tail(&(lock->jl_lock_list), &(handle->locks)); + handle->num_locks++; + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, + struct ocfs2_journal_handle *handle) +{ + struct list_head *p, *n; + struct ocfs2_journal_lock *lock; + struct inode *inode; + + list_for_each_safe(p, n, &(handle->locks)) { + lock = list_entry(p, struct ocfs2_journal_lock, + jl_lock_list); + list_del(&lock->jl_lock_list); + handle->num_locks--; + + inode = lock->jl_inode; + ocfs2_meta_unlock(inode, 1); + if (atomic_read(&inode->i_count) == 1) + mlog(ML_ERROR, + "Inode %"MLFu64", I'm doing a last iput for!", + OCFS2_I(inode)->ip_blkno); + iput(inode); + kmem_cache_free(ocfs2_lock_cache, lock); + } +} + +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) + +void ocfs2_set_journal_params(struct ocfs2_super *osb) +{ + journal_t *journal = osb->journal->j_journal; + + spin_lock(&journal->j_state_lock); + journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); +} + +int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_super *osb; + int meta_lock = 0; + + mlog_entry_void(); + + BUG_ON(!journal); + + osb = journal->j_osb; + + /* already have the inode for our journal */ + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + osb->slot_num); + if (inode == NULL) { + status = -EACCES; + mlog_errno(status); + goto done; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto done; + } + + SET_INODE_JOURNAL(inode); + OCFS2_I(inode)->ip_open_count++; + + status = ocfs2_meta_lock(inode, NULL, &bh, 1); + if (status < 0) { + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not get lock on journal!\n"); + goto done; + } + + meta_lock = 1; + di = (struct ocfs2_dinode *)bh->b_data; + + if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { + mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", + inode->i_size); + status = -EINVAL; + goto done; + } + + mlog(0, "inode->i_size = %lld\n", inode->i_size); + mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks); + mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); + + /* call the kernels journal init function now */ + j_journal = journal_init_inode(inode); + if (j_journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EINVAL; + goto done; + } + + mlog(0, "Returned from journal_init_inode\n"); + mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); + + *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL); + + journal->j_journal = j_journal; + journal->j_inode = inode; + journal->j_bh = bh; + + ocfs2_set_journal_params(osb); + + journal->j_state = OCFS2_JOURNAL_LOADED; + + status = 0; +done: + if (status < 0) { + if (meta_lock) + ocfs2_meta_unlock(inode, 1); + if (bh != NULL) + brelse(bh); + if (inode) { + OCFS2_I(inode)->ip_open_count--; + iput(inode); + } + } + + mlog_exit(status); + return status; +} + +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty) +{ + int status; + unsigned int flags; + struct ocfs2_journal *journal = osb->journal; + struct buffer_head *bh = journal->j_bh; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + fe = (struct ocfs2_dinode *)bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + /* This is called from startup/shutdown which will + * handle the errors in a specific manner, so no need + * to call ocfs2_error() here. */ + mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid " + "signature: %.*s", fe->i_blkno, 7, fe->i_signature); + status = -EIO; + goto out; + } + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + if (dirty) + flags |= OCFS2_JOURNAL_DIRTY_FL; + else + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + status = ocfs2_write_block(osb, bh, journal->j_inode); + if (status < 0) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +/* + * If the journal has been kmalloc'd it needs to be freed after this + * call. + */ +void ocfs2_journal_shutdown(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = NULL; + int status = 0; + struct inode *inode = NULL; + int num_running_trans = 0; + + mlog_entry_void(); + + if (!osb) + BUG(); + + journal = osb->journal; + if (!journal) + goto done; + + inode = journal->j_inode; + + if (journal->j_state != OCFS2_JOURNAL_LOADED) + goto done; + + /* need to inc inode use count as journal_destroy will iput. */ + if (!igrab(inode)) + BUG(); + + num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + if (num_running_trans > 0) + mlog(0, "Shutting down journal: must wait on %d " + "running transactions!\n", + num_running_trans); + + /* Do a commit_cache here. It will flush our journal, *and* + * release any locks that are still held. + * set the SHUTDOWN flag and release the trans lock. + * the commit thread will take the trans lock for us below. */ + journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; + + /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not + * drop the trans_lock (which we want to hold until we + * completely destroy the journal. */ + if (osb->commit_task) { + /* Wait for the commit thread */ + mlog(0, "Waiting for ocfs2commit to exit....\n"); + kthread_stop(osb->commit_task); + osb->commit_task = NULL; + } + + BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + + status = ocfs2_journal_toggle_dirty(osb, 0); + if (status < 0) + mlog_errno(status); + + /* Shutdown the kernel journal system */ + journal_destroy(journal->j_journal); + + OCFS2_I(inode)->ip_open_count--; + + /* unlock our journal */ + ocfs2_meta_unlock(inode, 1); + + brelse(journal->j_bh); + journal->j_bh = NULL; + + journal->j_state = OCFS2_JOURNAL_FREE; + +// up_write(&journal->j_trans_barrier); +done: + if (inode) + iput(inode); + mlog_exit_void(); +} + +static void ocfs2_clear_journal_error(struct super_block *sb, + journal_t *journal, + int slot) +{ + int olderr; + + olderr = journal_errno(journal); + if (olderr) { + mlog(ML_ERROR, "File system error %d recorded in " + "journal %u.\n", olderr, slot); + mlog(ML_ERROR, "File system on device %s needs checking.\n", + sb->s_id); + + journal_ack_err(journal); + journal_clear_err(journal); + } +} + +int ocfs2_journal_load(struct ocfs2_journal *journal) +{ + int status = 0; + struct ocfs2_super *osb; + + mlog_entry_void(); + + if (!journal) + BUG(); + + osb = journal->j_osb; + + status = journal_load(journal->j_journal); + if (status < 0) { + mlog(ML_ERROR, "Failed to load journal!\n"); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + + status = ocfs2_journal_toggle_dirty(osb, 1); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* Launch the commit thread */ + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d", + osb->osb_id); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", + status); + goto done; + } + +done: + mlog_exit(status); + return status; +} + + +/* 'full' flag tells us whether we clear out all blocks or if we just + * mark the journal clean */ +int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) +{ + int status; + + mlog_entry_void(); + + if (!journal) + BUG(); + + status = journal_wipe(journal->j_journal, full); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); + if (status < 0) + mlog_errno(status); + +bail: + mlog_exit(status); + return status; +} + +/* + * JBD Might read a cached version of another nodes journal file. We + * don't want this as this file changes often and we get no + * notification on those changes. The only way to be sure that we've + * got the most up to date version of those blocks then is to force + * read them off disk. Just searching through the buffer cache won't + * work as there may be pages backing this file which are still marked + * up to date. We know things can't change on this file underneath us + * as we have the lock by now :) + */ +static int ocfs2_force_read_journal(struct inode *inode) +{ + int status = 0; + int i, p_blocks; + u64 v_blkno, p_blkno; +#define CONCURRENT_JOURNAL_FILL 32 + struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; + + mlog_entry_void(); + + BUG_ON(inode->i_blocks != + ocfs2_align_bytes_to_sectors(i_size_read(inode))); + + memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + + mlog(0, "Force reading %lu blocks\n", + (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))); + + v_blkno = 0; + while (v_blkno < + (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { + + status = ocfs2_extent_map_get_blocks(inode, v_blkno, + 1, &p_blkno, + &p_blocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (p_blocks > CONCURRENT_JOURNAL_FILL) + p_blocks = CONCURRENT_JOURNAL_FILL; + + status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs, 0, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = 0; i < p_blocks; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + + v_blkno += p_blocks; + } + +bail: + for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) + if (bhs[i]) + brelse(bhs[i]); + mlog_exit(status); + return status; +} + +struct ocfs2_la_recovery_item { + struct list_head lri_list; + int lri_slot; + struct ocfs2_dinode *lri_la_dinode; + struct ocfs2_dinode *lri_tl_dinode; +}; + +/* Does the second half of the recovery process. By this point, the + * node is marked clean and can actually be considered recovered, + * hence it's no longer in the recovery map, but there's still some + * cleanup we can do which shouldn't happen within the recovery thread + * as locking in that context becomes very difficult if we are to take + * recovering nodes into account. + * + * NOTE: This function can and will sleep on recovery of other nodes + * during cluster locking, just like any other ocfs2 process. + */ +void ocfs2_complete_recovery(void *data) +{ + int ret; + struct ocfs2_super *osb = data; + struct ocfs2_journal *journal = osb->journal; + struct ocfs2_dinode *la_dinode, *tl_dinode; + struct ocfs2_la_recovery_item *item; + struct list_head *p, *n; + LIST_HEAD(tmp_la_list); + + mlog_entry_void(); + + mlog(0, "completing recovery from keventd\n"); + + spin_lock(&journal->j_lock); + list_splice_init(&journal->j_la_cleanups, &tmp_la_list); + spin_unlock(&journal->j_lock); + + list_for_each_safe(p, n, &tmp_la_list) { + item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); + list_del_init(&item->lri_list); + + mlog(0, "Complete recovery for slot %d\n", item->lri_slot); + + la_dinode = item->lri_la_dinode; + if (la_dinode) { + mlog(0, "Clean up local alloc %"MLFu64"\n", + la_dinode->i_blkno); + + ret = ocfs2_complete_local_alloc_recovery(osb, + la_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(la_dinode); + } + + tl_dinode = item->lri_tl_dinode; + if (tl_dinode) { + mlog(0, "Clean up truncate log %"MLFu64"\n", + tl_dinode->i_blkno); + + ret = ocfs2_complete_truncate_log_recovery(osb, + tl_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(tl_dinode); + } + + ret = ocfs2_recover_orphans(osb, item->lri_slot); + if (ret < 0) + mlog_errno(ret); + + kfree(item); + } + + mlog(0, "Recovery completion\n"); + mlog_exit_void(); +} + +/* NOTE: This function always eats your references to la_dinode and + * tl_dinode, either manually on error, or by passing them to + * ocfs2_complete_recovery */ +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode) +{ + struct ocfs2_la_recovery_item *item; + + item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); + if (!item) { + /* Though we wish to avoid it, we are in fact safe in + * skipping local alloc cleanup as fsck.ocfs2 is more + * than capable of reclaiming unused space. */ + if (la_dinode) + kfree(la_dinode); + + if (tl_dinode) + kfree(tl_dinode); + + mlog_errno(-ENOMEM); + return; + } + + INIT_LIST_HEAD(&item->lri_list); + item->lri_la_dinode = la_dinode; + item->lri_slot = slot_num; + item->lri_tl_dinode = tl_dinode; + + spin_lock(&journal->j_lock); + list_add_tail(&item->lri_list, &journal->j_la_cleanups); + queue_work(ocfs2_wq, &journal->j_recovery_work); + spin_unlock(&journal->j_lock); +} + +/* Called by the mount code to queue recovery the last part of + * recovery for it's own slot. */ +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = osb->journal; + + if (osb->dirty) { + /* No need to queue up our truncate_log as regular + * cleanup will catch that. */ + ocfs2_queue_recovery_completion(journal, + osb->slot_num, + osb->local_alloc_copy, + NULL); + ocfs2_schedule_truncate_log_flush(osb, 0); + + osb->local_alloc_copy = NULL; + osb->dirty = 0; + } +} + +static int __ocfs2_recovery_thread(void *arg) +{ + int status, node_num; + struct ocfs2_super *osb = arg; + + mlog_entry_void(); + + status = ocfs2_wait_on_mount(osb); + if (status < 0) { + goto bail; + } + +restart: + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + node_num = ocfs2_node_map_first_set_bit(osb, + &osb->recovery_map); + if (node_num == O2NM_INVALID_NODE_NUM) { + mlog(0, "Out of nodes to recover.\n"); + break; + } + + status = ocfs2_recover_node(osb, node_num); + if (status < 0) { + mlog(ML_ERROR, + "Error %d recovering node %d on device (%u,%u)!\n", + status, node_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + mlog(ML_ERROR, "Volume requires unmount.\n"); + continue; + } + + ocfs2_recovery_map_clear(osb, node_num); + } + ocfs2_super_unlock(osb, 1); + + /* We always run recovery on our own orphan dir - the dead + * node(s) may have voted "no" on an inode delete earlier. A + * revote is therefore required. */ + ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, + NULL); + +bail: + down(&osb->recovery_lock); + if (!status && + !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + up(&osb->recovery_lock); + goto restart; + } + + osb->recovery_thread_task = NULL; + mb(); /* sync with ocfs2_recovery_thread_running */ + wake_up(&osb->recovery_event); + + up(&osb->recovery_lock); + + mlog_exit(status); + /* no one is callint kthread_stop() for us so the kthread() api + * requires that we call do_exit(). And it isn't exported, but + * complete_and_exit() seems to be a minimal wrapper around it. */ + complete_and_exit(NULL, status); + return status; +} + +void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) +{ + mlog_entry("(node_num=%d, osb->node_num = %d)\n", + node_num, osb->node_num); + + down(&osb->recovery_lock); + if (osb->disable_recovery) + goto out; + + /* People waiting on recovery will wait on + * the recovery map to empty. */ + if (!ocfs2_recovery_map_set(osb, node_num)) + mlog(0, "node %d already be in recovery.\n", node_num); + + mlog(0, "starting recovery thread...\n"); + + if (osb->recovery_thread_task) + goto out; + + osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, + "ocfs2rec-%d", osb->osb_id); + if (IS_ERR(osb->recovery_thread_task)) { + mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); + osb->recovery_thread_task = NULL; + } + +out: + up(&osb->recovery_lock); + wake_up(&osb->recovery_event); + + mlog_exit_void(); +} + +/* Does the actual journal replay and marks the journal inode as + * clean. Will only replay if the journal inode is marked dirty. */ +static int ocfs2_replay_journal(struct ocfs2_super *osb, + int node_num, + int slot_num) +{ + int status; + int got_lock = 0; + unsigned int flags; + struct inode *inode = NULL; + struct ocfs2_dinode *fe; + journal_t *journal = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (inode == NULL) { + status = -EACCES; + mlog_errno(status); + goto done; + } + if (is_bad_inode(inode)) { + status = -EACCES; + iput(inode); + inode = NULL; + mlog_errno(status); + goto done; + } + SET_INODE_JOURNAL(inode); + + status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, + OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not lock journal!\n"); + goto done; + } + got_lock = 1; + + fe = (struct ocfs2_dinode *) bh->b_data; + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + + if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { + mlog(0, "No recovery required for node %d\n", node_num); + goto done; + } + + mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", + node_num, slot_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + + status = ocfs2_force_read_journal(inode); + if (status < 0) { + mlog_errno(status); + goto done; + } + + mlog(0, "calling journal_init_inode\n"); + journal = journal_init_inode(inode); + if (journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EIO; + goto done; + } + + status = journal_load(journal); + if (status < 0) { + mlog_errno(status); + if (!igrab(inode)) + BUG(); + journal_destroy(journal); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal, slot_num); + + /* wipe the journal */ + mlog(0, "flushing the journal.\n"); + journal_lock_updates(journal); + status = journal_flush(journal); + journal_unlock_updates(journal); + if (status < 0) + mlog_errno(status); + + /* This will mark the node clean */ + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + status = ocfs2_write_block(osb, bh, inode); + if (status < 0) + mlog_errno(status); + + if (!igrab(inode)) + BUG(); + + journal_destroy(journal); + +done: + /* drop the lock on this nodes journal */ + if (got_lock) + ocfs2_meta_unlock(inode, 1); + + if (inode) + iput(inode); + + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +/* + * Do the most important parts of node recovery: + * - Replay it's journal + * - Stamp a clean local allocator file + * - Stamp a clean truncate log + * - Mark the node clean + * + * If this function completes without error, a node in OCFS2 can be + * said to have been safely recovered. As a result, failure during the + * second part of a nodes recovery process (local alloc recovery) is + * far less concerning. + */ +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num) +{ + int status = 0; + int slot_num; + struct ocfs2_slot_info *si = osb->slot_info; + struct ocfs2_dinode *la_copy = NULL; + struct ocfs2_dinode *tl_copy = NULL; + + mlog_entry("(node_num=%d, osb->node_num = %d)\n", + node_num, osb->node_num); + + mlog(0, "checking node %d\n", node_num); + + /* Should not ever be called to recover ourselves -- in that + * case we should've called ocfs2_journal_load instead. */ + if (osb->node_num == node_num) + BUG(); + + slot_num = ocfs2_node_num_to_slot(si, node_num); + if (slot_num == OCFS2_INVALID_SLOT) { + status = 0; + mlog(0, "no slot for this node, so no recovery required.\n"); + goto done; + } + + mlog(0, "node %d was using slot %d\n", node_num, slot_num); + + status = ocfs2_replay_journal(osb, node_num, slot_num); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* Stamp a clean local alloc file AFTER recovering the journal... */ + status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* An error from begin_truncate_log_recovery is not + * serious enough to warrant halting the rest of + * recovery. */ + status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); + if (status < 0) + mlog_errno(status); + + /* Likewise, this would be a strange but ultimately not so + * harmful place to get an error... */ + ocfs2_clear_slot(si, slot_num); + status = ocfs2_update_disk_slots(osb, si); + if (status < 0) + mlog_errno(status); + + /* This will kfree the memory pointed to by la_copy and tl_copy */ + ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, + tl_copy); + + status = 0; +done: + + mlog_exit(status); + return status; +} + +/* Test node liveness by trylocking his journal. If we get the lock, + * we drop it here. Return 0 if we got the lock, -EAGAIN if node is + * still alive (we couldn't get the lock) and < 0 on error. */ +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num) +{ + int status, flags; + struct inode *inode = NULL; + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (inode == NULL) { + mlog(ML_ERROR, "access error\n"); + status = -EACCES; + goto bail; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto bail; + } + SET_INODE_JOURNAL(inode); + + flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; + status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto bail; + } + + ocfs2_meta_unlock(inode, 1); +bail: + if (inode) + iput(inode); + + return status; +} + +/* Call this underneath ocfs2_super_lock. It also assumes that the + * slot info struct has been updated from disk. */ +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) +{ + int status, i, node_num; + struct ocfs2_slot_info *si = osb->slot_info; + + /* This is called with the super block cluster lock, so we + * know that the slot map can't change underneath us. */ + + spin_lock(&si->si_lock); + for(i = 0; i < si->si_num_slots; i++) { + if (i == osb->slot_num) + continue; + if (ocfs2_is_empty_slot(si, i)) + continue; + + node_num = si->si_global_node_nums[i]; + if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) + continue; + spin_unlock(&si->si_lock); + + /* Ok, we have a slot occupied by another node which + * is not in the recovery map. We trylock his journal + * file here to test if he's alive. */ + status = ocfs2_trylock_journal(osb, i); + if (!status) { + /* Since we're called from mount, we know that + * the recovery thread can't race us on + * setting / checking the recovery bits. */ + ocfs2_recovery_thread(osb, node_num); + } else if ((status < 0) && (status != -EAGAIN)) { + mlog_errno(status); + goto bail; + } + + spin_lock(&si->si_lock); + } + spin_unlock(&si->si_lock); + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot) +{ + int status = 0; + int have_disk_lock = 0; + struct inode *inode = NULL; + struct inode *iter; + struct inode *orphan_dir_inode = NULL; + unsigned long offset, blk, local; + struct buffer_head *bh = NULL; + struct ocfs2_dir_entry *de; + struct super_block *sb = osb->sb; + struct ocfs2_inode_info *oi; + + mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + slot); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto out; + } + + down(&orphan_dir_inode->i_sem); + status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); + if (status < 0) { + up(&orphan_dir_inode->i_sem); + mlog_errno(status); + goto out; + } + have_disk_lock = 1; + + offset = 0; + iter = NULL; + while(offset < i_size_read(orphan_dir_inode)) { + blk = offset >> sb->s_blocksize_bits; + + bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0); + if (!bh) + status = -EINVAL; + if (status < 0) { + up(&orphan_dir_inode->i_sem); + if (bh) + brelse(bh); + mlog_errno(status); + goto out; + } + + local = 0; + while(offset < i_size_read(orphan_dir_inode) + && local < sb->s_blocksize) { + de = (struct ocfs2_dir_entry *) (bh->b_data + local); + + if (!ocfs2_check_dir_entry(orphan_dir_inode, + de, bh, local)) { + up(&orphan_dir_inode->i_sem); + status = -EINVAL; + mlog_errno(status); + brelse(bh); + goto out; + } + + local += le16_to_cpu(de->rec_len); + offset += le16_to_cpu(de->rec_len); + + /* I guess we silently fail on no inode? */ + if (!le64_to_cpu(de->inode)) + continue; + if (de->file_type > OCFS2_FT_MAX) { + mlog(ML_ERROR, + "block %llu contains invalid de: " + "inode = %"MLFu64", rec_len = %u, " + "name_len = %u, file_type = %u, " + "name='%.*s'\n", + (unsigned long long)bh->b_blocknr, + le64_to_cpu(de->inode), + le16_to_cpu(de->rec_len), + de->name_len, + de->file_type, + de->name_len, + de->name); + continue; + } + if (de->name_len == 1 && !strncmp(".", de->name, 1)) + continue; + if (de->name_len == 2 && !strncmp("..", de->name, 2)) + continue; + + iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); + if (IS_ERR(iter)) + continue; + + mlog(0, "queue orphan %"MLFu64"\n", + OCFS2_I(iter)->ip_blkno); + OCFS2_I(iter)->ip_next_orphan = inode; + inode = iter; + } + brelse(bh); + } + up(&orphan_dir_inode->i_sem); + + ocfs2_meta_unlock(orphan_dir_inode, 0); + have_disk_lock = 0; + + iput(orphan_dir_inode); + orphan_dir_inode = NULL; + + while (inode) { + oi = OCFS2_I(inode); + mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno); + + iter = oi->ip_next_orphan; + + spin_lock(&oi->ip_lock); + /* Delete voting may have set these on the assumption + * that the other node would wipe them successfully. + * If they are still in the node's orphan dir, we need + * to reset that state. */ + oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); + + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + oi->ip_orphaned_slot = slot; + spin_unlock(&oi->ip_lock); + + iput(inode); + + inode = iter; + } + +out: + if (have_disk_lock) + ocfs2_meta_unlock(orphan_dir_inode, 0); + + if (orphan_dir_inode) + iput(orphan_dir_inode); + + return status; +} + +static int ocfs2_wait_on_mount(struct ocfs2_super *osb) +{ + /* This check is good because ocfs2 will wait on our recovery + * thread before changing it to something other than MOUNTED + * or DISABLED. */ + wait_event(osb->osb_mount_event, + atomic_read(&osb->vol_state) == VOLUME_MOUNTED || + atomic_read(&osb->vol_state) == VOLUME_DISABLED); + + /* If there's an error on mount, then we may never get to the + * MOUNTED flag, but this is set right before + * dismount_volume() so we can trust it. */ + if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { + mlog(0, "mount error, exiting!\n"); + return -EBUSY; + } + + return 0; +} + +static int ocfs2_commit_thread(void *arg) +{ + int status; + struct ocfs2_super *osb = arg; + struct ocfs2_journal *journal = osb->journal; + + /* we can trust j_num_trans here because _should_stop() is only set in + * shutdown and nobody other than ourselves should be able to start + * transactions. committing on shutdown might take a few iterations + * as final transactions put deleted inodes on the list */ + while (!(kthread_should_stop() && + atomic_read(&journal->j_num_trans) == 0)) { + + wait_event_interruptible_timeout(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop(), + OCFS2_CHECKPOINT_INTERVAL); + + status = ocfs2_commit_cache(osb); + if (status < 0) + mlog_errno(status); + + if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ + mlog(ML_KTHREAD, + "commit_thread: %u transactions pending on " + "shutdown\n", + atomic_read(&journal->j_num_trans)); + } + } + + return 0; +} + +/* Look for a dirty journal without taking any cluster locks. Used for + * hard readonly access to determine whether the file system journals + * require recovery. */ +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) +{ + int ret = 0; + unsigned int slot; + struct buffer_head *di_bh; + struct ocfs2_dinode *di; + struct inode *journal = NULL; + + for(slot = 0; slot < osb->max_slots; slot++) { + journal = ocfs2_get_system_file_inode(osb, + JOURNAL_SYSTEM_INODE, + slot); + if (!journal || is_bad_inode(journal)) { + ret = -EACCES; + mlog_errno(ret); + goto out; + } + + di_bh = NULL; + ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, + 0, journal); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + + if (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL) + ret = -EROFS; + + brelse(di_bh); + if (ret) + break; + } + +out: + if (journal) + iput(journal); + + return ret; +} diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h new file mode 100644 index 00000000000..7d0a816184f --- /dev/null +++ b/fs/ocfs2/journal.h @@ -0,0 +1,457 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.h + * + * Defines journalling api and structures. + * + * Copyright (C) 2003, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_JOURNAL_H +#define OCFS2_JOURNAL_H + +#include <linux/fs.h> +#include <linux/jbd.h> + +#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) + +enum ocfs2_journal_state { + OCFS2_JOURNAL_FREE = 0, + OCFS2_JOURNAL_LOADED, + OCFS2_JOURNAL_IN_SHUTDOWN, +}; + +struct ocfs2_super; +struct ocfs2_dinode; +struct ocfs2_journal_handle; + +struct ocfs2_journal { + enum ocfs2_journal_state j_state; /* Journals current state */ + + journal_t *j_journal; /* The kernels journal type */ + struct inode *j_inode; /* Kernel inode pointing to + * this journal */ + struct ocfs2_super *j_osb; /* pointer to the super + * block for the node + * we're currently + * running on -- not + * necessarily the super + * block from the node + * which we usually run + * from (recovery, + * etc) */ + struct buffer_head *j_bh; /* Journal disk inode block */ + atomic_t j_num_trans; /* Number of transactions + * currently in the system. */ + unsigned long j_trans_id; + struct rw_semaphore j_trans_barrier; + wait_queue_head_t j_checkpointed; + + spinlock_t j_lock; + struct list_head j_la_cleanups; + struct work_struct j_recovery_work; +}; + +extern spinlock_t trans_inc_lock; + +/* wrap j_trans_id so we never have it equal to zero. */ +static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) +{ + unsigned long old_id; + spin_lock(&trans_inc_lock); + old_id = j->j_trans_id++; + if (unlikely(!j->j_trans_id)) + j->j_trans_id = 1; + spin_unlock(&trans_inc_lock); + return old_id; +} + +static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, + struct inode *inode) +{ + spin_lock(&trans_inc_lock); + OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +/* Used to figure out whether it's safe to drop a metadata lock on an + * inode. Returns true if all the inodes changes have been + * checkpointed to disk. You should be holding the spinlock on the + * metadata lock while calling this to be sure that nobody can take + * the lock and put it on another transaction. */ +static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) +{ + int ret; + struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; + + spin_lock(&trans_inc_lock); + ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); + spin_unlock(&trans_inc_lock); + return ret; +} + +/* convenience function to check if an inode is still new (has never + * hit disk) Will do you a favor and set created_trans = 0 when you've + * been checkpointed. returns '1' if the inode is still new. */ +static inline int ocfs2_inode_is_new(struct inode *inode) +{ + int ret; + + /* System files are never "new" as they're written out by + * mkfs. This helps us early during mount, before we have the + * journal open and j_trans_id could be junk. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + return 0; + spin_lock(&trans_inc_lock); + ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, + OCFS2_I(inode)->ip_created_trans)); + if (!ret) + OCFS2_I(inode)->ip_created_trans = 0; + spin_unlock(&trans_inc_lock); + return ret; +} + +static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, + struct inode *inode) +{ + spin_lock(&trans_inc_lock); + OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +extern kmem_cache_t *ocfs2_lock_cache; + +struct ocfs2_journal_lock { + struct inode *jl_inode; + struct list_head jl_lock_list; +}; + +struct ocfs2_journal_handle { + handle_t *k_handle; /* kernel handle. */ + struct ocfs2_journal *journal; + u32 flags; /* see flags below. */ + int max_buffs; /* Buffs reserved by this handle */ + + /* The following two fields are for ocfs2_handle_add_lock */ + int num_locks; + struct list_head locks; /* A bunch of locks to + * release on commit. This + * should be a list_head */ + + struct list_head inode_list; +}; + +#define OCFS2_HANDLE_STARTED 1 +/* should we sync-commit this handle? */ +#define OCFS2_HANDLE_SYNC 2 +static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle) +{ + return handle->flags & OCFS2_HANDLE_STARTED; +} + +static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync) +{ + if (sync) + handle->flags |= OCFS2_HANDLE_SYNC; + else + handle->flags &= ~OCFS2_HANDLE_SYNC; +} + +/* Exported only for the journal struct init code in super.c. Do not call. */ +void ocfs2_complete_recovery(void *data); + +/* + * Journal Control: + * Initialize, Load, Shutdown, Wipe a journal. + * + * ocfs2_journal_init - Initialize journal structures in the OSB. + * ocfs2_journal_load - Load the given journal off disk. Replay it if + * there's transactions still in there. + * ocfs2_journal_shutdown - Shutdown a journal, this will flush all + * uncommitted, uncheckpointed transactions. + * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally + * zero out each block. + * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. + * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat + * event on. + * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. + */ +void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_init(struct ocfs2_journal *journal, + int *dirty); +void ocfs2_journal_shutdown(struct ocfs2_super *osb); +int ocfs2_journal_wipe(struct ocfs2_journal *journal, + int full); +int ocfs2_journal_load(struct ocfs2_journal *journal); +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); +void ocfs2_recovery_thread(struct ocfs2_super *osb, + int node_num); +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); + +static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) +{ + atomic_set(&osb->needs_checkpoint, 1); + wake_up(&osb->checkpoint_event); +} + +static inline void ocfs2_checkpoint_inode(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_inode_fully_checkpointed(inode)) { + /* WARNING: This only kicks off a single + * checkpoint. If someone races you and adds more + * metadata to the journal, you won't know, and will + * wind up waiting *alot* longer than necessary. Right + * now we only use this in clear_inode so that's + * OK. */ + ocfs2_start_checkpoint(osb); + + wait_event(osb->journal->j_checkpointed, + ocfs2_inode_fully_checkpointed(inode)); + } +} + +/* + * Transaction Handling: + * Manage the lifetime of a transaction handle. + * + * ocfs2_alloc_handle - Only allocate a handle so we can start putting + * cluster locks on it. To actually change blocks, + * call ocfs2_start_trans with the handle returned + * from this function. You may call ocfs2_commit_trans + * at any time in the lifetime of a handle. + * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of + * the number of blocks that will be changed during + * this handle. + * ocfs2_commit_trans - Complete a handle. + * ocfs2_extend_trans - Extend a handle by nblocks credits. This may + * commit the handle to disk in the process, but will + * not release any locks taken during the transaction. + * ocfs2_journal_access - Notify the handle that we want to journal this + * buffer. Will have to call ocfs2_journal_dirty once + * we've actually dirtied it. Type is one of . or . + * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. + * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before + * the current handle commits. + * ocfs2_handle_add_lock - Sometimes we need to delay lock release + * until after a transaction has been completed. Use + * ocfs2_handle_add_lock to indicate that a lock needs + * to be released at the end of that handle. Locks + * will be released in the order that they are added. + * ocfs2_handle_add_inode - Add a locked inode to a transaction. + */ + +/* You must always start_trans with a number of buffs > 0, but it's + * perfectly legal to go through an entire transaction without having + * dirtied any buffers. */ +struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb); +struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + int max_buffs); +void ocfs2_commit_trans(struct ocfs2_journal_handle *handle); +int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, + int nblocks); + +/* + * Create access is for when we get a newly created buffer and we're + * not gonna read it off disk, but rather fill it ourselves. Right + * now, we don't do anything special with this (it turns into a write + * request), but this is a good placeholder in case we do... + * + * Write access is for when we read a block off disk and are going to + * modify it. This way the journalling layer knows it may need to make + * a copy of that block (if it's part of another, uncommitted + * transaction) before we do so. + */ +#define OCFS2_JOURNAL_ACCESS_CREATE 0 +#define OCFS2_JOURNAL_ACCESS_WRITE 1 +#define OCFS2_JOURNAL_ACCESS_UNDO 2 + +int ocfs2_journal_access(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh, + int type); +/* + * A word about the journal_access/journal_dirty "dance". It is + * entirely legal to journal_access a buffer more than once (as long + * as the access type is the same -- I'm not sure what will happen if + * access type is different but this should never happen anyway) It is + * also legal to journal_dirty a buffer more than once. In fact, you + * can even journal_access a buffer after you've done a + * journal_access/journal_dirty pair. The only thing you cannot do + * however, is journal_dirty a buffer which you haven't yet passed to + * journal_access at least once. + * + * That said, 99% of the time this doesn't matter and this is what the + * path looks like: + * + * <read a bh> + * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); + * <modify the bh> + * ocfs2_journal_dirty(handle, bh); + */ +int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, + struct buffer_head *bh); +int ocfs2_journal_dirty_data(handle_t *handle, + struct buffer_head *bh); +int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, + struct inode *inode); +/* + * Use this to protect from other processes reading buffer state while + * it's in flight. + */ +void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, + struct inode *inode); + +/* + * Credit Macros: + * Convenience macros to calculate number of credits needed. + * + * For convenience sake, I have a set of macros here which calculate + * the *maximum* number of sectors which will be changed for various + * metadata updates. + */ + +/* simple file updates like chmod, etc. */ +#define OCFS2_INODE_UPDATE_CREDITS 1 + +/* get one bit out of a suballocator: dinode + group descriptor + + * prev. group desc. if we relink. */ +#define OCFS2_SUBALLOC_ALLOC (3) + +/* dinode + group descriptor update. We don't relink on free yet. */ +#define OCFS2_SUBALLOC_FREE (2) + +#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS +#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + + OCFS2_TRUNCATE_LOG_UPDATE) + +/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + + * bitmap block for the new bit) */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) + +/* parent fe, parent block, new file entry, inode alloc fe, inode alloc + * group descriptor + mkdir/symlink blocks */ +#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ + + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) + +/* local alloc metadata change + main bitmap updates */ +#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) + +/* used when we don't need an allocation change for a dir extend. One + * for the dinode, one for the new block. */ +#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) + +/* file update (nlink, etc) + dir entry block */ +#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* inode + dir inode (if we unlink a dir), + dir entry block + orphan + * dir inode link */ +#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ + + OCFS2_LINK_CREDITS) + +/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + + * inode alloc group descriptor */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) + +/* dinode update, old dir dinode update, new dir dinode update, old + * dir dir entry, new dir dir entry, dir entry update for renaming + * directory + target unlink */ +#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ + + OCFS2_UNLINK_CREDITS) + +static inline int ocfs2_calc_extend_credits(struct super_block *sb, + struct ocfs2_dinode *fe, + u32 bits_wanted) +{ + int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; + + /* bitmap dinode, group desc. + relinked group. */ + bitmap_blocks = OCFS2_SUBALLOC_ALLOC; + + /* we might need to shift tree depth so lets assume an + * absolute worst case of complete fragmentation. Even with + * that, we only need one update for the dinode, and then + * however many metadata chunks needed * a remaining suballoc + * alloc. */ + sysfile_bitmap_blocks = 1 + + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); + + /* this does not include *new* metadata blocks, which are + * accounted for in sysfile_bitmap_blocks. fe + + * prev. last_eb_blk + blocks along edge of tree. + * calc_symlink_credits passes because we just need 1 + * credit for the dinode there. */ + dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); + + return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; +} + +static inline int ocfs2_calc_symlink_credits(struct super_block *sb) +{ + int blocks = OCFS2_MKNOD_CREDITS; + + /* links can be longer than one block so we may update many + * within our single allocated extent. */ + blocks += ocfs2_clusters_to_blocks(sb, 1); + + return blocks; +} + +static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, + unsigned int cpg) +{ + int blocks; + int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; + /* parent inode update + new block group header + bitmap inode update + + bitmap blocks affected */ + blocks = 1 + 1 + 1 + bitmap_blocks; + return blocks; +} + +static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, + unsigned int clusters_to_del, + struct ocfs2_dinode *fe, + struct ocfs2_extent_list *last_el) +{ + /* for dinode + all headers in this pass + update to next leaf */ + u16 next_free = le16_to_cpu(last_el->l_next_free_rec); + u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); + int credits = 1 + tree_depth + 1; + int i; + + i = next_free - 1; + BUG_ON(i < 0); + + /* We may be deleting metadata blocks, so metadata alloc dinode + + one desc. block for each possible delete. */ + if (tree_depth && next_free == 1 && + le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) + credits += 1 + tree_depth; + + /* update to the truncate log. */ + credits += OCFS2_TRUNCATE_LOG_UPDATE; + + return credits; +} + +#endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c new file mode 100644 index 00000000000..fe373a2101d --- /dev/null +++ b/fs/ocfs2/localalloc.c @@ -0,0 +1,983 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.c + * + * Node local data allocation + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/bitops.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) + +static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 numbits); + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); + +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh); + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh); + +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac); + +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode); + +/* + * Determine how large our local alloc window should be, in bits. + * + * These values (and the behavior in ocfs2_alloc_should_use_local) have + * been chosen so that most allocations, including new block groups go + * through local alloc. + */ +static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) +{ + BUG_ON(osb->s_clustersize_bits < 12); + + return 2048 >> (osb->s_clustersize_bits - 12); +} + +/* + * Tell us whether a given allocation should use the local alloc + * file. Otherwise, it has to go to the main bitmap. + */ +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) +{ + int la_bits = ocfs2_local_alloc_window_bits(osb); + + if (osb->local_alloc_state != OCFS2_LA_ENABLED) + return 0; + + /* la_bits should be at least twice the size (in clusters) of + * a new block group. We want to be sure block group + * allocations go through the local alloc, so allow an + * allocation to take up to half the bitmap. */ + if (bits > (la_bits / 2)) + return 0; + + return 1; +} + +int ocfs2_load_local_alloc(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_dinode *alloc = NULL; + struct buffer_head *alloc_bh = NULL; + u32 num_used; + struct inode *inode = NULL; + struct ocfs2_local_alloc *la; + + mlog_entry_void(); + + /* read the alloc off disk */ + inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, + &alloc_bh, 0, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + if (!(le32_to_cpu(alloc->i_flags) & + (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) { + mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n", + OCFS2_I(inode)->ip_blkno); + status = -EINVAL; + goto bail; + } + + if ((la->la_size == 0) || + (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) { + mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n", + le16_to_cpu(la->la_size)); + status = -EINVAL; + goto bail; + } + + /* do a little verification. */ + num_used = ocfs2_local_alloc_count_bits(alloc); + + /* hopefully the local alloc has always been recovered before + * we load it. */ + if (num_used + || alloc->id1.bitmap1.i_used + || alloc->id1.bitmap1.i_total + || la->la_bm_off) + mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + "found = %u, set = %u, taken = %u, off = %u\n", + num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), + le32_to_cpu(alloc->id1.bitmap1.i_total), + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + + osb->local_alloc_bh = alloc_bh; + osb->local_alloc_state = OCFS2_LA_ENABLED; + +bail: + if (status < 0) + if (alloc_bh) + brelse(alloc_bh); + if (inode) + iput(inode); + + mlog_exit(status); + return status; +} + +/* + * return any unused bits to the bitmap and write out a clean + * local_alloc. + * + * local_alloc_bh is optional. If not passed, we will simply use the + * one off osb. If you do pass it however, be warned that it *will* be + * returned brelse'd and NULL'd out.*/ +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_journal_handle *handle = NULL; + struct inode *local_alloc_inode = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_dinode *alloc = NULL; + + mlog_entry_void(); + + if (osb->local_alloc_state == OCFS2_LA_UNUSED) + goto bail; + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + osb->local_alloc_state = OCFS2_LA_DISABLED; + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + ocfs2_handle_add_inode(handle, main_bm_inode); + status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* WINDOW_MOVE_CREDITS is a bit heavy... */ + handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + handle = NULL; + goto bail; + } + + bh = osb->local_alloc_bh; + alloc = (struct ocfs2_dinode *) bh->b_data; + + alloc_copy = kmalloc(bh->b_size, GFP_KERNEL); + if (!alloc_copy) { + status = -ENOMEM; + goto bail; + } + memcpy(alloc_copy, alloc, bh->b_size); + + status = ocfs2_journal_access(handle, local_alloc_inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_clear_local_alloc(alloc); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + brelse(bh); + osb->local_alloc_bh = NULL; + osb->local_alloc_state = OCFS2_LA_UNUSED; + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (main_bm_bh) + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + if (local_alloc_inode) + iput(local_alloc_inode); + + if (alloc_copy) + kfree(alloc_copy); + + mlog_exit_void(); +} + +/* + * We want to free the bitmap bits outside of any recovery context as + * we'll need a cluster lock to do so, but we must clear the local + * alloc before giving up the recovered nodes journal. To solve this, + * we kmalloc a copy of the local alloc before it's change for the + * caller to process with ocfs2_complete_local_alloc_recovery + */ +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **alloc_copy) +{ + int status = 0; + struct buffer_head *alloc_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_dinode *alloc; + + mlog_entry("(slot_num = %d)\n", slot_num); + + *alloc_copy = NULL; + + inode = ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + down(&inode->i_sem); + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, + &alloc_bh, 0, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); + if (!(*alloc_copy)) { + status = -ENOMEM; + goto bail; + } + memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size); + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + ocfs2_clear_local_alloc(alloc); + + status = ocfs2_write_block(osb, alloc_bh, inode); + if (status < 0) + mlog_errno(status); + +bail: + if ((status < 0) && (*alloc_copy)) { + kfree(*alloc_copy); + *alloc_copy = NULL; + } + + if (alloc_bh) + brelse(alloc_bh); + + if (inode) { + up(&inode->i_sem); + iput(inode); + } + + mlog_exit(status); + return status; +} + +/* + * Step 2: By now, we've completed the journal recovery, we've stamped + * a clean local alloc on disk and dropped the node out of the + * recovery map. Dlm locks will no longer stall, so lets clear out the + * main bitmap. + */ +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc) +{ + int status; + struct ocfs2_journal_handle *handle = NULL; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + + mlog_entry_void(); + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + ocfs2_handle_add_inode(handle, main_bm_inode); + status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + /* we want the bitmap change to be recorded on disk asap */ + ocfs2_handle_set_sync(handle, 1); + + status = ocfs2_sync_local_to_main(osb, handle, alloc, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (main_bm_bh) + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + mlog_exit(status); + return status; +} + +/* + * make sure we've got at least bitswanted contiguous bits in the + * local alloc. You lose them when you drop i_sem. + * + * We will add ourselves to the transaction passed in, but may start + * our own in order to shift windows. + */ +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *passed_handle, + u32 bits_wanted, + struct ocfs2_alloc_context *ac) +{ + int status; + struct ocfs2_dinode *alloc; + struct inode *local_alloc_inode; + unsigned int free_bits; + + mlog_entry_void(); + + BUG_ON(!passed_handle); + BUG_ON(!ac); + BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED); + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + ocfs2_handle_add_inode(passed_handle, local_alloc_inode); + + if (osb->local_alloc_state != OCFS2_LA_ENABLED) { + status = -ENOSPC; + goto bail; + } + + if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { + mlog(0, "Asking for more than my max window size!\n"); + status = -ENOSPC; + goto bail; + } + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + + if (le32_to_cpu(alloc->id1.bitmap1.i_used) != + ocfs2_local_alloc_count_bits(alloc)) { + ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has " + "%u free bits, but a count shows %u", + le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); + status = -EIO; + goto bail; + } + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) { + /* uhoh, window change time. */ + status = + ocfs2_local_alloc_slide_window(osb, local_alloc_inode); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + ac->ac_inode = igrab(local_alloc_inode); + get_bh(osb->local_alloc_bh); + ac->ac_bh = osb->local_alloc_bh; + ac->ac_which = OCFS2_AC_USE_LOCAL; + status = 0; +bail: + if (local_alloc_inode) + iput(local_alloc_inode); + + mlog_exit(status); + return status; +} + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_bits, + u32 *bit_off, + u32 *num_bits) +{ + int status, start; + struct inode *local_alloc_inode; + u32 bits_wanted; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + mlog_entry_void(); + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + if (start == -1) { + /* TODO: Shouldn't we just BUG here? */ + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + bitmap = la->la_bitmap; + *bit_off = le32_to_cpu(la->la_bm_off) + start; + /* local alloc is always contiguous by nature -- we never + * delete bits from it! */ + *num_bits = bits_wanted; + + status = ocfs2_journal_access(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while(bits_wanted--) + ocfs2_set_bit(start++, bitmap); + + alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits + + le32_to_cpu(alloc->id1.bitmap1.i_used)); + + status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) +{ + int i; + u8 *buffer; + u32 count = 0; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + mlog_entry_void(); + + buffer = la->la_bitmap; + for (i = 0; i < le16_to_cpu(la->la_size); i++) + count += hweight8(buffer[i]); + + mlog_exit(count); + return count; +} + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 numbits) +{ + int numfound, bitoff, left, startoff, lastzero; + void *bitmap = NULL; + + mlog_entry("(numbits wanted = %u)\n", numbits); + + if (!alloc->id1.bitmap1.i_total) { + mlog(0, "No bits in my window!\n"); + bitoff = -1; + goto bail; + } + + bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; + + numfound = bitoff = startoff = 0; + lastzero = -1; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { + if (bitoff == left) { + /* mlog(0, "bitoff (%d) == left", bitoff); */ + break; + } + /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " + "numfound = %d\n", bitoff, startoff, numfound);*/ + + /* Ok, we found a zero bit... is it contig. or do we + * start over?*/ + if (bitoff == startoff) { + /* we found a zero */ + numfound++; + startoff++; + } else { + /* got a zero after some ones */ + numfound = 1; + startoff = bitoff+1; + } + /* we got everything we needed */ + if (numfound == numbits) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, + numfound); + + if (numfound == numbits) + bitoff = startoff - numfound; + else + bitoff = -1; + +bail: + mlog_exit(bitoff); + return bitoff; +} + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) +{ + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + int i; + mlog_entry_void(); + + alloc->id1.bitmap1.i_total = 0; + alloc->id1.bitmap1.i_used = 0; + la->la_bm_off = 0; + for(i = 0; i < le16_to_cpu(la->la_size); i++) + la->la_bitmap[i] = 0; + + mlog_exit_void(); +} + +#if 0 +/* turn this on and uncomment below to aid debugging window shifts. */ +static void ocfs2_verify_zero_bits(unsigned long *bitmap, + unsigned int start, + unsigned int count) +{ + unsigned int tmp = count; + while(tmp--) { + if (ocfs2_test_bit(start + tmp, bitmap)) { + printk("ocfs2_verify_zero_bits: start = %u, count = " + "%u\n", start, count); + printk("ocfs2_verify_zero_bits: bit %u is set!", + start + tmp); + BUG(); + } + } +} +#endif + +/* + * sync the local alloc to main bitmap. + * + * assumes you've already locked the main bitmap -- the bitmap inode + * passed is used for caching. + */ +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh) +{ + int status = 0; + int bit_off, left, count, start; + u64 la_start_blk; + u64 blkno; + void *bitmap; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + mlog_entry("total = %u, COUNT = %u, used = %u\n", + le32_to_cpu(alloc->id1.bitmap1.i_total), + ocfs2_local_alloc_count_bits(alloc), + le32_to_cpu(alloc->id1.bitmap1.i_used)); + + if (!alloc->id1.bitmap1.i_total) { + mlog(0, "nothing to sync!\n"); + goto bail; + } + + if (le32_to_cpu(alloc->id1.bitmap1.i_used) == + le32_to_cpu(alloc->id1.bitmap1.i_total)) { + mlog(0, "all bits were taken!\n"); + goto bail; + } + + la_start_blk = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(la->la_bm_off)); + bitmap = la->la_bitmap; + start = count = bit_off = 0; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) + != -1) { + if ((bit_off < left) && (bit_off == start)) { + count++; + start++; + continue; + } + if (count) { + blkno = la_start_blk + + ocfs2_clusters_to_blocks(osb->sb, + start - count); + + mlog(0, "freeing %u bits starting at local " + "alloc bit %u (la_start_blk = %"MLFu64", " + "blkno = %"MLFu64")\n", count, start - count, + la_start_blk, blkno); + + status = ocfs2_free_clusters(handle, main_bm_inode, + main_bm_bh, blkno, count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + if (bit_off >= left) + break; + count = 1; + start = bit_off + 1; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh) +{ + int status; + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_handle = handle; + (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); + + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + *bitmap_inode = (*ac)->ac_inode; + igrab(*bitmap_inode); + *bitmap_bh = (*ac)->ac_bh; + get_bh(*bitmap_bh); + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + mlog_exit(status); + return status; +} + +/* + * pass it the bitmap lock in lock_bh if you have it. + */ +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac) +{ + int status = 0; + u32 cluster_off, cluster_count; + struct ocfs2_dinode *alloc = NULL; + struct ocfs2_local_alloc *la; + + mlog_entry_void(); + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + if (alloc->id1.bitmap1.i_total) + mlog(0, "asking me to alloc a new window over a non-empty " + "one\n"); + + mlog(0, "Allocating %u clusters for a new window.\n", + ocfs2_local_alloc_window_bits(osb)); + /* we used the generic suballoc reserve function, but we set + * everything up nicely, so there's no reason why we can't use + * the more specific cluster api to claim bits. */ + status = ocfs2_claim_clusters(osb, handle, ac, + ocfs2_local_alloc_window_bits(osb), + &cluster_off, &cluster_count); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + la->la_bm_off = cpu_to_le32(cluster_off); + alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); + /* just in case... In the future when we find space ourselves, + * we don't have to get all contiguous -- but we'll have to + * set all previously used bits in bitmap and update + * la_bits_set before setting the bits in the main bitmap. */ + alloc->id1.bitmap1.i_used = 0; + memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, + le16_to_cpu(la->la_size)); + + mlog(0, "New window allocated:\n"); + mlog(0, "window la_bm_off = %u\n", + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total)); + +bail: + mlog_exit(status); + return status; +} + +/* Note that we do *NOT* lock the local alloc inode here as + * it's been locked already for us. */ +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode) +{ + int status = 0; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_dinode *alloc; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_alloc_context *ac = NULL; + + mlog_entry_void(); + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* This will lock the main bitmap for us. */ + status = ocfs2_local_alloc_reserve_for_window(osb, + handle, + &ac, + &main_bm_inode, + &main_bm_bh); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + + /* We want to clear the local alloc before doing anything + * else, so that if we error later during this operation, + * local alloc shutdown won't try to double free main bitmap + * bits. Make a copy so the sync function knows which bits to + * free. */ + alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL); + if (!alloc_copy) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); + + status = ocfs2_journal_access(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_clear_local_alloc(alloc); + + status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_local_alloc_new_window(osb, handle, ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + atomic_inc(&osb->alloc_stats.moves); + + status = 0; +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (main_bm_bh) + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + if (alloc_copy) + kfree(alloc_copy); + + if (ac) + ocfs2_free_alloc_context(ac); + + mlog_exit(status); + return status; +} + diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h new file mode 100644 index 00000000000..30f88ce14e4 --- /dev/null +++ b/fs/ocfs2/localalloc.h @@ -0,0 +1,56 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCALALLOC_H +#define OCFS2_LOCALALLOC_H + +int ocfs2_load_local_alloc(struct ocfs2_super *osb); + +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); + +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int node_num, + struct ocfs2_dinode **alloc_copy); + +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc); + +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, + u64 bits); + +struct ocfs2_alloc_context; +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *passed_handle, + u32 bits_wanted, + struct ocfs2_alloc_context *ac); + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_bits, + u32 *bit_off, + u32 *num_bits); + +#endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c new file mode 100644 index 00000000000..afdeec4b0ee --- /dev/null +++ b/fs/ocfs2/mmap.c @@ -0,0 +1,102 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mmap.c + * + * Code to deal with the mess that is clustered mmap. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/signal.h> +#include <linux/rbtree.h> + +#define MLOG_MASK_PREFIX ML_FILE_IO +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "mmap.h" + +static struct page *ocfs2_nopage(struct vm_area_struct * area, + unsigned long address, + int *type) +{ + struct inode *inode = area->vm_file->f_dentry->d_inode; + struct page *page = NOPAGE_SIGBUS; + sigset_t blocked, oldset; + int ret; + + mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address); + + /* The best way to deal with signals in this path is + * to block them upfront, rather than allowing the + * locking paths to return -ERESTARTSYS. */ + sigfillset(&blocked); + + /* We should technically never get a bad ret return + * from sigprocmask */ + ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + page = filemap_nopage(area, address, type); + + ret = sigprocmask(SIG_SETMASK, &oldset, NULL); + if (ret < 0) + mlog_errno(ret); +out: + mlog_exit_ptr(page); + return page; +} + +static struct vm_operations_struct ocfs2_file_vm_ops = { + .nopage = ocfs2_nopage, +}; + +int ocfs2_mmap(struct file *file, + struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + /* We don't want to support shared writable mappings yet. */ + if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) + && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { + mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); + /* This is -EINVAL because generic_file_readonly_mmap + * returns it in a similar situation. */ + return -EINVAL; + } + + update_atime(inode); + vma->vm_ops = &ocfs2_file_vm_ops; + return 0; +} + diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h new file mode 100644 index 00000000000..1274ee0f1fe --- /dev/null +++ b/fs/ocfs2/mmap.h @@ -0,0 +1,6 @@ +#ifndef OCFS2_MMAP_H +#define OCFS2_MMAP_H + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); + +#endif /* OCFS2_MMAP_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c new file mode 100644 index 00000000000..f6b77ff1d2b --- /dev/null +++ b/fs/ocfs2/namei.c @@ -0,0 +1,2264 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.c + * + * Create and rename file, directory, symlinks + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_NAMEI +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static int inline ocfs2_search_dirblock(struct buffer_head *bh, + struct inode *dir, + const char *name, int namelen, + unsigned long offset, + struct ocfs2_dir_entry **res_dir); + +static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh); + +static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct buffer_head *insert_bh); + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct dentry *dentry, int mode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + struct ocfs2_journal_handle *handle, + struct inode **ret_inode, + struct ocfs2_alloc_context *inode_ac); + +static int ocfs2_fill_new_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac); + +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2); + +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + char *name, + struct buffer_head **de_bh); + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct ocfs2_dinode *fe, + char *name, + struct buffer_head *de_bh); + +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + const char *symname); + +static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle, + struct dentry *dentry, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct buffer_head *insert_bh) +{ + return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + dentry->d_name.name, dentry->d_name.len, + inode, blkno, parent_fe_bh, insert_bh); +} + +/* An orphan dir name is an 8 byte value, printed as a hex string */ +#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) + +static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int status; + u64 blkno; + struct buffer_head *dirent_bh = NULL; + struct inode *inode = NULL; + struct dentry *ret; + struct ocfs2_dir_entry *dirent; + struct ocfs2_inode_info *oi; + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { + ret = ERR_PTR(-ENAMETOOLONG); + goto bail; + } + + mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len, + dentry->d_name.name, OCFS2_I(dir)->ip_blkno); + + status = ocfs2_meta_lock(dir, NULL, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ret = ERR_PTR(status); + goto bail; + } + + status = ocfs2_find_files_on_disk(dentry->d_name.name, + dentry->d_name.len, &blkno, + dir, &dirent_bh, &dirent); + if (status < 0) + goto bail_add; + + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); + if (IS_ERR(inode)) { + mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); + ret = ERR_PTR(-EACCES); + goto bail_unlock; + } + + oi = OCFS2_I(inode); + /* Clear any orphaned state... If we were able to look up the + * inode from a directory, it certainly can't be orphaned. We + * might have the bad state from a node which intended to + * orphan this inode but crashed before it could commit the + * unlink. */ + spin_lock(&oi->ip_lock); + oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; + oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; + spin_unlock(&oi->ip_lock); + +bail_add: + + dentry->d_op = &ocfs2_dentry_ops; + ret = d_splice_alias(inode, dentry); + +bail_unlock: + /* Don't drop the cluster lock until *after* the d_add -- + * unlink on another node will message us to remove that + * dentry under this lock so otherwise we can race this with + * the vote thread and have a stale dentry. */ + ocfs2_meta_unlock(dir, 0); + +bail: + if (dirent_bh) + brelse(dirent_bh); + + mlog_exit_ptr(ret); + + return ret; +} + +static int ocfs2_fill_new_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac) +{ + int status; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry *de = NULL; + + mlog_entry_void(); + + status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, + data_ac, NULL, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + status = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, osb->sb->s_blocksize); + + de = (struct ocfs2_dir_entry *) new_bh->b_data; + de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + de->name_len = 1; + de->rec_len = + cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + strcpy(de->name, "."); + ocfs2_set_de_type(de, S_IFDIR); + de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); + de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize - + OCFS2_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy(de->name, ".."); + ocfs2_set_de_type(de, S_IFDIR); + + status = ocfs2_journal_dirty(handle, new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + i_size_write(inode, inode->i_sb->s_blocksize); + inode->i_nlink = 2; + inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if (new_bh) + brelse(new_bh); + + mlog_exit(status); + return status; +} + +static int ocfs2_mknod(struct inode *dir, + struct dentry *dentry, + int mode, + dev_t dev) +{ + int status = 0; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_super *osb; + struct ocfs2_dinode *dirfe; + struct buffer_head *new_fe_bh = NULL; + struct buffer_head *de_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, + (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + + /* get our super block */ + osb = OCFS2_SB(dir->i_sb); + + if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { + mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n", + OCFS2_I(dir)->ip_blkno, dir->i_nlink); + status = -EMLINK; + goto leave; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!dirfe->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + /* are we making a directory? If so, reserve a cluster for his + * 1st extent. */ + if (S_ISDIR(mode)) { + status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, + &new_fe_bh, parent_fe_bh, handle, + &inode, inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(mode)) { + status = ocfs2_fill_new_dir(osb, handle, dir, inode, + new_fe_bh, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle, dir, parent_fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + le16_add_cpu(&dirfe->i_links_count, 1); + status = ocfs2_journal_dirty(handle, parent_fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + dir->i_nlink++; + } + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); + status = 0; +leave: + if (handle) + ocfs2_commit_trans(handle); + + if (status == -ENOSPC) + mlog(0, "Disk is full\n"); + + if (new_fe_bh) + brelse(new_fe_bh); + + if (de_bh) + brelse(de_bh); + + if (parent_fe_bh) + brelse(parent_fe_bh); + + if ((status < 0) && inode) + iput(inode); + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + mlog_exit(status); + + return status; +} + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct dentry *dentry, int mode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + struct ocfs2_journal_handle *handle, + struct inode **ret_inode, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_extent_list *fel; + u64 fe_blkno = 0; + u16 suballoc_bit; + struct inode *inode = NULL; + + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, + (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + + *new_fe_bh = NULL; + *ret_inode = NULL; + + status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, + &fe_blkno); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + inode = new_inode(dir->i_sb); + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + mlog(ML_ERROR, "new_inode failed!\n"); + goto leave; + } + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); + OCFS2_I(inode)->ip_blkno = fe_blkno; + if (S_ISDIR(mode)) + inode->i_nlink = 2; + else + inode->i_nlink = 1; + inode->i_mode = mode; + spin_lock(&osb->osb_lock); + inode->i_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + *new_fe_bh = sb_getblk(osb->sb, fe_blkno); + if (!*new_fe_bh) { + status = -EIO; + mlog_errno(status); + goto leave; + } + ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); + + status = ocfs2_journal_access(handle, inode, *new_fe_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; + memset(fe, 0, osb->sb->s_blocksize); + + fe->i_generation = cpu_to_le32(inode->i_generation); + fe->i_fs_generation = cpu_to_le32(osb->fs_generation); + fe->i_blkno = cpu_to_le64(fe_blkno); + fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); + fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); + fe->i_uid = cpu_to_le32(current->fsuid); + if (dir->i_mode & S_ISGID) { + fe->i_gid = cpu_to_le32(dir->i_gid); + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + fe->i_gid = cpu_to_le32(current->fsgid); + fe->i_mode = cpu_to_le16(mode); + if (S_ISCHR(mode) || S_ISBLK(mode)) + fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); + + fe->i_links_count = cpu_to_le16(inode->i_nlink); + + fe->i_last_eb_blk = 0; + strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); + le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); + fe->i_atime = fe->i_ctime = fe->i_mtime = + cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = + cpu_to_le32(CURRENT_TIME.tv_nsec); + fe->i_dtime = 0; + + fel = &fe->id2.i_list; + fel->l_tree_depth = 0; + fel->l_next_free_rec = 0; + fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); + + status = ocfs2_journal_dirty(handle, *new_fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (ocfs2_populate_inode(inode, fe, 1) < 0) { + mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " + "i_blkno=%"MLFu64", i_ino=%lu\n", + (unsigned long long) (*new_fe_bh)->b_blocknr, + fe->i_blkno, inode->i_ino); + BUG(); + } + + ocfs2_inode_set_new(osb, inode); + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + + status = 0; /* error in ocfs2_create_new_inode_locks is not + * critical */ + + *ret_inode = inode; +leave: + if (status < 0) { + if (*new_fe_bh) { + brelse(*new_fe_bh); + *new_fe_bh = NULL; + } + if (inode) + iput(inode); + } + + mlog_exit(status); + return status; +} + +static int ocfs2_mkdir(struct inode *dir, + struct dentry *dentry, + int mode) +{ + int ret; + + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, + dentry->d_name.len, dentry->d_name.name); + ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + mlog_exit(ret); + + return ret; +} + +static int ocfs2_create(struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + int ret; + + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, + dentry->d_name.len, dentry->d_name.name); + ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + mlog_exit(ret); + + return ret; +} + +static int ocfs2_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct ocfs2_journal_handle *handle = NULL; + struct inode *inode = old_dentry->d_inode; + int err; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct buffer_head *de_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, + old_dentry->d_name.len, old_dentry->d_name.name, + dentry->d_name.len, dentry->d_name.name); + + if (S_ISDIR(inode->i_mode)) { + err = -EPERM; + goto bail; + } + + if (inode->i_nlink >= OCFS2_LINK_MAX) { + err = -EMLINK; + goto bail; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + err = -ENOMEM; + goto bail; + } + + err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (err) + goto bail; + + err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (err < 0) { + mlog_errno(err); + goto bail; + } + + err = ocfs2_meta_lock(inode, handle, &fe_bh, 1); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { + err = -EMLINK; + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + mlog_errno(err); + goto bail; + } + + err = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (err < 0) { + mlog_errno(err); + goto bail; + } + + inode->i_nlink++; + inode->i_ctime = CURRENT_TIME; + fe->i_links_count = cpu_to_le16(inode->i_nlink); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + err = ocfs2_journal_dirty(handle, fe_bh); + if (err < 0) { + le16_add_cpu(&fe->i_links_count, -1); + inode->i_nlink--; + mlog_errno(err); + goto bail; + } + + err = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, + parent_fe_bh, de_bh); + if (err) { + le16_add_cpu(&fe->i_links_count, -1); + inode->i_nlink--; + mlog_errno(err); + goto bail; + } + + atomic_inc(&inode->i_count); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); +bail: + if (handle) + ocfs2_commit_trans(handle); + if (de_bh) + brelse(de_bh); + if (fe_bh) + brelse(fe_bh); + if (parent_fe_bh) + brelse(parent_fe_bh); + + mlog_exit(err); + + return err; +} + +static int ocfs2_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + unsigned int saved_nlink = 0; + struct inode *inode = dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + u64 blkno; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_node_bh = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_dir_entry *dirent = NULL; + struct buffer_head *dirent_bh = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *orphan_entry_bh = NULL; + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + BUG_ON(dentry->d_parent->d_inode != dir); + + mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + if (inode == osb->root_inode) { + mlog(0, "Cannot delete the root directory\n"); + status = -EPERM; + goto leave; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + status = ocfs2_find_files_on_disk(dentry->d_name.name, + dentry->d_name.len, &blkno, + dir, &dirent_bh, &dirent); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + if (OCFS2_I(inode)->ip_blkno != blkno) { + status = -ENOENT; + + mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") " + "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno, + OCFS2_I(inode)->ip_flags); + goto leave; + } + + status = ocfs2_meta_lock(inode, handle, &fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(inode->i_mode)) { + if (!ocfs2_empty_dir(inode)) { + status = -ENOTEMPTY; + goto leave; + } else if (inode->i_nlink != 2) { + status = -ENOTEMPTY; + goto leave; + } + } + + /* There are still a few steps left until we can consider the + * unlink to have succeeded. Save off nlink here before + * modification so we can set it back in case we hit an issue + * before commit. */ + saved_nlink = inode->i_nlink; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 0; + else + inode->i_nlink--; + + status = ocfs2_request_unlink_vote(inode, dentry, + (unsigned int) inode->i_nlink); + if (status < 0) { + /* This vote should succeed under all normal + * circumstances. */ + mlog_errno(status); + goto leave; + } + + if (!inode->i_nlink) { + status = ocfs2_prepare_orphan_dir(osb, handle, inode, + orphan_name, + &orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (!inode->i_nlink) { + status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, + orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + /* delete the name from the parent dir */ + status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* We can set nlink on the dinode now. clear the saved version + * so that it doesn't get set later. */ + fe->i_links_count = cpu_to_le16(inode->i_nlink); + saved_nlink = 0; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(inode->i_mode)) { + dir->i_nlink--; + status = ocfs2_mark_inode_dirty(handle, dir, + parent_node_bh); + if (status < 0) { + mlog_errno(status); + dir->i_nlink++; + } + } + +leave: + if (status < 0 && saved_nlink) + inode->i_nlink = saved_nlink; + + if (handle) + ocfs2_commit_trans(handle); + + if (fe_bh) + brelse(fe_bh); + + if (dirent_bh) + brelse(dirent_bh); + + if (parent_node_bh) + brelse(parent_node_bh); + + if (orphan_entry_bh) + brelse(orphan_entry_bh); + + mlog_exit(status); + + return status; +} + +/* + * The only place this should be used is rename! + * if they have the same id, then the 1st one is the only one locked. + */ +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2) +{ + int status; + struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); + struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); + struct buffer_head **tmpbh; + struct inode *tmpinode; + + mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n", + oi1->ip_blkno, oi2->ip_blkno); + + BUG_ON(!handle); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* we always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno != oi2->ip_blkno) { + if (oi1->ip_blkno < oi2->ip_blkno) { + /* switch id1 and id2 around */ + mlog(0, "switching them around...\n"); + tmpbh = bh2; + bh2 = bh1; + bh1 = tmpbh; + + tmpinode = inode2; + inode2 = inode1; + inode1 = tmpinode; + } + /* lock id2 */ + status = ocfs2_meta_lock(inode2, handle, bh2, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + } + /* lock id1 */ + status = ocfs2_meta_lock(inode1, handle, bh1, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } +bail: + mlog_exit(status); + return status; +} + +#define PARENT_INO(buffer) \ + ((struct ocfs2_dir_entry *) \ + ((char *)buffer + \ + le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode + +static int ocfs2_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + int status = 0, rename_lock = 0; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct ocfs2_dinode *newfe = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *orphan_entry_bh = NULL; + struct buffer_head *newfe_bh = NULL; + struct buffer_head *insert_entry_bh = NULL; + struct ocfs2_super *osb = NULL; + u64 newfe_blkno; + struct ocfs2_journal_handle *handle = NULL; + struct buffer_head *old_dir_bh = NULL; + struct buffer_head *new_dir_bh = NULL; + struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry + // and new_dentry + struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above + struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, + // this is the 1st dirent bh + nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; + unsigned int links_count; + + /* At some point it might be nice to break this function up a + * bit. */ + + mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", + old_dir, old_dentry, new_dir, new_dentry, + old_dentry->d_name.len, old_dentry->d_name.name, + new_dentry->d_name.len, new_dentry->d_name.name); + + osb = OCFS2_SB(old_dir->i_sb); + + if (new_inode) { + if (!igrab(new_inode)) + BUG(); + } + + if (atomic_read(&old_dentry->d_count) > 2) { + shrink_dcache_parent(old_dentry); + if (atomic_read(&old_dentry->d_count) > 2) { + status = -EBUSY; + goto bail; + } + } + + /* Assume a directory heirarchy thusly: + * a/b/c + * a/d + * a,b,c, and d are all directories. + * + * from cwd of 'a' on both nodes: + * node1: mv b/c d + * node2: mv d b/c + * + * And that's why, just like the VFS, we need a file system + * rename lock. */ + if (old_dentry != new_dentry) { + status = ocfs2_rename_lock(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + rename_lock = 1; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* if old and new are the same, this'll just do one lock. */ + status = ocfs2_double_lock(osb, handle, + &old_dir_bh, old_dir, + &new_dir_bh, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!new_dir_bh) { + if (old_dir_bh) { + new_dir_bh = old_dir_bh; + get_bh(new_dir_bh); + } else { + mlog(ML_ERROR, "no old_dir_bh!\n"); + status = -EIO; + goto bail; + } + } + + if (S_ISDIR(old_inode->i_mode)) { + /* Directories actually require metadata updates to + * the directory info so we can't get away with not + * doing node locking on it. */ + status = ocfs2_meta_lock(old_inode, handle, NULL, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + status = ocfs2_request_rename_vote(old_inode, old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = -EIO; + old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); + if (!old_inode_de_bh) + goto bail; + + status = -EIO; + if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != + OCFS2_I(old_dir)->ip_blkno) + goto bail; + status = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= OCFS2_LINK_MAX) + goto bail; + } else { + /* Ah, the simple case - we're a file so just send a + * message. */ + status = ocfs2_request_rename_vote(old_inode, old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = -ENOENT; + old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, + old_dentry->d_name.len, + old_dir, &old_de); + if (!old_de_bh) + goto bail; + + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) + goto bail; + + /* check if the target already exists (in which case we need + * to delete it */ + status = ocfs2_find_files_on_disk(new_dentry->d_name.name, + new_dentry->d_name.len, + &newfe_blkno, new_dir, &new_de_bh, + &new_de); + /* The only error we allow here is -ENOENT because the new + * file not existing is perfectly valid. */ + if ((status < 0) && (status != -ENOENT)) { + /* If we cannot find the file specified we should just */ + /* return the error... */ + mlog_errno(status); + goto bail; + } + + if (!new_de && new_inode) + mlog(ML_ERROR, "inode %lu does not exist in it's parent " + "directory!", new_inode->i_ino); + + /* In case we need to overwrite an existing file, we blow it + * away first */ + if (new_de) { + /* VFS didn't think there existed an inode here, but + * someone else in the cluster must have raced our + * rename to create one. Today we error cleanly, in + * the future we should consider calling iget to build + * a new struct inode for this entry. */ + if (!new_inode) { + status = -EACCES; + + mlog(0, "We found an inode for name %.*s but VFS " + "didn't give us one.\n", new_dentry->d_name.len, + new_dentry->d_name.name); + goto bail; + } + + if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { + status = -EACCES; + + mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") " + "disagree. ip_flags = %x\n", + OCFS2_I(new_inode)->ip_blkno, newfe_blkno, + OCFS2_I(new_inode)->ip_flags); + goto bail; + } + + status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode)) + links_count = 0; + else + links_count = (unsigned int) (new_inode->i_nlink - 1); + + status = ocfs2_request_unlink_vote(new_inode, new_dentry, + links_count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + newfe = (struct ocfs2_dinode *) newfe_bh->b_data; + + mlog(0, "aha rename over existing... new_de=%p " + "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n", + new_de, newfe_blkno, newfe_bh, newfe_bh ? + (unsigned long long)newfe_bh->b_blocknr : 0ULL); + + if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { + status = ocfs2_prepare_orphan_dir(osb, handle, + new_inode, + orphan_name, + &orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + } else { + BUG_ON(new_dentry->d_parent->d_inode != new_dir); + + status = ocfs2_check_dir_for_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, + new_dentry->d_name.name, + new_dentry->d_name.len, + &insert_entry_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + if (new_de) { + if (S_ISDIR(new_inode->i_mode)) { + if (!ocfs2_empty_dir(new_inode) || + new_inode->i_nlink != 2) { + status = -ENOTEMPTY; + goto bail; + } + } + status = ocfs2_journal_access(handle, new_inode, newfe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode) || + (newfe->i_links_count == cpu_to_le16(1))){ + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe, orphan_name, + orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* change the dirent to point to the correct inode */ + status = ocfs2_journal_access(handle, new_dir, new_de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno); + new_de->file_type = old_de->file_type; + new_dir->i_version++; + status = ocfs2_journal_dirty(handle, new_de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode)) + newfe->i_links_count = 0; + else + le16_add_cpu(&newfe->i_links_count, -1); + + status = ocfs2_journal_dirty(handle, newfe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } else { + /* if the name was not found in new_dir, add it now */ + status = ocfs2_add_entry(handle, new_dentry, old_inode, + OCFS2_I(old_inode)->ip_blkno, + new_dir_bh, insert_entry_bh); + } + + old_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_inode); + + /* now that the name has been added to new_dir, remove the old name */ + status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + if (old_inode_de_bh) { + status = ocfs2_journal_access(handle, old_inode, + old_inode_de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + PARENT_INO(old_inode_de_bh->b_data) = + cpu_to_le64(OCFS2_I(new_dir)->ip_blkno); + status = ocfs2_journal_dirty(handle, old_inode_de_bh); + old_dir->i_nlink--; + if (new_inode) { + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; + mark_inode_dirty(new_dir); + } + } + mark_inode_dirty(old_dir); + if (new_inode) + mark_inode_dirty(new_inode); + + if (old_dir != new_dir) + if (new_dir_nlink != new_dir->i_nlink) { + if (!new_dir_bh) { + mlog(ML_ERROR, "need to change nlink for new " + "dir %"MLFu64" from %d to %d but bh is " + "NULL\n", OCFS2_I(new_dir)->ip_blkno, + (int)new_dir_nlink, new_dir->i_nlink); + } else { + struct ocfs2_dinode *fe; + status = ocfs2_journal_access(handle, + new_dir, + new_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + fe = (struct ocfs2_dinode *) new_dir_bh->b_data; + fe->i_links_count = cpu_to_le16(new_dir->i_nlink); + status = ocfs2_journal_dirty(handle, new_dir_bh); + } + } + + if (old_dir_nlink != old_dir->i_nlink) { + if (!old_dir_bh) { + mlog(ML_ERROR, "need to change nlink for old dir " + "%"MLFu64" from %d to %d but bh is NULL!\n", + OCFS2_I(old_dir)->ip_blkno, + (int)old_dir_nlink, + old_dir->i_nlink); + } else { + struct ocfs2_dinode *fe; + status = ocfs2_journal_access(handle, old_dir, + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + fe = (struct ocfs2_dinode *) old_dir_bh->b_data; + fe->i_links_count = cpu_to_le16(old_dir->i_nlink); + status = ocfs2_journal_dirty(handle, old_dir_bh); + } + } + + status = 0; +bail: + if (rename_lock) + ocfs2_rename_unlock(osb); + + if (handle) + ocfs2_commit_trans(handle); + + if (new_inode) + sync_mapping_buffers(old_inode->i_mapping); + + if (new_inode) + iput(new_inode); + if (newfe_bh) + brelse(newfe_bh); + if (old_dir_bh) + brelse(old_dir_bh); + if (new_dir_bh) + brelse(new_dir_bh); + if (new_de_bh) + brelse(new_de_bh); + if (old_de_bh) + brelse(old_de_bh); + if (old_inode_de_bh) + brelse(old_inode_de_bh); + if (orphan_entry_bh) + brelse(orphan_entry_bh); + if (insert_entry_bh) + brelse(insert_entry_bh); + + mlog_exit(status); + + return status; +} + +/* + * we expect i_size = strlen(symname). Copy symname into the file + * data, including the null terminator. + */ +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + const char *symname) +{ + struct buffer_head **bhs = NULL; + const char *c; + struct super_block *sb = osb->sb; + u64 p_blkno; + int p_blocks; + int virtual, blocks, status, i, bytes_left; + + bytes_left = i_size_read(inode) + 1; + /* we can't trust i_blocks because we're actually going to + * write i_size + 1 bytes. */ + blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n", + inode->i_blocks, i_size_read(inode), blocks); + + /* Sanity check -- make sure we're going to fit. */ + if (bytes_left > + ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, + &p_blocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* links can never be larger than one cluster so we know this + * is all going to be contiguous, but do a sanity check + * anyway. */ + if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + virtual = 0; + while(bytes_left > 0) { + c = &symname[virtual * sb->s_blocksize]; + + bhs[virtual] = sb_getblk(sb, p_blkno); + if (!bhs[virtual]) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); + + status = ocfs2_journal_access(handle, inode, bhs[virtual], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[virtual]->b_data, 0, sb->s_blocksize); + + memcpy(bhs[virtual]->b_data, c, + (bytes_left > sb->s_blocksize) ? sb->s_blocksize : + bytes_left); + + status = ocfs2_journal_dirty(handle, bhs[virtual]); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + virtual++; + p_blkno++; + bytes_left -= sb->s_blocksize; + } + + status = 0; +bail: + + if (bhs) { + for(i = 0; i < blocks; i++) + if (bhs[i]) + brelse(bhs[i]); + kfree(bhs); + } + + mlog_exit(status); + return status; +} + +static int ocfs2_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + int status, l, credits; + u64 newsize; + struct ocfs2_super *osb = NULL; + struct inode *inode = NULL; + struct super_block *sb; + struct buffer_head *new_fe_bh = NULL; + struct buffer_head *de_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_dinode *dirfe; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + + mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, + dentry, symname, dentry->d_name.len, dentry->d_name.name); + + sb = dir->i_sb; + osb = OCFS2_SB(sb); + + l = strlen(symname) + 1; + + credits = ocfs2_calc_symlink_credits(sb); + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* lock the parent directory */ + status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!dirfe->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto bail; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* don't reserve bitmap space for fast symlinks. */ + if (l > ocfs2_fast_symlink_chars(sb)) { + status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_mknod_locked(osb, dir, dentry, + S_IFLNK | S_IRWXUGO, 0, + &new_fe_bh, parent_fe_bh, handle, + &inode, inode_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; + inode->i_rdev = 0; + newsize = l - 1; + if (l > ocfs2_fast_symlink_chars(sb)) { + inode->i_op = &ocfs2_symlink_inode_operations; + status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, + handle, data_ac, NULL, + NULL); + if (status < 0) { + if (status != -ENOSPC && status != -EINTR) { + mlog(ML_ERROR, "Failed to extend file to " + "%"MLFu64"\n", + newsize); + mlog_errno(status); + status = -ENOSPC; + } + goto bail; + } + i_size_write(inode, newsize); + inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); + } else { + inode->i_op = &ocfs2_fast_symlink_inode_operations; + memcpy((char *) fe->id2.i_symlink, symname, l); + i_size_write(inode, newsize); + inode->i_blocks = 0; + } + + status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!ocfs2_inode_is_fast_symlink(inode)) { + status = ocfs2_create_symlink_data(osb, handle, inode, + symname); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_add_entry(handle, dentry, inode, + le64_to_cpu(fe->i_blkno), parent_fe_bh, + de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); +bail: + if (handle) + ocfs2_commit_trans(handle); + if (new_fe_bh) + brelse(new_fe_bh); + if (parent_fe_bh) + brelse(parent_fe_bh); + if (de_bh) + brelse(de_bh); + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if ((status < 0) && inode) + iput(inode); + + mlog_exit(status); + + return status; +} + +int ocfs2_check_dir_entry(struct inode * dir, + struct ocfs2_dir_entry * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char *error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (rlen < OCFS2_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + + if (error_msg != NULL) + mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - " + "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n", + OCFS2_I(dir)->ip_blkno, error_msg, offset, + le64_to_cpu(de->inode), rlen, de->name_len); + return error_msg == NULL ? 1 : 0; +} + +/* we don't always have a dentry for what we want to add, so people + * like orphan dir can call this instead. + * + * If you pass me insert_bh, I'll skip the search of the other dir + * blocks and put the record in there. + */ +static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct buffer_head *insert_bh) +{ + unsigned long offset; + unsigned short rec_len; + struct ocfs2_dir_entry *de, *de1; + struct super_block *sb; + int retval, status; + + mlog_entry_void(); + + sb = dir->i_sb; + + if (!namelen) + return -EINVAL; + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) insert_bh->b_data; + while (1) { + BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data); + /* These checks should've already been passed by the + * prepare function, but I guess we can leave them + * here anyway. */ + if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { + retval = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + retval = -EEXIST; + goto bail; + } + if (((le64_to_cpu(de->inode) == 0) && + (le16_to_cpu(de->rec_len) >= rec_len)) || + (le16_to_cpu(de->rec_len) >= + (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { + status = ocfs2_journal_access(handle, dir, insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + /* By now the buffer is marked for journaling */ + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + de1 = (struct ocfs2_dir_entry *)((char *) de + + OCFS2_DIR_REC_LEN(de->name_len)); + de1->rec_len = + cpu_to_le16(le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len)); + de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + de = de1; + } + de->file_type = OCFS2_FT_UNKNOWN; + if (blkno) { + de->inode = cpu_to_le64(blkno); + ocfs2_set_de_type(de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_version++; + status = ocfs2_journal_dirty(handle, insert_bh); + retval = 0; + goto bail; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); + } + + /* when you think about it, the assert above should prevent us + * from ever getting here. */ + retval = -ENOSPC; +bail: + + mlog_exit(retval); + return retval; +} + + +/* + * ocfs2_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh) +{ + struct ocfs2_dir_entry *de, *pde; + int i, status = -ENOENT; + + mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); + + i = 0; + pde = NULL; + de = (struct ocfs2_dir_entry *) bh->b_data; + while (i < bh->b_size) { + if (!ocfs2_check_dir_entry(dir, de, bh, i)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (de == de_del) { + status = ocfs2_journal_access(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + status = ocfs2_journal_dirty(handle, bh); + goto bail; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); + } +bail: + mlog_exit(status); + return status; +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static int inline ocfs2_search_dirblock(struct buffer_head *bh, + struct inode *dir, + const char *name, int namelen, + unsigned long offset, + struct ocfs2_dir_entry **res_dir) +{ + struct ocfs2_dir_entry *de; + char *dlimit, *de_buf; + int de_len; + int ret = 0; + + mlog_entry_void(); + + de_buf = bh->b_data; + dlimit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + de = (struct ocfs2_dir_entry *) de_buf; + + if (de_buf + namelen <= dlimit && + ocfs2_match(namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + ret = -1; + goto bail; + } + *res_dir = de; + ret = 1; + goto bail; + } + + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) { + ret = -1; + goto bail; + } + + de_buf += de_len; + offset += de_len; + } + +bail: + mlog_exit(ret); + return ret; +} + +struct buffer_head *ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + + mlog_entry_void(); + + *res_dir = NULL; + sb = dir->i_sb; + + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + start = OCFS2_I(dir)->ip_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; + +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + + /* XXX: questionable readahead stuff here */ + bh = ocfs2_bread(dir, b++, &err, 1); + bh_use[ra_max] = bh; +#if 0 // ??? + if (bh) + ll_rw_block(READ, 1, &bh); +#endif + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + brelse(bh); + goto next; + } + i = ocfs2_search_dirblock(bh, dir, name, namelen, + block << sb->s_blocksize_bits, + res_dir); + if (i == 1) { + OCFS2_I(dir)->ip_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + + mlog_exit_ptr(ret); + return ret; +} + +static int ocfs2_blkno_stringify(u64 blkno, char *name) +{ + int status, namelen; + + mlog_entry_void(); + + namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64, + blkno); + if (namelen <= 0) { + if (namelen) + status = namelen; + else + status = -EINVAL; + mlog_errno(status); + goto bail; + } + if (namelen != OCFS2_ORPHAN_NAMELEN) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, + namelen); + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + char *name, + struct buffer_head **de_bh) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int status = 0; + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + ocfs2_handle_add_inode(handle, orphan_dir_inode); + status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + +leave: + if (orphan_dir_inode) + iput(orphan_dir_inode); + + if (orphan_dir_bh) + brelse(orphan_dir_bh); + + mlog_exit(status); + return status; +} + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct ocfs2_dinode *fe, + char *name, + struct buffer_head *de_bh) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int status = 0; + struct ocfs2_dinode *orphan_fe; + + mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + status = ocfs2_read_block(osb, + OCFS2_I(orphan_dir_inode)->ip_blkno, + &orphan_dir_bh, OCFS2_BH_CACHED, + orphan_dir_inode); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* we're a cluster, and nlink can change on disk from + * underneath us... */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + le16_add_cpu(&orphan_fe->i_links_count, 1); + orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + + status = ocfs2_journal_dirty(handle, orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = __ocfs2_add_entry(handle, orphan_dir_inode, name, + OCFS2_ORPHAN_NAMELEN, inode, + OCFS2_I(inode)->ip_blkno, + orphan_dir_bh, de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; + spin_unlock(&OCFS2_I(inode)->ip_lock); + + mlog(0, "Inode %"MLFu64" orphaned in slot %d\n", + OCFS2_I(inode)->ip_blkno, osb->slot_num); + +leave: + if (orphan_dir_inode) + iput(orphan_dir_inode); + + if (orphan_dir_bh) + brelse(orphan_dir_bh); + + mlog_exit(status); + return status; +} + +/* unlike orphan_add, we expect the orphan dir to already be locked here. */ +int ocfs2_orphan_del(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh) +{ + char name[OCFS2_ORPHAN_NAMELEN + 1]; + struct ocfs2_dinode *orphan_fe; + int status = 0; + struct buffer_head *target_de_bh = NULL; + struct ocfs2_dir_entry *target_de = NULL; + + mlog_entry_void(); + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n", + name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN); + + /* find it's spot in the orphan directory */ + target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, + orphan_dir_inode, &target_de); + if (!target_de_bh) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + /* remove it from the orphan directory */ + status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, + target_de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* do the i_nlink dance! :) */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + le16_add_cpu(&orphan_fe->i_links_count, -1); + orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + + status = ocfs2_journal_dirty(handle, orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + +leave: + if (target_de_bh) + brelse(target_de_bh); + + mlog_exit(status); + return status; +} + +struct inode_operations ocfs2_dir_iops = { + .create = ocfs2_create, + .lookup = ocfs2_lookup, + .link = ocfs2_link, + .unlink = ocfs2_unlink, + .rmdir = ocfs2_unlink, + .symlink = ocfs2_symlink, + .mkdir = ocfs2_mkdir, + .mknod = ocfs2_mknod, + .rename = ocfs2_rename, + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, +}; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h new file mode 100644 index 00000000000..deaaa97dbf0 --- /dev/null +++ b/fs/ocfs2/namei.h @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_NAMEI_H +#define OCFS2_NAMEI_H + +extern struct inode_operations ocfs2_dir_iops; + +struct dentry *ocfs2_get_parent(struct dentry *child); + +int ocfs2_check_dir_entry (struct inode *dir, + struct ocfs2_dir_entry *de, + struct buffer_head *bh, + unsigned long offset); +struct buffer_head *ocfs2_find_entry(const char *name, + int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir); +int ocfs2_orphan_del(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh); + +static inline int ocfs2_match(int len, + const char * const name, + struct ocfs2_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +#endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h new file mode 100644 index 00000000000..0b499bccec5 --- /dev/null +++ b/fs/ocfs2/ocfs1_fs_compat.h @@ -0,0 +1,109 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs1_fs_compat.h + * + * OCFS1 volume header definitions. OCFS2 creates valid but unmountable + * OCFS1 volume headers on the first two sectors of an OCFS2 volume. + * This allows an OCFS1 volume to see the partition and cleanly fail to + * mount it. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS1_FS_COMPAT_H +#define _OCFS1_FS_COMPAT_H + +#define OCFS1_MAX_VOL_SIGNATURE_LEN 128 +#define OCFS1_MAX_MOUNT_POINT_LEN 128 +#define OCFS1_MAX_VOL_ID_LENGTH 16 +#define OCFS1_MAX_VOL_LABEL_LEN 64 +#define OCFS1_MAX_CLUSTER_NAME_LEN 64 + +#define OCFS1_MAJOR_VERSION (2) +#define OCFS1_MINOR_VERSION (0) +#define OCFS1_VOLUME_SIGNATURE "OracleCFS" + +/* + * OCFS1 superblock. Lives at sector 0. + */ +struct ocfs1_vol_disk_hdr +{ +/*00*/ __u32 minor_version; + __u32 major_version; +/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN]; +/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN]; +/*108*/ __u64 serial_num; +/*110*/ __u64 device_size; + __u64 start_off; +/*120*/ __u64 bitmap_off; + __u64 publ_off; +/*130*/ __u64 vote_off; + __u64 root_bitmap_off; +/*140*/ __u64 data_start_off; + __u64 root_bitmap_size; +/*150*/ __u64 root_off; + __u64 root_size; +/*160*/ __u64 cluster_size; + __u64 num_nodes; +/*170*/ __u64 num_clusters; + __u64 dir_node_size; +/*180*/ __u64 file_node_size; + __u64 internal_off; +/*190*/ __u64 node_cfg_off; + __u64 node_cfg_size; +/*1A0*/ __u64 new_cfg_off; + __u32 prot_bits; + __s32 excl_mount; +/*1B0*/ +}; + + +struct ocfs1_disk_lock +{ +/*00*/ __u32 curr_master; + __u8 file_lock; + __u8 compat_pad[3]; /* Not in orignal definition. Used to + make the already existing alignment + explicit */ + __u64 last_write_time; +/*10*/ __u64 last_read_time; + __u32 writer_node_num; + __u32 reader_node_num; +/*20*/ __u64 oin_node_map; + __u64 dlock_seq_num; +/*30*/ +}; + +/* + * OCFS1 volume label. Lives at sector 1. + */ +struct ocfs1_vol_label +{ +/*00*/ struct ocfs1_disk_lock disk_lock; +/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN]; +/*70*/ __u16 label_len; +/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH]; +/*82*/ __u16 vol_id_len; +/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN]; +/*A4*/ __u16 cluster_name_len; +/*A6*/ +}; + + +#endif /* _OCFS1_FS_COMPAT_H */ + diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h new file mode 100644 index 00000000000..f468c600cf9 --- /dev/null +++ b/fs/ocfs2/ocfs2.h @@ -0,0 +1,464 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2.h + * + * Defines macros and structures used in OCFS2 + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_H +#define OCFS2_H + +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/workqueue.h> +#include <linux/kref.h> + +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "dlm/dlmapi.h" + +#include "ocfs2_fs.h" +#include "endian.h" +#include "ocfs2_lockid.h" + +struct ocfs2_extent_map { + u32 em_clusters; + struct rb_root em_extents; +}; + +/* Most user visible OCFS2 inodes will have very few pieces of + * metadata, but larger files (including bitmaps, etc) must be taken + * into account when designing an access scheme. We allow a small + * amount of inlined blocks to be stored on an array and grow the + * structure into a rb tree when necessary. */ +#define OCFS2_INODE_MAX_CACHE_ARRAY 2 + +struct ocfs2_caching_info { + unsigned int ci_num_cached; + union { + sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; + struct rb_root ci_tree; + } ci_cache; +}; + +/* this limits us to 256 nodes + * if we need more, we can do a kmalloc for the map */ +#define OCFS2_NODE_MAP_MAX_NODES 256 +struct ocfs2_node_map { + u16 num_nodes; + unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; +}; + +enum ocfs2_ast_action { + OCFS2_AST_INVALID = 0, + OCFS2_AST_ATTACH, + OCFS2_AST_CONVERT, + OCFS2_AST_DOWNCONVERT, +}; + +/* actions for an unlockast function to take. */ +enum ocfs2_unlock_action { + OCFS2_UNLOCK_INVALID = 0, + OCFS2_UNLOCK_CANCEL_CONVERT, + OCFS2_UNLOCK_DROP_LOCK, +}; + +/* ocfs2_lock_res->l_flags flags. */ +#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized + * the lvb */ +#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ +#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) +#define OCFS2_LOCK_REFRESHING (0x00000020) +#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization + * for shutdown paths */ +#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track + * when to skip queueing + * a lock because it's + * about to be + * dropped. */ +#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ + +struct ocfs2_lock_res_ops; + +typedef void (*ocfs2_lock_callback)(int status, unsigned long data); + +struct ocfs2_lock_res { + void *l_priv; + struct ocfs2_lock_res_ops *l_ops; + spinlock_t l_lock; + + struct list_head l_blocked_list; + struct list_head l_mask_waiters; + + enum ocfs2_lock_type l_type; + unsigned long l_flags; + char l_name[OCFS2_LOCK_ID_MAX_LEN]; + int l_level; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + struct dlm_lockstatus l_lksb; + + /* used from AST/BAST funcs. */ + enum ocfs2_ast_action l_action; + enum ocfs2_unlock_action l_unlock_action; + int l_requested; + int l_blocking; + + wait_queue_head_t l_event; + + struct list_head l_debug_list; +}; + +struct ocfs2_dlm_debug { + struct kref d_refcnt; + struct dentry *d_locking_state; + struct list_head d_lockres_tracking; +}; + +enum ocfs2_vol_state +{ + VOLUME_INIT = 0, + VOLUME_MOUNTED, + VOLUME_DISMOUNTED, + VOLUME_DISABLED +}; + +struct ocfs2_alloc_stats +{ + atomic_t moves; + atomic_t local_data; + atomic_t bitmap_data; + atomic_t bg_allocs; + atomic_t bg_extends; +}; + +enum ocfs2_local_alloc_state +{ + OCFS2_LA_UNUSED = 0, + OCFS2_LA_ENABLED, + OCFS2_LA_DISABLED +}; + +enum ocfs2_mount_options +{ + OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ + OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ + OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ + OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ + OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ +#ifdef OCFS2_ORACORE_WORKAROUNDS + OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ +#endif +}; + +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 + +struct ocfs2_journal; +struct ocfs2_journal_handle; +struct ocfs2_super +{ + u32 osb_id; /* id used by the proc interface */ + struct task_struct *commit_task; + struct super_block *sb; + struct inode *root_inode; + struct inode *sys_root_inode; + struct inode *system_inodes[NUM_SYSTEM_INODES]; + + struct ocfs2_slot_info *slot_info; + + spinlock_t node_map_lock; + struct ocfs2_node_map mounted_map; + struct ocfs2_node_map recovery_map; + struct ocfs2_node_map umount_map; + + u32 num_clusters; + u64 root_blkno; + u64 system_dir_blkno; + u64 bitmap_blkno; + u32 bitmap_cpg; + u8 *uuid; + char *uuid_str; + u8 *vol_label; + u64 first_cluster_group_blkno; + u32 fs_generation; + + u32 s_feature_compat; + u32 s_feature_incompat; + u32 s_feature_ro_compat; + + /* Protects s_next_generaion, osb_flags. Could protect more on + * osb as it's very short lived. */ + spinlock_t osb_lock; + u32 s_next_generation; + unsigned long osb_flags; + + unsigned long s_mount_opt; + + u16 max_slots; + u16 num_nodes; + s16 node_num; + s16 slot_num; + int s_sectsize_bits; + int s_clustersize; + int s_clustersize_bits; + struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ + + atomic_t vol_state; + struct semaphore recovery_lock; + struct task_struct *recovery_thread_task; + int disable_recovery; + wait_queue_head_t checkpoint_event; + atomic_t needs_checkpoint; + struct ocfs2_journal *journal; + + enum ocfs2_local_alloc_state local_alloc_state; + struct buffer_head *local_alloc_bh; + + /* Next two fields are for local node slot recovery during + * mount. */ + int dirty; + struct ocfs2_dinode *local_alloc_copy; + + struct ocfs2_alloc_stats alloc_stats; + char dev_str[20]; /* "major,minor" of the device */ + + struct dlm_ctxt *dlm; + struct ocfs2_lock_res osb_super_lockres; + struct ocfs2_lock_res osb_rename_lockres; + struct dlm_eviction_cb osb_eviction_cb; + struct ocfs2_dlm_debug *osb_dlm_debug; + + struct dentry *osb_debug_root; + + wait_queue_head_t recovery_event; + + spinlock_t vote_task_lock; + struct task_struct *vote_task; + wait_queue_head_t vote_event; + unsigned long vote_wake_sequence; + unsigned long vote_work_sequence; + + struct list_head blocked_lock_list; + unsigned long blocked_lock_count; + + struct list_head vote_list; + int vote_count; + + u32 net_key; + spinlock_t net_response_lock; + unsigned int net_response_ids; + struct list_head net_response_list; + + struct o2hb_callback_func osb_hb_up; + struct o2hb_callback_func osb_hb_down; + + struct list_head osb_net_handlers; + + wait_queue_head_t osb_mount_event; + + /* Truncate log info */ + struct inode *osb_tl_inode; + struct buffer_head *osb_tl_bh; + struct work_struct osb_truncate_log_wq; +}; + +#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) +#define OCFS2_MAX_OSB_ID 65536 + +static inline int ocfs2_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) + return 0; + return 1; +} + +/* set / clear functions because cluster events can make these happen + * in parallel so we want the transitions to be atomic. this also + * means that any future flags osb_flags must be protected by spinlock + * too! */ +static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags |= flag; + spin_unlock(&osb->osb_lock); +} + +static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, + int hard) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); + if (hard) + osb->osb_flags |= OCFS2_OSB_HARD_RO; + else + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); +} + +static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_HARD_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +#define OCFS2_IS_VALID_DINODE(ptr) \ + (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) + +#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \ + typeof(__di) ____di = (__di); \ + ocfs2_error((__sb), \ + "Dinode # %"MLFu64" has bad signature %.*s", \ + (____di)->i_blkno, 7, \ + (____di)->i_signature); \ +} while (0); + +#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ + (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) + +#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \ + typeof(__eb) ____eb = (__eb); \ + ocfs2_error((__sb), \ + "Extent Block # %"MLFu64" has bad signature %.*s", \ + (____eb)->h_blkno, 7, \ + (____eb)->h_signature); \ +} while (0); + +#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ + (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) + +#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \ + typeof(__gd) ____gd = (__gd); \ + ocfs2_error((__sb), \ + "Group Descriptor # %"MLFu64" has bad signature %.*s", \ + (____gd)->bg_blkno, 7, \ + (____gd)->bg_signature); \ +} while (0); + +static inline unsigned long ino_from_blkno(struct super_block *sb, + u64 blkno) +{ + return (unsigned long)(blkno & (u64)ULONG_MAX); +} + +static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, + u32 clusters) +{ + int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u64)clusters << c_to_b_bits; +} + +static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u32)(blocks >> b_to_c_bits); +} + +static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + bytes += OCFS2_SB(sb)->s_clustersize - 1; + /* OCFS2 just cannot have enough clusters to overflow this */ + clusters = (unsigned int)(bytes >> cl_bits); + + return clusters; +} + +static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, + u64 bytes) +{ + bytes += sb->s_blocksize - 1; + return bytes >> sb->s_blocksize_bits; +} + +static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, + u32 clusters) +{ + return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; +} + +static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = ocfs2_clusters_for_bytes(sb, bytes); + return (u64)clusters << cl_bits; +} + +static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, + u64 bytes) +{ + u64 blocks; + + blocks = ocfs2_blocks_for_bytes(sb, bytes); + return blocks << sb->s_blocksize_bits; +} + +static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) +{ + return (unsigned long)((bytes + 511) >> 9); +} + +#define ocfs2_set_bit ext2_set_bit +#define ocfs2_clear_bit ext2_clear_bit +#define ocfs2_test_bit ext2_test_bit +#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit +#endif /* OCFS2_H */ + diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h new file mode 100644 index 00000000000..dfb8a5bedfc --- /dev/null +++ b/fs/ocfs2/ocfs2_fs.h @@ -0,0 +1,638 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_fs.h + * + * On-disk structures for OCFS2. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_FS_H +#define _OCFS2_FS_H + +/* Version */ +#define OCFS2_MAJOR_REV_LEVEL 0 +#define OCFS2_MINOR_REV_LEVEL 90 + +/* + * An OCFS2 volume starts this way: + * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. + * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. + * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. + * + * All other structures are found from the superblock information. + * + * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a + * blocksize of 2K, it is 4096 bytes into disk. + */ +#define OCFS2_SUPER_BLOCK_BLKNO 2 + +/* + * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could + * grow if needed. + */ +#define OCFS2_MIN_CLUSTERSIZE 4096 +#define OCFS2_MAX_CLUSTERSIZE 1048576 + +/* + * Blocks cannot be bigger than clusters, so the maximum blocksize is the + * minimum cluster size. + */ +#define OCFS2_MIN_BLOCKSIZE 512 +#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE + +/* Filesystem magic number */ +#define OCFS2_SUPER_MAGIC 0x7461636f + +/* Object signatures */ +#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" +#define OCFS2_INODE_SIGNATURE "INODE01" +#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" +#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" + +/* Compatibility flags */ +#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_compat & (mask) ) +#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) +#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) +#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat |= (mask) +#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat |= (mask) +#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat |= (mask) +#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat &= ~(mask) +#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) +#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat &= ~(mask) + +#define OCFS2_FEATURE_COMPAT_SUPP 0 +#define OCFS2_FEATURE_INCOMPAT_SUPP 0 +#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 + +/* + * Heartbeat-only devices are missing journals and other files. The + * filesystem driver can't load them, but the library can. Never put + * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. + */ +#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 + + +/* + * Flags on ocfs2_dinode.i_flags + */ +#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ +#define OCFS2_UNUSED2_FL (0x00000002) +#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ +#define OCFS2_UNUSED3_FL (0x00000008) +/* System inode flags */ +#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ +#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ +#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ +#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ +#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ +#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ +#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ +#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ + +/* + * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) + */ +#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ + +/* + * superblock s_state flags + */ +#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ + +/* Limit of space in ocfs2_dir_entry */ +#define OCFS2_MAX_FILENAME_LEN 255 + +/* Maximum slots on an ocfs2 file system */ +#define OCFS2_MAX_SLOTS 255 + +/* Slot map indicator for an empty slot */ +#define OCFS2_INVALID_SLOT -1 + +#define OCFS2_VOL_UUID_LEN 16 +#define OCFS2_MAX_VOL_LABEL_LEN 64 + +/* Journal limits (in bytes) */ +#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) +#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) + +struct ocfs2_system_inode_info { + char *si_name; + int si_iflags; + int si_mode; +}; + +/* System file index */ +enum { + BAD_BLOCK_SYSTEM_INODE = 0, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, + SLOT_MAP_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE + HEARTBEAT_SYSTEM_INODE, + GLOBAL_BITMAP_SYSTEM_INODE, +#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE + ORPHAN_DIR_SYSTEM_INODE, + EXTENT_ALLOC_SYSTEM_INODE, + INODE_ALLOC_SYSTEM_INODE, + JOURNAL_SYSTEM_INODE, + LOCAL_ALLOC_SYSTEM_INODE, + TRUNCATE_LOG_SYSTEM_INODE, + NUM_SYSTEM_INODES +}; + +static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { + /* Global system inodes (single copy) */ + /* The first two are only used from userspace mfks/tunefs */ + [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, + [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + + /* These are used by the running filesystem */ + [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, + [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, + [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, + + /* Slot-specific system inodes (one copy per slot) */ + [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, + [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, + [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, + [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } +}; + +/* Parameter passed from mount.ocfs2 to module */ +#define OCFS2_HB_NONE "heartbeat=none" +#define OCFS2_HB_LOCAL "heartbeat=local" + +/* + * OCFS2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define OCFS2_FT_UNKNOWN 0 +#define OCFS2_FT_REG_FILE 1 +#define OCFS2_FT_DIR 2 +#define OCFS2_FT_CHRDEV 3 +#define OCFS2_FT_BLKDEV 4 +#define OCFS2_FT_FIFO 5 +#define OCFS2_FT_SOCK 6 +#define OCFS2_FT_SYMLINK 7 + +#define OCFS2_FT_MAX 8 + +/* + * OCFS2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define OCFS2_DIR_PAD 4 +#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) +#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) +#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ + OCFS2_DIR_ROUND) & \ + ~OCFS2_DIR_ROUND) + +#define OCFS2_LINK_MAX 32000 + +#define S_SHIFT 12 +static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, + [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, + [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, +}; + + +/* + * Convenience casts + */ +#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) + +/* + * On disk extent record for OCFS2 + * It describes a range of clusters on disk. + */ +struct ocfs2_extent_rec { +/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ + __le32 e_clusters; /* Clusters covered by this extent */ + __le64 e_blkno; /* Physical disk offset, in blocks */ +/*10*/ +}; + +struct ocfs2_chain_rec { + __le32 c_free; /* Number of free bits in this chain. */ + __le32 c_total; /* Number of total bits in this chain */ + __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ +}; + +struct ocfs2_truncate_rec { + __le32 t_start; /* 1st cluster in this log */ + __le32 t_clusters; /* Number of total clusters covered */ +}; + +/* + * On disk extent list for OCFS2 (node in the tree). Note that this + * is contained inside ocfs2_dinode or ocfs2_extent_block, so the + * offsets are relative to ocfs2_dinode.id2.i_list or + * ocfs2_extent_block.h_list, respectively. + */ +struct ocfs2_extent_list { +/*00*/ __le16 l_tree_depth; /* Extent tree depth from this + point. 0 means data extents + hang directly off this + header (a leaf) */ + __le16 l_count; /* Number of extent records */ + __le16 l_next_free_rec; /* Next unused extent slot */ + __le16 l_reserved1; + __le64 l_reserved2; /* Pad to + sizeof(ocfs2_extent_rec) */ +/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ +}; + +/* + * On disk allocation chain list for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_chain. + */ +struct ocfs2_chain_list { +/*00*/ __le16 cl_cpg; /* Clusters per Block Group */ + __le16 cl_bpc; /* Bits per cluster */ + __le16 cl_count; /* Total chains in this list */ + __le16 cl_next_free_rec; /* Next unused chain slot */ + __le64 cl_reserved1; +/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ +}; + +/* + * On disk deallocation log for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_dealloc. + */ +struct ocfs2_truncate_log { +/*00*/ __le16 tl_count; /* Total records in this log */ + __le16 tl_used; /* Number of records in use */ + __le32 tl_reserved1; +/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ +}; + +/* + * On disk extent block (indirect block) for OCFS2 + */ +struct ocfs2_extent_block +{ +/*00*/ __u8 h_signature[8]; /* Signature for verification */ + __le64 h_reserved1; +/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this + extent_header belongs to */ + __le16 h_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 h_fs_generation; /* Must match super block */ + __le64 h_blkno; /* Offset on disk, in blocks */ +/*20*/ __le64 h_reserved3; + __le64 h_next_leaf_blk; /* Offset on disk, in blocks, + of next leaf header pointing + to data */ +/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ +/* Actual on-disk size is one block */ +}; + +/* + * On disk superblock for OCFS2 + * Note that it is contained inside an ocfs2_dinode, so all offsets + * are relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_super_block { +/*00*/ __le16 s_major_rev_level; + __le16 s_minor_rev_level; + __le16 s_mnt_count; + __le16 s_max_mnt_count; + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le32 s_checkinterval; /* Max time between checks */ +/*10*/ __le64 s_lastcheck; /* Time of last check */ + __le32 s_creator_os; /* OS */ + __le32 s_feature_compat; /* Compatible feature set */ +/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ + __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ + __le64 s_root_blkno; /* Offset, in blocks, of root directory + dinode */ +/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system + directory dinode */ + __le32 s_blocksize_bits; /* Blocksize for this fs */ + __le32 s_clustersize_bits; /* Clustersize for this fs */ +/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts + before tunefs required */ + __le16 s_reserved1; + __le32 s_reserved2; + __le64 s_first_cluster_group; /* Block offset of 1st cluster + * group header */ +/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ +/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ +/*A0*/ +}; + +/* + * Local allocation bitmap for OCFS2 slots + * Note that it exists inside an ocfs2_dinode, so all offsets are + * relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_local_alloc +{ +/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ + __le16 la_size; /* Size of included bitmap, in bytes */ + __le16 la_reserved1; + __le64 la_reserved2; +/*10*/ __u8 la_bitmap[0]; +}; + +/* + * On disk inode for OCFS2 + */ +struct ocfs2_dinode { +/*00*/ __u8 i_signature[8]; /* Signature for validation */ + __le32 i_generation; /* Generation number */ + __le16 i_suballoc_slot; /* Slot suballocator this inode + belongs to */ + __le16 i_suballoc_bit; /* Bit offset in suballocator + block group */ +/*10*/ __le32 i_reserved0; + __le32 i_clusters; /* Cluster count */ + __le32 i_uid; /* Owner UID */ + __le32 i_gid; /* Owning GID */ +/*20*/ __le64 i_size; /* Size in bytes */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_flags; /* File flags */ +/*30*/ __le64 i_atime; /* Access time */ + __le64 i_ctime; /* Creation time */ +/*40*/ __le64 i_mtime; /* Modification time */ + __le64 i_dtime; /* Deletion time */ +/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ + __le64 i_last_eb_blk; /* Pointer to last extent + block */ +/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ + __le32 i_atime_nsec; + __le32 i_ctime_nsec; + __le32 i_mtime_nsec; +/*70*/ __le64 i_reserved1[9]; +/*B8*/ union { + __le64 i_pad1; /* Generic way to refer to this + 64bit union */ + struct { + __le64 i_rdev; /* Device number */ + } dev1; + struct { /* Info for bitmap system + inodes */ + __le32 i_used; /* Bits (ie, clusters) used */ + __le32 i_total; /* Total bits (clusters) + available */ + } bitmap1; + struct { /* Info for journal system + inodes */ + __le32 ij_flags; /* Mounted, version, etc. */ + __le32 ij_pad; + } journal1; + } id1; /* Inode type dependant 1 */ +/*C0*/ union { + struct ocfs2_super_block i_super; + struct ocfs2_local_alloc i_lab; + struct ocfs2_chain_list i_chain; + struct ocfs2_extent_list i_list; + struct ocfs2_truncate_log i_dealloc; + __u8 i_symlink[0]; + } id2; +/* Actual on-disk size is one block */ +}; + +/* + * On-disk directory entry structure for OCFS2 + * + * Packed as this structure could be accessed unaligned on 64-bit platforms + */ +struct ocfs2_dir_entry { +/*00*/ __le64 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; +/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ +/* Actual on-disk length specified by rec_len */ +} __attribute__ ((packed)); + +/* + * On disk allocator group structure for OCFS2 + */ +struct ocfs2_group_desc +{ +/*00*/ __u8 bg_signature[8]; /* Signature for validation */ + __le16 bg_size; /* Size of included bitmap in + bytes. */ + __le16 bg_bits; /* Bits represented by this + group. */ + __le16 bg_free_bits_count; /* Free bits count */ + __le16 bg_chain; /* What chain I am in. */ +/*10*/ __le32 bg_generation; + __le32 bg_reserved1; + __le64 bg_next_group; /* Next group in my list, in + blocks */ +/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in + blocks */ + __le64 bg_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 bg_reserved2[2]; +/*40*/ __u8 bg_bitmap[0]; +}; + +#ifdef __KERNEL__ +static inline int ocfs2_fast_symlink_chars(struct super_block *sb) +{ + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_local_alloc_size(struct super_block *sb) +{ + u16 size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} +#else +static inline int ocfs2_fast_symlink_chars(int blocksize) +{ + return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_extent_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline int ocfs2_extent_recs_per_eb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_local_alloc_size(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} +#endif /* __KERNEL__ */ + + +static inline int ocfs2_system_inode_is_global(int type) +{ + return ((type >= 0) && + (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); +} + +static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, + int type, int slot) +{ + int chars; + + /* + * Global system inodes can only have one copy. Everything + * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode + * list has a copy per slot. + */ + if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) + chars = snprintf(buf, len, + ocfs2_system_inodes[type].si_name); + else + chars = snprintf(buf, len, + ocfs2_system_inodes[type].si_name, + slot); + + return chars; +} + +static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, + umode_t mode) +{ + de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +#endif /* _OCFS2_FS_H */ + diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h new file mode 100644 index 00000000000..7dd9e1e705b --- /dev/null +++ b/fs/ocfs2/ocfs2_lockid.h @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_lockid.h + * + * Defines OCFS2 lockid bits. + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCKID_H +#define OCFS2_LOCKID_H + +/* lock ids are made up in the following manner: + * name[0] --> type + * name[1-6] --> 6 pad characters, reserved for now + * name[7-22] --> block number, expressed in hex as 16 chars + * name[23-30] --> i_generation, expressed in hex 8 chars + * name[31] --> '\0' */ +#define OCFS2_LOCK_ID_MAX_LEN 32 +#define OCFS2_LOCK_ID_PAD "000000" + +enum ocfs2_lock_type { + OCFS2_LOCK_TYPE_META = 0, + OCFS2_LOCK_TYPE_DATA, + OCFS2_LOCK_TYPE_SUPER, + OCFS2_LOCK_TYPE_RENAME, + OCFS2_LOCK_TYPE_RW, + OCFS2_NUM_LOCK_TYPES +}; + +static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) +{ + char c; + switch (type) { + case OCFS2_LOCK_TYPE_META: + c = 'M'; + break; + case OCFS2_LOCK_TYPE_DATA: + c = 'D'; + break; + case OCFS2_LOCK_TYPE_SUPER: + c = 'S'; + break; + case OCFS2_LOCK_TYPE_RENAME: + c = 'R'; + break; + case OCFS2_LOCK_TYPE_RW: + c = 'W'; + break; + default: + c = '\0'; + } + + return c; +} + +#endif /* OCFS2_LOCKID_H */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c new file mode 100644 index 00000000000..871627961d6 --- /dev/null +++ b/fs/ocfs2/slot_map.c @@ -0,0 +1,303 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slot_map.c + * + * + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> + +#define MLOG_MASK_PREFIX ML_SUPER +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global); +static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, + s16 slot_num, + s16 node_num); + +/* Use the slot information we've collected to create a map of mounted + * nodes. Should be holding an EX on super block. assumes slot info is + * up to date. Note that we call this *after* we find a slot, so our + * own node should be set in the map too... */ +void ocfs2_populate_mounted_map(struct ocfs2_super *osb) +{ + int i; + struct ocfs2_slot_info *si = osb->slot_info; + + spin_lock(&si->si_lock); + + for (i = 0; i < si->si_size; i++) + if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT) + ocfs2_node_map_set_bit(osb, &osb->mounted_map, + si->si_global_node_nums[i]); + + spin_unlock(&si->si_lock); +} + +/* post the slot information on disk into our slot_info struct. */ +void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +{ + int i; + __le16 *disk_info; + + /* we don't read the slot block here as ocfs2_super_lock + * should've made sure we have the most recent copy. */ + spin_lock(&si->si_lock); + disk_info = (__le16 *) si->si_bh->b_data; + + for (i = 0; i < si->si_size; i++) + si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); + + spin_unlock(&si->si_lock); +} + +/* post the our slot info stuff into it's destination bh and write it + * out. */ +int ocfs2_update_disk_slots(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) +{ + int status, i; + __le16 *disk_info = (__le16 *) si->si_bh->b_data; + + spin_lock(&si->si_lock); + for (i = 0; i < si->si_size; i++) + disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); + spin_unlock(&si->si_lock); + + status = ocfs2_write_block(osb, si->si_bh, si->si_inode); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* try to find global node in the slot info. Returns + * OCFS2_INVALID_SLOT if nothing is found. */ +static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global) +{ + int i; + s16 ret = OCFS2_INVALID_SLOT; + + for(i = 0; i < si->si_num_slots; i++) { + if (global == si->si_global_node_nums[i]) { + ret = (s16) i; + break; + } + } + return ret; +} + +static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) +{ + int i; + s16 ret = OCFS2_INVALID_SLOT; + + for(i = 0; i < si->si_num_slots; i++) { + if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { + ret = (s16) i; + break; + } + } + return ret; +} + +s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global) +{ + s16 ret; + + spin_lock(&si->si_lock); + ret = __ocfs2_node_num_to_slot(si, global); + spin_unlock(&si->si_lock); + return ret; +} + +static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, + s16 slot_num, + s16 node_num) +{ + BUG_ON(slot_num == OCFS2_INVALID_SLOT); + BUG_ON(slot_num >= si->si_num_slots); + BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && + (node_num >= O2NM_MAX_NODES)); + + si->si_global_node_nums[slot_num] = node_num; +} + +void ocfs2_clear_slot(struct ocfs2_slot_info *si, + s16 slot_num) +{ + spin_lock(&si->si_lock); + __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); + spin_unlock(&si->si_lock); +} + +int ocfs2_init_slot_info(struct ocfs2_super *osb) +{ + int status, i; + u64 blkno; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_slot_info *si; + + si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL); + if (!si) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + spin_lock_init(&si->si_lock); + si->si_num_slots = osb->max_slots; + si->si_size = OCFS2_MAX_SLOTS; + + for(i = 0; i < si->si_num_slots; i++) + si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; + + inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_read_block(osb, blkno, &bh, 0, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_inode = inode; + si->si_bh = bh; + osb->slot_info = si; +bail: + if (status < 0 && si) + ocfs2_free_slot_info(si); + + return status; +} + +void ocfs2_free_slot_info(struct ocfs2_slot_info *si) +{ + if (si->si_inode) + iput(si->si_inode); + if (si->si_bh) + brelse(si->si_bh); + kfree(si); +} + +int ocfs2_find_slot(struct ocfs2_super *osb) +{ + int status; + s16 slot; + struct ocfs2_slot_info *si; + + mlog_entry_void(); + + si = osb->slot_info; + + ocfs2_update_slot_info(si); + + spin_lock(&si->si_lock); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot == OCFS2_INVALID_SLOT) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si); + if (slot == OCFS2_INVALID_SLOT) { + spin_unlock(&si->si_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", + slot); + + __ocfs2_fill_slot(si, slot, osb->node_num); + osb->slot_num = slot; + spin_unlock(&si->si_lock); + + mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num); + + status = ocfs2_update_disk_slots(osb, si); + if (status < 0) + mlog_errno(status); + +bail: + mlog_exit(status); + return status; +} + +void ocfs2_put_slot(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_slot_info *si = osb->slot_info; + + if (!si) + return; + + ocfs2_update_slot_info(si); + + spin_lock(&si->si_lock); + __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&si->si_lock); + + status = ocfs2_update_disk_slots(osb, si); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + osb->slot_info = NULL; + ocfs2_free_slot_info(si); +} + diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h new file mode 100644 index 00000000000..d8c8ceed031 --- /dev/null +++ b/fs/ocfs2/slot_map.h @@ -0,0 +1,66 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slotmap.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef SLOTMAP_H +#define SLOTMAP_H + +struct ocfs2_slot_info { + spinlock_t si_lock; + + struct inode *si_inode; + struct buffer_head *si_bh; + unsigned int si_num_slots; + unsigned int si_size; + s16 si_global_node_nums[OCFS2_MAX_SLOTS]; +}; + +int ocfs2_init_slot_info(struct ocfs2_super *osb); +void ocfs2_free_slot_info(struct ocfs2_slot_info *si); + +int ocfs2_find_slot(struct ocfs2_super *osb); +void ocfs2_put_slot(struct ocfs2_super *osb); + +void ocfs2_update_slot_info(struct ocfs2_slot_info *si); +int ocfs2_update_disk_slots(struct ocfs2_super *osb, + struct ocfs2_slot_info *si); + +s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global); +void ocfs2_clear_slot(struct ocfs2_slot_info *si, + s16 slot_num); + +void ocfs2_populate_mounted_map(struct ocfs2_super *osb); + +static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, + int slot_num) +{ + BUG_ON(slot_num == OCFS2_INVALID_SLOT); + assert_spin_locked(&si->si_lock); + + return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; +} + +#endif diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c new file mode 100644 index 00000000000..c46c164aefb --- /dev/null +++ b/fs/ocfs2/suballoc.c @@ -0,0 +1,1651 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.c + * + * metadata alloc and free + * Inspired by ext3 block groups. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); +static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + u16 my_chain, + struct ocfs2_chain_list *cl); +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh); + +static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac); + +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found); +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found); +static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno); +static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno); +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr); +static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, + struct buffer_head *bg_bh, + unsigned int bits_wanted, + u16 *bit_off, + u16 *bits_found); +static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); +static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + +static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain); +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted); +static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count); +static inline u64 ocfs2_which_suballoc_group(u64 block, + unsigned int bit); +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off); +static inline u64 ocfs2_which_cluster_group(struct inode *inode, + u32 cluster); +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off); + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +{ + if (ac->ac_inode) + iput(ac->ac_inode); + if (ac->ac_bh) + brelse(ac->ac_bh); + kfree(ac); +} + +static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) +{ + return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); +} + +static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + u16 my_chain, + struct ocfs2_chain_list *cl) +{ + int status = 0; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct super_block * sb = alloc_inode->i_sb; + + mlog_entry_void(); + + if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { + ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") " + "!= b_blocknr (%llu)", group_blkno, + (unsigned long long) bg_bh->b_blocknr); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access(handle, + alloc_inode, + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bg, 0, sb->s_blocksize); + strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); + bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + bg->bg_chain = cpu_to_le16(my_chain); + bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; + bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); + bg->bg_blkno = cpu_to_le64(group_blkno); + /* set the 1st bit in the bitmap to account for the descriptor block */ + ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); + bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); + + status = ocfs2_journal_dirty(handle, bg_bh); + if (status < 0) + mlog_errno(status); + + /* There is no need to zero out or otherwise initialize the + * other blocks in a group - All valid FS metadata in a block + * group stores the superblock fs_generation value at + * allocation time. */ + +bail: + mlog_exit(status); + return status; +} + +static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_count)) { + if (le32_to_cpu(cl->cl_recs[best].c_total) > + le32_to_cpu(cl->cl_recs[curr].c_total)) + best = curr; + curr++; + } + return best; +} + +/* + * We expect the block group allocator to already be locked. + */ +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh) +{ + int status, credits; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + struct ocfs2_chain_list *cl; + struct ocfs2_alloc_context *ac = NULL; + struct ocfs2_journal_handle *handle = NULL; + u32 bit_off, num_bits; + u16 alloc_rec; + u64 bg_blkno; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + + BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); + + mlog_entry_void(); + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + cl = &fe->id2.i_chain; + status = ocfs2_reserve_clusters(osb, + handle, + le16_to_cpu(cl->cl_cpg), + &ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + credits = ocfs2_calc_group_alloc_credits(osb->sb, + le16_to_cpu(cl->cl_cpg)); + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_claim_clusters(osb, + handle, + ac, + le16_to_cpu(cl->cl_cpg), + &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + alloc_rec = ocfs2_find_smallest_chain(cl); + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "new descriptor, record %u, at block %"MLFu64"\n", + alloc_rec, bg_blkno); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); + + status = ocfs2_block_group_fill(handle, + alloc_inode, + bg_bh, + bg_blkno, + alloc_rec, + cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + status = ocfs2_journal_access(handle, alloc_inode, + bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); + if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) + le16_add_cpu(&cl->cl_next_free_rec, 1); + + le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); + le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + spin_lock(&OCFS2_I(alloc_inode)->ip_lock); + OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, + le32_to_cpu(fe->i_clusters))); + spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); + i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); + alloc_inode->i_blocks = + ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); + + status = 0; +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (ac) + ocfs2_free_alloc_context(ac); + + if (bg_bh) + brelse(bg_bh); + + mlog_exit(status); + return status; +} + +static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int status; + u32 bits_wanted = ac->ac_bits_wanted; + struct inode *alloc_inode = ac->ac_inode; + struct buffer_head *bh = NULL; + struct ocfs2_journal_handle *handle = ac->ac_handle; + struct ocfs2_dinode *fe; + u32 free_bits; + + mlog_entry_void(); + + BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); + + ocfs2_handle_add_inode(handle, alloc_inode); + status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); + status = -EIO; + goto bail; + } + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { + ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator " + "# %"MLFu64, le64_to_cpu(fe->i_blkno)); + status = -EIO; + goto bail; + } + + free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - + le32_to_cpu(fe->id1.bitmap1.i_used); + + if (bits_wanted > free_bits) { + /* cluster bitmap never grows */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", + bits_wanted, free_bits); + status = -ENOSPC; + goto bail; + } + + status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_extends); + + /* You should never ask for this much metadata */ + BUG_ON(bits_wanted > + (le32_to_cpu(fe->id1.bitmap1.i_total) + - le32_to_cpu(fe->id1.bitmap1.i_used))); + } + + get_bh(bh); + ac->ac_bh = bh; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *fe, + struct ocfs2_alloc_context **ac) +{ + int status; + struct inode *alloc_inode = NULL; + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_handle = handle; + (*ac)->ac_which = OCFS2_AC_USE_META; + +#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + 0); +#else + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num); +#endif + if (!alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_inode = igrab(alloc_inode); + (*ac)->ac_group_search = ocfs2_block_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, (*ac)); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (alloc_inode) + iput(alloc_inode); + + mlog_exit(status); + return status; +} + +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac) +{ + int status; + struct inode *alloc_inode = NULL; + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = 1; + (*ac)->ac_handle = handle; + (*ac)->ac_which = OCFS2_AC_USE_INODE; + + alloc_inode = ocfs2_get_system_file_inode(osb, + INODE_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_inode = igrab(alloc_inode); + (*ac)->ac_group_search = ocfs2_block_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (alloc_inode) + iput(alloc_inode); + + mlog_exit(status); + return status; +} + +/* local alloc code has to do the same thing, so rather than do this + * twice.. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int status; + + ac->ac_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!ac->ac_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto bail; + } + ac->ac_which = OCFS2_AC_USE_MAIN; + ac->ac_group_search = ocfs2_cluster_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, ac); + if (status < 0 && status != -ENOSPC) + mlog_errno(status); +bail: + return status; +} + +/* Callers don't need to care which bitmap (local alloc or main) to + * use so we figure it out for them, but unfortunately this clutters + * things a bit. */ +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + int status; + + mlog_entry_void(); + + BUG_ON(!handle); + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_handle = handle; + + status = -ENOSPC; + if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { + status = ocfs2_reserve_local_alloc_bits(osb, + handle, + bits_wanted, + *ac); + if ((status < 0) && (status != -ENOSPC)) { + mlog_errno(status); + goto bail; + } else if (status == -ENOSPC) { + /* reserve_local_bits will return enospc with + * the local alloc inode still locked, so we + * can change this safely here. */ + mlog(0, "Disabling local alloc\n"); + /* We set to OCFS2_LA_DISABLED so that umount + * can clean up what's left of the local + * allocation */ + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + } + + if (status == -ENOSPC) { + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + mlog_exit(status); + return status; +} + +/* + * More or less lifted from ext3. I'll leave their description below: + * + * "For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes." + * + * Note: OCFS2 already does this differently for metadata vs data + * allocations, as those bitmaps are seperate and undo access is never + * called on a metadata group descriptor. + */ +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr) +{ + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) + return 0; + if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) + return 1; + + bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); +} + +static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, + struct buffer_head *bg_bh, + unsigned int bits_wanted, + u16 *bit_off, + u16 *bits_found) +{ + void *bitmap; + u16 best_offset, best_size; + int offset, start, found, status = 0; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); + return -EIO; + } + + found = start = best_offset = best_size = 0; + bitmap = bg->bg_bitmap; + + while((offset = ocfs2_find_next_zero_bit(bitmap, + le16_to_cpu(bg->bg_bits), + start)) != -1) { + if (offset == le16_to_cpu(bg->bg_bits)) + break; + + if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { + /* We found a zero, but we can't use it as it + * hasn't been put to disk yet! */ + found = 0; + start = offset + 1; + } else if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_size) { + best_size = found; + best_offset = start - found; + } + /* we got everything we needed */ + if (found == bits_wanted) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + /* XXX: I think the first clause is equivalent to the second + * - jlbec */ + if (found == bits_wanted) { + *bit_off = start - found; + *bits_found = found; + } else if (best_size) { + *bit_off = best_offset; + *bits_found = best_size; + } else { + status = -ENOSPC; + /* No error log here -- see the comment above + * ocfs2_test_bg_bit_allocatable */ + } + + return status; +} + +static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + void *bitmap = bg->bg_bitmap; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + + mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, + num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access(handle, + alloc_inode, + group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + + while(num_bits--) + ocfs2_set_bit(bit_off++, bitmap); + + status = ocfs2_journal_dirty(handle, + group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +/* find the one with the most empty bits */ +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + BUG_ON(!cl->cl_next_free_rec); + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_next_free_rec)) { + if (le32_to_cpu(cl->cl_recs[curr].c_free) > + le32_to_cpu(cl->cl_recs[best].c_free)) + best = curr; + curr++; + } + + BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); + return best; +} + +static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain) +{ + int status; + /* there is a really tiny chance the journal calls could fail, + * but we wouldn't want inconsistent blocks in *any* case. */ + u64 fe_ptr, bg_ptr, prev_bg_ptr; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); + status = -EIO; + goto out; + } + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto out; + } + if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); + status = -EIO; + goto out; + } + + mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to " + "top, prev = %"MLFu64"\n", + fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno); + + fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); + bg_ptr = le64_to_cpu(bg->bg_next_group); + prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); + + status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + prev_bg->bg_next_group = bg->bg_next_group; + + status = ocfs2_journal_dirty(handle, prev_bg_bh); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + status = ocfs2_journal_access(handle, alloc_inode, bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; + + status = ocfs2_journal_dirty(handle, bg_bh); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + status = ocfs2_journal_access(handle, alloc_inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + status = 0; +out_rollback: + if (status < 0) { + fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); + bg->bg_next_group = cpu_to_le64(bg_ptr); + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + } +out: + mlog_exit(status); + return status; +} + +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted) +{ + return le16_to_cpu(bg->bg_free_bits_count) > wanted; +} + +/* return 0 on success, -ENOSPC to keep searching and any other < 0 + * value on error. */ +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found) +{ + int search = -ENOSPC; + int ret; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + u16 tmp_off, tmp_found; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (bg->bg_free_bits_count) { + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + &tmp_off, &tmp_found); + if (ret) + return ret; + + /* ocfs2_block_group_find_clear_bits() might + * return success, but we still want to return + * -ENOSPC unless it found the minimum number + * of bits. */ + if (min_bits <= tmp_found) { + *bit_off = tmp_off; + *bits_found = tmp_found; + search = 0; /* success */ + } + } + + return search; +} + +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found) +{ + int ret = -ENOSPC; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + + BUG_ON(min_bits != 1); + BUG_ON(ocfs2_is_cluster_bitmap(inode)); + + if (bg->bg_free_bits_count) + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + bit_off, bits_found); + + return ret; +} + +static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno) +{ + int status; + u16 chain, tmp_bits; + u32 tmp_used; + u64 next_group; + struct ocfs2_journal_handle *handle = ac->ac_handle; + struct inode *alloc_inode = ac->ac_inode; + struct buffer_head *group_bh = NULL; + struct buffer_head *prev_group_bh = NULL; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + struct ocfs2_group_desc *bg; + + chain = ac->ac_chain; + mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n", + bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno); + + status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh, OCFS2_BH_CACHED, alloc_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + + status = -ENOSPC; + /* for now, the chain search is a bit simplistic. We just use + * the 1st group with any empty bits. */ + while ((status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, bit_off, + &tmp_bits)) == -ENOSPC) { + if (!bg->bg_next_group) + break; + + if (prev_group_bh) { + brelse(prev_group_bh); + prev_group_bh = NULL; + } + next_group = le64_to_cpu(bg->bg_next_group); + prev_group_bh = group_bh; + group_bh = NULL; + status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + next_group, &group_bh, + OCFS2_BH_CACHED, alloc_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n", + tmp_bits, bg->bg_blkno); + + *num_bits = tmp_bits; + + BUG_ON(*num_bits == 0); + + /* + * Keep track of previous block descriptor read. When + * we find a target, if we have read more than X + * number of descriptors, and the target is reasonably + * empty, relink him to top of his chain. + * + * We've read 0 extra blocks and only send one more to + * the transaction, yet the next guy to search has a + * much easier time. + * + * Do this *after* figuring out how many bits we're taking out + * of our target group. + */ + if (ac->ac_allow_chain_relink && + (prev_group_bh) && + (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { + status = ocfs2_relink_block_group(handle, alloc_inode, + ac->ac_bh, group_bh, + prev_group_bh, chain); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Ok, claim our bits now: set the info on dinode, chainlist + * and then the group */ + status = ocfs2_journal_access(handle, + alloc_inode, + ac->ac_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); + + status = ocfs2_journal_dirty(handle, + ac->ac_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_set_bits(handle, + alloc_inode, + bg, + group_bh, + *bit_off, + *num_bits); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n", + *num_bits, fe->i_blkno); + + *bg_blkno = le64_to_cpu(bg->bg_blkno); +bail: + if (group_bh) + brelse(group_bh); + if (prev_group_bh) + brelse(prev_group_bh); + + mlog_exit(status); + return status; +} + +/* will give out up to bits_wanted contiguous bits. */ +static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno) +{ + int status; + u16 victim, i; + struct ocfs2_chain_list *cl; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); + BUG_ON(!ac->ac_bh); + + fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); + status = -EIO; + goto bail; + } + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= + le32_to_cpu(fe->id1.bitmap1.i_total)) { + ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u" + "used bits but only %u total.", + le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); + status = -EIO; + goto bail; + } + + cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + + victim = ocfs2_find_victim_chain(cl); + ac->ac_chain = victim; + ac->ac_allow_chain_relink = 1; + + status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, + num_bits, bg_blkno); + if (!status) + goto bail; + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Search of victim chain %u came up with nothing, " + "trying all chains now.\n", victim); + + /* If we didn't pick a good victim, then just default to + * searching each chain in order. Don't allow chain relinking + * because we only calculate enough journal credits for one + * relink per alloc. */ + ac->ac_allow_chain_relink = 0; + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { + if (i == victim) + continue; + if (!cl->cl_recs[i].c_free) + continue; + + ac->ac_chain = i; + status = ocfs2_search_chain(ac, bits_wanted, min_bits, + bit_off, num_bits, + bg_blkno); + if (!status) + break; + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } +bail: + + mlog_exit(status); + return status; +} + +int ocfs2_claim_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u16 *suballoc_bit_start, + unsigned int *num_bits, + u64 *blkno_start) +{ + int status; + u64 bg_blkno; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); + BUG_ON(ac->ac_which != OCFS2_AC_USE_META); + BUG_ON(ac->ac_handle != handle); + + status = ocfs2_claim_suballoc_bits(osb, + ac, + bits_wanted, + 1, + suballoc_bit_start, + num_bits, + &bg_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_allocs); + + *blkno_start = bg_blkno + (u64) *suballoc_bit_start; + ac->ac_bits_given += (*num_bits); + status = 0; +bail: + mlog_exit(status); + return status; +} + +int ocfs2_claim_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u16 *suballoc_bit, + u64 *fe_blkno) +{ + int status; + unsigned int num_bits; + u64 bg_blkno; + + mlog_entry_void(); + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + BUG_ON(ac->ac_handle != handle); + + status = ocfs2_claim_suballoc_bits(osb, + ac, + 1, + 1, + suballoc_bit, + &num_bits, + &bg_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_allocs); + + BUG_ON(num_bits != 1); + + *fe_blkno = bg_blkno + (u64) (*suballoc_bit); + ac->ac_bits_given++; + status = 0; +bail: + mlog_exit(status); + return status; +} + +/* translate a group desc. blkno and it's bitmap offset into + * disk cluster offset. */ +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 cluster = 0; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (bg_blkno != osb->first_cluster_group_blkno) + cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); + cluster += (u32) bg_bit_off; + return cluster; +} + +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +static inline u64 ocfs2_which_cluster_group(struct inode *inode, + u32 cluster) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 group_no; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + group_no = cluster / osb->bitmap_cpg; + if (!group_no) + return osb->first_cluster_group_blkno; + return ocfs2_clusters_to_blocks(inode->i_sb, + group_no * osb->bitmap_cpg); +} + +/* given the block number of a cluster start, calculate which cluster + * group and descriptor bitmap offset that corresponds to. */ +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + *bg_blkno = ocfs2_which_cluster_group(inode, + data_cluster); + + if (*bg_blkno == osb->first_cluster_group_blkno) + *bg_bit_off = (u16) data_cluster; + else + *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, + data_blkno - *bg_blkno); +} + +/* + * min_bits - minimum contiguous chunk from this total allocation we + * can handle. set to what we asked for originally for a full + * contig. allocation, set to '1' to indicate we can deal with extents + * of any size. + */ +int ocfs2_claim_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters) +{ + int status; + unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; + u64 bg_blkno; + u16 bg_bit_off; + + mlog_entry_void(); + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL + && ac->ac_which != OCFS2_AC_USE_MAIN); + BUG_ON(ac->ac_handle != handle); + + if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + status = ocfs2_claim_local_alloc_bits(osb, + handle, + ac, + bits_wanted, + cluster_start, + num_clusters); + if (!status) + atomic_inc(&osb->alloc_stats.local_data); + } else { + if (min_clusters > (osb->bitmap_cpg - 1)) { + /* The only paths asking for contiguousness + * should know about this already. */ + mlog(ML_ERROR, "minimum allocation requested exceeds " + "group bitmap size!"); + status = -ENOSPC; + goto bail; + } + /* clamp the current request down to a realistic size. */ + if (bits_wanted > (osb->bitmap_cpg - 1)) + bits_wanted = osb->bitmap_cpg - 1; + + status = ocfs2_claim_suballoc_bits(osb, + ac, + bits_wanted, + min_clusters, + &bg_bit_off, + num_clusters, + &bg_blkno); + if (!status) { + *cluster_start = + ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, + bg_blkno, + bg_bit_off); + atomic_inc(&osb->alloc_stats.bitmap_data); + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + ac->ac_bits_given += *num_clusters; + +bail: + mlog_exit(status); + return status; +} + +static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + unsigned int tmp; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + struct ocfs2_group_desc *undo_bg = NULL; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + + mlog(0, "off = %u, num = %u\n", bit_off, num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access(handle, alloc_inode, group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; + + tmp = num_bits; + while(tmp--) { + ocfs2_clear_bit((bit_off + tmp), + (unsigned long *) bg->bg_bitmap); + if (ocfs2_is_cluster_bitmap(alloc_inode)) + ocfs2_set_bit(bit_off + tmp, + (unsigned long *) undo_bg->bg_bitmap); + } + le16_add_cpu(&bg->bg_free_bits_count, num_bits); + + status = ocfs2_journal_dirty(handle, group_bh); + if (status < 0) + mlog_errno(status); +bail: + return status; +} + +/* + * expects the suballoc inode to already be locked. + */ +static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) +{ + int status = 0; + u32 tmp_used; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); + status = -EIO; + goto bail; + } + BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); + + mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64 + ", starting at %u\n", + OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno, + start_bit); + + status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, + alloc_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(group)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); + status = -EIO; + goto bail; + } + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + + status = ocfs2_block_group_clear_bits(handle, alloc_inode, + group, group_bh, + start_bit, count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, + count); + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); + + status = ocfs2_journal_dirty(handle, alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + if (group_bh) + brelse(group_bh); + + mlog_exit(status); + return status; +} + +static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) +{ + u64 group = block - (u64) bit; + + return group; +} + +int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di) +{ + u64 blk = le64_to_cpu(di->i_blkno); + u16 bit = le16_to_cpu(di->i_suballoc_bit); + u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, + inode_alloc_bh, bit, bg_blkno, 1); +} + +int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, + struct inode *eb_alloc_inode, + struct buffer_head *eb_alloc_bh, + struct ocfs2_extent_block *eb) +{ + u64 blk = le64_to_cpu(eb->h_blkno); + u16 bit = le16_to_cpu(eb->h_suballoc_bit); + u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, + bit, bg_blkno, 1); +} + +int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + int status; + u16 bg_start_bit; + u64 bg_blkno; + struct ocfs2_dinode *fe; + + /* You can't ever have a contiguous set of clusters + * bigger than a block group bitmap so we never have to worry + * about looping on them. */ + + mlog_entry_void(); + + /* This is expensive. We can safely remove once this stuff has + * gotten tested really well. */ + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + + fe = (struct ocfs2_dinode *) bitmap_bh->b_data; + + ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, + &bg_start_bit); + + mlog(0, "want to free %u clusters starting at block %"MLFu64"\n", + num_clusters, start_blk); + mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n", + bg_blkno, bg_start_bit); + + status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, + bg_start_bit, bg_blkno, + num_clusters); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) +{ + printk("Block Group:\n"); + printk("bg_signature: %s\n", bg->bg_signature); + printk("bg_size: %u\n", bg->bg_size); + printk("bg_bits: %u\n", bg->bg_bits); + printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); + printk("bg_chain: %u\n", bg->bg_chain); + printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); + printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group); + printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode); + printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno); +} + +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) +{ + int i; + + printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno); + printk("i_signature: %s\n", fe->i_signature); + printk("i_size: %"MLFu64"\n", fe->i_size); + printk("i_clusters: %u\n", fe->i_clusters); + printk("i_generation: %u\n", + le32_to_cpu(fe->i_generation)); + printk("id1.bitmap1.i_used: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_used)); + printk("id1.bitmap1.i_total: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_total)); + printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); + printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); + printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); + printk("id2.i_chain.cl_next_free_rec: %u\n", + fe->id2.i_chain.cl_next_free_rec); + for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { + printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_free); + printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_total); + printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i, + fe->id2.i_chain.cl_recs[i].c_blkno); + } +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h new file mode 100644 index 00000000000..a76c82a7cea --- /dev/null +++ b/fs/ocfs2/suballoc.h @@ -0,0 +1,132 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.h + * + * Defines sub allocator api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _CHAINALLOC_H_ +#define _CHAINALLOC_H_ + +typedef int (group_search_t)(struct inode *, + struct buffer_head *, + u32, + u32, + u16 *, + u16 *); + +struct ocfs2_alloc_context { + struct inode *ac_inode; /* which bitmap are we allocating from? */ + struct buffer_head *ac_bh; /* file entry bh */ + u32 ac_bits_wanted; + u32 ac_bits_given; +#define OCFS2_AC_USE_LOCAL 1 +#define OCFS2_AC_USE_MAIN 2 +#define OCFS2_AC_USE_INODE 3 +#define OCFS2_AC_USE_META 4 + u32 ac_which; + struct ocfs2_journal_handle *ac_handle; + + /* these are used by the chain search */ + u16 ac_chain; + int ac_allow_chain_relink; + group_search_t *ac_group_search; +}; + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); +static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) +{ + return ac->ac_bits_wanted - ac->ac_bits_given; +} + +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *fe, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u32 bits_wanted, + struct ocfs2_alloc_context **ac); + +int ocfs2_claim_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u16 *suballoc_bit_start, + u32 *num_bits, + u64 *blkno_start); +int ocfs2_claim_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u16 *suballoc_bit, + u64 *fe_blkno); +int ocfs2_claim_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters); + +int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di); +int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, + struct inode *eb_alloc_inode, + struct buffer_head *eb_alloc_bh, + struct ocfs2_extent_block *eb); +int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); + +static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, + u64 bg_blkno) +{ + /* This should work for all block group descriptors as only + * the 1st group descriptor of the cluster bitmap is + * different. */ + + if (bg_blkno == osb->first_cluster_group_blkno) + return 0; + + /* the rest of the block groups are located at the beginning + * of their 1st cluster, so a direct translation just + * works. */ + return ocfs2_blocks_to_clusters(osb->sb, bg_blkno); +} + +static inline int ocfs2_is_cluster_bitmap(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno; +} + +/* This is for local alloc ONLY. Others should use the task-specific + * apis above. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac); + +#endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c new file mode 100644 index 00000000000..48bf7f0ce54 --- /dev/null +++ b/fs/ocfs2/super.c @@ -0,0 +1,1733 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.c + * + * load/unload driver, mount/dismount volumes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/random.h> +#include <linux/statfs.h> +#include <linux/moduleparam.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/parser.h> +#include <linux/crc32.h> +#include <linux/debugfs.h> + +#include <cluster/nodemanager.h> + +#define MLOG_MASK_PREFIX ML_SUPER +#include <cluster/masklog.h> + +#include "ocfs2.h" + +/* this should be the only file to include a version 1 header */ +#include "ocfs1_fs_compat.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "export.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "namei.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ver.h" +#include "vote.h" + +#include "buffer_head_io.h" + +/* + * Globals + */ +static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED; + +static u32 osb_id; /* Keeps track of next available OSB Id */ + +static kmem_cache_t *ocfs2_inode_cachep = NULL; + +kmem_cache_t *ocfs2_lock_cache = NULL; + +/* OCFS2 needs to schedule several differnt types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. */ +struct workqueue_struct *ocfs2_wq = NULL; + +static struct dentry *ocfs2_debugfs_root = NULL; + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +static int ocfs2_parse_options(struct super_block *sb, char *options, + unsigned long *mount_opt, int is_remount); +static void ocfs2_put_super(struct super_block *sb); +static int ocfs2_mount_volume(struct super_block *sb); +static int ocfs2_remount(struct super_block *sb, int *flags, char *data); +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); +static int ocfs2_initialize_mem_caches(void); +static void ocfs2_free_mem_caches(void); +static void ocfs2_delete_osb(struct ocfs2_super *osb); + +static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf); + +static int ocfs2_sync_fs(struct super_block *sb, int wait); + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); +static int ocfs2_release_system_inodes(struct ocfs2_super *osb); +static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); +static int ocfs2_check_volume(struct ocfs2_super *osb); +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 sectsize); +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size); +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size); +static void ocfs2_write_super(struct super_block *sb); +static struct inode *ocfs2_alloc_inode(struct super_block *sb); +static void ocfs2_destroy_inode(struct inode *inode); + +static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); + +static struct super_operations ocfs2_sops = { + .statfs = ocfs2_statfs, + .alloc_inode = ocfs2_alloc_inode, + .destroy_inode = ocfs2_destroy_inode, + .drop_inode = ocfs2_drop_inode, + .clear_inode = ocfs2_clear_inode, + .delete_inode = ocfs2_delete_inode, + .sync_fs = ocfs2_sync_fs, + .write_super = ocfs2_write_super, + .put_super = ocfs2_put_super, + .remount_fs = ocfs2_remount, +}; + +enum { + Opt_barrier, + Opt_err_panic, + Opt_err_ro, + Opt_intr, + Opt_nointr, + Opt_hb_none, + Opt_hb_local, + Opt_data_ordered, + Opt_data_writeback, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_barrier, "barrier=%u"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_intr, "intr"}, + {Opt_nointr, "nointr"}, + {Opt_hb_none, OCFS2_HB_NONE}, + {Opt_hb_local, OCFS2_HB_LOCAL}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_err, NULL} +}; + +/* + * write_super and sync_fs ripped right out of ext3. + */ +static void ocfs2_write_super(struct super_block *sb) +{ + if (down_trylock(&sb->s_lock) == 0) + BUG(); + sb->s_dirt = 0; +} + +static int ocfs2_sync_fs(struct super_block *sb, int wait) +{ + int status = 0; + tid_t target; + struct ocfs2_super *osb = OCFS2_SB(sb); + + sb->s_dirt = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (wait) { + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + } else { + ocfs2_schedule_truncate_log_flush(osb, 0); + } + + if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { + if (wait) + log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); + } + return 0; +} + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + mlog_entry_void(); + + new = ocfs2_iget(osb, osb->root_blkno); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->root_inode = new; + + new = ocfs2_iget(osb, osb->system_dir_blkno); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->sys_root_inode = new; + + for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; + i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog_errno(status); + /* FIXME: Should ERROR_RO_FS */ + mlog(ML_ERROR, "Unable to load system inode %d, " + "possibly corrupt fs?", i); + goto bail; + } + // the array now has one ref, so drop this one + iput(new); + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + mlog_entry_void(); + + for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; + i < NUM_SYSTEM_INODES; + i++) { + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", + status, i, osb->slot_num); + goto bail; + } + /* the array now has one ref, so drop this one */ + iput(new); + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_release_system_inodes(struct ocfs2_super *osb) +{ + int status = 0, i; + struct inode *inode; + + mlog_entry_void(); + + for (i = 0; i < NUM_SYSTEM_INODES; i++) { + inode = osb->system_inodes[i]; + if (inode) { + iput(inode); + osb->system_inodes[i] = NULL; + } + } + + inode = osb->sys_root_inode; + if (inode) { + iput(inode); + osb->sys_root_inode = NULL; + } + + inode = osb->root_inode; + if (inode) { + iput(inode); + osb->root_inode = NULL; + } + + mlog_exit(status); + return status; +} + +/* We're allocating fs objects, use GFP_NOFS */ +static struct inode *ocfs2_alloc_inode(struct super_block *sb) +{ + struct ocfs2_inode_info *oi; + + oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS); + if (!oi) + return NULL; + + return &oi->vfs_inode; +} + +static void ocfs2_destroy_inode(struct inode *inode) +{ + kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); +} + +/* From xfs_super.c:xfs_max_file_offset + * Copyright (c) 2000-2004 Silicon Graphics, Inc. + */ +static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_prepare_write does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBD) + BUG_ON(sizeof(sector_t) != 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((unsigned long long)pagefactor) << bitshift) - 1; +} + +static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +{ + int incompat_features; + int ret = 0; + unsigned long parsed_options; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + ret = -EINVAL; + goto out; + } + + if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != + (parsed_options & OCFS2_MOUNT_HB_LOCAL)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); + goto out; + } + + if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != + (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change data mode on remount\n"); + goto out; + } + + /* We're going to/from readonly mode. */ + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Lock here so the check of HARD_RO and the potential + * setting of SOFT_RO is atomic. */ + spin_lock(&osb->osb_lock); + if (osb->osb_flags & OCFS2_OSB_HARD_RO) { + mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); + ret = -EROFS; + goto unlock_osb; + } + + if (*flags & MS_RDONLY) { + mlog(0, "Going to ro mode.\n"); + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + } else { + mlog(0, "Making ro filesystem writeable.\n"); + + if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { + mlog(ML_ERROR, "Cannot remount RDWR " + "filesystem due to previous errors.\n"); + ret = -EROFS; + goto unlock_osb; + } + incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); + if (incompat_features) { + mlog(ML_ERROR, "Cannot remount RDWR because " + "of unsupported optional features " + "(%x).\n", incompat_features); + ret = -EINVAL; + goto unlock_osb; + } + sb->s_flags &= ~MS_RDONLY; + osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; + } +unlock_osb: + spin_unlock(&osb->osb_lock); + } + + if (!ret) { + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); + + /* Only save off the new mount options in case of a successful + * remount. */ + osb->s_mount_opt = parsed_options; + } +out: + return ret; +} + +static int ocfs2_sb_probe(struct super_block *sb, + struct buffer_head **bh, + int *sector_size) +{ + int status = 0, tmpstat; + struct ocfs1_vol_disk_hdr *hdr; + struct ocfs2_dinode *di; + int blksize; + + *bh = NULL; + + /* may be > 512 */ + *sector_size = bdev_hardsect_size(sb->s_bdev); + if (*sector_size > OCFS2_MAX_BLOCKSIZE) { + mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", + *sector_size, OCFS2_MAX_BLOCKSIZE); + status = -EINVAL; + goto bail; + } + + /* Can this really happen? */ + if (*sector_size < OCFS2_MIN_BLOCKSIZE) + *sector_size = OCFS2_MIN_BLOCKSIZE; + + /* check block zero for old format */ + status = ocfs2_get_sector(sb, bh, 0, *sector_size); + if (status < 0) { + mlog_errno(status); + goto bail; + } + hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; + if (hdr->major_version == OCFS1_MAJOR_VERSION) { + mlog(ML_ERROR, "incompatible version: %u.%u\n", + hdr->major_version, hdr->minor_version); + status = -EINVAL; + } + if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, + strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { + mlog(ML_ERROR, "incompatible volume signature: %8s\n", + hdr->signature); + status = -EINVAL; + } + brelse(*bh); + *bh = NULL; + if (status < 0) { + mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " + "upgraded before mounting with ocfs v2\n"); + goto bail; + } + + /* + * Now check at magic offset for 512, 1024, 2048, 4096 + * blocksizes. 4096 is the maximum blocksize because it is + * the minimum clustersize. + */ + status = -EINVAL; + for (blksize = *sector_size; + blksize <= OCFS2_MAX_BLOCKSIZE; + blksize <<= 1) { + tmpstat = ocfs2_get_sector(sb, bh, + OCFS2_SUPER_BLOCK_BLKNO, + blksize); + if (tmpstat < 0) { + status = tmpstat; + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) (*bh)->b_data; + status = ocfs2_verify_volume(di, *bh, blksize); + if (status >= 0) + goto bail; + brelse(*bh); + *bh = NULL; + if (status != -EAGAIN) + break; + } + +bail: + return status; +} + +static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct dentry *root; + int status, sector_size; + unsigned long parsed_opt; + struct inode *inode = NULL; + struct ocfs2_super *osb = NULL; + struct buffer_head *bh = NULL; + + mlog_entry("%p, %p, %i", sb, data, silent); + + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (!o2hb_check_local_node_heartbeating()) { + status = -EINVAL; + goto read_super_error; + } + + /* probe for superblock */ + status = ocfs2_sb_probe(sb, &bh, §or_size); + if (status < 0) { + mlog(ML_ERROR, "superblock probe failed!\n"); + goto read_super_error; + } + + status = ocfs2_initialize_super(sb, bh, sector_size); + osb = OCFS2_SB(sb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + brelse(bh); + bh = NULL; + + if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { + status = -EINVAL; + goto read_super_error; + } + osb->s_mount_opt = parsed_opt; + + sb->s_magic = OCFS2_SUPER_MAGIC; + + /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + * heartbeat=none */ + if (bdev_read_only(sb->s_bdev)) { + if (!(sb->s_flags & MS_RDONLY)) { + status = -EACCES; + mlog(ML_ERROR, "Readonly device detected but readonly " + "mount was not specified.\n"); + goto read_super_error; + } + + /* You should not be able to start a local heartbeat + * on a readonly device. */ + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + status = -EROFS; + mlog(ML_ERROR, "Local heartbeat specified on readonly " + "device.\n"); + goto read_super_error; + } + + status = ocfs2_check_journals_nolocks(osb); + if (status < 0) { + if (status == -EROFS) + mlog(ML_ERROR, "Recovery required on readonly " + "file system, but write access is " + "unavailable.\n"); + else + mlog_errno(status); + goto read_super_error; + } + + ocfs2_set_ro_flag(osb, 1); + + printk(KERN_NOTICE "Readonly device detected. No cluster " + "services will be utilized for this mount. Recovery " + "will be skipped.\n"); + } + + if (!ocfs2_is_hard_readonly(osb)) { + /* If this isn't a hard readonly mount, then we need + * to make sure that heartbeat is in a valid state, + * and that we mark ourselves soft readonly is -oro + * was specified. */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + mlog(ML_ERROR, "No heartbeat for device (%s)\n", + sb->s_id); + status = -EINVAL; + goto read_super_error; + } + + if (sb->s_flags & MS_RDONLY) + ocfs2_set_ro_flag(osb, 0); + } + + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, + ocfs2_debugfs_root); + if (!osb->osb_debug_root) { + status = -EINVAL; + mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); + goto read_super_error; + } + + status = ocfs2_mount_volume(sb); + if (osb->root_inode) + inode = igrab(osb->root_inode); + + if (status < 0) + goto read_super_error; + + if (!inode) { + status = -EIO; + mlog_errno(status); + goto read_super_error; + } + + root = d_alloc_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + + printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s " + "data mode.\n", + MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num, + osb->slot_num, + osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : + "ordered"); + + atomic_set(&osb->vol_state, VOLUME_MOUNTED); + wake_up(&osb->osb_mount_event); + + mlog_exit(status); + return status; + +read_super_error: + if (bh != NULL) + brelse(bh); + + if (inode) + iput(inode); + + if (osb) { + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + } + + mlog_exit(status); + return status; +} + +static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); +} + +static struct file_system_type ocfs2_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2", + .get_sb = ocfs2_get_sb, /* is this called when we mount + * the fs? */ + .kill_sb = kill_block_super, /* set to the generic one + * right now, but do we + * need to change that? */ + .fs_flags = FS_REQUIRES_DEV, + .next = NULL +}; + +static int ocfs2_parse_options(struct super_block *sb, + char *options, + unsigned long *mount_opt, + int is_remount) +{ + int status; + char *p; + + mlog_entry("remount: %d, options: \"%s\"\n", is_remount, + options ? options : "(none)"); + + *mount_opt = 0; + + if (!options) { + status = 1; + goto bail; + } + + while ((p = strsep(&options, ",")) != NULL) { + int token, option; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_hb_local: + *mount_opt |= OCFS2_MOUNT_HB_LOCAL; + break; + case Opt_hb_none: + *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; + break; + case Opt_barrier: + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + *mount_opt |= OCFS2_MOUNT_BARRIER; + else + *mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + *mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_nointr: + *mount_opt |= OCFS2_MOUNT_NOINTR; + break; + case Opt_err_panic: + *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_err_ro: + *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_data_ordered: + *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + break; + case Opt_data_writeback: + *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; + break; + default: + mlog(ML_ERROR, + "Unrecognized mount option \"%s\" " + "or missing value\n", p); + status = 0; + goto bail; + } + } + + status = 1; + +bail: + mlog_exit(status); + return status; +} + +static int __init ocfs2_init(void) +{ + int status; + + mlog_entry_void(); + + ocfs2_print_version(); + + if (init_ocfs2_extent_maps()) + return -ENOMEM; + + status = init_ocfs2_uptodate_cache(); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_initialize_mem_caches(); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + if (!ocfs2_wq) { + status = -ENOMEM; + goto leave; + } + + spin_lock(&ocfs2_globals_lock); + osb_id = 0; + spin_unlock(&ocfs2_globals_lock); + + ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); + if (!ocfs2_debugfs_root) { + status = -EFAULT; + mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); + } + +leave: + if (status < 0) { + ocfs2_free_mem_caches(); + exit_ocfs2_uptodate_cache(); + exit_ocfs2_extent_maps(); + } + + mlog_exit(status); + + if (status >= 0) { + return register_filesystem(&ocfs2_fs_type); + } else + return -1; +} + +static void __exit ocfs2_exit(void) +{ + mlog_entry_void(); + + if (ocfs2_wq) { + flush_workqueue(ocfs2_wq); + destroy_workqueue(ocfs2_wq); + } + + debugfs_remove(ocfs2_debugfs_root); + + ocfs2_free_mem_caches(); + + unregister_filesystem(&ocfs2_fs_type); + + exit_ocfs2_extent_maps(); + + exit_ocfs2_uptodate_cache(); + + mlog_exit_void(); +} + +static void ocfs2_put_super(struct super_block *sb) +{ + mlog_entry("(0x%p)\n", sb); + + ocfs2_sync_blockdev(sb); + ocfs2_dismount_volume(sb, 0); + + mlog_exit_void(); +} + +static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct ocfs2_super *osb; + u32 numbits, freebits; + int status; + struct ocfs2_dinode *bm_lock; + struct buffer_head *bh = NULL; + struct inode *inode = NULL; + + mlog_entry("(%p, %p)\n", sb, buf); + + osb = OCFS2_SB(sb); + + inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + mlog(ML_ERROR, "failed to get bitmap inode\n"); + status = -EIO; + goto bail; + } + + status = ocfs2_meta_lock(inode, NULL, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + bm_lock = (struct ocfs2_dinode *) bh->b_data; + + numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); + freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); + + buf->f_type = OCFS2_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_namelen = OCFS2_MAX_FILENAME_LEN; + buf->f_blocks = ((sector_t) numbits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bfree = ((sector_t) freebits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bavail = buf->f_bfree; + buf->f_files = numbits; + buf->f_ffree = freebits; + + brelse(bh); + + ocfs2_meta_unlock(inode, 0); + status = 0; +bail: + if (inode) + iput(inode); + + mlog_exit(status); + + return status; +} + +static void ocfs2_inode_init_once(void *data, + kmem_cache_t *cachep, + unsigned long flags) +{ + struct ocfs2_inode_info *oi = data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + oi->ip_flags = 0; + oi->ip_open_count = 0; + spin_lock_init(&oi->ip_lock); + ocfs2_extent_map_init(&oi->vfs_inode); + INIT_LIST_HEAD(&oi->ip_handle_list); + INIT_LIST_HEAD(&oi->ip_io_markers); + oi->ip_handle = NULL; + oi->ip_created_trans = 0; + oi->ip_last_trans = 0; + oi->ip_dir_start_lookup = 0; + + init_rwsem(&oi->ip_alloc_sem); + init_MUTEX(&(oi->ip_io_sem)); + + oi->ip_blkno = 0ULL; + oi->ip_clusters = 0; + + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); + ocfs2_lock_res_init_once(&oi->ip_meta_lockres); + ocfs2_lock_res_init_once(&oi->ip_data_lockres); + + ocfs2_metadata_cache_init(&oi->vfs_inode); + + inode_init_once(&oi->vfs_inode); + } +} + +static int ocfs2_initialize_mem_caches(void) +{ + ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", + sizeof(struct ocfs2_inode_info), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + ocfs2_inode_init_once, NULL); + if (!ocfs2_inode_cachep) + return -ENOMEM; + + ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", + sizeof(struct ocfs2_journal_lock), + 0, + SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ocfs2_lock_cache) + return -ENOMEM; + + return 0; +} + +static void ocfs2_free_mem_caches(void) +{ + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + if (ocfs2_lock_cache) + kmem_cache_destroy(ocfs2_lock_cache); + + ocfs2_inode_cachep = NULL; + ocfs2_lock_cache = NULL; +} + +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size) +{ + if (!sb_set_blocksize(sb, sect_size)) { + mlog(ML_ERROR, "unable to set blocksize\n"); + return -EIO; + } + + *bh = sb_getblk(sb, block); + if (!*bh) { + mlog_errno(-EIO); + return -EIO; + } + lock_buffer(*bh); + if (!buffer_dirty(*bh)) + clear_buffer_uptodate(*bh); + unlock_buffer(*bh); + ll_rw_block(READ, 1, bh); + wait_on_buffer(*bh); + return 0; +} + +/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ +static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) +{ + int status; + + /* XXX hold a ref on the node while mounte? easy enough, if + * desirable. */ + osb->node_num = o2nm_this_node(); + if (osb->node_num == O2NM_MAX_NODES) { + mlog(ML_ERROR, "could not find this host's node number\n"); + status = -ENOENT; + goto bail; + } + + mlog(ML_NOTICE, "I am node %d\n", osb->node_num); + + status = 0; +bail: + return status; +} + +static int ocfs2_mount_volume(struct super_block *sb) +{ + int status = 0; + int unlock_super = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + mlog_entry_void(); + + if (ocfs2_is_hard_readonly(osb)) + goto leave; + + status = ocfs2_fill_local_node_info(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_register_hb_callbacks(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_dlm_init(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* requires vote_thread to be running. */ + status = ocfs2_register_net_handlers(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto leave; + } + unlock_super = 1; + + /* This will load up the node map and add ourselves to it. */ + status = ocfs2_find_slot(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + ocfs2_populate_mounted_map(osb); + + /* load all node-local system inodes */ + status = ocfs2_init_local_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_check_volume(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_truncate_log_init(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* This should be sent *after* we recovered our journal as it + * will cause other nodes to unmark us as needing + * recovery. However, we need to send it *before* dropping the + * super block lock as otherwise their recovery threads might + * try to clean us up while we're live! */ + status = ocfs2_request_mount_vote(osb); + if (status < 0) + mlog_errno(status); + +leave: + if (unlock_super) + ocfs2_super_unlock(osb, 1); + + mlog_exit(status); + return status; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) +{ + int tmp; + struct ocfs2_super *osb = NULL; + + mlog_entry("(0x%p)\n", sb); + + BUG_ON(!sb); + osb = OCFS2_SB(sb); + BUG_ON(!osb); + + ocfs2_shutdown_local_alloc(osb); + + ocfs2_truncate_log_shutdown(osb); + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + down(&osb->recovery_lock); + osb->disable_recovery = 1; + up(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + ocfs2_journal_shutdown(osb); + + ocfs2_sync_blockdev(sb); + + /* No dlm means we've failed during mount, so skip all the + * steps which depended on that to complete. */ + if (osb->dlm) { + tmp = ocfs2_super_lock(osb, 1); + if (tmp < 0) { + mlog_errno(tmp); + return; + } + + tmp = ocfs2_request_umount_vote(osb); + if (tmp < 0) + mlog_errno(tmp); + + if (osb->slot_num != OCFS2_INVALID_SLOT) + ocfs2_put_slot(osb); + + ocfs2_super_unlock(osb, 1); + } + + ocfs2_release_system_inodes(osb); + + if (osb->dlm) { + ocfs2_unregister_net_handlers(osb); + + ocfs2_dlm_shutdown(osb); + } + + ocfs2_clear_hb_callbacks(osb); + + debugfs_remove(osb->osb_debug_root); + + if (!mnt_err) + ocfs2_stop_heartbeat(osb); + + atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); + + printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n", + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num); + + ocfs2_delete_osb(osb); + kfree(osb); + sb->s_dev = 0; + sb->s_fs_info = NULL; +} + +static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, + unsigned uuid_bytes) +{ + int i, ret; + char *ptr; + + BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); + + osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); + if (osb->uuid_str == NULL) + return -ENOMEM; + + memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN); + + for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { + /* print with null */ + ret = snprintf(ptr, 3, "%02X", uuid[i]); + if (ret != 2) /* drop super cleans up */ + return -EINVAL; + /* then only advance past the last char */ + ptr += 2; + } + + return 0; +} + +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size) +{ + int status = 0; + int i; + struct ocfs2_dinode *di = NULL; + struct inode *inode = NULL; + struct buffer_head *bitmap_bh = NULL; + struct ocfs2_journal *journal; + __le32 uuid_net_key; + struct ocfs2_super *osb; + + mlog_entry_void(); + + osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL); + if (!osb) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + sb->s_fs_info = osb; + sb->s_op = &ocfs2_sops; + sb->s_export_op = &ocfs2_export_ops; + sb->s_flags |= MS_NOATIME; + /* this is needed to support O_LARGEFILE */ + sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits); + + osb->sb = sb; + /* Save off for ocfs2_rw_direct */ + osb->s_sectsize_bits = blksize_bits(sector_size); + if (!osb->s_sectsize_bits) + BUG(); + + osb->net_response_ids = 0; + spin_lock_init(&osb->net_response_lock); + INIT_LIST_HEAD(&osb->net_response_list); + + INIT_LIST_HEAD(&osb->osb_net_handlers); + init_waitqueue_head(&osb->recovery_event); + spin_lock_init(&osb->vote_task_lock); + init_waitqueue_head(&osb->vote_event); + osb->vote_work_sequence = 0; + osb->vote_wake_sequence = 0; + INIT_LIST_HEAD(&osb->blocked_lock_list); + osb->blocked_lock_count = 0; + INIT_LIST_HEAD(&osb->vote_list); + spin_lock_init(&osb->osb_lock); + + atomic_set(&osb->alloc_stats.moves, 0); + atomic_set(&osb->alloc_stats.local_data, 0); + atomic_set(&osb->alloc_stats.bitmap_data, 0); + atomic_set(&osb->alloc_stats.bg_allocs, 0); + atomic_set(&osb->alloc_stats.bg_extends, 0); + + ocfs2_init_node_maps(osb); + + snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + init_MUTEX(&osb->recovery_lock); + + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + + init_waitqueue_head(&osb->checkpoint_event); + atomic_set(&osb->needs_checkpoint, 0); + + osb->node_num = O2NM_INVALID_NODE_NUM; + osb->slot_num = OCFS2_INVALID_SLOT; + + osb->local_alloc_state = OCFS2_LA_UNUSED; + osb->local_alloc_bh = NULL; + + ocfs2_setup_hb_callbacks(osb); + + init_waitqueue_head(&osb->osb_mount_event); + + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); + if (!osb->vol_label) { + mlog(ML_ERROR, "unable to alloc vol label\n"); + status = -ENOMEM; + goto bail; + } + + osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL); + if (!osb->uuid) { + mlog(ML_ERROR, "unable to alloc uuid\n"); + status = -ENOMEM; + goto bail; + } + + di = (struct ocfs2_dinode *)bh->b_data; + + osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); + if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { + mlog(ML_ERROR, "Invalid number of node slots (%u)\n", + osb->max_slots); + status = -EINVAL; + goto bail; + } + mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); + + osb->s_feature_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); + osb->s_feature_ro_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); + osb->s_feature_incompat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); + + if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount because of unsupported " + "optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + if (!(osb->sb->s_flags & MS_RDONLY) && + (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount RDWR because of " + "unsupported optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + + get_random_bytes(&osb->s_next_generation, sizeof(u32)); + + /* FIXME + * This should be done in ocfs2_journal_init(), but unknown + * ordering issues will cause the filesystem to crash. + * If anyone wants to figure out what part of the code + * refers to osb->journal before ocfs2_journal_init() is run, + * be my guest. + */ + /* initialize our journal structure */ + + journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto bail; + } + osb->journal = journal; + journal->j_osb = osb; + + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = (unsigned long) 1; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb); + journal->j_state = OCFS2_JOURNAL_FREE; + + /* get some pseudo constants for clustersize bits */ + osb->s_clustersize_bits = + le32_to_cpu(di->id2.i_super.s_clustersize_bits); + osb->s_clustersize = 1 << osb->s_clustersize_bits; + mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits); + + if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || + osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { + mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", + osb->s_clustersize); + status = -EINVAL; + goto bail; + } + + if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) + > (u32)~0UL) { + mlog(ML_ERROR, "Volume might try to write to blocks beyond " + "what jbd can address in 32 bits.\n"); + status = -EINVAL; + goto bail; + } + + if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid))) { + mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); + status = -ENOMEM; + goto bail; + } + + memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key)); + osb->net_key = le32_to_cpu(uuid_net_key); + + strncpy(osb->vol_label, di->id2.i_super.s_label, 63); + osb->vol_label[63] = '\0'; + osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); + osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); + osb->first_cluster_group_blkno = + le64_to_cpu(di->id2.i_super.s_first_cluster_group); + osb->fs_generation = le32_to_cpu(di->i_fs_generation); + mlog(0, "vol_label: %s\n", osb->vol_label); + mlog(0, "uuid: %s\n", osb->uuid_str); + mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n", + osb->root_blkno, osb->system_dir_blkno); + + osb->osb_dlm_debug = ocfs2_new_dlm_debug(); + if (!osb->osb_dlm_debug) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + atomic_set(&osb->vol_state, VOLUME_INIT); + + /* load root, system_dir, and all global system inodes */ + status = ocfs2_init_global_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* + * global bitmap + */ + inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + + status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, + inode); + iput(inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) bitmap_bh->b_data; + osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); + osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total); + brelse(bitmap_bh); + mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n", + osb->bitmap_blkno, osb->bitmap_cpg); + + status = ocfs2_init_slot_info(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Link this osb onto the global linked list of all osb structures. */ + /* The Global Link List is mainted for the whole driver . */ + spin_lock(&ocfs2_globals_lock); + osb->osb_id = osb_id; + if (osb_id < OCFS2_MAX_OSB_ID) + osb_id++; + else { + mlog(ML_ERROR, "Too many volumes mounted\n"); + status = -ENOMEM; + } + spin_unlock(&ocfs2_globals_lock); + +bail: + mlog_exit(status); + return status; +} + +/* + * will return: -EAGAIN if it is ok to keep searching for superblocks + * -EINVAL if there is a bad superblock + * 0 on success + */ +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 blksz) +{ + int status = -EAGAIN; + + mlog_entry_void(); + + if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, + strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { + status = -EINVAL; + if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), + blksz); + } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != + OCFS2_MAJOR_REV_LEVEL || + le16_to_cpu(di->id2.i_super.s_minor_rev_level) != + OCFS2_MINOR_REV_LEVEL) { + mlog(ML_ERROR, "found superblock with bad version: " + "found %u.%u, should be %u.%u\n", + le16_to_cpu(di->id2.i_super.s_major_rev_level), + le16_to_cpu(di->id2.i_super.s_minor_rev_level), + OCFS2_MAJOR_REV_LEVEL, + OCFS2_MINOR_REV_LEVEL); + } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { + mlog(ML_ERROR, "bad block number on superblock: " + "found %"MLFu64", should be %llu\n", + di->i_blkno, (unsigned long long)bh->b_blocknr); + } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || + le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { + mlog(ML_ERROR, "bad cluster size found: %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); + } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { + mlog(ML_ERROR, "bad root_blkno: 0\n"); + } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { + mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); + } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { + mlog(ML_ERROR, + "Superblock slots found greater than file system " + "maximum: found %u, max %u\n", + le16_to_cpu(di->id2.i_super.s_max_slots), + OCFS2_MAX_SLOTS); + } else { + /* found it! */ + status = 0; + } + } + + mlog_exit(status); + return status; +} + +static int ocfs2_check_volume(struct ocfs2_super *osb) +{ + int status = 0; + int dirty; + struct ocfs2_dinode *local_alloc = NULL; /* only used if we + * recover + * ourselves. */ + + mlog_entry_void(); + + /* Init our journal object. */ + status = ocfs2_journal_init(osb->journal, &dirty); + if (status < 0) { + mlog(ML_ERROR, "Could not initialize journal!\n"); + goto finally; + } + + /* If the journal was unmounted cleanly then we don't want to + * recover anything. Otherwise, journal_load will do that + * dirty work for us :) */ + if (!dirty) { + status = ocfs2_journal_wipe(osb->journal, 0); + if (status < 0) { + mlog_errno(status); + goto finally; + } + } else { + mlog(ML_NOTICE, "File system was not unmounted cleanly, " + "recovering volume.\n"); + } + + /* will play back anything left in the journal. */ + ocfs2_journal_load(osb->journal); + + if (dirty) { + /* recover my local alloc if we didn't unmount cleanly. */ + status = ocfs2_begin_local_alloc_recovery(osb, + osb->slot_num, + &local_alloc); + if (status < 0) { + mlog_errno(status); + goto finally; + } + /* we complete the recovery process after we've marked + * ourselves as mounted. */ + } + + mlog(0, "Journal loaded.\n"); + + status = ocfs2_load_local_alloc(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + if (dirty) { + /* Recovery will be completed after we've mounted the + * rest of the volume. */ + osb->dirty = 1; + osb->local_alloc_copy = local_alloc; + local_alloc = NULL; + } + + /* go through each journal, trylock it and if you get the + * lock, and it's marked as dirty, set the bit in the recover + * map and launch a recovery thread for it. */ + status = ocfs2_mark_dead_nodes(osb); + if (status < 0) + mlog_errno(status); + +finally: + if (local_alloc) + kfree(local_alloc); + + mlog_exit(status); + return status; +} + +/* + * The routine gets called from dismount or close whenever a dismount on + * volume is requested and the osb open count becomes 1. + * It will remove the osb from the global list and also free up all the + * initialized resources and fileobject. + */ +static void ocfs2_delete_osb(struct ocfs2_super *osb) +{ + mlog_entry_void(); + + /* This function assumes that the caller has the main osb resource */ + + if (osb->slot_info) + ocfs2_free_slot_info(osb->slot_info); + + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the start of ocfs2_initalize_osb(), + * we free it here. + */ + kfree(osb->journal); + if (osb->local_alloc_copy) + kfree(osb->local_alloc_copy); + kfree(osb->uuid_str); + ocfs2_put_dlm_debug(osb->osb_dlm_debug); + memset(osb, 0, sizeof(struct ocfs2_super)); + + mlog_exit_void(); +} + +/* Put OCFS2 into a readonly state, or (if the user specifies it), + * panic(). We do not support continue-on-error operation. */ +static void ocfs2_handle_error(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + + ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return; + + printk(KERN_CRIT "File system is now read-only due to the potential " + "of on-disk corruption. Please run fsck.ocfs2 once the file " + "system is unmounted.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); +} + +static char error_buf[1024]; + +void __ocfs2_error(struct super_block *sb, + const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsprintf(error_buf, fmt, args); + va_end(args); + + /* Not using mlog here because we want to show the actual + * function the error came from. */ + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + ocfs2_handle_error(sb); +} + +/* Handle critical errors. This is intentionally more drastic than + * ocfs2_handle_error, so we only use for things like journal errors, + * etc. */ +void __ocfs2_abort(struct super_block* sb, + const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsprintf(error_buf, fmt, args); + va_end(args); + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + /* We don't have the cluster support yet to go straight to + * hard readonly in here. Until then, we want to keep + * ocfs2_abort() so that we can at least mark critical + * errors. + * + * TODO: This should abort the journal and alert other nodes + * that our slot needs recovery. */ + + /* Force a panic(). This stinks, but it's better than letting + * things continue without having a proper hard readonly + * here. */ + OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + ocfs2_handle_error(sb); +} + +module_init(ocfs2_init); +module_exit(ocfs2_exit); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h new file mode 100644 index 00000000000..c564177dfbd --- /dev/null +++ b/fs/ocfs2/super.h @@ -0,0 +1,44 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SUPER_H +#define OCFS2_SUPER_H + +extern struct workqueue_struct *ocfs2_wq; + +int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, + int node_num); + +void __ocfs2_error(struct super_block *sb, + const char *function, + const char *fmt, ...); +#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) + +void __ocfs2_abort(struct super_block *sb, + const char *function, + const char *fmt, ...); +#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) + +#endif /* OCFS2_SUPER_H */ diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c new file mode 100644 index 00000000000..f6986bd79e7 --- /dev/null +++ b/fs/ocfs2/symlink.c @@ -0,0 +1,180 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * linux/cluster/ssi/cfs/symlink.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE + * or NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net + * + * Copyright (C) 1992 Rick Sladkey + * + * Optimization changes Copyright (C) 1994 Florian La Roche + * + * Jun 7 1999, cache symlink lookups in the page cache. -DaveM + * + * Portions Copyright (C) 2001 Compaq Computer Corporation + * + * ocfs2 symlink handling code. + * + * Copyright (C) 2004, 2005 Oracle. + * + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/utsname.h> + +#define MLOG_MASK_PREFIX ML_NAMEI +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "symlink.h" + +#include "buffer_head_io.h" + +static char *ocfs2_page_getlink(struct dentry * dentry, + struct page **ppage); +static char *ocfs2_fast_symlink_getlink(struct inode *inode, + struct buffer_head **bh); + +/* get the link contents into pagecache */ +static char *ocfs2_page_getlink(struct dentry * dentry, + struct page **ppage) +{ + struct page * page; + struct address_space *mapping = dentry->d_inode->i_mapping; + page = read_cache_page(mapping, 0, + (filler_t *)mapping->a_ops->readpage, NULL); + if (IS_ERR(page)) + goto sync_fail; + wait_on_page_locked(page); + if (!PageUptodate(page)) + goto async_fail; + *ppage = page; + return kmap(page); + +async_fail: + page_cache_release(page); + return ERR_PTR(-EIO); + +sync_fail: + return (char*)page; +} + +static char *ocfs2_fast_symlink_getlink(struct inode *inode, + struct buffer_head **bh) +{ + int status; + char *link = NULL; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + bh, + OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + link = ERR_PTR(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) (*bh)->b_data; + link = (char *) fe->id2.i_symlink; +bail: + mlog_exit(status); + + return link; +} + +static int ocfs2_readlink(struct dentry *dentry, + char __user *buffer, + int buflen) +{ + int ret; + char *link; + struct buffer_head *bh = NULL; + struct inode *inode = dentry->d_inode; + + mlog_entry_void(); + + link = ocfs2_fast_symlink_getlink(inode, &bh); + if (IS_ERR(link)) { + ret = PTR_ERR(link); + goto out; + } + + ret = vfs_readlink(dentry, buffer, buflen, link); + + brelse(bh); +out: + mlog_exit(ret); + return ret; +} + +static void *ocfs2_follow_link(struct dentry *dentry, + struct nameidata *nd) +{ + int status; + char *link; + struct inode *inode = dentry->d_inode; + struct page *page = NULL; + struct buffer_head *bh = NULL; + + if (ocfs2_inode_is_fast_symlink(inode)) + link = ocfs2_fast_symlink_getlink(inode, &bh); + else + link = ocfs2_page_getlink(dentry, &page); + if (IS_ERR(link)) { + status = PTR_ERR(link); + mlog_errno(status); + goto bail; + } + + status = vfs_follow_link(nd, link); + if (status) + mlog_errno(status); +bail: + if (page) { + kunmap(page); + page_cache_release(page); + } + if (bh) + brelse(bh); + + return ERR_PTR(status); +} + +struct inode_operations ocfs2_symlink_inode_operations = { + .readlink = page_readlink, + .follow_link = ocfs2_follow_link, + .getattr = ocfs2_getattr, +}; +struct inode_operations ocfs2_fast_symlink_inode_operations = { + .readlink = ocfs2_readlink, + .follow_link = ocfs2_follow_link, + .getattr = ocfs2_getattr, +}; diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h new file mode 100644 index 00000000000..1ea9e4d9e9e --- /dev/null +++ b/fs/ocfs2/symlink.h @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYMLINK_H +#define OCFS2_SYMLINK_H + +extern struct inode_operations ocfs2_symlink_inode_operations; +extern struct inode_operations ocfs2_fast_symlink_inode_operations; + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) +{ + return (S_ISLNK(inode->i_mode) && + inode->i_blocks == 0); +} + + +#endif /* OCFS2_SYMLINK_H */ diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c new file mode 100644 index 00000000000..600a8bc5b54 --- /dev/null +++ b/fs/ocfs2/sysfile.c @@ -0,0 +1,131 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.c + * + * Initialize, read, write, etc. system files. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#include "ocfs2.h" + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "alloc.h" +#include "dir.h" +#include "inode.h" +#include "journal.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +static inline int is_global_system_inode(int type); +static inline int is_in_system_inode_array(struct ocfs2_super *osb, + int type, + u32 slot); + +static inline int is_global_system_inode(int type) +{ + return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && + type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; +} + +static inline int is_in_system_inode_array(struct ocfs2_super *osb, + int type, + u32 slot) +{ + return slot == osb->slot_num || is_global_system_inode(type); +} + +struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + struct inode *inode = NULL; + struct inode **arr = NULL; + + /* avoid the lookup if cached in local system file array */ + if (is_in_system_inode_array(osb, type, slot)) + arr = &(osb->system_inodes[type]); + + if (arr && ((inode = *arr) != NULL)) { + /* get a ref in addition to the array ref */ + inode = igrab(inode); + if (!inode) + BUG(); + + return inode; + } + + /* this gets one ref thru iget */ + inode = _ocfs2_get_system_file_inode(osb, type, slot); + + /* add one more if putting into array for first time */ + if (arr && inode) { + *arr = igrab(inode); + if (!*arr) + BUG(); + } + return inode; +} + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + char namebuf[40]; + struct inode *inode = NULL; + u64 blkno; + struct buffer_head *dirent_bh = NULL; + struct ocfs2_dir_entry *de = NULL; + int status = 0; + + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); + + status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf), + &blkno, osb->sys_root_inode, + &dirent_bh, &de); + if (status < 0) { + goto bail; + } + + inode = ocfs2_iget(osb, blkno); + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + inode = NULL; + goto bail; + } +bail: + if (dirent_bh) + brelse(dirent_bh); + return inode; +} + diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h new file mode 100644 index 00000000000..cc9ea661ffc --- /dev/null +++ b/fs/ocfs2/sysfile.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYSFILE_H +#define OCFS2_SYSFILE_H + +struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +#endif /* OCFS2_SYSFILE_H */ diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c new file mode 100644 index 00000000000..3a0458fd3e1 --- /dev/null +++ b/fs/ocfs2/uptodate.c @@ -0,0 +1,544 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.c + * + * Tracking the up-to-date-ness of a local buffer_head with respect to + * the cluster. + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Standard buffer head caching flags (uptodate, etc) are insufficient + * in a clustered environment - a buffer may be marked up to date on + * our local node but could have been modified by another cluster + * member. As a result an additional (and performant) caching scheme + * is required. A further requirement is that we consume as little + * memory as possible - we never pin buffer_head structures in order + * to cache them. + * + * We track the existence of up to date buffers on the inodes which + * are associated with them. Because we don't want to pin + * buffer_heads, this is only a (strong) hint and several other checks + * are made in the I/O path to ensure that we don't use a stale or + * invalid buffer without going to disk: + * - buffer_jbd is used liberally - if a bh is in the journal on + * this node then it *must* be up to date. + * - the standard buffer_uptodate() macro is used to detect buffers + * which may be invalid (even if we have an up to date tracking + * item for them) + * + * For a full understanding of how this code works together, one + * should read the callers in dlmglue.c, the I/O functions in + * buffer_head_io.c and ocfs2_journal_access in journal.c + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/buffer_head.h> +#include <linux/rbtree.h> +#include <linux/jbd.h> + +#define MLOG_MASK_PREFIX ML_UPTODATE + +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "inode.h" +#include "uptodate.h" + +struct ocfs2_meta_cache_item { + struct rb_node c_node; + sector_t c_block; +}; + +static kmem_cache_t *ocfs2_uptodate_cachep = NULL; + +void ocfs2_metadata_cache_init(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; + ci->ci_num_cached = 0; +} + +/* No lock taken here as 'root' is not expected to be visible to other + * processes. */ +static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) +{ + unsigned int purged = 0; + struct rb_node *node; + struct ocfs2_meta_cache_item *item; + + while ((node = rb_last(root)) != NULL) { + item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); + + mlog(0, "Purge item %llu\n", + (unsigned long long) item->c_block); + + rb_erase(&item->c_node, root); + kmem_cache_free(ocfs2_uptodate_cachep, item); + + purged++; + } + return purged; +} + +/* Called from locking and called from ocfs2_clear_inode. Dump the + * cache for a given inode. + * + * This function is a few more lines longer than necessary due to some + * accounting done here, but I think it's worth tracking down those + * bugs sooner -- Mark */ +void ocfs2_metadata_cache_purge(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int tree, to_purge, purged; + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + struct rb_root root = RB_ROOT; + + spin_lock(&oi->ip_lock); + tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + to_purge = ci->ci_num_cached; + + mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge, + tree ? "array" : "tree", oi->ip_blkno); + + /* If we're a tree, save off the root so that we can safely + * initialize the cache. We do the work to free tree members + * without the spinlock. */ + if (tree) + root = ci->ci_cache.ci_tree; + + ocfs2_metadata_cache_init(inode); + spin_unlock(&oi->ip_lock); + + purged = ocfs2_purge_copied_metadata_tree(&root); + /* If possible, track the number wiped so that we can more + * easily detect counting errors. Unfortunately, this is only + * meaningful for trees. */ + if (tree && purged != to_purge) + mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n", + oi->ip_blkno, to_purge, purged); +} + +/* Returns the index in the cache array, -1 if not found. + * Requires ip_lock. */ +static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, + sector_t item) +{ + int i; + + for (i = 0; i < ci->ci_num_cached; i++) { + if (item == ci->ci_cache.ci_array[i]) + return i; + } + + return -1; +} + +/* Returns the cache item if found, otherwise NULL. + * Requires ip_lock. */ +static struct ocfs2_meta_cache_item * +ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, + sector_t block) +{ + struct rb_node * n = ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *item = NULL; + + while (n) { + item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); + + if (block < item->c_block) + n = n->rb_left; + else if (block > item->c_block) + n = n->rb_right; + else + return item; + } + + return NULL; +} + +static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, + struct buffer_head *bh) +{ + int index = -1; + struct ocfs2_meta_cache_item *item = NULL; + + spin_lock(&oi->ip_lock); + + mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n", + oi->ip_blkno, (unsigned long long) bh->b_blocknr, + !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); + + if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) + index = ocfs2_search_cache_array(&oi->ip_metadata_cache, + bh->b_blocknr); + else + item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, + bh->b_blocknr); + + spin_unlock(&oi->ip_lock); + + mlog(0, "index = %d, item = %p\n", index, item); + + return (index != -1) || (item != NULL); +} + +/* Warning: even if it returns true, this does *not* guarantee that + * the block is stored in our inode metadata cache. */ +int ocfs2_buffer_uptodate(struct inode *inode, + struct buffer_head *bh) +{ + /* Doesn't matter if the bh is in our cache or not -- if it's + * not marked uptodate then we know it can't have correct + * data. */ + if (!buffer_uptodate(bh)) + return 0; + + /* OCFS2 does not allow multiple nodes to be changing the same + * block at the same time. */ + if (buffer_jbd(bh)) + return 1; + + /* Ok, locally the buffer is marked as up to date, now search + * our cache to see if we can trust that. */ + return ocfs2_buffer_cached(OCFS2_I(inode), bh); +} + +/* Requires ip_lock */ +static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, + sector_t block) +{ + BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); + + mlog(0, "block %llu takes position %u\n", (unsigned long long) block, + ci->ci_num_cached); + + ci->ci_cache.ci_array[ci->ci_num_cached] = block; + ci->ci_num_cached++; +} + +/* By now the caller should have checked that the item does *not* + * exist in the tree. + * Requires ip_lock. */ +static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *new) +{ + sector_t block = new->c_block; + struct rb_node *parent = NULL; + struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *tmp; + + mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, + ci->ci_num_cached); + + while(*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); + + if (block < tmp->c_block) + p = &(*p)->rb_left; + else if (block > tmp->c_block) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate block %llu cached!\n", + (unsigned long long) block); + BUG(); + } + } + + rb_link_node(&new->c_node, parent, p); + rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached++; +} + +static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, + struct ocfs2_caching_info *ci) +{ + assert_spin_locked(&oi->ip_lock); + + return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && + (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); +} + +/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the + * pointers in tree after we use them - this allows caller to detect + * when to free in case of error. */ +static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, + struct ocfs2_meta_cache_item **tree) +{ + int i; + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, + "Inode %"MLFu64", num cached = %u, should be %u\n", + oi->ip_blkno, ci->ci_num_cached, + OCFS2_INODE_MAX_CACHE_ARRAY); + mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), + "Inode %"MLFu64" not marked as inline anymore!\n", + oi->ip_blkno); + assert_spin_locked(&oi->ip_lock); + + /* Be careful to initialize the tree members *first* because + * once the ci_tree is used, the array is junk... */ + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + tree[i]->c_block = ci->ci_cache.ci_array[i]; + + oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; + ci->ci_cache.ci_tree = RB_ROOT; + /* this will be set again by __ocfs2_insert_cache_tree */ + ci->ci_num_cached = 0; + + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + __ocfs2_insert_cache_tree(ci, tree[i]); + tree[i] = NULL; + } + + mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n", + oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); +} + +/* Slow path function - memory allocation is necessary. See the + * comment above ocfs2_set_buffer_uptodate for more information. */ +static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, + sector_t block, + int expand_tree) +{ + int i; + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + struct ocfs2_meta_cache_item *new = NULL; + struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = + { NULL, }; + + mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n", + oi->ip_blkno, (unsigned long long) block, expand_tree); + + new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); + if (!new) { + mlog_errno(-ENOMEM); + return; + } + new->c_block = block; + + if (expand_tree) { + /* Do *not* allocate an array here - the removal code + * has no way of tracking that. */ + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, + GFP_KERNEL); + if (!tree[i]) { + mlog_errno(-ENOMEM); + goto out_free; + } + + /* These are initialized in ocfs2_expand_cache! */ + } + } + + spin_lock(&oi->ip_lock); + if (ocfs2_insert_can_use_array(oi, ci)) { + mlog(0, "Someone cleared the tree underneath us\n"); + /* Ok, items were removed from the cache in between + * locks. Detect this and revert back to the fast path */ + ocfs2_append_cache_array(ci, block); + spin_unlock(&oi->ip_lock); + goto out_free; + } + + if (expand_tree) + ocfs2_expand_cache(oi, tree); + + __ocfs2_insert_cache_tree(ci, new); + spin_unlock(&oi->ip_lock); + + new = NULL; +out_free: + if (new) + kmem_cache_free(ocfs2_uptodate_cachep, new); + + /* If these were used, then ocfs2_expand_cache re-set them to + * NULL for us. */ + if (tree[0]) { + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + if (tree[i]) + kmem_cache_free(ocfs2_uptodate_cachep, + tree[i]); + } +} + +/* Item insertion is guarded by ip_io_sem, so the insertion path takes + * advantage of this by not rechecking for a duplicate insert during + * the slow case. Additionally, if the cache needs to be bumped up to + * a tree, the code will not recheck after acquiring the lock -- + * multiple paths cannot be expanding to a tree at the same time. + * + * The slow path takes into account that items can be removed + * (including the whole tree wiped and reset) when this process it out + * allocating memory. In those cases, it reverts back to the fast + * path. + * + * Note that this function may actually fail to insert the block if + * memory cannot be allocated. This is not fatal however (but may + * result in a performance penalty) */ +void ocfs2_set_buffer_uptodate(struct inode *inode, + struct buffer_head *bh) +{ + int expand; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + /* The block may very well exist in our cache already, so avoid + * doing any more work in that case. */ + if (ocfs2_buffer_cached(oi, bh)) + return; + + mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno, + (unsigned long long) bh->b_blocknr); + + /* No need to recheck under spinlock - insertion is guarded by + * ip_io_sem */ + spin_lock(&oi->ip_lock); + if (ocfs2_insert_can_use_array(oi, ci)) { + /* Fast case - it's an array and there's a free + * spot. */ + ocfs2_append_cache_array(ci, bh->b_blocknr); + spin_unlock(&oi->ip_lock); + return; + } + + expand = 0; + if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + /* We need to bump things up to a tree. */ + expand = 1; + } + spin_unlock(&oi->ip_lock); + + __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); +} + +/* Called against a newly allocated buffer. Most likely nobody should + * be able to read this sort of metadata while it's still being + * allocated, but this is careful to take ip_io_sem anyway. */ +void ocfs2_set_new_buffer_uptodate(struct inode *inode, + struct buffer_head *bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + /* This should definitely *not* exist in our cache */ + BUG_ON(ocfs2_buffer_cached(oi, bh)); + + set_buffer_uptodate(bh); + + down(&oi->ip_io_sem); + ocfs2_set_buffer_uptodate(inode, bh); + up(&oi->ip_io_sem); +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, + int index) +{ + sector_t *array = ci->ci_cache.ci_array; + int bytes; + + BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); + BUG_ON(index >= ci->ci_num_cached); + BUG_ON(!ci->ci_num_cached); + + mlog(0, "remove index %d (num_cached = %u\n", index, + ci->ci_num_cached); + + ci->ci_num_cached--; + + /* don't need to copy if the array is now empty, or if we + * removed at the tail */ + if (ci->ci_num_cached && index < ci->ci_num_cached) { + bytes = sizeof(sector_t) * (ci->ci_num_cached - index); + memmove(&array[index], &array[index + 1], bytes); + } +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *item) +{ + mlog(0, "remove block %llu from tree\n", + (unsigned long long) item->c_block); + + rb_erase(&item->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached--; +} + +/* Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. */ +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh) +{ + int index; + sector_t block = bh->b_blocknr; + struct ocfs2_meta_cache_item *item = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + spin_lock(&oi->ip_lock); + mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n", + oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached, + oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + + if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + index = ocfs2_search_cache_array(ci, block); + if (index != -1) + ocfs2_remove_metadata_array(ci, index); + } else { + item = ocfs2_search_cache_tree(ci, block); + if (item) + ocfs2_remove_metadata_tree(ci, item); + } + spin_unlock(&oi->ip_lock); + + if (item) + kmem_cache_free(ocfs2_uptodate_cachep, item); +} + +int __init init_ocfs2_uptodate_cache(void) +{ + ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", + sizeof(struct ocfs2_meta_cache_item), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ocfs2_uptodate_cachep) + return -ENOMEM; + + mlog(0, "%u inlined cache items per inode.\n", + OCFS2_INODE_MAX_CACHE_ARRAY); + + return 0; +} + +void __exit exit_ocfs2_uptodate_cache(void) +{ + if (ocfs2_uptodate_cachep) + kmem_cache_destroy(ocfs2_uptodate_cachep); +} diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h new file mode 100644 index 00000000000..e5aacdf4eab --- /dev/null +++ b/fs/ocfs2/uptodate.h @@ -0,0 +1,44 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.h + * + * Cluster uptodate tracking + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_UPTODATE_H +#define OCFS2_UPTODATE_H + +int __init init_ocfs2_uptodate_cache(void); +void __exit exit_ocfs2_uptodate_cache(void); + +void ocfs2_metadata_cache_init(struct inode *inode); +void ocfs2_metadata_cache_purge(struct inode *inode); + +int ocfs2_buffer_uptodate(struct inode *inode, + struct buffer_head *bh); +void ocfs2_set_buffer_uptodate(struct inode *inode, + struct buffer_head *bh); +void ocfs2_set_new_buffer_uptodate(struct inode *inode, + struct buffer_head *bh); +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh); + +#endif /* OCFS2_UPTODATE_H */ diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c new file mode 100644 index 00000000000..5405ce121c9 --- /dev/null +++ b/fs/ocfs2/ver.c @@ -0,0 +1,43 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> + +#include "ver.h" + +#define OCFS2_BUILD_VERSION "1.3.3" + +#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION + +void ocfs2_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h new file mode 100644 index 00000000000..d7395cb91d2 --- /dev/null +++ b/fs/ocfs2/ver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_VER_H +#define OCFS2_VER_H + +void ocfs2_print_version(void); + +#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c new file mode 100644 index 00000000000..021978e0576 --- /dev/null +++ b/fs/ocfs2/vote.c @@ -0,0 +1,1202 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * vote.c + * + * description here + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <linux/kthread.h> + +#include <cluster/heartbeat.h> +#include <cluster/nodemanager.h> +#include <cluster/tcp.h> + +#include <dlm/dlmapi.h> + +#define MLOG_MASK_PREFIX ML_VOTE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "slot_map.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define OCFS2_MESSAGE_TYPE_VOTE (0x1) +#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2) +struct ocfs2_msg_hdr +{ + __be32 h_response_id; /* used to lookup message handle on sending + * node. */ + __be32 h_request; + __be64 h_blkno; + __be32 h_generation; + __be32 h_node_num; /* node sending this particular message. */ +}; + +/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this + * for the network. */ +#define OCFS2_VOTE_FILENAME_LEN 256 +struct ocfs2_vote_msg +{ + struct ocfs2_msg_hdr v_hdr; + union { + __be32 v_generic1; + __be32 v_orphaned_slot; /* Used during delete votes */ + __be32 v_nlink; /* Used during unlink votes */ + } md1; /* Message type dependant 1 */ + __be32 v_unlink_namelen; + __be64 v_unlink_parent; + u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN]; +}; + +/* Responses are given these values to maintain backwards + * compatibility with older ocfs2 versions */ +#define OCFS2_RESPONSE_OK (0) +#define OCFS2_RESPONSE_BUSY (-16) +#define OCFS2_RESPONSE_BAD_MSG (-22) + +struct ocfs2_response_msg +{ + struct ocfs2_msg_hdr r_hdr; + __be32 r_response; + __be32 r_orphaned_slot; +}; + +struct ocfs2_vote_work { + struct list_head w_list; + struct ocfs2_vote_msg w_msg; +}; + +enum ocfs2_vote_request { + OCFS2_VOTE_REQ_INVALID = 0, + OCFS2_VOTE_REQ_DELETE, + OCFS2_VOTE_REQ_UNLINK, + OCFS2_VOTE_REQ_RENAME, + OCFS2_VOTE_REQ_MOUNT, + OCFS2_VOTE_REQ_UMOUNT, + OCFS2_VOTE_REQ_LAST +}; + +static inline int ocfs2_is_valid_vote_request(int request) +{ + return OCFS2_VOTE_REQ_INVALID < request && + request < OCFS2_VOTE_REQ_LAST; +} + +typedef void (*ocfs2_net_response_callback)(void *priv, + struct ocfs2_response_msg *resp); +struct ocfs2_net_response_cb { + ocfs2_net_response_callback rc_cb; + void *rc_priv; +}; + +struct ocfs2_net_wait_ctxt { + struct list_head n_list; + u32 n_response_id; + wait_queue_head_t n_event; + struct ocfs2_node_map n_node_map; + int n_response; /* an agreggate response. 0 if + * all nodes are go, < 0 on any + * negative response from any + * node or network error. */ + struct ocfs2_net_response_cb *n_callback; +}; + +static void ocfs2_process_mount_request(struct ocfs2_super *osb, + unsigned int node_num) +{ + mlog(0, "MOUNT vote from node %u\n", node_num); + /* The other node only sends us this message when he has an EX + * on the superblock, so our recovery threads (if having been + * launched) are waiting on it.*/ + ocfs2_recovery_map_clear(osb, node_num); + ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num); + + /* We clear the umount map here because a node may have been + * previously mounted, safely unmounted but never stopped + * heartbeating - in which case we'd have a stale entry. */ + ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); +} + +static void ocfs2_process_umount_request(struct ocfs2_super *osb, + unsigned int node_num) +{ + mlog(0, "UMOUNT vote from node %u\n", node_num); + ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num); + ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); +} + +void ocfs2_mark_inode_remotely_deleted(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + assert_spin_locked(&oi->ip_lock); + /* We set the SKIP_DELETE flag on the inode so we don't try to + * delete it in delete_inode ourselves, thus avoiding + * unecessary lock pinging. If the other node failed to wipe + * the inode as a result of a crash, then recovery will pick + * up the slack. */ + oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; +} + +static int ocfs2_process_delete_request(struct inode *inode, + int *orphaned_slot) +{ + int response = OCFS2_RESPONSE_BUSY; + + mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", + inode->i_ino, inode->i_nlink, *orphaned_slot); + + spin_lock(&OCFS2_I(inode)->ip_lock); + + /* Whatever our vote response is, we want to make sure that + * the orphaned slot is recorded properly on this node *and* + * on the requesting node. Technically, if the requesting node + * did not know which slot the inode is orphaned in but we + * respond with BUSY he doesn't actually need the orphaned + * slot, but it doesn't hurt to do it here anyway. */ + if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { + mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != + OCFS2_INVALID_SLOT && + OCFS2_I(inode)->ip_orphaned_slot != + (*orphaned_slot), + "Inode %"MLFu64": This node thinks it's " + "orphaned in slot %d, messaged it's in %d\n", + OCFS2_I(inode)->ip_blkno, + OCFS2_I(inode)->ip_orphaned_slot, + *orphaned_slot); + + mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n", + OCFS2_I(inode)->ip_blkno, *orphaned_slot); + + OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; + } else { + mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n", + OCFS2_I(inode)->ip_orphaned_slot, + OCFS2_I(inode)->ip_blkno); + + *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + } + + /* vote no if the file is still open. */ + if (OCFS2_I(inode)->ip_open_count) { + mlog(0, "open count = %u\n", + OCFS2_I(inode)->ip_open_count); + spin_unlock(&OCFS2_I(inode)->ip_lock); + goto done; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* directories are a bit ugly... What if someone is sitting in + * it? We want to make sure the inode is removed completely as + * a result of the iput in process_vote. */ + if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { + mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); + goto done; + } + + if (filemap_fdatawrite(inode->i_mapping)) { + mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n", + OCFS2_I(inode)->ip_blkno); + goto done; + } + sync_mapping_buffers(inode->i_mapping); + truncate_inode_pages(inode->i_mapping, 0); + ocfs2_extent_map_trunc(inode, 0); + + spin_lock(&OCFS2_I(inode)->ip_lock); + /* double check open count - someone might have raced this + * thread into ocfs2_file_open while we were writing out + * data. If we're to allow a wipe of this inode now, we *must* + * hold the spinlock until we've marked it. */ + if (OCFS2_I(inode)->ip_open_count) { + mlog(0, "Raced to wipe! open count = %u\n", + OCFS2_I(inode)->ip_open_count); + spin_unlock(&OCFS2_I(inode)->ip_lock); + goto done; + } + + /* Mark the inode as being wiped from disk. */ + ocfs2_mark_inode_remotely_deleted(inode); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* Not sure this is necessary anymore. */ + d_prune_aliases(inode); + + /* If we get here, then we're voting 'yes', so commit the + * delete on our side. */ + response = OCFS2_RESPONSE_OK; +done: + return response; +} + +static int ocfs2_match_dentry(struct dentry *dentry, + u64 parent_blkno, + unsigned int namelen, + const char *name) +{ + struct inode *parent; + + if (!dentry->d_parent) { + mlog(0, "Detached from parent.\n"); + return 0; + } + + parent = dentry->d_parent->d_inode; + /* Negative parent dentry? */ + if (!parent) + return 0; + + /* Name is in a different directory. */ + if (OCFS2_I(parent)->ip_blkno != parent_blkno) + return 0; + + if (dentry->d_name.len != namelen) + return 0; + + /* comparison above guarantees this is safe. */ + if (memcmp(dentry->d_name.name, name, namelen)) + return 0; + + return 1; +} + +static void ocfs2_process_dentry_request(struct inode *inode, + int rename, + unsigned int new_nlink, + u64 parent_blkno, + unsigned int namelen, + const char *name) +{ + struct dentry *dentry = NULL; + struct list_head *p; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno, + namelen, namelen, name); + + spin_lock(&dcache_lock); + + /* Another node is removing this name from the system. It is + * up to us to find the corresponding dentry and if it exists, + * unhash it from the dcache. */ + list_for_each(p, &inode->i_dentry) { + dentry = list_entry(p, struct dentry, d_alias); + + if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) { + mlog(0, "dentry found: %.*s\n", + dentry->d_name.len, dentry->d_name.name); + + dget_locked(dentry); + break; + } + + dentry = NULL; + } + + spin_unlock(&dcache_lock); + + if (dentry) { + d_delete(dentry); + dput(dentry); + } + + /* rename votes don't send link counts */ + if (!rename) { + mlog(0, "new_nlink = %u\n", new_nlink); + + /* We don't have the proper locks here to directly + * change i_nlink and besides, the vote is sent + * *before* the operation so it may have failed on the + * other node. This passes a hint to ocfs2_drop_inode + * to force ocfs2_delete_inode, who will take the + * proper cluster locks to sort things out. */ + if (new_nlink == 0) { + spin_lock(&oi->ip_lock); + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&OCFS2_I(inode)->ip_lock); + } + } +} + +static void ocfs2_process_vote(struct ocfs2_super *osb, + struct ocfs2_vote_msg *msg) +{ + int net_status, vote_response; + int orphaned_slot = 0; + int rename = 0; + unsigned int node_num, generation, new_nlink, namelen; + u64 blkno, parent_blkno; + enum ocfs2_vote_request request; + struct inode *inode = NULL; + struct ocfs2_msg_hdr *hdr = &msg->v_hdr; + struct ocfs2_response_msg response; + + /* decode the network mumbo jumbo into local variables. */ + request = be32_to_cpu(hdr->h_request); + blkno = be64_to_cpu(hdr->h_blkno); + generation = be32_to_cpu(hdr->h_generation); + node_num = be32_to_cpu(hdr->h_node_num); + if (request == OCFS2_VOTE_REQ_DELETE) + orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); + + mlog(0, "processing vote: request = %u, blkno = %"MLFu64", " + "generation = %u, node_num = %u, priv1 = %u\n", request, + blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1)); + + if (!ocfs2_is_valid_vote_request(request)) { + mlog(ML_ERROR, "Invalid vote request %d from node %u\n", + request, node_num); + vote_response = OCFS2_RESPONSE_BAD_MSG; + goto respond; + } + + vote_response = OCFS2_RESPONSE_OK; + + switch (request) { + case OCFS2_VOTE_REQ_UMOUNT: + ocfs2_process_umount_request(osb, node_num); + goto respond; + case OCFS2_VOTE_REQ_MOUNT: + ocfs2_process_mount_request(osb, node_num); + goto respond; + default: + /* avoids a gcc warning */ + break; + } + + /* We cannot process the remaining message types before we're + * fully mounted. It's perfectly safe however to send a 'yes' + * response as we can't possibly have any of the state they're + * asking us to modify yet. */ + if (atomic_read(&osb->vol_state) == VOLUME_INIT) + goto respond; + + /* If we get here, then the request is against an inode. */ + inode = ocfs2_ilookup_for_vote(osb, blkno, + request == OCFS2_VOTE_REQ_DELETE); + + /* Not finding the inode is perfectly valid - it means we're + * not interested in what the other node is about to do to it + * so in those cases we automatically respond with an + * affirmative. Cluster locking ensures that we won't race + * interest in the inode with this vote request. */ + if (!inode) + goto respond; + + /* Check generation values. It's possible for us to get a + * request against a stale inode. If so then we proceed as if + * we had not found an inode in the first place. */ + if (inode->i_generation != generation) { + mlog(0, "generation passed %u != inode generation = %u, " + "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", " + "i_count = %u, message type = %u\n", + generation, inode->i_generation, OCFS2_I(inode)->ip_flags, + OCFS2_I(inode)->ip_blkno, blkno, + atomic_read(&inode->i_count), request); + iput(inode); + inode = NULL; + goto respond; + } + + switch (request) { + case OCFS2_VOTE_REQ_DELETE: + vote_response = ocfs2_process_delete_request(inode, + &orphaned_slot); + break; + case OCFS2_VOTE_REQ_RENAME: + rename = 1; + /* fall through */ + case OCFS2_VOTE_REQ_UNLINK: + parent_blkno = be64_to_cpu(msg->v_unlink_parent); + namelen = be32_to_cpu(msg->v_unlink_namelen); + /* new_nlink will be ignored in case of a rename vote */ + new_nlink = be32_to_cpu(msg->md1.v_nlink); + ocfs2_process_dentry_request(inode, rename, new_nlink, + parent_blkno, namelen, + msg->v_unlink_dirent); + break; + default: + mlog(ML_ERROR, "node %u, invalid request: %u\n", + node_num, request); + vote_response = OCFS2_RESPONSE_BAD_MSG; + } + +respond: + /* Response struture is small so we just put it on the stack + * and stuff it inline. */ + memset(&response, 0, sizeof(struct ocfs2_response_msg)); + response.r_hdr.h_response_id = hdr->h_response_id; + response.r_hdr.h_blkno = hdr->h_blkno; + response.r_hdr.h_generation = hdr->h_generation; + response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); + response.r_response = cpu_to_be32(vote_response); + response.r_orphaned_slot = cpu_to_be32(orphaned_slot); + + net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, + osb->net_key, + &response, + sizeof(struct ocfs2_response_msg), + node_num, + NULL); + /* We still want to error print for ENOPROTOOPT here. The + * sending node shouldn't have unregistered his net handler + * without sending an unmount vote 1st */ + if (net_status < 0 + && net_status != -ETIMEDOUT + && net_status != -ENOTCONN) + mlog(ML_ERROR, "message to node %u fails with error %d!\n", + node_num, net_status); + + if (inode) + iput(inode); +} + +static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) +{ + unsigned long processed; + struct ocfs2_lock_res *lockres; + struct ocfs2_vote_work *work; + + mlog_entry_void(); + + spin_lock(&osb->vote_task_lock); + /* grab this early so we know to try again if a state change and + * wake happens part-way through our work */ + osb->vote_work_sequence = osb->vote_wake_sequence; + + processed = osb->blocked_lock_count; + while (processed) { + BUG_ON(list_empty(&osb->blocked_lock_list)); + + lockres = list_entry(osb->blocked_lock_list.next, + struct ocfs2_lock_res, l_blocked_list); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock(&osb->vote_task_lock); + + BUG_ON(!processed); + processed--; + + ocfs2_process_blocked_lock(osb, lockres); + + spin_lock(&osb->vote_task_lock); + } + + while (osb->vote_count) { + BUG_ON(list_empty(&osb->vote_list)); + work = list_entry(osb->vote_list.next, + struct ocfs2_vote_work, w_list); + list_del(&work->w_list); + osb->vote_count--; + spin_unlock(&osb->vote_task_lock); + + ocfs2_process_vote(osb, &work->w_msg); + kfree(work); + + spin_lock(&osb->vote_task_lock); + } + spin_unlock(&osb->vote_task_lock); + + mlog_exit_void(); +} + +static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb) +{ + int empty = 0; + + spin_lock(&osb->vote_task_lock); + if (list_empty(&osb->blocked_lock_list) && + list_empty(&osb->vote_list)) + empty = 1; + + spin_unlock(&osb->vote_task_lock); + return empty; +} + +static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb) +{ + int should_wake = 0; + + spin_lock(&osb->vote_task_lock); + if (osb->vote_work_sequence != osb->vote_wake_sequence) + should_wake = 1; + spin_unlock(&osb->vote_task_lock); + + return should_wake; +} + +int ocfs2_vote_thread(void *arg) +{ + int status = 0; + struct ocfs2_super *osb = arg; + + /* only quit once we've been asked to stop and there is no more + * work available */ + while (!(kthread_should_stop() && + ocfs2_vote_thread_lists_empty(osb))) { + + wait_event_interruptible(osb->vote_event, + ocfs2_vote_thread_should_wake(osb) || + kthread_should_stop()); + + mlog(0, "vote_thread: awoken\n"); + + ocfs2_vote_thread_do_work(osb); + } + + osb->vote_task = NULL; + return status; +} + +static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id) +{ + struct ocfs2_net_wait_ctxt *w; + + w = kcalloc(1, sizeof(*w), GFP_KERNEL); + if (!w) { + mlog_errno(-ENOMEM); + goto bail; + } + + INIT_LIST_HEAD(&w->n_list); + init_waitqueue_head(&w->n_event); + ocfs2_node_map_init(&w->n_node_map); + w->n_response_id = response_id; + w->n_callback = NULL; +bail: + return w; +} + +static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb) +{ + unsigned int ret; + + spin_lock(&osb->net_response_lock); + ret = ++osb->net_response_ids; + spin_unlock(&osb->net_response_lock); + + return ret; +} + +static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb, + struct ocfs2_net_wait_ctxt *w) +{ + spin_lock(&osb->net_response_lock); + list_del(&w->n_list); + spin_unlock(&osb->net_response_lock); +} + +static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb, + struct ocfs2_net_wait_ctxt *w) +{ + spin_lock(&osb->net_response_lock); + list_add_tail(&w->n_list, + &osb->net_response_list); + spin_unlock(&osb->net_response_lock); +} + +static void __ocfs2_mark_node_responded(struct ocfs2_super *osb, + struct ocfs2_net_wait_ctxt *w, + int node_num) +{ + assert_spin_locked(&osb->net_response_lock); + + ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num); + if (ocfs2_node_map_is_empty(osb, &w->n_node_map)) + wake_up(&w->n_event); +} + +/* Intended to be called from the node down callback, we fake remove + * the node from all our response contexts */ +void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, + int node_num) +{ + struct list_head *p; + struct ocfs2_net_wait_ctxt *w = NULL; + + spin_lock(&osb->net_response_lock); + + list_for_each(p, &osb->net_response_list) { + w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); + + __ocfs2_mark_node_responded(osb, w, node_num); + } + + spin_unlock(&osb->net_response_lock); +} + +static int ocfs2_broadcast_vote(struct ocfs2_super *osb, + struct ocfs2_vote_msg *request, + unsigned int response_id, + int *response, + struct ocfs2_net_response_cb *callback) +{ + int status, i, remote_err; + struct ocfs2_net_wait_ctxt *w = NULL; + int dequeued = 0; + + mlog_entry_void(); + + w = ocfs2_new_net_wait_ctxt(response_id); + if (!w) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + w->n_callback = callback; + + /* we're pretty much ready to go at this point, and this fills + * in n_response which we need anyway... */ + ocfs2_queue_net_wait_ctxt(osb, w); + + i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0); + + while (i != O2NM_INVALID_NODE_NUM) { + if (i != osb->node_num) { + mlog(0, "trying to send request to node %i\n", i); + ocfs2_node_map_set_bit(osb, &w->n_node_map, i); + + remote_err = 0; + status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE, + osb->net_key, + request, + sizeof(*request), + i, + &remote_err); + if (status == -ETIMEDOUT) { + mlog(0, "remote node %d timed out!\n", i); + status = -EAGAIN; + goto bail; + } + if (remote_err < 0) { + status = remote_err; + mlog(0, "remote error %d on node %d!\n", + remote_err, i); + mlog_errno(status); + goto bail; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i++; + i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i); + mlog(0, "next is %d, i am %d\n", i, osb->node_num); + } + mlog(0, "done sending, now waiting on responses...\n"); + + wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map)); + + ocfs2_dequeue_net_wait_ctxt(osb, w); + dequeued = 1; + + *response = w->n_response; + status = 0; +bail: + if (w) { + if (!dequeued) + ocfs2_dequeue_net_wait_ctxt(osb, w); + kfree(w); + } + + mlog_exit(status); + return status; +} + +static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, + u64 blkno, + unsigned int generation, + enum ocfs2_vote_request type, + u32 priv) +{ + struct ocfs2_vote_msg *request; + struct ocfs2_msg_hdr *hdr; + + BUG_ON(!ocfs2_is_valid_vote_request(type)); + + request = kcalloc(1, sizeof(*request), GFP_KERNEL); + if (!request) { + mlog_errno(-ENOMEM); + } else { + hdr = &request->v_hdr; + hdr->h_node_num = cpu_to_be32(osb->node_num); + hdr->h_request = cpu_to_be32(type); + hdr->h_blkno = cpu_to_be64(blkno); + hdr->h_generation = cpu_to_be32(generation); + + request->md1.v_generic1 = cpu_to_be32(priv); + } + + return request; +} + +/* Complete the buildup of a new vote request and process the + * broadcast return value. */ +static int ocfs2_do_request_vote(struct ocfs2_super *osb, + struct ocfs2_vote_msg *request, + struct ocfs2_net_response_cb *callback) +{ + int status, response; + unsigned int response_id; + struct ocfs2_msg_hdr *hdr; + + response_id = ocfs2_new_response_id(osb); + + hdr = &request->v_hdr; + hdr->h_response_id = cpu_to_be32(response_id); + + status = ocfs2_broadcast_vote(osb, request, response_id, &response, + callback); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = response; +bail: + + return status; +} + +static int ocfs2_request_vote(struct inode *inode, + struct ocfs2_vote_msg *request, + struct ocfs2_net_response_cb *callback) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_inode_is_new(inode)) + return 0; + + status = -EAGAIN; + while (status == -EAGAIN) { + if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && + signal_pending(current)) + return -ERESTARTSYS; + + status = ocfs2_super_lock(osb, 0); + if (status < 0) { + mlog_errno(status); + break; + } + + status = 0; + if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) + status = ocfs2_do_request_vote(osb, request, callback); + + ocfs2_super_unlock(osb, 0); + } + return status; +} + +static void ocfs2_delete_response_cb(void *priv, + struct ocfs2_response_msg *resp) +{ + int orphaned_slot, node; + struct inode *inode = priv; + + orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); + node = be32_to_cpu(resp->r_hdr.h_node_num); + mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot " + "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot); + + /* The other node may not actually know which slot the inode + * is orphaned in. */ + if (orphaned_slot == OCFS2_INVALID_SLOT) + return; + + /* Ok, the responding node knows which slot this inode is + * orphaned in. We verify that the information is correct and + * then record this in the inode. ocfs2_delete_inode will use + * this information to determine which lock to take. */ + spin_lock(&OCFS2_I(inode)->ip_lock); + mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && + OCFS2_I(inode)->ip_orphaned_slot + != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d " + "says it's orphaned in slot %d, we think it's in %d\n", + OCFS2_I(inode)->ip_blkno, + be32_to_cpu(resp->r_hdr.h_node_num), + orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); + + OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +int ocfs2_request_delete_vote(struct inode *inode) +{ + int orphaned_slot, status; + struct ocfs2_net_response_cb delete_cb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_vote_msg *request; + + spin_lock(&OCFS2_I(inode)->ip_lock); + orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + spin_unlock(&OCFS2_I(inode)->ip_lock); + + delete_cb.rc_cb = ocfs2_delete_response_cb; + delete_cb.rc_priv = inode; + + mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n", + OCFS2_I(inode)->ip_blkno, orphaned_slot); + + status = -ENOMEM; + request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, + inode->i_generation, + OCFS2_VOTE_REQ_DELETE, orphaned_slot); + if (request) { + status = ocfs2_request_vote(inode, request, &delete_cb); + + kfree(request); + } + + return status; +} + +static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request, + struct dentry *dentry) +{ + struct inode *parent = dentry->d_parent->d_inode; + + /* We need some values which will uniquely identify a dentry + * on the other nodes so that they can find it and run + * d_delete against it. Parent directory block and full name + * should suffice. */ + + mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n", + OCFS2_I(parent)->ip_blkno, dentry->d_name.len, + dentry->d_name.name); + + request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno); + request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len); + memcpy(request->v_unlink_dirent, dentry->d_name.name, + dentry->d_name.len); +} + +int ocfs2_request_unlink_vote(struct inode *inode, + struct dentry *dentry, + unsigned int nlink) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_vote_msg *request; + + if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) + return -ENAMETOOLONG; + + status = -ENOMEM; + request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, + inode->i_generation, + OCFS2_VOTE_REQ_UNLINK, nlink); + if (request) { + ocfs2_setup_unlink_vote(request, dentry); + + status = ocfs2_request_vote(inode, request, NULL); + + kfree(request); + } + return status; +} + +int ocfs2_request_rename_vote(struct inode *inode, + struct dentry *dentry) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_vote_msg *request; + + if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) + return -ENAMETOOLONG; + + status = -ENOMEM; + request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, + inode->i_generation, + OCFS2_VOTE_REQ_RENAME, 0); + if (request) { + ocfs2_setup_unlink_vote(request, dentry); + + status = ocfs2_request_vote(inode, request, NULL); + + kfree(request); + } + return status; +} + +int ocfs2_request_mount_vote(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_vote_msg *request = NULL; + + request = ocfs2_new_vote_request(osb, 0ULL, 0, + OCFS2_VOTE_REQ_MOUNT, 0); + if (!request) { + status = -ENOMEM; + goto bail; + } + + status = -EAGAIN; + while (status == -EAGAIN) { + if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && + signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + if (ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) { + status = 0; + goto bail; + } + + status = ocfs2_do_request_vote(osb, request, NULL); + } + +bail: + if (request) + kfree(request); + + return status; +} + +int ocfs2_request_umount_vote(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_vote_msg *request = NULL; + + request = ocfs2_new_vote_request(osb, 0ULL, 0, + OCFS2_VOTE_REQ_UMOUNT, 0); + if (!request) { + status = -ENOMEM; + goto bail; + } + + status = -EAGAIN; + while (status == -EAGAIN) { + /* Do not check signals on this vote... We really want + * this one to go all the way through. */ + + if (ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) { + status = 0; + goto bail; + } + + status = ocfs2_do_request_vote(osb, request, NULL); + } + +bail: + if (request) + kfree(request); + + return status; +} + +/* TODO: This should eventually be a hash table! */ +static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb, + u32 response_id) +{ + struct list_head *p; + struct ocfs2_net_wait_ctxt *w = NULL; + + list_for_each(p, &osb->net_response_list) { + w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); + if (response_id == w->n_response_id) + break; + w = NULL; + } + + return w; +} + +/* Translate response codes into local node errno values */ +static inline int ocfs2_translate_response(int response) +{ + int ret; + + switch (response) { + case OCFS2_RESPONSE_OK: + ret = 0; + break; + + case OCFS2_RESPONSE_BUSY: + ret = -EBUSY; + break; + + default: + ret = -EINVAL; + } + + return ret; +} + +static int ocfs2_handle_response_message(struct o2net_msg *msg, + u32 len, + void *data) +{ + unsigned int response_id, node_num; + int response_status; + struct ocfs2_super *osb = data; + struct ocfs2_response_msg *resp; + struct ocfs2_net_wait_ctxt * w; + struct ocfs2_net_response_cb *resp_cb; + + resp = (struct ocfs2_response_msg *) msg->buf; + + response_id = be32_to_cpu(resp->r_hdr.h_response_id); + node_num = be32_to_cpu(resp->r_hdr.h_node_num); + response_status = + ocfs2_translate_response(be32_to_cpu(resp->r_response)); + + mlog(0, "received response message:\n"); + mlog(0, "h_response_id = %u\n", response_id); + mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request)); + mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno)); + mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation)); + mlog(0, "h_node_num = %u\n", node_num); + mlog(0, "r_response = %d\n", response_status); + + spin_lock(&osb->net_response_lock); + w = __ocfs2_find_net_wait_ctxt(osb, response_id); + if (!w) { + mlog(0, "request not found!\n"); + goto bail; + } + resp_cb = w->n_callback; + + if (response_status && (!w->n_response)) { + /* we only really need one negative response so don't + * set it twice. */ + w->n_response = response_status; + } + + if (resp_cb) { + spin_unlock(&osb->net_response_lock); + + resp_cb->rc_cb(resp_cb->rc_priv, resp); + + spin_lock(&osb->net_response_lock); + } + + __ocfs2_mark_node_responded(osb, w, node_num); +bail: + spin_unlock(&osb->net_response_lock); + + return 0; +} + +static int ocfs2_handle_vote_message(struct o2net_msg *msg, + u32 len, + void *data) +{ + int status; + struct ocfs2_super *osb = data; + struct ocfs2_vote_work *work; + + work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL); + if (!work) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + INIT_LIST_HEAD(&work->w_list); + memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg)); + + mlog(0, "scheduling vote request:\n"); + mlog(0, "h_response_id = %u\n", + be32_to_cpu(work->w_msg.v_hdr.h_response_id)); + mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request)); + mlog(0, "h_blkno = %"MLFu64"\n", + be64_to_cpu(work->w_msg.v_hdr.h_blkno)); + mlog(0, "h_generation = %u\n", + be32_to_cpu(work->w_msg.v_hdr.h_generation)); + mlog(0, "h_node_num = %u\n", + be32_to_cpu(work->w_msg.v_hdr.h_node_num)); + mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); + + spin_lock(&osb->vote_task_lock); + list_add_tail(&work->w_list, &osb->vote_list); + osb->vote_count++; + spin_unlock(&osb->vote_task_lock); + + ocfs2_kick_vote_thread(osb); + + status = 0; +bail: + return status; +} + +void ocfs2_unregister_net_handlers(struct ocfs2_super *osb) +{ + if (!osb->net_key) + return; + + o2net_unregister_handler_list(&osb->osb_net_handlers); + + if (!list_empty(&osb->net_response_list)) + mlog(ML_ERROR, "net response list not empty!\n"); + + osb->net_key = 0; +} + +int ocfs2_register_net_handlers(struct ocfs2_super *osb) +{ + int status = 0; + + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, + osb->net_key, + sizeof(struct ocfs2_response_msg), + ocfs2_handle_response_message, + osb, &osb->osb_net_handlers); + if (status) { + mlog_errno(status); + goto bail; + } + + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE, + osb->net_key, + sizeof(struct ocfs2_vote_msg), + ocfs2_handle_vote_message, + osb, &osb->osb_net_handlers); + if (status) { + mlog_errno(status); + goto bail; + } +bail: + if (status < 0) + ocfs2_unregister_net_handlers(osb); + + return status; +} diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h new file mode 100644 index 00000000000..9cce6070346 --- /dev/null +++ b/fs/ocfs2/vote.h @@ -0,0 +1,56 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * vote.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef VOTE_H +#define VOTE_H + +int ocfs2_vote_thread(void *arg); +static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) +{ + spin_lock(&osb->vote_task_lock); + /* make sure the voting thread gets a swipe at whatever changes + * the caller may have made to the voting state */ + osb->vote_wake_sequence++; + spin_unlock(&osb->vote_task_lock); + wake_up(&osb->vote_event); +} + +int ocfs2_request_delete_vote(struct inode *inode); +int ocfs2_request_unlink_vote(struct inode *inode, + struct dentry *dentry, + unsigned int nlink); +int ocfs2_request_rename_vote(struct inode *inode, + struct dentry *dentry); +int ocfs2_request_mount_vote(struct ocfs2_super *osb); +int ocfs2_request_umount_vote(struct ocfs2_super *osb); +int ocfs2_register_net_handlers(struct ocfs2_super *osb); +void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); + +void ocfs2_mark_inode_remotely_deleted(struct inode *inode); + +void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, + int node_num); +#endif |