diff options
Diffstat (limited to 'fs')
231 files changed, 12563 insertions, 6541 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 10b7d3c9dba..8c92a9ba833 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -259,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { - buf->f_type = V9FS_MAGIC; + buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; @@ -305,15 +305,18 @@ out_freectx: return ERR_PTR(err); } -/* aio_cancel_all +/* kill_ctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void aio_cancel_all(struct kioctx *ctx) +static void kill_ctx(struct kioctx *ctx) { int (*cancel)(struct kiocb *, struct io_event *); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); struct io_event res; + spin_lock_irq(&ctx->ctx_lock); ctx->dead = 1; while (!list_empty(&ctx->active_reqs)) { @@ -329,15 +332,7 @@ static void aio_cancel_all(struct kioctx *ctx) spin_lock_irq(&ctx->ctx_lock); } } - spin_unlock_irq(&ctx->ctx_lock); -} - -static void wait_for_all_aios(struct kioctx *ctx) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - spin_lock_irq(&ctx->ctx_lock); if (!ctx->reqs_active) goto out; @@ -387,9 +382,7 @@ void exit_aio(struct mm_struct *mm) ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); hlist_del_rcu(&ctx->list); - aio_cancel_all(ctx); - - wait_for_all_aios(ctx); + kill_ctx(ctx); if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG @@ -1269,8 +1262,7 @@ static void io_destroy(struct kioctx *ioctx) if (likely(!was_dead)) put_ioctx(ioctx); /* twice for the list */ - aio_cancel_all(ioctx); - wait_for_all_aios(ioctx); + kill_ctx(ioctx); /* * Wake up any waiters. The setting of ctx->dead must be seen @@ -1278,7 +1270,6 @@ static void io_destroy(struct kioctx *ioctx) * locking done by the above calls to ensure this consistency. */ wake_up_all(&ioctx->wait); - put_ioctx(ioctx); /* once for the lookup */ } /* sys_io_setup: @@ -1315,11 +1306,9 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) { - put_ioctx(ioctx); - return 0; - } - io_destroy(ioctx); + if (ret) + io_destroy(ioctx); + put_ioctx(ioctx); } out: @@ -1337,6 +1326,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { io_destroy(ioctx); + put_ioctx(ioctx); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 85f1fcdb30e..9dacb858670 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -230,7 +230,7 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - FD_SET(fd, fdt->close_on_exec); + __set_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 4d5e6d26578..2eb12f13593 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -26,7 +26,6 @@ #include <linux/coredump.h> #include <linux/slab.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/a.out-core.h> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 504b6eee50a..48ffb3dc610 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,7 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> +#include <asm/exec.h> static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); @@ -1414,6 +1415,22 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } +#ifndef PR_REG_SIZE +#define PR_REG_SIZE(S) sizeof(S) +#endif + +#ifndef PRSTATUS_SIZE +#define PRSTATUS_SIZE(S) sizeof(S) +#endif + +#ifndef PR_REG_PTR +#define PR_REG_PTR(S) (&((S)->pr_reg)) +#endif + +#ifndef SET_PR_FPVALID +#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#endif + static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, long signr, size_t *total) @@ -1428,11 +1445,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, */ fill_prstatus(&t->prstatus, t->task, signr); (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, sizeof(t->prstatus.pr_reg), - &t->prstatus.pr_reg, NULL); + 0, PR_REG_SIZE(t->prstatus.pr_reg), + PR_REG_PTR(&t->prstatus), NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - sizeof(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1462,7 +1479,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - t->prstatus.pr_fpvalid = 1; + SET_PR_FPVALID(&t->prstatus, 1); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c64bf5ee2df..9bd5612a822 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,6 +39,7 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/pgalloc.h> +#include <asm/exec.h> typedef char *elf_caddr_t; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5979027451b..024d20ee3ca 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -37,7 +37,6 @@ #include <linux/syscalls.h> #include <asm/byteorder.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/unaligned.h> #include <asm/cacheflush.h> diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 0cc20b35c1c..42704149b72 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -171,11 +171,11 @@ out: spin_unlock_irqrestore(&workers->lock, flags); } -static noinline int run_ordered_completions(struct btrfs_workers *workers, +static noinline void run_ordered_completions(struct btrfs_workers *workers, struct btrfs_work *work) { if (!workers->ordered) - return 0; + return; set_bit(WORK_DONE_BIT, &work->flags); @@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, } spin_unlock(&workers->order_lock); - return 0; } static void put_worker(struct btrfs_worker_thread *worker) @@ -399,7 +398,7 @@ again: /* * this will wait for all the worker threads to shutdown */ -int btrfs_stop_workers(struct btrfs_workers *workers) +void btrfs_stop_workers(struct btrfs_workers *workers) { struct list_head *cur; struct btrfs_worker_thread *worker; @@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers) put_worker(worker); } spin_unlock_irq(&workers->lock); - return 0; } /* @@ -615,14 +613,14 @@ found: * it was taken from. It is intended for use with long running work functions * that make some progress and want to give the cpu up for others. */ -int btrfs_requeue_work(struct btrfs_work *work) +void btrfs_requeue_work(struct btrfs_work *work) { struct btrfs_worker_thread *worker = work->worker; unsigned long flags; int wake = 0; if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; spin_lock_irqsave(&worker->lock, flags); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) @@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); -out: - - return 0; } void btrfs_set_work_high_prio(struct btrfs_work *work) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index f34cc31fa3c..063698b90ce 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -111,9 +111,9 @@ struct btrfs_workers { void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers); -int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); -int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0436c12da8c..f4e90748940 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -116,6 +116,7 @@ add_parent: * to a logical address */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + int search_commit_root, struct __prelim_ref *ref, struct ulist *parents) { @@ -131,6 +132,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->search_commit_root = !!search_commit_root; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -188,6 +190,7 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + int search_commit_root, struct list_head *head) { int err; @@ -212,7 +215,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; - err = __resolve_indirect_ref(fs_info, ref, parents); + err = __resolve_indirect_ref(fs_info, search_commit_root, + ref, parents); if (err) { if (ret == 0) ret = err; @@ -586,6 +590,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; + int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; @@ -600,6 +605,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->search_commit_root = !!search_commit_root; /* * grab both a lock on the path and a lock on the delayed ref head. @@ -614,35 +620,39 @@ again: goto out; BUG_ON(ret == 0); - /* - * look if there are updates for this ref queued and lock the head - */ - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (head) { - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - goto again; - } - ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); - if (ret) { - spin_unlock(&delayed_refs->lock); - goto out; + if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { + /* + * look if there are updates for this ref queued and lock the + * head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + ret = __add_delayed_refs(head, seq, &info_key, + &prefs_delayed); + if (ret) { + spin_unlock(&delayed_refs->lock); + goto out; + } } + spin_unlock(&delayed_refs->lock); } - spin_unlock(&delayed_refs->lock); if (path->slots[0]) { struct extent_buffer *leaf; @@ -679,7 +689,7 @@ again: if (ret) goto out; - ret = __resolve_indirect_refs(fs_info, &prefs); + ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); if (ret) goto out; @@ -1074,8 +1084,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 logical, +static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, u64 orig_extent_item_objectid, u64 extent_item_pos, u64 root, iterate_extent_inodes_t *iterate, void *ctx) @@ -1143,35 +1152,38 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, * calls iterate() for every inode that references the extent identified by * the given parameters. * when the iterator function returns a non-zero value, iteration stops. - * path is guaranteed to be in released state when iterate() is called. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 extent_item_objectid, u64 extent_item_pos, + int search_commit_root, iterate_extent_inodes_t *iterate, void *ctx) { int ret; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); struct btrfs_trans_handle *trans; - struct ulist *refs; - struct ulist *roots; + struct ulist *refs = NULL; + struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; struct seq_list seq_elem; - struct btrfs_delayed_ref_root *delayed_refs; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); + if (search_commit_root) { + trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; + } else { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + btrfs_get_delayed_seq(delayed_refs, &seq_elem); + spin_unlock(&delayed_refs->lock); + } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, extent_item_pos, seq_elem.seq, @@ -1188,7 +1200,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, while (!ret && (root_node = ulist_next(roots, root_node))) { pr_debug("root %llu references leaf %llu\n", root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, path, ref_node->val, + ret = iterate_leaf_refs(fs_info, ref_node->val, extent_item_objectid, extent_item_pos, root_node->val, iterate, ctx); @@ -1198,8 +1210,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ulist_free(refs); ulist_free(roots); out: - btrfs_put_delayed_seq(delayed_refs, &seq_elem); - btrfs_end_transaction(trans, fs_info->extent_root); + if (!search_commit_root) { + btrfs_put_delayed_seq(delayed_refs, &seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); + } + return ret; } @@ -1210,6 +1225,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int ret; u64 extent_item_pos; struct btrfs_key found_key; + int search_commit_root = path->search_commit_root; ret = extent_from_logical(fs_info, logical, path, &found_key); @@ -1220,8 +1236,9 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, return ret; extent_item_pos = logical - found_key.objectid; - ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_item_pos, iterate, ctx); + ret = iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, search_commit_root, + iterate, ctx); return ret; } @@ -1342,12 +1359,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) inode_to_path, ipath); } -/* - * allocates space to return multiple file system paths for an inode. - * total_bytes to allocate are passed, note that space usable for actual path - * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. - */ struct btrfs_data_container *init_data_container(u32 total_bytes) { struct btrfs_data_container *data; @@ -1403,5 +1414,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + kfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d00dfa9ca93..57ea2e959e4 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -22,6 +22,8 @@ #include "ioctl.h" #include "ulist.h" +#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) + struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -44,9 +46,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 extent_item_objectid, - u64 extent_offset, + u64 extent_offset, int search_commit_root, iterate_extent_inodes_t *iterate, void *ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b805afb37fa..d286b40a567 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -226,8 +226,8 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline int end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; @@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start, index += ret; } /* the inode may be gone now */ - return 0; } /* @@ -392,16 +391,16 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, */ atomic_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); @@ -421,15 +420,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_get(bio); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); return 0; @@ -497,7 +496,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * sure they map to this compressed extent on disk. */ set_page_extent_mapped(page); - lock_extent(tree, last_offset, end, GFP_NOFS); + lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); @@ -507,7 +506,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -535,7 +534,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, nr_pages++; page_cache_release(page); } else { - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -662,7 +661,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * inc the count before we submit the bio so @@ -675,14 +674,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); @@ -698,15 +697,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); return 0; @@ -734,7 +733,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = { &btrfs_lzo_compress, }; -int __init btrfs_init_compress(void) +void __init btrfs_init_compress(void) { int i; @@ -744,7 +743,6 @@ int __init btrfs_init_compress(void) atomic_set(&comp_alloc_workspace[i], 0); init_waitqueue_head(&comp_workspace_wait[i]); } - return 0; } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index a12059f4f0f..9afb0a62ae8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_init_compress(void); +void btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(int type, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0639a555e16..e801f226d7e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -36,7 +36,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); struct btrfs_path *btrfs_alloc_path(void) @@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - rcu_read_lock(); - eb = rcu_dereference(root->node); - extent_buffer_get(eb); - rcu_read_unlock(); + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -331,8 +344,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, buf->len, &refs, &flags); - BUG_ON(ret); - BUG_ON(refs == 0); + if (ret) + return ret; + if (refs == 0) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + return ret; + } } else { refs = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || @@ -351,14 +369,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_inc_ref(trans, root, cow, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { @@ -368,14 +386,15 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1, 1); else ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, new_flags, 0); - BUG_ON(ret); + if (ret) + return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { @@ -384,9 +403,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1, 1); else ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, buf, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); *last_ref = 1; @@ -415,7 +434,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *cow; - int level; + int level, ret; int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -467,7 +486,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow, &last_ref); + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } if (root->ref_cows) btrfs_reloc_cow_block(trans, root, buf, cow); @@ -504,7 +527,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; @@ -934,7 +957,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - BUG_ON(!child); + if (!child) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); @@ -959,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ - free_extent_buffer(mid); + free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > @@ -1010,13 +1038,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - wret = del_ptr(trans, root, path, level + 1, pslot + - 1); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer(right); + free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; @@ -1035,7 +1060,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - BUG_ON(!left); + if (!left) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } wret = balance_node_right(trans, root, mid, left); if (wret < 0) { ret = wret; @@ -1051,12 +1080,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - wret = del_ptr(trans, root, path, level + 1, pslot); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer(mid); + free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ @@ -1382,7 +1409,8 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock) + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) { int i; int skip_level = level; @@ -1414,6 +1442,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level, if (i >= lowest_unlock && i > skip_level && path->locks[i]) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } } } } @@ -1637,6 +1670,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; + int min_write_lock_level; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1664,6 +1698,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; + min_write_lock_level = write_lock_level; + again: /* * we try very hard to do read locks on the root @@ -1795,7 +1831,8 @@ cow_done: goto again; } - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); if (level == lowest_level) { if (dec) @@ -1857,7 +1894,8 @@ cow_done: } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); goto done; } } @@ -1881,15 +1919,12 @@ done: * fixing up pointers when a given leaf/node is not in slot 0 of the * higher levels * - * If this fails to write a tree block, it returns -1, but continues - * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) { int i; - int ret = 0; struct extent_buffer *t; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1902,7 +1937,6 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, if (tslot != 0) break; } - return ret; } /* @@ -1911,9 +1945,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; struct extent_buffer *eb; @@ -1923,13 +1957,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (comp_keys(&disk_key, new_key) >= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) >= 0); } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (comp_keys(&disk_key, new_key) <= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) <= 0); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -1937,7 +1969,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(eb); if (slot == 0) fixup_low_keys(trans, root, path, &disk_key, 1); - return 0; } /* @@ -2140,12 +2171,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. - * - * returns zero on success and < 0 on any error */ -static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_disk_key - *key, u64 bytenr, int slot, int level) +static void insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -2155,8 +2185,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) - BUG(); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), @@ -2169,7 +2198,6 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); - return 0; } /* @@ -2190,7 +2218,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; int mid; int ret; - int wret; u32 c_nritems; c = path->nodes[level]; @@ -2247,11 +2274,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - wret = insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, - level + 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2320,6 +2344,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2331,6 +2356,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; + btrfs_init_map_token(&token); + if (empty) nr = 0; else @@ -2408,8 +2435,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space -= btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } left_nritems -= push_items; @@ -2537,9 +2564,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 old_left_nritems; u32 nr; int ret = 0; - int wret; u32 this_item_size; u32 old_left_item_size; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -2600,9 +2629,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, item = btrfs_item_nr(left, i); - ioff = btrfs_item_offset(left, item); - btrfs_set_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2632,8 +2662,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space = push_space - btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } btrfs_mark_buffer_dirty(left); @@ -2643,9 +2674,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -2716,7 +2745,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; + if (ret == -ENOSPC) + ret = 1; goto out; } @@ -2738,22 +2768,21 @@ out: /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. */ -static noinline int copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline void copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { int data_copy_size; int rt_data_off; int i; - int ret = 0; - int wret; struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2775,17 +2804,15 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); } btrfs_set_header_nritems(l, mid); - ret = 0; btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -2803,8 +2830,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); - - return ret; } /* @@ -2993,12 +3018,8 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3006,29 +3027,21 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, + insert_ptr(trans, root, path, &disk_key, right->start, path->slots[1], 1); - if (wret) - ret = wret; btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } + if (path->slots[1] == 0) + fixup_low_keys(trans, root, path, + &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; } - ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); - BUG_ON(ret); + copy_for_split(trans, root, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -3036,7 +3049,7 @@ again: goto again; } - return ret; + return 0; push_for_double: push_for_double_split(trans, root, path, data_size); @@ -3238,11 +3251,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - ret = setup_items_for_insert(trans, root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - BUG_ON(ret); - + setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -3257,10 +3268,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -3271,13 +3282,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; old_size = btrfs_item_size_nr(leaf, slot); if (old_size == new_size) - return 0; + return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3297,8 +3311,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + size_diff); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); } /* shift the data */ @@ -3350,15 +3365,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* * make the item pointed to by the path bigger, data_size is the new size. */ -int btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) { int slot; struct extent_buffer *leaf; @@ -3368,6 +3382,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -3397,8 +3414,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - data_size); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); } /* shift the data */ @@ -3416,7 +3434,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* @@ -3441,6 +3458,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, unsigned int data_end; struct btrfs_disk_key disk_key; struct btrfs_key found_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); for (i = 0; i < nr; i++) { if (total_size + data_size[i] + sizeof(struct btrfs_item) > @@ -3506,8 +3526,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3534,9 +3555,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); @@ -3544,7 +3566,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -3562,19 +3584,21 @@ out: * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; - int ret; struct extent_buffer *leaf; int slot; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3606,8 +3630,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3626,17 +3651,17 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); - ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -3645,7 +3670,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return ret; } /* @@ -3672,16 +3696,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret == 0) return -EEXIST; if (ret < 0) - goto out; + return ret; slot = path->slots[0]; BUG_ON(slot < 0); - ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(trans, root, path, cpu_key, data_size, total_data, total_size, nr); - -out: - return ret; + return 0; } /* @@ -3717,13 +3739,11 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; - int ret = 0; - int wret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { @@ -3743,12 +3763,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); - return ret; } /* @@ -3761,17 +3778,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { - int ret; - WARN_ON(btrfs_header_generation(leaf) != trans->transid); - ret = del_ptr(trans, root, path, 1, path->slots[1]); - if (ret) - return ret; + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3781,8 +3794,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); + extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); - return 0; + free_extent_buffer_stale(leaf); } /* * delete the item at the leaf level in path. If that empties @@ -3799,6 +3813,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3820,8 +3837,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + dsize); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -3839,8 +3857,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { btrfs_set_path_blocking(path); clean_tree_block(trans, root, leaf); - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -3848,10 +3865,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, - &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -3879,9 +3893,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); free_extent_buffer(leaf); + ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -4059,18 +4073,18 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); + BUG_ON(!cur); /* -ENOMEM */ btrfs_tree_read_lock(cur); path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); btrfs_clear_path_blocking(path, NULL, 0); } out: @@ -4306,7 +4320,7 @@ again: } ret = 0; done: - unlock_up(path, 0, 1); + unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80b6486fd5e..5b8ef8eb352 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAX_MIRRORS 2 + #define BTRFS_MAX_LEVEL 8 #define BTRFS_COMPAT_EXTENT_TREE_V0 @@ -138,6 +140,12 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 /* + * the max metadata block size. This limit is somewhat artificial, + * but the memmove costs go through the roof for larger blocks. + */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + +/* * we can actually store much bigger names, but lets not confuse the rest * of linux */ @@ -461,6 +469,19 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -468,6 +489,7 @@ struct btrfs_super_block { (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* @@ -829,6 +851,21 @@ struct btrfs_csum_item { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline u64 chunk_to_extended(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline u64 extended_to_chunk(u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} + struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; @@ -1503,6 +1540,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) +#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1526,6 +1564,17 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +static inline void btrfs_init_map_token (struct btrfs_map_token *token) +{ + memset(token, 0, sizeof(*token)); +} + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1549,20 +1598,22 @@ struct btrfs_ioctl_defrag_range_args { #ifndef BTRFS_SETGET_FUNCS #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #endif #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ p->member = cpu_to_le##bits(val); \ } @@ -2466,8 +2517,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data); + struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2484,8 +2534,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2548,8 +2598,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -int btrfs_set_block_group_rw(struct btrfs_root *root, - struct btrfs_block_group_cache *cache); +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_root *root, @@ -2568,9 +2618,9 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key); +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -2590,12 +2640,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u32 data_size); -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2629,10 +2680,10 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2659,9 +2710,9 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2687,24 +2738,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } -/** - * profile_is_valid - tests whether a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile - */ -static inline int profile_is_valid(u64 flags, int extended) -{ - u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; - - flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; - if (extended) - mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (flags & mask) - return 0; - /* true if zero or exactly one bit set */ - return (flags & (~flags + 1)) == flags; -} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, @@ -2723,9 +2756,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); @@ -2909,7 +2943,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); -int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); int btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -2961,13 +2995,41 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno); + unsigned int line, int errno, const char *fmt, ...); + +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno); + +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + __btrfs_abort_transaction(trans, root, __func__, \ + __LINE__, errno); \ +} while (0) #define btrfs_std_error(fs_info, errno) \ do { \ if ((errno)) \ - __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ + __btrfs_std_error((fs_info), __func__, \ + __LINE__, (errno), NULL); \ +} while (0) + +#define btrfs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_std_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + struct btrfs_fs_info *_i = (fs_info); \ + __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ + BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ } while (0) /* acl.c */ @@ -3003,16 +3065,17 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); -void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly); -int btrfs_scrub_pause(struct btrfs_root *root); -int btrfs_scrub_pause_super(struct btrfs_root *root); -int btrfs_scrub_continue(struct btrfs_root *root); -int btrfs_scrub_continue_super(struct btrfs_root *root); +void btrfs_scrub_pause(struct btrfs_root *root); +void btrfs_scrub_pause_super(struct btrfs_root *root); +void btrfs_scrub_continue(struct btrfs_root *root); +void btrfs_scrub_continue_super(struct btrfs_root *root); +int __btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel(struct btrfs_root *root); int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fe4cd0f1cef..03e3748d84d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -115,6 +115,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) return NULL; } +/* Will return either the node or PTR_ERR(-ENOMEM) */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct inode *inode) { @@ -836,10 +837,8 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ - ret = setup_items_for_insert(trans, root, path, keys, data_size, - total_data_size, total_size, nitems); - if (ret) - goto error; + setup_items_for_insert(trans, root, path, keys, data_size, + total_data_size, total_size, nitems); /* insert the dir index items */ slot = path->slots[0]; @@ -1108,16 +1107,25 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, return 0; } -/* Called when committing the transaction. */ +/* + * Called when committing the transaction. + * Returns 0 on success. + * Returns < 0 on error and returns with an aborted transaction with any + * outstanding delayed items cleaned up. + */ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; + if (trans->aborted) + return -EIO; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1130,17 +1138,18 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node) { - root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, root, + curr_root = curr_node->root; + ret = btrfs_insert_delayed_items(trans, path, curr_root, curr_node); if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - curr_node); + ret = btrfs_delete_delayed_items(trans, path, + curr_root, curr_node); if (!ret) - ret = btrfs_update_delayed_inode(trans, root, path, - curr_node); + ret = btrfs_update_delayed_inode(trans, curr_root, + path, curr_node); if (ret) { btrfs_release_delayed_node(curr_node); + btrfs_abort_transaction(trans, root, ret); break; } @@ -1151,6 +1160,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, btrfs_free_path(path); trans->block_rsv = block_rsv; + return ret; } @@ -1371,6 +1381,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) btrfs_wq_run_delayed_node(delayed_root, root, 0); } +/* Will return 0 or -ENOMEM */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 66e4f29505a..69f22e3ab3b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -420,7 +420,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, @@ -487,20 +487,19 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, @@ -549,18 +548,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, @@ -611,12 +609,11 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* @@ -634,7 +631,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; BUG_ON(extent_op && extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); @@ -656,14 +652,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, action, 0); - BUG_ON(ret); - ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - BUG_ON(ret); if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); @@ -685,7 +679,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; BUG_ON(extent_op && !extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); @@ -707,14 +700,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, action, 1); - BUG_ON(ret); - ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - BUG_ON(ret); if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); @@ -729,7 +720,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, { struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); if (!head_ref) @@ -740,10 +730,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - BUG_ON(ret); if (waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 31d84e78129..c1a074d0696 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -49,9 +49,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - ret = btrfs_extend_item(trans, root, path, data_size); - } - if (ret < 0) + btrfs_extend_item(trans, root, path, data_size); + } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; @@ -116,6 +115,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). + * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -383,8 +383,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - ret = btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 534266fe505..20196f41120 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -48,20 +48,19 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static int btrfs_destroy_ordered_operations(struct btrfs_root *root); -static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); -static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); static int btrfs_destroy_pinned_extent(struct btrfs_root *root, struct extent_io_tree *pinned_extents); -static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -99,6 +98,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; + int error; }; /* @@ -332,8 +332,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, return 0; lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state, GFP_NOFS); - if (extent_buffer_uptodate(io_tree, eb, cached_state) && + 0, &cached_state); + if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; @@ -344,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, (unsigned long long)parent_transid, (unsigned long long)btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(io_tree, eb, &cached_state); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); @@ -360,9 +360,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, u64 start, u64 parent_transid) { struct extent_io_tree *io_tree; + int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; + int failed_mirror = 0; clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; @@ -370,9 +372,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && - !verify_parent_transid(io_tree, eb, parent_transid)) - return ret; + if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) + break; /* * This buffer's crc is fine, but its contents are corrupted, so @@ -380,18 +381,31 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * any less wrong. */ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) - return ret; + break; + + if (!failed_mirror) { + failed = 1; + printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); + failed_mirror = eb->failed_mirror; + } num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) - return ret; + break; mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + if (mirror_num > num_copies) - return ret; + break; } - return -EIO; + + if (failed && !ret) + repair_eb_io_failure(root, eb, failed_mirror); + + return ret; } /* @@ -404,50 +418,27 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; - unsigned long len; struct extent_buffer *eb; - int ret; tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) { - WARN_ON(1); - goto out; - } - if (!page->private) { - WARN_ON(1); - goto out; - } - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { - WARN_ON(1); - goto out; - } - ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, - btrfs_header_generation(eb)); - BUG_ON(ret); - WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); - + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return 0; found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); - goto err; + return 0; } - if (eb->first_page != page) { + if (eb->pages[0] != page) { WARN_ON(1); - goto err; + return 0; } if (!PageUptodate(page)) { WARN_ON(1); - goto err; + return 0; } csum_tree_block(root, eb, 0); -err: - free_extent_buffer(eb); -out: return 0; } @@ -537,34 +528,74 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, + struct page *page, int max_walk) +{ + struct extent_buffer *eb; + u64 start = page_offset(page); + u64 target = start; + u64 min_start; + + if (start < max_walk) + min_start = 0; + else + min_start = start - max_walk; + + while (start >= min_start) { + eb = find_extent_buffer(tree, start, 0); + if (eb) { + /* + * we found an extent buffer and it contains our page + * horray! + */ + if (eb->start <= target && + eb->start + eb->len > target) + return eb; + + /* we found an extent buffer that wasn't for us */ + free_extent_buffer(eb); + return NULL; + } + if (start == 0) + break; + start -= PAGE_CACHE_SIZE; + } + return NULL; +} + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { struct extent_io_tree *tree; u64 found_start; int found_level; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; + int reads_done; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; if (!page->private) goto out; - len = page->private >> 2; - WARN_ON(len == 0); + tree = &BTRFS_I(page->mapping->host)->io_tree; + eb = (struct extent_buffer *)page->private; - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { + /* the pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + extent_buffer_get(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; + + if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; - goto out; + goto err; } found_start = btrfs_header_bytenr(eb); - if (found_start != start) { + if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", (unsigned long long)found_start, @@ -572,13 +603,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - if (eb->first_page != page) { - printk(KERN_INFO "btrfs bad first page %lu %lu\n", - eb->first_page->index, page->index); - WARN_ON(1); - ret = -EIO; - goto err; - } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", (unsigned long long)eb->start); @@ -606,48 +630,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; } - end = min_t(u64, eb->len, PAGE_CACHE_SIZE); - end = eb->start + end - 1; + if (!ret) + set_extent_buffer_uptodate(eb); err: if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, ret); } + if (ret) + clear_extent_buffer_uptodate(eb); free_extent_buffer(eb); out: return ret; } -static int btree_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - int mirror_num, struct extent_state *state) +static int btree_io_failed_hook(struct page *page, int failed_mirror) { - struct extent_io_tree *tree; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; - if (!page->private) - goto out; - - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) - goto out; - - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = failed_mirror; + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); - } - free_extent_buffer(eb); - -out: return -EIO; /* we fixed nothing */ } @@ -719,11 +726,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; + int ret; async = container_of(work, struct async_submit_bio, work); - async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + ret = async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); + if (ret) + async->error = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -744,6 +754,12 @@ static void run_one_async_done(struct btrfs_work *work) waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); + /* If an error occured we just want to clean up the bio and move on */ + if (async->error) { + bio_endio(async->bio, async->error); + return; + } + async->submit_bio_done(async->inode, async->rw, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); @@ -785,6 +801,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; + async->error = 0; + atomic_inc(&fs_info->nr_async_submits); if (rw & REQ_SYNC) @@ -806,15 +824,18 @@ static int btree_csum_one_bio(struct bio *bio) struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; struct btrfs_root *root; + int ret = 0; WARN_ON(bio->bi_vcnt <= 0); while (bio_index < bio->bi_vcnt) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root, bvec->bv_page); + if (ret) + break; bio_index++; bvec++; } - return 0; + return ret; } static int __btree_submit_bio_start(struct inode *inode, int rw, @@ -826,8 +847,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - btree_csum_one_bio(bio); - return 0; + return btree_csum_one_bio(bio); } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, @@ -847,15 +867,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, { int ret; - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - BUG_ON(ret); - if (!(rw & REQ_WRITE)) { + /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + if (ret) + return ret; return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } @@ -893,34 +914,6 @@ static int btree_migratepage(struct address_space *mapping, } #endif -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct extent_buffer *eb; - int was_dirty; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (!(current->flags & PF_MEMALLOC)) { - return extent_write_full_page(tree, page, - btree_get_extent, wbc); - } - - redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); - WARN_ON(!eb); - - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; - spin_unlock(&root->fs_info->delalloc_lock); - } - free_extent_buffer(eb); - - unlock_page(page); - return 0; -} static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -940,7 +933,7 @@ static int btree_writepages(struct address_space *mapping, if (num_dirty < thresh) return 0; } - return extent_writepages(tree, mapping, btree_get_extent, wbc); + return btree_write_cache_pages(mapping, wbc); } static int btree_readpage(struct file *file, struct page *page) @@ -952,16 +945,8 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - if (PageWriteback(page) || PageDirty(page)) return 0; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -969,18 +954,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) */ gfp_flags &= ~GFP_SLAB_BUG_MASK; - ret = try_release_extent_state(map, tree, page, gfp_flags); - if (!ret) - return 0; - - ret = try_release_extent_buffer(tree, page); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } - - return ret; + return try_release_extent_buffer(page, gfp_flags); } static void btree_invalidatepage(struct page *page, unsigned long offset) @@ -998,15 +972,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } +static int btree_set_page_dirty(struct page *page) +{ + struct extent_buffer *eb; + + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); +} + static const struct address_space_operations btree_aops = { .readpage = btree_readpage, - .writepage = btree_writepage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif + .set_page_dirty = btree_set_page_dirty, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1049,7 +1036,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { free_extent_buffer(buf); return -EIO; - } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + } else if (extent_buffer_uptodate(buf)) { *eb = buf; } else { free_extent_buffer(buf); @@ -1074,20 +1061,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *eb; eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, NULL); + bytenr, blocksize); return eb; } int btrfs_write_tree_block(struct extent_buffer *buf) { - return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->first_page->mapping, + return filemap_fdatawait_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } @@ -1102,17 +1089,13 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return buf; } -int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf) +void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) { - struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1121,23 +1104,27 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, spin_lock(&root->fs_info->delalloc_lock); if (root->fs_info->dirty_metadata_bytes >= buf->len) root->fs_info->dirty_metadata_bytes -= buf->len; - else - WARN_ON(1); + else { + spin_unlock(&root->fs_info->delalloc_lock); + btrfs_panic(root->fs_info, -EOVERFLOW, + "Can't clear %lu bytes from " + " dirty_mdatadata_bytes (%lu)", + buf->len, + root->fs_info->dirty_metadata_bytes); + } spin_unlock(&root->fs_info->delalloc_lock); } /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + clear_extent_buffer_dirty(buf); } - return 0; } -static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, - u64 objectid) +static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) { root->node = NULL; root->commit_root = NULL; @@ -1189,13 +1176,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; - return 0; } -static int find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) +static int __must_check find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) { int ret; u32 blocksize; @@ -1208,7 +1194,8 @@ static int find_and_setup_root(struct btrfs_root *tree_root, &root->root_item, &root->root_key); if (ret > 0) return -ENOENT; - BUG_ON(ret); + else if (ret < 0) + return ret; generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); @@ -1377,7 +1364,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); + BUG_ON(!root->node); /* -ENOMEM */ out: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; @@ -1513,41 +1500,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return 0; } -static int bio_ready_for_csum(struct bio *bio) -{ - u64 length = 0; - u64 buf_len = 0; - u64 start = 0; - struct page *page; - struct extent_io_tree *io_tree = NULL; - struct bio_vec *bvec; - int i; - int ret; - - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - if (page->private == EXTENT_PAGE_PRIVATE) { - length += bvec->bv_len; - continue; - } - if (!page->private) { - length += bvec->bv_len; - continue; - } - length = bvec->bv_len; - buf_len = page->private >> 2; - start = page_offset(page) + bvec->bv_offset; - io_tree = &BTRFS_I(page->mapping->host)->io_tree; - } - /* are we fully contained in this bio? */ - if (buf_len <= length) - return 1; - - ret = extent_range_uptodate(io_tree, start + length, - start + buf_len - 1); - return ret; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1563,17 +1515,6 @@ static void end_workqueue_fn(struct btrfs_work *work) bio = end_io_wq->bio; fs_info = end_io_wq->info; - /* metadata bio reads are special because the whole tree block must - * be checksummed at once. This makes sure the entire block is in - * ram and up to date before trying to verify things. For - * blocksize <= pagesize, it is basically a noop - */ - if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && - !bio_ready_for_csum(bio)) { - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); - return; - } error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; @@ -1614,9 +1555,10 @@ static int transaction_kthread(void *arg) u64 transid; unsigned long now; unsigned long delay; - int ret; + bool cannot_commit; do { + cannot_commit = false; delay = HZ * 30; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); @@ -1638,11 +1580,14 @@ static int transaction_kthread(void *arg) transid = cur->transid; spin_unlock(&root->fs_info->trans_lock); + /* If the file system is aborted, this will always fail. */ trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + cannot_commit = true; + goto sleep; + } if (transid == trans->transid) { - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + btrfs_commit_transaction(trans, root); } else { btrfs_end_transaction(trans, root); } @@ -1653,7 +1598,8 @@ sleep: if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && - !btrfs_transaction_blocked(root->fs_info)) + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) schedule_timeout(delay); __set_current_state(TASK_RUNNING); } @@ -2042,6 +1988,7 @@ int open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; @@ -2084,6 +2031,7 @@ int open_ctree(struct super_block *sb, __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + invalidate_bdev(fs_devices->latest_bdev); bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) { err = -EINVAL; @@ -2104,7 +2052,12 @@ int open_ctree(struct super_block *sb, /* check FS state, whether FS is broken. */ fs_info->fs_state |= btrfs_super_flags(disk_super); - btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + err = ret; + goto fail_alloc; + } /* * run through our array of backup supers and setup @@ -2135,10 +2088,55 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + if (btrfs_super_leafsize(disk_super) != + btrfs_super_nodesize(disk_super)) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksizes don't match. node %d leaf %d\n", + btrfs_super_nodesize(disk_super), + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksize (%d) was too large\n", + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + + /* + * flag our filesystem as having big metadata blocks if + * they are bigger than the page size + */ + if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + + /* + * mixed block groups end up with duplicate but slightly offset + * extent buffers for the same range. It leads to corruptions + */ + if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (sectorsize != leafsize)) { + printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + "are not allowed for mixed block groups on %s\n", + sb->s_id); + goto fail_alloc; + } + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -2242,10 +2240,6 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); - nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); tree_root->nodesize = nodesize; tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; @@ -2285,7 +2279,7 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), blocksize, generation); - BUG_ON(!chunk_root->node); + BUG_ON(!chunk_root->node); /* -ENOMEM */ if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); @@ -2425,21 +2419,31 @@ retry_root_backup: log_tree_root->node = read_tree_block(tree_root, bytenr, blocksize, generation + 1); + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); - BUG_ON(ret); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_trans_kthread; + } if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - BUG_ON(ret); + ret = btrfs_commit_super(tree_root); + if (ret) + goto fail_trans_kthread; } } ret = btrfs_find_orphan_roots(tree_root); - BUG_ON(ret); + if (ret) + goto fail_trans_kthread; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); - BUG_ON(ret); + if (ret) { + } ret = btrfs_recover_relocation(tree_root); if (ret < 0) { @@ -2859,6 +2863,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); + + /* This shouldn't happen. FUA is masked off if unsupported */ BUG(); } @@ -2875,9 +2881,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", - total_errors); - BUG(); + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } return 0; } @@ -2891,7 +2897,20 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +/* Kill all outstanding I/O */ +void btrfs_abort_devices(struct btrfs_root *root) +{ + struct list_head *head; + struct btrfs_device *dev; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + blk_abort_queue(dev->bdev->bd_disk->queue); + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +} + +void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -2904,7 +2923,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); - return 0; } static void free_fs_root(struct btrfs_root *root) @@ -2921,7 +2939,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root); } -static int del_fs_roots(struct btrfs_fs_info *fs_info) +static void del_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2950,7 +2968,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_free_fs_root(fs_info, gang[i]); } - return 0; } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -2999,14 +3016,21 @@ int btrfs_commit_super(struct btrfs_root *root) if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (ret) + return ret; /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; ret = btrfs_write_and_wait_transaction(NULL, root); - BUG_ON(ret); + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to sync btree inode to disk."); + return ret; + } ret = write_ctree_super(NULL, root, 0); return ret; @@ -3122,10 +3146,9 @@ int close_ctree(struct btrfs_root *root) int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) { int ret; - struct inode *btree_inode = buf->first_page->mapping->host; + struct inode *btree_inode = buf->pages[0]->mapping->host; - ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, - NULL); + ret = extent_buffer_uptodate(buf); if (!ret) return ret; @@ -3136,16 +3159,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct inode *btree_inode = buf->first_page->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, - buf); + return set_extent_buffer_uptodate(buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; u64 transid = btrfs_header_generation(buf); - struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; btrfs_assert_tree_locked(buf); @@ -3157,8 +3177,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); root->fs_info->dirty_metadata_bytes += buf->len; @@ -3212,12 +3231,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - int ret; - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); - return ret; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } static int btree_lock_page_hook(struct page *page, void *data, @@ -3225,17 +3240,21 @@ static int btree_lock_page_hook(struct page *page, void *data, { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; - unsigned long len; - u64 bytenr = page_offset(page); - if (page->private == EXTENT_PAGE_PRIVATE) + /* + * We culled this eb but the page is still hanging out on the mapping, + * carry on. + */ + if (!PagePrivate(page)) goto out; - len = page->private >> 2; - eb = find_extent_buffer(io_tree, bytenr, len); - if (!eb) + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + goto out; + } + if (page != eb->pages[0]) goto out; if (!btrfs_try_tree_write_lock(eb)) { @@ -3254,7 +3273,6 @@ static int btree_lock_page_hook(struct page *page, void *data, } btrfs_tree_unlock(eb); - free_extent_buffer(eb); out: if (!trylock_page(page)) { flush_fn(data); @@ -3263,15 +3281,23 @@ out: return 0; } -static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { + if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "btrfs: unsupported checksum algorithm\n"); + return -EINVAL; + } + if (read_only) - return; + return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { printk(KERN_WARNING "warning: mount fs with errors, " "running btrfsck is recommended\n"); + } + + return 0; } int btrfs_error_commit_super(struct btrfs_root *root) @@ -3293,7 +3319,7 @@ int btrfs_error_commit_super(struct btrfs_root *root) return ret; } -static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3315,11 +3341,9 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root) spin_unlock(&root->fs_info->ordered_extent_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; } -static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct list_head splice; struct btrfs_ordered_extent *ordered; @@ -3351,12 +3375,10 @@ static int btrfs_destroy_ordered_extents(struct btrfs_root *root) } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) +int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; @@ -3365,6 +3387,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; +again: spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { spin_unlock(&delayed_refs->lock); @@ -3386,6 +3409,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); + spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); kfree(head->extent_op); delayed_refs->num_heads--; @@ -3393,8 +3417,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs->num_heads_ready--; list_del_init(&head->cluster); mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; } - spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); @@ -3407,7 +3432,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3425,11 +3450,9 @@ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) kfree(snapshot); } - - return 0; } -static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3449,8 +3472,6 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) } spin_unlock(&root->fs_info->delalloc_lock); - - return 0; } static int btrfs_destroy_marked_extents(struct btrfs_root *root, @@ -3541,13 +3562,43 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, return 0; } -static int btrfs_cleanup_transaction(struct btrfs_root *root) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, + struct btrfs_root *root) +{ + btrfs_destroy_delayed_refs(cur_trans, root); + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + cur_trans->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + cur_trans->in_commit = 1; + cur_trans->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + cur_trans->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + + cur_trans->commit_done = 1; + if (waitqueue_active(&cur_trans->commit_wait)) + wake_up(&cur_trans->commit_wait); + + btrfs_destroy_pending_snapshots(cur_trans); + + btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, + EXTENT_DIRTY); + + /* + memset(cur_trans, 0, sizeof(*cur_trans)); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + */ +} + +int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; LIST_HEAD(list); - WARN_ON(1); - mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); @@ -3612,6 +3663,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } +static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state) +{ + struct super_block *sb = page->mapping->host->i_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + btrfs_error(fs_info, -EIO, + "Error occured while writing out btree at %llu", start); + return -EIO; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3619,4 +3681,5 @@ static struct extent_io_ops btree_extent_io_ops = { .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, + .writepage_io_failed_hook = btree_writepage_io_failed_hook, }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e4bc4741319..a7ace1a2dd1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -44,8 +44,8 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -64,7 +64,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); @@ -85,6 +85,10 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_cleanup_transaction(struct btrfs_root *root); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, + struct btrfs_root *root); +void btrfs_abort_devices(struct btrfs_root *root); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 5f77166fd01..e887ee62b6d 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -193,7 +193,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) if (ret < 0) goto fail; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 37e0a800d34..a84420491c1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -245,7 +245,7 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -253,13 +253,13 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } kfree(logical); @@ -321,7 +321,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { break; @@ -332,7 +332,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, size = end - start; total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ } return total_added; @@ -474,7 +474,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, int ret = 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); + if (!caching_ctl) + return -ENOMEM; INIT_LIST_HEAD(&caching_ctl->list); mutex_init(&caching_ctl->mutex); @@ -982,7 +983,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; - BUG_ON(ret > 0); + BUG_ON(ret > 0); /* Corruption */ leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, @@ -1008,9 +1009,9 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, new_size + extra_size, 1); if (ret < 0) return ret; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ - ret = btrfs_extend_item(trans, root, path, new_size); + btrfs_extend_item(trans, root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1478,7 +1479,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, err = ret; goto out; } - BUG_ON(ret); + if (ret && !insert) { + err = -ENOENT; + goto out; + } + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1592,13 +1597,13 @@ out: * helper to add new inline back ref */ static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1608,7 +1613,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 refs; int size; int type; - int ret; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1617,7 +1621,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - ret = btrfs_extend_item(trans, root, path, size); + btrfs_extend_item(trans, root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1652,7 +1656,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } btrfs_mark_buffer_dirty(leaf); - return 0; } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1687,12 +1690,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) +void update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1703,7 +1706,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, u32 item_size; int size; int type; - int ret; u64 refs; leaf = path->nodes[0]; @@ -1745,10 +1747,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - ret = btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(trans, root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); - return 0; } static noinline_for_stack @@ -1768,13 +1769,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - ret = update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); + update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - ret = setup_inline_extent_backref(trans, root, path, iref, - parent, root_objectid, - owner, offset, refs_to_add, - extent_op); + setup_inline_extent_backref(trans, root, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; } return ret; } @@ -1804,12 +1805,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data) { - int ret; + int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - ret = update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); + update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); } else if (is_data) { ret = remove_extent_data_ref(trans, root, path, refs_to_drop); } else { @@ -1835,6 +1836,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); + /* Error condition is -ENOMEM */ if (!ret) { struct btrfs_bio_stripe *stripe = bbio->stripes; int i; @@ -1850,7 +1852,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, if (!ret) discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) - break; + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ /* * Just in case we get back EOPNOTSUPP for some reason, @@ -1869,6 +1871,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, return ret; } +/* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1944,7 +1947,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); return err; @@ -2031,6 +2035,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int ret; int err = 0; + if (trans->aborted) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2128,7 +2135,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) { - int ret; + int ret = 0; + + if (trans->aborted) + return 0; + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* @@ -2146,11 +2157,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); - BUG_ON(ret); } } mutex_unlock(&head->mutex); - return 0; + return ret; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -2197,6 +2207,10 @@ again: return NULL; } +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct list_head *cluster) @@ -2285,9 +2299,13 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - BUG_ON(ret); kfree(extent_op); + if (ret) { + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + return ret; + } + goto next; } @@ -2308,11 +2326,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - BUG_ON(ret); btrfs_put_delayed_ref(ref); kfree(extent_op); count++; + + if (ret) { + printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + return ret; + } + next: do_chunk_alloc(trans, root->fs_info->extent_root, 2 * 1024 * 1024, @@ -2347,6 +2370,9 @@ static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, * 0, which means to process everything in the tree at the start * of the run (but not newly added entries), or it can be some target * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count) @@ -2362,6 +2388,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long num_refs = 0; int consider_waiting; + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2419,7 +2449,11 @@ again: } ret = run_clustered_refs(trans, root, &cluster); - BUG_ON(ret < 0); + if (ret < 0) { + spin_unlock(&delayed_refs->lock); + btrfs_abort_transaction(trans, root, ret); + return ret; + } count -= min_t(unsigned long, ret, count); @@ -2584,7 +2618,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = -ENOENT; if (path->slots[0] == 0) @@ -2738,7 +2772,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } return 0; fail: - BUG(); return ret; } @@ -2767,7 +2800,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); if (ret < 0) goto fail; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -2775,8 +2808,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); fail: - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } return 0; } @@ -2949,7 +2984,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -2976,7 +3012,9 @@ again: last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); - BUG_ON(err); + if (err) /* File system offline */ + goto out; + btrfs_put_block_group(cache); } @@ -2989,7 +3027,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -3014,20 +3053,21 @@ again: continue; } - btrfs_write_out_cache(root, trans, cache, path); + err = btrfs_write_out_cache(root, trans, cache, path); /* * If we didn't have an error then the cache state is still * NEED_WRITE, so we can set it to WRITTEN. */ - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) cache->disk_cache_state = BTRFS_DC_WRITTEN; last = cache->key.objectid + cache->key.offset; btrfs_put_block_group(cache); } +out: btrfs_free_path(path); - return 0; + return err; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -3098,11 +3138,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; @@ -3113,6 +3150,35 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) && + !spin_is_locked(&fs_info->balance_lock)); + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* * @flags: available profiles in extended format (see ctree.h) * * Returns reduced profile in chunk format. If profile changing is in @@ -3128,31 +3194,19 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) */ u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + u64 target; - /* pick restriper's target profile if it's available */ + /* + * see if restripe for this chunk_type is in progress, if so + * try to reduce to the target profile + */ spin_lock(&root->fs_info->balance_lock); - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && - (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->data.target)) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && - (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->sys.target)) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && - (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->meta.target)) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { + target = get_restripe_target(root->fs_info, flags); + if (target) { + /* pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { spin_unlock(&root->fs_info->balance_lock); - flags = tgt; - goto out; + return extended_to_chunk(target); } } spin_unlock(&root->fs_info->balance_lock); @@ -3180,10 +3234,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) flags &= ~BTRFS_BLOCK_GROUP_RAID0; } -out: - /* extended -> chunk profile */ - flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return flags; + return extended_to_chunk(flags); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) @@ -3312,8 +3363,7 @@ commit_trans: } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)(unsigned long)data_sinfo, - bytes, 1); + data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3334,8 +3384,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)(unsigned long)data_sinfo, - bytes, 0); + data_sinfo->flags, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3396,6 +3445,50 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ + u64 num_dev; + + if (type & BTRFS_BLOCK_GROUP_RAID10 || + type & BTRFS_BLOCK_GROUP_RAID0) + num_dev = root->fs_info->fs_devices->rw_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_dev = 2; + else + num_dev = 1; /* DUP or single */ + + /* metadata for updaing devices and chunk tree */ + return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} + +static void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + struct btrfs_space_info *info; + u64 left; + u64 thresh; + + info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly; + spin_unlock(&info->lock); + + thresh = get_system_chunk_thresh(root, type); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", + left, thresh, type); + dump_space_info(info, 0, 0); + } + + if (left < thresh) { + u64 flags; + + flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + btrfs_alloc_chunk(trans, root, flags); + } +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) @@ -3405,15 +3498,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - BUG_ON(!profile_is_valid(flags, 0)); - space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, 0, 0, &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } - BUG_ON(!space_info); + BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -3468,6 +3559,12 @@ again: force_metadata_allocation(fs_info); } + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, extent_root, flags); + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret < 0 && ret != -ENOSPC) goto out; @@ -3678,8 +3775,10 @@ again: ret = wait_event_interruptible(space_info->wait, !space_info->flush); /* Must have been interrupted, return */ - if (ret) + if (ret) { + printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__); return -EINTR; + } spin_lock(&space_info->lock); } @@ -3700,9 +3799,7 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)(unsigned long)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { /* @@ -3771,9 +3868,7 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)(unsigned long)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3836,8 +3931,9 @@ out: return ret; } -static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) { struct btrfs_block_rsv *block_rsv = NULL; @@ -3918,8 +4014,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)space_info, - num_bytes, 0); + space_info->flags, num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4137,14 +4232,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)sinfo, num_bytes, 1); + sinfo->flags, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)sinfo, num_bytes, 0); + sinfo->flags, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4198,12 +4293,12 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, return; trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)(unsigned long)trans, - trans->bytes_reserved, 0); + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } +/* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -4540,7 +4635,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) - return -1; + return -ENOENT; if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -4643,7 +4738,7 @@ int btrfs_pin_extent(struct btrfs_root *root, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ pin_down_extent(root, cache, bytenr, num_bytes, reserved); @@ -4661,7 +4756,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ /* * pull in the free space cache (if any) so that our pin @@ -4706,6 +4801,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; + spin_lock(&space_info->lock); spin_lock(&cache->lock); if (reserve != RESERVE_FREE) { @@ -4716,9 +4812,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)(unsigned long)space_info, - num_bytes, 0); + "space_info", space_info->flags, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } @@ -4734,7 +4829,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, return ret; } -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4764,7 +4859,6 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->extent_commit_sem); update_global_block_rsv(fs_info); - return 0; } static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) @@ -4779,7 +4873,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (cache) btrfs_put_block_group(cache); cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ } len = cache->key.objectid + cache->key.offset - start; @@ -4816,6 +4910,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; + if (trans->aborted) + return 0; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else @@ -4901,7 +4998,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - BUG_ON(ret); + if (ret) + goto abort; btrfs_release_path(path); path->leave_spinning = 1; @@ -4919,10 +5017,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) + goto abort; extent_slot = path->slots[0]; } - } else { + } else if (ret == -ENOENT) { btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " @@ -4932,6 +5031,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)root_objectid, (unsigned long long)owner_objectid, (unsigned long long)owner_offset); + } else { + goto abort; } leaf = path->nodes[0]; @@ -4941,7 +5042,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - BUG_ON(ret < 0); + if (ret < 0) + goto abort; btrfs_release_path(path); path->leave_spinning = 1; @@ -4958,7 +5060,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) + goto abort; extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -4995,7 +5098,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - BUG_ON(ret); + if (ret) + goto abort; } } else { if (found_extent) { @@ -5012,23 +5116,27 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - BUG_ON(ret); + if (ret) + goto abort; btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); + if (ret) + goto abort; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); - BUG_ON(ret); + if (ret) + goto abort; } +out: btrfs_free_path(path); return ret; + +abort: + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } /* @@ -5124,7 +5232,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, parent, root->root_key.objectid, btrfs_header_level(buf), BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (!last_ref) @@ -5158,6 +5266,7 @@ out: btrfs_put_block_group(cache); } +/* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int for_cow) @@ -5179,14 +5288,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, num_bytes, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); } return ret; } @@ -5243,28 +5350,34 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +static int __get_block_group_index(u64 flags) { int index; - if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + + if (flags & BTRFS_BLOCK_GROUP_RAID10) index = 0; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + else if (flags & BTRFS_BLOCK_GROUP_RAID1) index = 1; - else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + else if (flags & BTRFS_BLOCK_GROUP_DUP) index = 2; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + else if (flags & BTRFS_BLOCK_GROUP_RAID0) index = 3; else index = 4; + return index; } +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + return __get_block_group_index(cache->flags); +} + enum btrfs_loop_type { - LOOP_FIND_IDEAL = 0, - LOOP_CACHING_NOWAIT = 1, - LOOP_CACHING_WAIT = 2, - LOOP_ALLOC_CHUNK = 3, - LOOP_NO_EMPTY_SIZE = 4, + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, }; /* @@ -5278,7 +5391,6 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, - u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, u64 data) { @@ -5287,6 +5399,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; struct btrfs_block_group_cache *used_block_group; + u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; @@ -5300,8 +5413,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; - u64 ideal_cache_percent = 0; - u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -5351,7 +5462,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { -ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); used_block_group = block_group; @@ -5363,8 +5473,7 @@ ideal_cache: * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - (block_group->cached != BTRFS_CACHE_NO || - search_start == ideal_cache_offset)) { + block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -5418,44 +5527,13 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - u64 free_percent; - found_uncached_bg = true; ret = cache_block_group(block_group, trans, - orig_root, 1); - if (block_group->cached == BTRFS_CACHE_FINISHED) - goto alloc; - - free_percent = btrfs_block_group_used(&block_group->item); - free_percent *= 100; - free_percent = div64_u64(free_percent, - block_group->key.offset); - free_percent = 100 - free_percent; - if (free_percent > ideal_cache_percent && - likely(!block_group->ro)) { - ideal_cache_offset = block_group->key.objectid; - ideal_cache_percent = free_percent; - } - - /* - * The caching workers are limited to 2 threads, so we - * can queue as much work as we care to. - */ - if (loop > LOOP_FIND_IDEAL) { - ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret); - } - - /* - * If loop is set for cached only, try the next block - * group. - */ - if (loop == LOOP_FIND_IDEAL) - goto loop; + orig_root, 0); + BUG_ON(ret < 0); + ret = 0; } -alloc: if (unlikely(block_group->ro)) goto loop; @@ -5606,11 +5684,6 @@ unclustered_alloc: } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } /* move on to the next group */ if (search_start + num_bytes > @@ -5661,9 +5734,7 @@ loop: if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; - /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for - * for them to make caching progress. Also - * determine the best possible bg to cache + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5673,50 +5744,17 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { - found_uncached_bg = false; - loop++; - if (!ideal_cache_percent) - goto search; - - /* - * 1 of the following 2 things have happened so far - * - * 1) We found an ideal block group for caching that - * is mostly full and will cache quickly, so we might - * as well wait for it. - * - * 2) We searched for cached only and we didn't find - * anything, and we didn't start any caching kthreads - * either, so chances are we will loop through and - * start a couple caching kthreads, and then come back - * around and just wait for them. This will be slower - * because we will have 2 caching kthreads reading at - * the same time when we could have just started one - * and waited for it to get far enough to give us an - * allocation, so go ahead and go to the wait caching - * loop. - */ - loop = LOOP_CACHING_WAIT; - search_start = ideal_cache_offset; - ideal_cache_percent = 0; - goto ideal_cache; - } else if (loop == LOOP_FIND_IDEAL) { - /* - * Didn't find a uncached bg, wait on anything we find - * next. - */ - loop = LOOP_CACHING_WAIT; - goto search; - } - loop++; - if (loop == LOOP_ALLOC_CHUNK) { if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_LIMITED); + if (ret < 0) { + btrfs_abort_transaction(trans, + root, ret); + goto out; + } allowed_chunk_alloc = 0; if (ret == 1) done_chunk_alloc = 1; @@ -5745,6 +5783,7 @@ loop: } else if (ins->objectid) { ret = 0; } +out: return ret; } @@ -5798,12 +5837,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) + struct btrfs_key *ins, u64 data) { bool final_tried = false; int ret; - u64 search_start = 0; data = btrfs_get_alloc_profile(root, data); again: @@ -5811,23 +5848,31 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) + if (empty_size || root->ref_cows) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_NO_FORCE); + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, - ins, data); + hint_byte, ins, data); if (ret == -ENOSPC) { if (!final_tried) { num_bytes = num_bytes >> 1; num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, + ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes, data, CHUNK_ALLOC_FORCE); + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -5838,7 +5883,8 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + if (sinfo) + dump_space_info(sinfo, num_bytes, 1); } } @@ -5917,7 +5963,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5947,7 +5996,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_free_path(path); ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { + if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); @@ -5978,7 +6027,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -6008,7 +6060,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_free_path(path); ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { + if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); @@ -6056,28 +6108,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!caching_ctl) { BUG_ON(!block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { num_bytes = caching_ctl->progress - start; ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ start = caching_ctl->progress; num_bytes = ins->objectid + ins->offset - caching_ctl->progress; ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } mutex_unlock(&caching_ctl->mutex); @@ -6086,7 +6138,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); - BUG_ON(ret); + BUG_ON(ret); /* logic error */ btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -6107,6 +6159,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); @@ -6214,7 +6267,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, (u64)-1, &ins, 0); + empty_size, hint, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -6222,7 +6275,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); - BUG_ON(IS_ERR(buf)); + BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6234,7 +6287,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); + BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -6249,7 +6302,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, extent_op, for_cow); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } return buf; } @@ -6319,7 +6372,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, &refs, &flags); - BUG_ON(ret); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; BUG_ON(refs == 0); if (wc->stage == DROP_REFERENCE) { @@ -6386,7 +6441,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, eb->start, eb->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; BUG_ON(wc->refs[level] == 0); } @@ -6405,12 +6462,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -6482,7 +6539,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, &wc->refs[level - 1], &wc->flags[level - 1]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock(next); + return ret; + } + BUG_ON(wc->refs[level - 1] == 0); *lookup_info = 0; @@ -6551,7 +6612,7 @@ skip: ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); free_extent_buffer(next); @@ -6609,7 +6670,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, eb->start, eb->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + return ret; + } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); @@ -6629,7 +6693,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -6738,7 +6802,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -void btrfs_drop_snapshot(struct btrfs_root *root, +int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc) { @@ -6766,7 +6830,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root, } trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; @@ -6791,7 +6858,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out_free; + goto out_end_trans; } WARN_ON(ret > 0); @@ -6811,7 +6878,10 @@ void btrfs_drop_snapshot(struct btrfs_root *root, path->nodes[level]->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + err = ret; + goto out_end_trans; + } BUG_ON(wc->refs[level] == 0); if (level == root_item->drop_level) @@ -6862,26 +6932,40 @@ void btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; } } btrfs_release_path(path); - BUG_ON(err); + if (err) + goto out_end_trans; ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + goto out_end_trans; + } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_find_last_root(tree_root, root->root_key.objectid, NULL, NULL); - BUG_ON(ret < 0); - if (ret > 0) { + if (ret < 0) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { /* if we fail to delete the orphan item this time * around, it'll get picked up the next time. * @@ -6899,14 +6983,15 @@ void btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); kfree(root); } -out_free: +out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); +out_free: kfree(wc); btrfs_free_path(path); out: if (err) btrfs_std_error(root->fs_info, err); - return; + return err; } /* @@ -6983,31 +7068,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; - u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + u64 stripped; - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - /* pick restriper's target profile and return */ - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { - /* extended -> chunk profile */ - tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return tgt; - } - } + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(root->fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); /* * we add in the count of missing devices because we want @@ -7017,6 +7086,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -7029,7 +7101,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; - return flags; } else { /* they already had raid on here, just return */ if (flags & stripped) @@ -7042,9 +7113,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DUP) return stripped | BTRFS_BLOCK_GROUP_RAID1; - /* turn single device chunks into raid0 */ - return stripped | BTRFS_BLOCK_GROUP_RAID0; + /* this is drive concat, leave it alone */ } + return flags; } @@ -7103,12 +7174,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + } ret = set_block_group_ro(cache, 0); if (!ret) @@ -7188,7 +7263,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -int btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -7204,7 +7279,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root, cache->ro = 0; spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); - return 0; } /* @@ -7222,6 +7296,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; + u64 target; int index; int full = 0; int ret = 0; @@ -7262,13 +7337,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) /* * ok we don't have enough space, but maybe we have free space on our * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. However, if we - * were marked as full, then we know there aren't enough chunks, and we - * can just return. + * alloc devices and guess if we have enough space. if this block + * group is going to be restriped, run checks against the target + * profile instead of the current one. */ ret = -1; - if (full) - goto out; /* * index: @@ -7278,7 +7351,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * 3: raid0 * 4: single */ - index = get_block_group_index(block_group); + target = get_restripe_target(root->fs_info, block_group->flags); + if (target) { + index = __get_block_group_index(extended_to_chunk(target)); + } else { + /* + * this is just a balance, so if we were marked as full + * we know there is no space for a new chunk + */ + if (full) + goto out; + + index = get_block_group_index(block_group); + } + if (index == 0) { dev_min = 4; /* Divide by 2 */ @@ -7572,7 +7658,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ cache->space_info = space_info; spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7581,7 +7667,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) @@ -7663,7 +7749,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); @@ -7673,11 +7759,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, sizeof(cache->item)); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + return ret; + } set_avail_alloc_bits(extent_root->fs_info, type); @@ -7686,11 +7775,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; @@ -7758,7 +7844,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2862454bcdb..8d904dd7ea9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -19,6 +19,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" +#include "locking.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -53,6 +54,13 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + return btrfs_sb(tree->mapping->host->i_sb); +} + int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("extent_state", @@ -136,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) #endif atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -153,6 +162,7 @@ void free_extent_state(struct extent_state *state) list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif + trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -439,6 +449,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } +void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); +} + /* * clear some bits on a range in the tree. This may require splitting * or inserting elements in the tree, so the gfp mask is used to @@ -449,8 +466,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc) * * the range [start, end] is inclusive. * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. + * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int wake, int delete, @@ -464,7 +480,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct rb_node *node; u64 last_end; int err; - int set = 0; int clear = 0; if (delete) @@ -542,12 +557,14 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -564,17 +581,19 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; } - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); next: if (last_end == (u64)-1) goto out; @@ -591,7 +610,7 @@ out: if (prealloc) free_extent_state(prealloc); - return set; + return 0; search_again: if (start > end) @@ -602,8 +621,8 @@ search_again: goto again; } -static int wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) __releases(tree->lock) __acquires(tree->lock) { @@ -613,7 +632,6 @@ static int wait_on_state(struct extent_io_tree *tree, schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - return 0; } /* @@ -621,7 +639,7 @@ static int wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) { struct extent_state *state; struct rb_node *node; @@ -658,7 +676,6 @@ again: } out: spin_unlock(&tree->lock); - return 0; } static void set_state_bits(struct extent_io_tree *tree, @@ -706,9 +723,10 @@ static void uncache_state(struct extent_state **cached_ptr) * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -742,8 +760,10 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, &bits); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -809,7 +829,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; @@ -846,12 +868,9 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; @@ -873,7 +892,8 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); cache_state(prealloc, cached_state); @@ -900,6 +920,15 @@ search_again: goto again; } +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + /** * convert_extent - convert all bits in a given range from one bit to another * @tree: the io tree to search @@ -946,7 +975,8 @@ again: } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -1002,7 +1032,8 @@ hit_next: goto out; } err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; if (err) goto out; @@ -1041,12 +1072,8 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1065,7 +1092,8 @@ hit_next: } err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); @@ -1095,14 +1123,14 @@ search_again: int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return set_extent_bit(tree, start, end, bits, 0, NULL, + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } @@ -1117,7 +1145,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - 0, NULL, cached_state, mask); + NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -1131,7 +1159,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); } @@ -1139,7 +1167,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, - NULL, cached_state, mask); + cached_state, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -1155,42 +1183,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state, gfp_t mask) + int bits, struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); + if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; - } else { + } else break; - } WARN_ON(start > end); } return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return lock_extent_bits(tree, start, end, 0, NULL, mask); + return lock_extent_bits(tree, start, end, 0, NULL); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, mask); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, mask); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); return 0; } return 1; @@ -1203,10 +1229,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - mask); + GFP_NOFS); } /* @@ -1220,7 +1246,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) while (index <= end_index) { page = find_get_page(tree->mapping, index); - BUG_ON(!page); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); page_cache_release(page); index++; @@ -1343,9 +1369,9 @@ out: return found; } -static noinline int __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) { int ret; struct page *pages[16]; @@ -1355,7 +1381,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode, int i; if (index == locked_page->index && end_index == index) - return 0; + return; while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -1370,7 +1396,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode, index += ret; cond_resched(); } - return 0; } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1500,11 +1525,10 @@ again: goto out_failed; } } - BUG_ON(ret); + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, - 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1761,39 +1785,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static int check_page_uptodate(struct extent_io_tree *tree, - struct page *page) +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); - return 0; } /* * helper function to unlock a page if all the extents in the tree * for that page are unlocked */ -static int check_page_locked(struct extent_io_tree *tree, - struct page *page) +static void check_page_locked(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); - return 0; } /* * helper function to end page writeback if all the extents * in the tree for that page are done with writeback */ -static int check_page_writeback(struct extent_io_tree *tree, - struct page *page) +static void check_page_writeback(struct extent_io_tree *tree, + struct page *page) { end_page_writeback(page); - return 0; } /* @@ -1912,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2258,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; + int failed_mirror; int ret; if (err) @@ -2304,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else clean_io_failure(start, page); } - if (!uptodate) { - int failed_mirror; + + if (!uptodate) failed_mirror = (int)(unsigned long)bio->bi_bdev; + + if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2320,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, page, start, end, failed_mirror, NULL); if (ret == 0) { -error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2328,16 +2374,9 @@ error_handled: uncache_state(&cached); continue; } - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - if (ret == 0) - goto error_handled; - } } - if (uptodate) { + if (uptodate && tree->track_uptodate) { set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } @@ -2386,8 +2425,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) +/* + * Since writes are async, they will only return -ENOMEM. + * Reads can return the full range of I/O error conditions. + */ +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2413,6 +2456,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, return ret; } +static int merge_bio(struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + static int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2441,12 +2497,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, page_size, bio, - bio_flags)) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); + if (ret < 0) + return ret; bio = NULL; } else { return 0; @@ -2473,25 +2529,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void set_page_extent_mapped(struct page *page) +void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); } } -static void set_page_extent_head(struct page *page, unsigned long len) +void set_page_extent_mapped(struct page *page) { - WARN_ON(!PagePrivate(page)); - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } } /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) + * XXX JDM: This needs looking at to ensure proper page locking */ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, @@ -2531,11 +2593,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end = page_end; while (1) { - lock_extent(tree, start, end, GFP_NOFS); + lock_extent(tree, start, end); ordered = btrfs_lookup_ordered_extent(inode, start); if (!ordered) break; - unlock_extent(tree, start, end, GFP_NOFS); + unlock_extent(tree, start, end); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -2572,7 +2634,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end - cur + 1, 0); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2624,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2634,7 +2696,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2654,6 +2716,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); + BUG_ON(ret == -ENOMEM); nr++; *bio_flags = this_bio_flag; } @@ -2795,7 +2858,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end, &page_started, &nr_written); - BUG_ON(ret); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + goto done; + } /* * delalloc_end is already one less than the total * length, so we don't subtract one from @@ -2968,6 +3035,275 @@ done_unlocked: return 0; } +static int eb_wait(void *word) +{ + io_schedule(); + return 0; +} + +static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static int lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_lock(&fs_info->delalloc_lock); + if (fs_info->dirty_metadata_bytes >= eb->len) + fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&fs_info->delalloc_lock); + ret = 1; + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_clear_bit(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_buffer *eb; + int done; + + do { + struct page *page = bvec->bv_page; + + bvec--; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + +} + +static int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + u64 offset = eb->start; + unsigned long i, num_pages; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int ret; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, 0, 0); + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + continue; + } + + if (eb == prev_eb) + continue; + + if (!atomic_inc_not_zero(&eb->refs)) { + WARN_ON(1); + continue; + } + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -3099,10 +3435,14 @@ retry: static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { + int rw = WRITE; + int ret; + if (epd->sync_io) - submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); - else - submit_one_bio(WRITE, epd->bio, 0, 0); + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, 0); + BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } } @@ -3219,7 +3559,7 @@ int extent_readpages(struct extent_io_tree *tree, } BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(READ, bio, 0, bio_flags); return 0; } @@ -3240,7 +3580,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -3454,7 +3794,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state, GFP_NOFS); + &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); @@ -3548,26 +3888,7 @@ out: inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; - - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); - - return p; + return eb->pages[i]; } inline unsigned long num_extent_pages(u64 start, u64 len) @@ -3576,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + if (eb->pages && eb->pages != eb->inline_pages) + kfree(eb->pages); + kmem_cache_free(extent_buffer_cache, eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -3591,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; + eb->tree = tree; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3607,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { + struct page **pages; + int num_pages = (len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + pages = kzalloc(num_pages, mask); + if (!pages) { + __free_extent_buffer(eb); + return NULL; + } + eb->pages = pages; + } else { + eb->pages = eb->inline_pages; + } return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) +static int extent_buffer_under_io(struct extent_buffer *eb) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } /* @@ -3632,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; - if (!eb->first_page) - return; + BUG_ON(extent_buffer_under_io(eb)); index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) @@ -3642,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) + if (page) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ page_cache_release(page); + } } while (index != start_idx); } @@ -3656,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + atomic_inc(&eb->refs); + if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + } +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb) +{ + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + mark_page_accessed(p); + } +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0) + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3674,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3683,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + mark_extent_buffer_accessed(exists); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + mark_page_accessed(p); + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -3716,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ - if (i != 0) - unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - +again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; @@ -3731,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - /* add one reference for the caller */ - atomic_inc(&exists->refs); + if (!atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + exists = NULL; + goto again; + } spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + mark_extent_buffer_accessed(exists); goto free_eb; } /* add one reference for the tree */ - atomic_inc(&eb->refs); + spin_lock(&eb->refs_lock); + check_buffer_tree_ref(eb); + spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3751,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->first_page); - set_page_extent_head(eb->first_page, eb->len); - if (!page0) - unlock_page(eb->first_page); + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (eb->first_page && !page0) - unlock_page(eb->first_page); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } if (!atomic_dec_and_test(&eb->refs)) return exists; @@ -3776,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3784,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +/* Expects to have eb->eb_lock already held */ +static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + struct extent_io_tree *tree = eb->tree; + + spin_unlock(&eb->refs_lock); + + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return; + } + spin_unlock(&eb->refs_lock); +} + void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb, GFP_ATOMIC); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb, GFP_NOFS); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +void clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; @@ -3812,10 +4298,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, lock_page(page); WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3827,24 +4309,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } - return 0; + WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -static int __eb_straddles_pages(u64 start, u64 len) +static int range_straddles_pages(u64 start, u64 len) { if (len < PAGE_CACHE_SIZE) return 1; @@ -3855,25 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len) return 0; } -static int eb_straddles_pages(struct extent_buffer *eb) -{ - return __eb_straddles_pages(eb->start, eb->len); -} - -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3882,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - if (eb_straddles_pages(eb)) { - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); - } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; @@ -3917,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - if (__eb_straddles_pages(start, end - start + 1)) { + if (range_straddles_pages(start, end - start + 1)) { ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) @@ -3939,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree, return pg_uptodate; } -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) +int extent_buffer_uptodate(struct extent_buffer *eb) { - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - if (eb_straddles_pages(eb)) { - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -3981,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (eb_straddles_pages(eb)) { - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -4014,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -4023,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = 0; + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - - WARN_ON(!PagePrivate(page)); - - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, @@ -4048,8 +4474,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } } - if (bio) - submit_one_bio(READ, bio, mirror_num, bio_flags); + if (bio) { + err = submit_one_bio(READ, bio, mirror_num, bio_flags); + if (err) + return err; + } if (ret || wait != WAIT_COMPLETE) return ret; @@ -4061,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4304,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - BUG_ON(areas_overlap(src_off, dst_off, len)); + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4382,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (!areas_overlap(src_offset, dst_offset, len)) { + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4408,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +int try_release_extent_buffer(struct page *page, gfp_t mask) { - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - btrfs_release_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); struct extent_buffer *eb; - int ret = 1; - spin_lock(&tree->buffer_lock); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) { - spin_unlock(&tree->buffer_lock); - return ret; + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } + spin_unlock(&page->mapping->private_lock); - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); -out: - spin_unlock(&tree->buffer_lock); + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + release_extent_buffer(eb, mask); + + return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cecc3518c12..faf10eb57f7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,6 +35,10 @@ #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 +#define EXTENT_BUFFER_WRITEBACK 7 +#define EXTENT_BUFFER_IOERR 8 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -54,6 +58,7 @@ #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; +struct btrfs_root; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -69,9 +74,7 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state); + int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, struct extent_state *state); @@ -97,6 +100,7 @@ struct extent_io_tree { struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; + int track_uptodate; spinlock_t lock; spinlock_t buffer_lock; struct extent_io_ops *ops; @@ -119,16 +123,21 @@ struct extent_state { struct list_head leak_list; }; +#define INLINE_EXTENT_BUFFER_PAGES 16 +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) struct extent_buffer { u64 start; unsigned long len; unsigned long map_start; unsigned long map_len; - struct page *first_page; unsigned long bflags; + struct extent_io_tree *tree; + spinlock_t refs_lock; + atomic_t refs; + atomic_t io_pages; + int failed_mirror; struct list_head leak_list; struct rcu_head rcu_head; - atomic_t refs; pid_t lock_owner; /* count of read lock holders on the extent buffer */ @@ -152,6 +161,9 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; + wait_queue_head_t lock_wq; + struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; + struct page **pages; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -178,18 +190,17 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_buffer(struct page *page, gfp_t mask); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached, gfp_t mask); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); + int bits, struct extent_state **cached); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); @@ -210,7 +221,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, + int bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -240,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, struct list_head *pages, unsigned nr_pages, @@ -251,11 +264,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0); + u64 start, unsigned long len); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 @@ -287,19 +300,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state); -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +void clear_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, @@ -320,4 +326,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 078b4fd5450..5d158d32023 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,10 +25,12 @@ #include "transaction.h" #include "print-tree.h" -#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) +#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) + #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ sizeof(struct btrfs_sector_sum) * \ @@ -59,7 +61,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); + BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -284,6 +286,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; + LIST_HEAD(tmplist); unsigned long offset; int ret; size_t size; @@ -358,7 +361,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS); - BUG_ON(!sums); + if (!sums) { + ret = -ENOMEM; + goto fail; + } sector_sum = sums->sums; sums->bytenr = start; @@ -380,12 +386,19 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, offset += csum_size; sector_sum++; } - list_add_tail(&sums->list, list); + list_add_tail(&sums->list, &tmplist); } path->slots[0]++; } ret = 0; fail: + while (ret < 0 && !list_empty(&tmplist)) { + sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + list_splice_tail(&tmplist, list); + btrfs_free_path(path); return ret; } @@ -420,7 +433,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, offset = page_offset(bvec->bv_page) + bvec->bv_offset; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); + BUG_ON(!ordered); /* Logic error */ sums->bytenr = ordered->start; while (bio_index < bio->bi_vcnt) { @@ -439,11 +452,11 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); - BUG_ON(!sums); + BUG_ON(!sums); /* -ENOMEM */ sector_sum = sums->sums; sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); + BUG_ON(!ordered); /* Logic error */ sums->bytenr = ordered->start; } @@ -483,18 +496,17 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 bytenr, u64 len) +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) { struct extent_buffer *leaf; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; - int ret; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; @@ -510,7 +522,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 1); + btrfs_truncate_item(trans, root, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -522,15 +534,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 0); + btrfs_truncate_item(trans, root, path, new_size, 0); key->offset = end_byte; - ret = btrfs_set_item_key_safe(trans, root, path, key); - BUG_ON(ret); + btrfs_set_item_key_safe(trans, root, path, key); } else { BUG(); } - return 0; } /* @@ -635,13 +645,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - BUG_ON(ret && ret != -EAGAIN); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } key.offset = end_byte - 1; } else { - ret = truncate_one_csum(trans, root, path, - &key, bytenr, len); - BUG_ON(ret); + truncate_one_csum(trans, root, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -772,7 +783,7 @@ again: if (diff != csum_size) goto insert; - ret = btrfs_extend_item(trans, root, path, diff); + btrfs_extend_item(trans, root, path, diff); goto csum; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e8d06b6b919..d83260d7498 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -452,7 +452,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); - BUG_ON(!split || !split2); + BUG_ON(!split || !split2); /* -ENOMEM */ write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -494,7 +494,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ free_extent_map(split); split = split2; split2 = NULL; @@ -520,7 +520,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ free_extent_map(split); split = NULL; } @@ -679,7 +679,7 @@ next_slot: root->root_key.objectid, new_key.objectid, start - extent_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ *hint_byte = disk_bytenr; } key.offset = start; @@ -754,7 +754,7 @@ next_slot: root->root_key.objectid, key.objectid, key.offset - extent_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); *hint_byte = disk_bytenr; @@ -770,7 +770,10 @@ next_slot: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } del_nr = 0; del_slot = 0; @@ -782,11 +785,13 @@ next_slot: BUG_ON(1); } - if (del_nr > 0) { + if (!ret && del_nr > 0) { ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +out: btrfs_free_path(path); return ret; } @@ -944,7 +949,10 @@ again: btrfs_release_path(path); goto again; } - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, @@ -963,7 +971,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (split == start) { key.offset = start; @@ -990,7 +998,7 @@ again: ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } other_start = 0; other_end = start; @@ -1007,7 +1015,7 @@ again: ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { fi = btrfs_item_ptr(leaf, path->slots[0], @@ -1025,7 +1033,10 @@ again: btrfs_mark_buffer_dirty(leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } out: btrfs_free_path(path); @@ -1105,8 +1116,7 @@ again: if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state, - GFP_NOFS); + start_pos, last_pos - 1, 0, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, last_pos - 1); if (ordered && @@ -1638,7 +1648,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); + locked_end, 0, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -1667,7 +1677,13 @@ static long btrfs_fallocate(struct file *file, int mode, em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); - BUG_ON(IS_ERR_OR_NULL(em)); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + break; + } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = (last_byte + mask) & ~mask; @@ -1737,7 +1753,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) return -ENXIO; lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, - &cached_state, GFP_NOFS); + &cached_state); /* * Delalloc is such a pain. If we have a hole and we have pending diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b02e379b14c..e88330d3df5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -230,11 +230,13 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, if (ret) { trans->block_rsv = rsv; - WARN_ON(1); + btrfs_abort_transaction(trans, root, ret); return ret; } ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); trans->block_rsv = rsv; return ret; @@ -869,7 +871,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); node = rb_first(&ctl->free_space_offset); if (!node && cluster) { @@ -1948,14 +1950,14 @@ again: */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - WARN_ON(ret); + WARN_ON(ret); /* -ENOMEM */ goto out; } ret = remove_from_bitmap(ctl, info, &offset, &bytes); if (ret == -EAGAIN) goto again; - BUG_ON(ret); + BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); out: @@ -2346,7 +2348,7 @@ again: rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); - BUG_ON(ret); + BUG_ON(ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * block_group->sectorsize, 1); @@ -2439,7 +2441,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; - BUG_ON(ret); + BUG_ON(ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; @@ -2830,6 +2832,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) int ret; ret = search_bitmap(ctl, entry, &offset, &count); + /* Logic error; Should be empty if it can't find anything */ BUG_ON(ret); ino = offset; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index baa74f3db69..a13cf1a96c7 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -19,6 +19,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "print-tree.h" static int find_name_in_backref(struct btrfs_path *path, const char *name, int name_len, struct btrfs_inode_ref **ref_ret) @@ -128,13 +129,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - ret = btrfs_truncate_item(trans, root, path, + btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); return ret; } +/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, @@ -165,7 +167,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ret = btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(trans, root, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ee15d88b33d..b1a1c929ba8 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); + BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) break; info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); + BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->cache_progress) goto free; @@ -439,17 +439,16 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (ret) goto out; trace_btrfs_space_reservation(root->fs_info, "ino_cache", - (u64)(unsigned long)trans, - trans->bytes_reserved, 1); + trans->transid, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { ret = PTR_ERR(inode); goto out_release; } if (IS_ERR(inode)) { - BUG_ON(retry); + BUG_ON(retry); /* Logic error */ retry = true; ret = create_free_ino_inode(root, trans, path); @@ -460,12 +459,17 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); - WARN_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto out_put; + } } spin_lock(&root->cache_lock); @@ -502,8 +506,7 @@ out_put: iput(inode); out_release: trace_btrfs_space_reservation(root->fs_info, "ino_cache", - (u64)(unsigned long)trans, - trans->bytes_reserved, 0); + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; @@ -532,7 +535,7 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3a0b5c1f9d3..115bc05e42b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -150,7 +150,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, inode_add_bytes(inode, size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - BUG_ON(ret); if (ret) { err = ret; goto fail; @@ -206,9 +205,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); - return 0; + return ret; fail: btrfs_free_path(path); return err; @@ -250,14 +249,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, ret = btrfs_drop_extents(trans, inode, start, aligned_end, &hint_byte, 1); - BUG_ON(ret); + if (ret) + return ret; if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -293,7 +296,7 @@ static noinline int add_async_extent(struct async_cow *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); + BUG_ON(!async_extent); /* -ENOMEM */ async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -344,8 +347,9 @@ static noinline int compress_file_range(struct inode *inode, int will_compress; int compress_type = root->fs_info->compress_type; - /* if this is a small write inside eof, kick off a defragbot */ - if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024) + /* if this is a small write inside eof, kick off a defrag */ + if ((end - start + 1) < 16 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); actual_end = min_t(u64, isize, end + 1); @@ -433,7 +437,11 @@ again: cont: if (start == 0) { trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto cleanup_and_out; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ @@ -450,11 +458,11 @@ cont: total_compressed, compress_type, pages); } - if (ret == 0) { + if (ret <= 0) { /* - * inline extent creation worked, we don't need - * to create any more async work items. Unlock - * and free up our temp pages. + * inline extent creation worked or returned error, + * we don't need to create any more async work items. + * Unlock and free up our temp pages. */ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -547,7 +555,7 @@ cleanup_and_bail_uncompressed: } out: - return 0; + return ret; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -557,6 +565,20 @@ free_pages_out: kfree(pages); goto out; + +cleanup_and_out: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + if (!trans || IS_ERR(trans)) + btrfs_error(root->fs_info, ret, "Failed to join transaction"); + else + btrfs_abort_transaction(trans, root, ret); + goto free_pages_out; } /* @@ -597,7 +619,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); + async_extent->ram_size - 1); /* allocate blocks */ ret = cow_file_range(inode, async_cow->locked_page, @@ -606,6 +628,8 @@ retry: async_extent->ram_size - 1, &page_started, &nr_written, 0); + /* JDM XXX */ + /* * if page_started, cow_file_range inserted an * inline extent and took care of all the unlocking @@ -625,18 +649,21 @@ retry: } lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1, - GFP_NOFS); + async_extent->start + async_extent->ram_size - 1); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_reserve_extent(trans, root, + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, - (u64)-1, &ins, 1); - btrfs_end_transaction(trans, root); + 0, alloc_hint, &ins, 1); + if (ret) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + } if (ret) { int i; @@ -649,8 +676,10 @@ retry: async_extent->pages = NULL; unlock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); - goto retry; + async_extent->ram_size - 1); + if (ret == -ENOSPC) + goto retry; + goto out_free; /* JDM: Requeue? */ } /* @@ -662,7 +691,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -694,7 +723,7 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * clear dirty, set writeback and unlock the pages. @@ -716,13 +745,17 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } - - return 0; + ret = 0; +out: + return ret; +out_free: + kfree(async_extent); + goto out; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -791,7 +824,18 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(btrfs_is_free_space_inode(root, inode)); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); @@ -800,7 +844,8 @@ static noinline int cow_file_range(struct inode *inode, ret = 0; /* if this is a small write inside eof, kick off defrag */ - if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024) + if (num_bytes < 64 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(trans, inode); if (start == 0) { @@ -821,8 +866,10 @@ static noinline int cow_file_range(struct inode *inode, *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; - ret = 0; goto out; + } else if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } } @@ -838,11 +885,14 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - BUG_ON(ret); + &ins, 1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -868,13 +918,16 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } } if (disk_num_bytes < cur_alloc_size) @@ -899,11 +952,23 @@ static noinline int cow_file_range(struct inode *inode, alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } -out: ret = 0; +out: btrfs_end_transaction(trans, root); return ret; +out_unlock: + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + + goto out; } /* @@ -969,7 +1034,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); + BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = inode; async_cow->root = root; async_cow->locked_page = locked_page; @@ -1060,7 +1125,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_bytenr; u64 num_bytes; int extent_type; - int ret; + int ret, err; int type; int nocow; int check_prev = 1; @@ -1078,7 +1143,11 @@ static noinline int run_delalloc_nocow(struct inode *inode, else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; @@ -1086,7 +1155,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, while (1) { ret = btrfs_lookup_file_extent(trans, root, path, ino, cur_offset, 0); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1100,8 +1172,10 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - BUG_ON(1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } if (ret > 0) break; leaf = path->nodes[0]; @@ -1189,7 +1263,10 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } cow_start = (u64)-1; } @@ -1198,7 +1275,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; @@ -1224,13 +1301,16 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } } extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -1249,18 +1329,23 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } } +error: if (nolock) { - ret = btrfs_end_transaction_nolock(trans, root); - BUG_ON(ret); + err = btrfs_end_transaction_nolock(trans, root); } else { - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + err = btrfs_end_transaction(trans, root); } + if (!ret) + ret = err; + btrfs_free_path(path); - return 0; + return ret; } /* @@ -1425,10 +1510,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_length = length; ret = btrfs_map_block(map_tree, READ, logical, &map_length, NULL, 0); - + /* Will always return 0 or 1 with map_multi == NULL */ + BUG_ON(ret < 0); if (map_length < length + size) return 1; - return ret; + return 0; } /* @@ -1448,7 +1534,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, int ret = 0; ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -1479,14 +1565,16 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; int skip_sum; + int metadata = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(root, inode)) - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); - else - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + metadata = 2; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + return ret; if (!(rw & REQ_WRITE)) { if (bio_flags & EXTENT_BIO_COMPRESSED) { @@ -1571,7 +1659,7 @@ again: page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, - &cached_state, GFP_NOFS); + &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) @@ -1675,13 +1763,15 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, &hint, 0); - BUG_ON(ret); + if (ret) + goto out; ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1709,10 +1799,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, btrfs_ino(inode), file_pos, &ins); - BUG_ON(ret); +out: btrfs_free_path(path); - return 0; + return ret; } /* @@ -1740,35 +1830,41 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) end - start + 1); if (!ret) return 0; - BUG_ON(!ordered_extent); + BUG_ON(!ordered_extent); /* Logic error */ nolock = btrfs_is_free_space_inode(root, inode); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); + BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { if (nolock) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); - BUG_ON(ret); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); } goto out; } lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); if (nolock) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_unlock; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -1779,7 +1875,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len); - BUG_ON(ret); } else { BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, @@ -1793,11 +1888,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len); - BUG_ON(ret); } unlock_extent_cached(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state, GFP_NOFS); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); @@ -1805,7 +1903,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { ret = btrfs_update_inode_fallback(trans, root, inode); - BUG_ON(ret); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out; + } } ret = 0; out: @@ -1824,6 +1925,11 @@ out: btrfs_put_ordered_extent(ordered_extent); return 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); + goto out; } static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, @@ -1905,6 +2011,8 @@ struct delayed_iput { struct inode *inode; }; +/* JDM: If this is fs-wide, why can't we add a pointer to + * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -2051,20 +2159,27 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); + BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ } /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret && ret != -EEXIST); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + ret = 0; } /* insert an orphan item to track subvolume contains orphan files */ if (insert >= 2) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); - BUG_ON(ret); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } } return 0; } @@ -2094,7 +2209,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) if (trans && delete_item) { ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ } if (release_rsv) @@ -2228,7 +2343,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) } ret = btrfs_del_orphan_item(trans, root, found_key.objectid); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ btrfs_end_transaction(trans, root); continue; } @@ -2610,16 +2725,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, printk(KERN_INFO "btrfs failed to delete reference to %.*s, " "inode %llu parent %llu\n", name_len, name, (unsigned long long)ino, (unsigned long long)dir_ino); + btrfs_abort_transaction(trans, root, ret); goto err; } ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto err; + } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); - BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != 0 && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto err; + } ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2777,7 +2898,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = ret; goto out; } - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (check_path_shared(root, path)) goto out; btrfs_release_path(path); @@ -2810,7 +2931,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = PTR_ERR(ref); goto out; } - BUG_ON(!ref); + BUG_ON(!ref); /* Logic error */ if (check_path_shared(root, path)) goto out; index = btrfs_inode_ref_index(path->nodes[0], ref); @@ -2917,23 +3038,42 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - BUG_ON(IS_ERR_OR_NULL(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + goto out; + } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_release_path(path); ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, dir_ino, &index, name, name_len); if (ret < 0) { - BUG_ON(ret != -ENOENT); + if (ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); - BUG_ON(IS_ERR_OR_NULL(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); @@ -2943,15 +3083,19 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_i_size_write(dir, dir->i_size - name_len * 2); dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: btrfs_free_path(path); - return 0; + return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -3161,8 +3305,8 @@ search_again: } size = btrfs_file_extent_calc_inline_size(size); - ret = btrfs_truncate_item(trans, root, path, - size, 1); + btrfs_truncate_item(trans, root, path, + size, 1); } else if (root->ref_cows) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); @@ -3210,7 +3354,11 @@ delete: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, ret); + goto error; + } pending_del_nr = 0; } btrfs_release_path(path); @@ -3223,8 +3371,10 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +error: btrfs_free_path(path); return err; } @@ -3282,8 +3432,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -3359,7 +3508,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) btrfs_wait_ordered_range(inode, hole_start, block_end - hole_start); lock_extent_bits(io_tree, hole_start, block_end - 1, 0, - &cached_state, GFP_NOFS); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, hole_start); if (!ordered) break; @@ -3372,7 +3521,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset, 0); - BUG_ON(IS_ERR_OR_NULL(em)); + if (IS_ERR(em)) { + err = PTR_ERR(em); + break; + } last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3389,7 +3541,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size, &hint_byte, 1); if (err) { - btrfs_update_inode(trans, root, inode); + btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); break; } @@ -3399,7 +3551,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 0, hole_size, 0, hole_size, 0, 0, 0); if (err) { - btrfs_update_inode(trans, root, inode); + btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); break; } @@ -3779,7 +3931,7 @@ static void inode_tree_del(struct inode *inode) } } -int btrfs_invalidate_inodes(struct btrfs_root *root) +void btrfs_invalidate_inodes(struct btrfs_root *root) { struct rb_node *node; struct rb_node *prev; @@ -3839,7 +3991,6 @@ again: node = rb_next(node); } spin_unlock(&root->inode_lock); - return 0; } static int btrfs_init_locked_inode(struct inode *inode, void *p) @@ -4581,18 +4732,26 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, parent_ino, index); } - if (ret == 0) { - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, - btrfs_inode_type(inode), index); - if (ret) - goto fail_dir_item; + /* Nothing to clean up yet */ + if (ret) + return ret; - btrfs_i_size_write(parent_inode, parent_inode->i_size + - name_len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode, &key, + btrfs_inode_type(inode), index); + if (ret == -EEXIST) + goto fail_dir_item; + else if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); return ret; fail_dir_item: @@ -4806,7 +4965,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); - BUG_ON(err); + if (err) + goto fail; d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -5137,7 +5297,7 @@ again: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -5252,6 +5412,7 @@ out: free_extent_map(em); return ERR_PTR(err); } + BUG_ON(!em); /* Error is always set */ return em; } @@ -5414,7 +5575,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, - alloc_hint, (u64)-1, &ins, 1); + alloc_hint, &ins, 1); if (ret) { em = ERR_PTR(ret); goto out; @@ -5602,7 +5763,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, free_extent_map(em); /* DIO will do one hole at a time, so just unlock a sector */ unlock_extent(&BTRFS_I(inode)->io_tree, start, - start + root->sectorsize - 1, GFP_NOFS); + start + root->sectorsize - 1); return 0; } @@ -5743,7 +5904,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) } while (bvec <= bvec_end); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1, GFP_NOFS); + dip->logical_offset + dip->bytes - 1); bio->bi_private = dip->private; kfree(dip->csums); @@ -5794,7 +5955,7 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, ordered->file_offset + ordered->len - 1, 0, - &cached_state, GFP_NOFS); + &cached_state); if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { ret = btrfs_mark_extent_written(trans, inode, @@ -5868,7 +6029,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -6209,7 +6370,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -6233,7 +6394,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, if (writing) { write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DELALLOC, 0, NULL, &cached_state, + EXTENT_DELALLOC, NULL, &cached_state, GFP_NOFS); if (ret) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, @@ -6363,8 +6524,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); if (ordered) { @@ -6386,8 +6546,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) } btrfs_put_ordered_extent(ordered); cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -6462,8 +6621,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); /* @@ -6737,10 +6895,9 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); - BUG_ON(err); iput(inode); - return 0; + return err; } struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -6783,6 +6940,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + ei->io_tree.track_uptodate = 1; + ei->io_failure_tree.track_uptodate = 1; mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -7072,7 +7231,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret) ret = btrfs_update_inode(trans, root, old_inode); } - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } if (new_inode) { new_inode->i_ctime = CURRENT_TIME; @@ -7090,11 +7252,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len); } - BUG_ON(ret); - if (new_inode->i_nlink == 0) { + if (!ret && new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, new_dentry->d_inode); BUG_ON(ret); } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } } fixup_inode_flags(new_dir, old_inode); @@ -7102,7 +7267,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; @@ -7315,7 +7483,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, (u64)-1, &ins, 1); + 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -7327,7 +7495,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ins.offset, ins.offset, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); @@ -7349,7 +7522,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d8b54715c2d..18cc23d164a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -425,22 +425,37 @@ static noinline int create_subvol(struct btrfs_root *root, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(IS_ERR(new_root)); + if (IS_ERR(new_root)) { + btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); + ret = PTR_ERR(new_root); + goto fail; + } btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + if (ret) { + /* We potentially lose an unused inode item here */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + /* * insert the directory item */ ret = btrfs_set_inode_index(dir, &index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_insert_dir_item(trans, root, name, namelen, dir, &key, BTRFS_FT_DIR, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto fail; + } btrfs_i_size_write(dir, dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); @@ -769,6 +784,31 @@ none: return -ENOENT; } +/* + * Validaty check of prev em and next em: + * 1) no prev/next em + * 2) prev/next em is an hole/inline extent + */ +static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *prev = NULL, *next = NULL; + int ret = 0; + + read_lock(&em_tree->lock); + prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); + next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); + read_unlock(&em_tree->lock); + + if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && + (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) + ret = 1; + free_extent_map(prev); + free_extent_map(next); + + return ret; +} + static int should_defrag_range(struct inode *inode, u64 start, u64 len, int thresh, u64 *last_len, u64 *skip, u64 *defrag_end) @@ -797,17 +837,25 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, if (!em) { /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1, GFP_NOFS); + lock_extent(io_tree, start, start + len - 1); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); + unlock_extent(io_tree, start, start + len - 1); if (IS_ERR(em)) return 0; } /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { ret = 0; + goto out; + } + + /* If we have nothing to merge with us, just skip. */ + if (check_adjacent_extents(inode, em)) { + ret = 0; + goto out; + } /* * we hit a real extent, if it is big don't bother defragging it again @@ -815,6 +863,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) ret = 0; +out: /* * last_len ends up being a counter of how many bytes we've defragged. * every time we choose not to defrag an extent, we reset *last_len @@ -856,6 +905,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 isize = i_size_read(inode); u64 page_start; u64 page_end; + u64 page_cnt; int ret; int i; int i_done; @@ -864,19 +914,21 @@ static int cluster_pages_for_defrag(struct inode *inode, struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - if (isize == 0) - return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || start_index > file_end) + return 0; + + page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ - for (i = 0; i < num_pages; i++) { + for (i = 0; i < page_cnt; i++) { struct page *page; again: page = find_or_create_page(inode->i_mapping, @@ -887,10 +939,10 @@ again: page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { - lock_extent(tree, page_start, page_end, GFP_NOFS); + lock_extent(tree, page_start, page_end); ordered = btrfs_lookup_ordered_extent(inode, page_start); - unlock_extent(tree, page_start, page_end, GFP_NOFS); + unlock_extent(tree, page_start, page_end); if (!ordered) break; @@ -898,6 +950,15 @@ again: btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); + /* + * we unlocked the page above, so we need check if + * it was released or not. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } } if (!PageUptodate(page)) { @@ -911,15 +972,6 @@ again: } } - isize = i_size_read(inode); - file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end) { - /* whoops, we blew past eof, skip this page */ - unlock_page(page); - page_cache_release(page); - break; - } - if (page->mapping != inode->i_mapping) { unlock_page(page); page_cache_release(page); @@ -946,19 +998,18 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state, - GFP_NOFS); + page_start, page_end - 1, 0, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, GFP_NOFS); - if (i_done != num_pages) { + if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (num_pages - i_done) << PAGE_CACHE_SHIFT); + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -983,7 +1034,7 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); return ret; } @@ -1089,12 +1140,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; - if (!newer_than && - !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, - extent_thresh, - &last_len, &skip, - &defrag_end)) { + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, extent_thresh, + &last_len, &skip, &defrag_end)) { unsigned long next; /* * the should_defrag function tells us how much to skip @@ -1123,17 +1171,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += max_cluster; } + mutex_lock(&inode->i_mutex); ret = cluster_pages_for_defrag(inode, pages, i, cluster); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&inode->i_mutex); goto out_ra; + } defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + mutex_unlock(&inode->i_mutex); if (newer_than) { if (newer_off == (u64)-1) break; + if (ret > 0) + i += ret; + newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); @@ -1966,7 +2021,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_key.objectid, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, root, ret); + goto out_end_trans; + } btrfs_record_root_in_trans(trans, dest); @@ -1979,11 +2038,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } } - +out_end_trans: ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + if (ret && !err) + err = ret; inode->i_flags |= S_DEAD; out_up_write: up_write(&root->fs_info->subvol_sem); @@ -2326,13 +2390,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, another, and lock file content */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + lock_extent(&BTRFS_I(src)->io_tree, off, off+len); ordered = btrfs_lookup_first_ordered_extent(src, off+len); if (!ordered && !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, EXTENT_DELALLOC, 0, NULL)) break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); if (ordered) btrfs_put_ordered_extent(ordered); btrfs_wait_ordered_range(src, off, len); @@ -2447,11 +2511,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset, new_key.offset + datal, &hint_byte, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } leaf = path->nodes[0]; slot = path->slots[0]; @@ -2478,7 +2552,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_ino(inode), new_key.offset - datao, 0); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + goto out; + + } } } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; @@ -2503,11 +2585,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset, new_key.offset + datal, &hint_byte, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } if (skip) { u32 start = @@ -2541,8 +2633,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_i_size_write(inode, endoff); ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); } next: btrfs_release_path(path); @@ -2551,7 +2647,7 @@ next: ret = 0; out: btrfs_release_path(path); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); @@ -3066,8 +3162,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; extent_item_pos = loi->logical - key.objectid; - ret = iterate_extent_inodes(root->fs_info, path, key.objectid, - extent_item_pos, build_ino_list, + ret = iterate_extent_inodes(root->fs_info, key.objectid, + extent_item_pos, 0, build_ino_list, inodes); if (ret < 0) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5e178d8f716..272f911203f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -208,7 +208,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * take a spinning write lock. This will wait for both * blocking readers or writers */ -int btrfs_tree_lock(struct extent_buffer *eb) +void btrfs_tree_lock(struct extent_buffer *eb) { again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); @@ -230,13 +230,12 @@ again: atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); eb->lock_owner = current->pid; - return 0; } /* * drop a spinning or a blocking write lock. */ -int btrfs_tree_unlock(struct extent_buffer *eb) +void btrfs_tree_unlock(struct extent_buffer *eb) { int blockers = atomic_read(&eb->blocking_writers); @@ -255,7 +254,6 @@ int btrfs_tree_unlock(struct extent_buffer *eb) atomic_dec(&eb->spinning_writers); write_unlock(&eb->lock); } - return 0; } void btrfs_assert_tree_locked(struct extent_buffer *eb) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 17247ddb81a..ca52681e5f4 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -24,8 +24,8 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 -int btrfs_tree_lock(struct extent_buffer *eb); -int btrfs_tree_unlock(struct extent_buffer *eb); +void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a1c94042530..bbf6d0d9aeb 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -59,6 +59,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +static void ordered_data_tree_panic(struct inode *inode, int errno, + u64 offset) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " + "%llu\n", (unsigned long long)offset); +} + /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -207,7 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); - BUG_ON(node); + if (node) + ordered_data_tree_panic(inode, -EEXIST, file_offset); spin_unlock(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); @@ -215,7 +224,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &BTRFS_I(inode)->root->fs_info->ordered_extents); spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); - BUG_ON(node); return 0; } @@ -249,9 +257,9 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, * when an ordered extent is finished. If the list covers more than one * ordered extent, it is split across multiples. */ -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum) +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; @@ -259,7 +267,6 @@ int btrfs_add_ordered_sum(struct inode *inode, spin_lock(&tree->lock); list_add_tail(&sum->list, &entry->list); spin_unlock(&tree->lock); - return 0; } /* @@ -384,7 +391,7 @@ out: * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped */ -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { struct list_head *cur; struct btrfs_ordered_sum *sum; @@ -400,7 +407,6 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) } kfree(entry); } - return 0; } /* @@ -408,8 +414,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * and you must wake_up entry->wait. You must hold the tree lock * while you call this function. */ -static int __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +static void __btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -436,35 +442,30 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } /* * remove an ordered extent from the tree. No references are dropped * but any waiters are woken. */ -int btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; - int ret; tree = &BTRFS_I(inode)->ordered_tree; spin_lock(&tree->lock); - ret = __btrfs_remove_ordered_extent(inode, entry); + __btrfs_remove_ordered_extent(inode, entry); spin_unlock(&tree->lock); wake_up(&entry->wait); - - return ret; } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -512,7 +513,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); - return 0; } /* @@ -525,7 +525,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; @@ -573,8 +573,6 @@ again: spin_unlock(&root->fs_info->ordered_extent_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; } /* @@ -609,7 +607,7 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * Used to wait on ordered extents across a large range of bytes. */ -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; u64 orig_end; @@ -664,7 +662,6 @@ again: schedule_timeout(1); goto again; } - return 0; } /* @@ -948,9 +945,8 @@ out: * If trans is not null, we'll do a friendly check for a transaction that * is already flushing things and force the IO down ourselves. */ -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) { u64 last_mod; @@ -961,7 +957,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * commit, we can safely return without doing anything */ if (last_mod < root->fs_info->last_trans_committed) - return 0; + return; /* * the transaction is already committing. Just start the IO and @@ -969,7 +965,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, */ if (trans && root->fs_info->running_transaction->blocked) { btrfs_wait_ordered_range(inode, 0, (u64)-1); - return 0; + return; } spin_lock(&root->fs_info->ordered_extent_lock); @@ -978,6 +974,4 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, &root->fs_info->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ff1f69aa188..c355ad4dc1a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -138,8 +138,8 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -int btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, @@ -154,14 +154,14 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int compress_type); -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum); +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, @@ -170,10 +170,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +void btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index f8be250963a..24cad1695af 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -58,7 +58,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - if (ret) { + if (ret) { /* JDM: Really? */ ret = -ENOENT; goto out; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 22db04550f6..dc5d33146fd 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -54,7 +54,6 @@ * than the 2 started one after another. */ -#define MAX_MIRRORS 2 #define MAX_IN_FLIGHT 6 struct reada_extctl { @@ -71,7 +70,7 @@ struct reada_extent { struct list_head extctl; struct kref refcnt; spinlock_t lock; - struct reada_zone *zones[MAX_MIRRORS]; + struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; struct btrfs_device *scheduled_for; }; @@ -84,7 +83,8 @@ struct reada_zone { spinlock_t lock; int locked; struct btrfs_device *device; - struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl + * self */ int ndevs; struct kref refcnt; }; @@ -365,9 +365,9 @@ again: if (ret || !bbio || length < blocksize) goto error; - if (bbio->num_stripes > MAX_MIRRORS) { + if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", MAX_MIRRORS); + "supported", BTRFS_MAX_MIRRORS); goto error; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8c1aae2c845..017281dbb2a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -326,6 +326,19 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) return NULL; } +void backref_tree_panic(struct rb_node *rb_node, int errno, + u64 bytenr) +{ + + struct btrfs_fs_info *fs_info = NULL; + struct backref_node *bnode = rb_entry(rb_node, struct backref_node, + rb_node); + if (bnode->root) + fs_info = bnode->root->fs_info; + btrfs_panic(fs_info, errno, "Inconsistency in backref cache " + "found at offset %llu\n", (unsigned long long)bytenr); +} + /* * walk up backref nodes until reach node presents tree root */ @@ -452,7 +465,8 @@ static void update_backref_node(struct backref_cache *cache, rb_erase(&node->rb_node, &cache->rb_root); node->bytenr = bytenr; rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, bytenr); } /* @@ -999,7 +1013,8 @@ next: if (!cowonly) { rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); list_add_tail(&node->lower, &cache->leaves); } @@ -1034,7 +1049,9 @@ next: if (!cowonly) { rb_node = tree_insert(&cache->rb_root, upper->bytenr, &upper->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + upper->bytenr); } list_add_tail(&edge->list[UPPER], &upper->lower); @@ -1180,7 +1197,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, rb_node = tree_insert(&cache->rb_root, new_node->bytenr, &new_node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); if (!new_node->lowest) { list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { @@ -1203,14 +1221,15 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __add_reloc_root(struct btrfs_root *root) +static int __must_check __add_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node; struct reloc_control *rc = root->fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); - BUG_ON(!node); + if (!node) + return -ENOMEM; node->bytenr = root->node->start; node->data = root; @@ -1219,7 +1238,12 @@ static int __add_reloc_root(struct btrfs_root *root) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) { + kfree(node); + btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " + "for start=%llu while inserting into relocation " + "tree\n"); + } list_add_tail(&root->root_list, &rc->reloc_roots); return 0; @@ -1252,7 +1276,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); } else { list_del_init(&root->root_list); kfree(node); @@ -1334,6 +1359,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *reloc_root; struct reloc_control *rc = root->fs_info->reloc_ctl; int clear_rsv = 0; + int ret; if (root->reloc_root) { reloc_root = root->reloc_root; @@ -1353,7 +1379,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, if (clear_rsv) trans->block_rsv = NULL; - __add_reloc_root(reloc_root); + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); root->reloc_root = reloc_root; return 0; } @@ -1577,15 +1604,14 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, root->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - GFP_NOFS); + key.offset, end); if (!ret) continue; btrfs_drop_extent_cache(inode, key.offset, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, GFP_NOFS); + key.offset, end); } } @@ -1956,9 +1982,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); btrfs_drop_extent_cache(inode, start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); } return 0; } @@ -2246,7 +2272,8 @@ again: } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + BUG_ON(ret < 0); } if (found) { @@ -2862,12 +2889,12 @@ int prealloc_file_extent_cluster(struct inode *inode, else end = cluster->end - offset; - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; @@ -2899,7 +2926,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -2910,7 +2937,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } btrfs_drop_extent_cache(inode, start, end, 0); } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); return ret; } @@ -2990,8 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); set_page_extent_mapped(page); @@ -3007,7 +3033,7 @@ static int relocate_file_extent_cluster(struct inode *inode, set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + page_start, page_end); unlock_page(page); page_cache_release(page); @@ -3154,7 +3180,8 @@ static int add_tree_block(struct reloc_control *rc, block->key_ready = 0; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, block->bytenr); return 0; } @@ -3426,7 +3453,9 @@ static int find_data_references(struct reloc_control *rc, block->key_ready = 1; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + block->bytenr); } if (counted) added = 1; @@ -4073,10 +4102,11 @@ out: static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; + int ret, err; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -4084,11 +4114,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); - ret = btrfs_end_transaction(trans, root->fs_info->tree_root); - BUG_ON(ret); - return 0; + err = btrfs_end_transaction(trans, root->fs_info->tree_root); + if (err) + return err; + return ret; } /* @@ -4156,7 +4186,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } - mark_garbage_root(reloc_root); + ret = mark_garbage_root(reloc_root); + if (ret < 0) { + err = ret; + goto out; + } } } @@ -4202,13 +4236,19 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); - BUG_ON(IS_ERR(fs_root)); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out_free; + } - __add_reloc_root(reloc_root); + err = __add_reloc_root(reloc_root); + BUG_ON(err < 0); /* -ENOMEM or logic error */ fs_root->reloc_root = reloc_root; } - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); + if (err) + goto out_free; merge_reloc_roots(rc); @@ -4218,7 +4258,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(trans)) err = PTR_ERR(trans); else - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: @@ -4267,6 +4307,8 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); + if (ret) + goto out; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); @@ -4284,6 +4326,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_add_ordered_sum(inode, ordered, sums); } +out: btrfs_put_ordered_extent(ordered); return ret; } @@ -4380,7 +4423,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot */ -void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_root *root = pending->root; @@ -4390,7 +4433,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return; + return 0; rc = root->fs_info->reloc_ctl; rc->merging_rsv_size += rc->nodes_relocated; @@ -4399,18 +4442,21 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, rc->nodes_relocated); - BUG_ON(ret); + if (ret) + return ret; } new_root = pending->snap; reloc_root = create_reloc_root(trans, root->reloc_root, new_root->root_key.objectid); + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); - __add_reloc_root(reloc_root); + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); new_root->reloc_root = reloc_root; - if (rc->create_reloc_tree) { + if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); - BUG_ON(ret); - } + return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index f4099904565..24fb8ce4e07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -93,10 +93,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root unsigned long ptr; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); goto out; + } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); @@ -116,13 +120,10 @@ out: return ret; } -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item) +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item) { - int ret; - ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); - return ret; + return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } /* @@ -384,6 +385,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, * * For a back ref the root_id is the id of the subvol or snapshot and * ref_id is the id of the tree referencing it. + * + * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, @@ -407,7 +410,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 390e7102b0f..90acc82046c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -36,37 +36,30 @@ * Future enhancements: * - In case an unrepairable extent is encountered, track which files are * affected and report them - * - In case of a read error on files with nodatasum, map the file and read - * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space */ -struct scrub_bio; -struct scrub_page; +struct scrub_block; struct scrub_dev; -static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_checksum(struct btrfs_work *work); -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer); -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer); -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); -static int scrub_fixup_check(struct scrub_bio *sbio, int ix); -static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page); -static void scrub_fixup(struct scrub_bio *sbio, int ix); #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { + struct scrub_block *sblock; + struct page *page; + struct block_device *bdev; u64 flags; /* extent flags */ u64 generation; - int mirror_num; - int have_csum; + u64 logical; + u64 physical; + struct { + unsigned int mirror_num:8; + unsigned int have_csum:1; + unsigned int io_error:1; + }; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -77,12 +70,25 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page spag[SCRUB_PAGES_PER_BIO]; - u64 count; + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; + int page_count; int next_free; struct btrfs_work work; }; +struct scrub_block { + struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + int page_count; + atomic_t outstanding_pages; + atomic_t ref_count; /* free mem on transition to zero */ + struct scrub_dev *sdev; + struct { + unsigned int header_error:1; + unsigned int checksum_error:1; + unsigned int no_io_error_seen:1; + }; +}; + struct scrub_dev { struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; struct btrfs_device *dev; @@ -96,6 +102,10 @@ struct scrub_dev { struct list_head csum_list; atomic_t cancel_req; int readonly; + int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + u32 sectorsize; + u32 nodesize; + u32 leafsize; /* * statistics */ @@ -124,6 +134,43 @@ struct scrub_warning { int scratch_bufsize; }; + +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblock); +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size); +static void scrub_complete_bio_end_io(struct bio *bio, int err); +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write); +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write); +static int scrub_checksum_data(struct scrub_block *sblock); +static int scrub_checksum_tree_block(struct scrub_block *sblock); +static int scrub_checksum_super(struct scrub_block *sblock); +static void scrub_block_get(struct scrub_block *sblock); +static void scrub_block_put(struct scrub_block *sblock); +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage); +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force); +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_block_complete(struct scrub_block *sblock); + + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -135,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } -static void scrub_free_bio(struct bio *bio) -{ - int i; - struct page *last_page = NULL; - - if (!bio) - return; - - for (i = 0; i < bio->bi_vcnt; ++i) { - if (bio->bi_io_vec[i].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[i].bv_page; - __free_page(last_page); - } - bio_put(bio); -} - static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; @@ -159,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) if (!sdev) return; + /* this can happen when scrub is cancelled */ + if (sdev->curr != -1) { + struct scrub_bio *sbio = sdev->bios[sdev->curr]; + + for (i = 0; i < sbio->page_count; i++) { + BUG_ON(!sbio->pagev[i]); + BUG_ON(!sbio->pagev[i]->page); + scrub_block_put(sbio->pagev[i]->sblock); + } + bio_put(sbio->bio); + } + for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; if (!sbio) break; - - scrub_free_bio(sbio->bio); kfree(sbio); } @@ -179,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) struct scrub_dev *sdev; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + int pages_per_bio; + pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, + bio_get_nr_vecs(dev->bdev)); sdev = kzalloc(sizeof(*sdev), GFP_NOFS); if (!sdev) goto nomem; sdev->dev = dev; + sdev->pages_per_bio = pages_per_bio; + sdev->curr = -1; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio; @@ -194,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sbio->index = i; sbio->sdev = sdev; - sbio->count = 0; - sbio->work.func = scrub_checksum; + sbio->page_count = 0; + sbio->work.func = scrub_bio_end_io_worker; if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -203,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->bios[i]->next_free = -1; } sdev->first_free = 0; - sdev->curr = -1; + sdev->nodesize = dev->dev_root->nodesize; + sdev->leafsize = dev->dev_root->leafsize; + sdev->sectorsize = dev->dev_root->sectorsize; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); @@ -294,10 +341,9 @@ err: return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, - int ix) +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_device *dev = sblock->sdev->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -316,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; - swarn.logical = sbio->logical + ix * PAGE_SIZE; + BUG_ON(sblock->page_count < 1); + swarn.sector = (sblock->pagev[0].physical) >> 9; + swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; swarn.dev = dev; swarn.msg_bufsize = bufsize; @@ -342,7 +389,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING "%s at logical %llu on dev %s, " + printk(KERN_WARNING + "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, dev->name, (unsigned long long)swarn.sector, @@ -352,8 +400,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, } while (ret != 1); } else { swarn.path = path; - iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_item_pos, + iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, 1, scrub_print_warning_inode, &swarn); } @@ -531,9 +579,9 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR "btrfs: unable to fixup " - "(nodatasum) error at logical %llu\n", - fixup->logical); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + (unsigned long long)fixup->logical, sdev->dev->name); } btrfs_free_path(path); @@ -550,91 +598,168 @@ out: } /* - * scrub_recheck_error gets called when either verification of the page - * failed or the bio failed to read, e.g. with EIO. In the latter case, - * recheck_error gets called for every page in the bio, even though only - * one may be bad + * scrub_handle_errored_block gets called when either verification of the + * pages failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all pages in the bio, even though only one + * may be bad. + * The goal of this function is to repair the errored block by using the + * contents of one of the mirrors. */ -static int scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sbio->sdev; - u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + struct scrub_dev *sdev = sblock_to_check->sdev; + struct btrfs_fs_info *fs_info; + u64 length; + u64 logical; + u64 generation; + unsigned int failed_mirror_index; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + struct scrub_block *sblock_bad; + int ret; + int mirror_index; + int page_num; + int success; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); + DEFAULT_RATELIMIT_BURST); + + BUG_ON(sblock_to_check->page_count < 1); + fs_info = sdev->dev->dev_root->fs_info; + length = sblock_to_check->page_count * PAGE_SIZE; + logical = sblock_to_check->pagev[0].logical; + generation = sblock_to_check->pagev[0].generation; + BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0].flags & + BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock_to_check->pagev[0].have_csum; + csum = sblock_to_check->pagev[0].csum; - if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, - sbio->bio->bi_io_vec[ix].bv_page) == 0) { - if (scrub_fixup_check(sbio, ix) == 0) - return 0; - } - if (__ratelimit(&_rs)) - scrub_print_warning("i/o error", sbio, ix); - } else { - if (__ratelimit(&_rs)) - scrub_print_warning("checksum error", sbio, ix); + /* + * read all mirrors one after the other. This includes to + * re-read the extent or metadata block that failed (that was + * the cause that this fixup code is called) another time, + * page by page this time in order to know which pages + * caused I/O errors and which ones are good (for all mirrors). + * It is the goal to handle the situation when more than one + * mirror contains I/O errors, but the errors do not + * overlap, i.e. the data can be repaired by selecting the + * pages from those mirrors without I/O error on the + * particular pages. One example (with blocks >= 2 * PAGE_SIZE) + * would be that mirror #1 has an I/O error on the first page, + * the second page is good, and mirror #2 has an I/O error on + * the second page, but the first page is good. + * Then the first page of the first mirror can be repaired by + * taking the first page of the second mirror, and the + * second page of the second mirror can be repaired by + * copying the contents of the 2nd page of the 1st mirror. + * One more note: if the pages of one mirror contain I/O + * errors, the checksum cannot be verified. In order to get + * the best data for repairing, the first attempt is to find + * a mirror without I/O errors and with a validated checksum. + * Only if this is not possible, the pages are picked from + * mirrors with I/O errors without considering the checksum. + * If the latter is the case, at the end, the checksum of the + * repaired area is verified in order to correctly maintain + * the statistics. + */ + + sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * + sizeof(*sblocks_for_recheck), + GFP_NOFS); + if (!sblocks_for_recheck) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; } - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); + /* setup the context, map the logical blocks and alloc the pages */ + ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + logical, sblocks_for_recheck); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } + BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); + sblock_bad = sblocks_for_recheck + failed_mirror_index; - scrub_fixup(sbio, ix); - return 1; -} + /* build and submit the bios for the failed mirror, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sdev->csum_size); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } -static int scrub_fixup_check(struct scrub_bio *sbio, int ix) -{ - int ret = 1; - struct page *page; - void *buffer; - u64 flags = sbio->spag[ix].flags; + if (!sblock_bad->header_error && !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) { + /* + * the error disappeared after reading page by page, or + * the area was part of a huge bio and other parts of the + * bio caused I/O errors, or the block layer merged several + * read requests into one and the error is caused by a + * different bio (usually one of the two latter cases is + * the cause) + */ + spin_lock(&sdev->stat_lock); + sdev->stat.unverified_errors++; + spin_unlock(&sdev->stat_lock); - page = sbio->bio->bi_io_vec[ix].bv_page; - buffer = kmap_atomic(page); - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sbio->sdev, - sbio->spag + ix, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sbio->sdev, - sbio->spag + ix, - sbio->logical + ix * PAGE_SIZE, - buffer); - } else { - WARN_ON(1); + goto out; } - kunmap_atomic(buffer); - return ret; -} + if (!sblock_bad->no_io_error_seen) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sblock_to_check); + } else if (sblock_bad->checksum_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.csum_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sblock_to_check); + } else if (sblock_bad->header_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.verify_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum/header error", + sblock_to_check); + } -static void scrub_fixup_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} + if (sdev->readonly) + goto did_not_correct_error; -static void scrub_fixup(struct scrub_bio *sbio, int ix) -{ - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_bio *bbio = NULL; - struct scrub_fixup_nodatasum *fixup; - u64 logical = sbio->logical + ix * PAGE_SIZE; - u64 length; - int i; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); - - if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && - (sbio->spag[ix].have_csum == 0)) { - fixup = kzalloc(sizeof(*fixup), GFP_NOFS); - if (!fixup) - goto uncorrectable; - fixup->sdev = sdev; - fixup->logical = logical; - fixup->root = fs_info->extent_root; - fixup->mirror_num = sbio->spag[ix].mirror_num; + if (!is_metadata && !have_csum) { + struct scrub_fixup_nodatasum *fixup_nodatasum; + + /* + * !is_metadata and !have_csum, this means that the data + * might not be COW'ed, that it might be modified + * concurrently. The general strategy to work on the + * commit root does not help in the case when COW is not + * used. + */ + fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); + if (!fixup_nodatasum) + goto did_not_correct_error; + fixup_nodatasum->sdev = sdev; + fixup_nodatasum->logical = logical; + fixup_nodatasum->root = fs_info->extent_root; + fixup_nodatasum->mirror_num = failed_mirror_index + 1; /* * increment scrubs_running to prevent cancel requests from * completing as long as a fixup worker is running. we must also @@ -649,235 +774,528 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); atomic_inc(&sdev->fixup_cnt); - fixup->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); - return; + fixup_nodatasum->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, + &fixup_nodatasum->work); + goto out; } - length = PAGE_SIZE; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &bbio, 0); - if (ret || !bbio || length < PAGE_SIZE) { - printk(KERN_ERR - "scrub_fixup: btrfs_map_block failed us for %llu\n", - (unsigned long long)logical); - WARN_ON(1); - kfree(bbio); - return; + /* + * now build and submit the bios for the other mirrors, check + * checksums + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (mirror_index == failed_mirror_index) + continue; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, + sblocks_for_recheck + mirror_index, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (ret) + goto did_not_correct_error; } - if (bbio->num_stripes == 1) - /* there aren't any replicas */ - goto uncorrectable; + /* + * first try to pick the mirror which is completely without I/O + * errors and also does not have a checksum error. + * If one is found, and if a checksum is present, the full block + * that is known to contain an error is rewritten. Afterwards + * the block is known to be corrected. + * If a mirror is found which is completely correct, and no + * checksum is present, only those pages are rewritten that had + * an I/O error in the block to be repaired, since it cannot be + * determined, which copy of the other pages is better (and it + * could happen otherwise that a correct page would be + * overwritten by a bad one). + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + + if (!sblock_other->header_error && + !sblock_other->checksum_error && + sblock_other->no_io_error_seen) { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy(sblock_bad, + sblock_other, + force_write); + if (0 == ret) + goto corrected_error; + } + } /* - * first find a good copy + * in case of I/O errors in the area that is supposed to be + * repaired, continue by picking good copies of those pages. + * Select the good pages from mirrors to rewrite bad pages from + * the area to fix. Afterwards verify the checksum of the block + * that is supposed to be repaired. This verification step is + * only done for the purpose of statistic counting and for the + * final scrub report, whether errors remain. + * A perfect algorithm could make use of the checksum and try + * all possible combinations of pages from the different mirrors + * until the checksum verification succeeds. For example, when + * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * of mirror #2 is readable but the final checksum test fails, + * then the 2nd page of mirror #3 could be tried, whether now + * the final checksum succeedes. But this would be a rare + * exception and is therefore not implemented. At least it is + * avoided that the good copy is overwritten. + * A more useful improvement would be to pick the sectors + * without I/O error based on sector sizes (512 bytes on legacy + * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * mirror could be repaired by taking 512 byte of a different + * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * area are unreadable. */ - for (i = 0; i < bbio->num_stripes; ++i) { - if (i + 1 == sbio->spag[ix].mirror_num) - continue; - if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, - bbio->stripes[i].physical >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, this is not a good copy */ + /* can only fix I/O errors from here on */ + if (sblock_bad->no_io_error_seen) + goto did_not_correct_error; + + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + + if (!page_bad->io_error) continue; + + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + struct scrub_page *page_other = sblock_other->pagev + + page_num; + + if (!page_other->io_error) { + ret = scrub_repair_page_from_good_copy( + sblock_bad, sblock_other, page_num, 0); + if (0 == ret) { + page_bad->io_error = 0; + break; /* succeeded for this page */ + } + } } - if (scrub_fixup_check(sbio, ix) == 0) - break; + if (page_bad->io_error) { + /* did not find a mirror to copy the page from */ + success = 0; + } } - if (i == bbio->num_stripes) - goto uncorrectable; - if (!sdev->readonly) { - /* - * bi_io_vec[ix].bv_page now contains good data, write it back - */ - if (scrub_fixup_io(WRITE, sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, writeback failed, give up */ - goto uncorrectable; + if (success) { + if (is_metadata || have_csum) { + /* + * need to verify the checksum now that all + * sectors on disk are repaired (the write + * request for data to be repaired is on its way). + * Just be lazy and use scrub_recheck_block() + * which re-reads the data before the checksum + * is verified, but most likely the data comes out + * of the page cache. + */ + ret = scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (!ret && !sblock_bad->header_error && + !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) + goto corrected_error; + else + goto did_not_correct_error; + } else { +corrected_error: + spin_lock(&sdev->stat_lock); + sdev->stat.corrected_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: fixed up error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } + } else { +did_not_correct_error: + spin_lock(&sdev->stat_lock); + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); +out: + if (sblocks_for_recheck) { + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; + mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck + + mirror_index; + int page_index; + + for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; + page_index++) + if (sblock->pagev[page_index].page) + __free_page( + sblock->pagev[page_index].page); + } + kfree(sblocks_for_recheck); + } - printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", - (unsigned long long)logical); - return; + return 0; +} -uncorrectable: - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblocks_for_recheck) +{ + int page_index; + int mirror_index; + int ret; + + /* + * note: the three members sdev, ref_count and outstanding_pages + * are not used (and not set) in the blocks that are used for + * the recheck procedure + */ + + page_index = 0; + while (length > 0) { + u64 sublen = min_t(u64, length, PAGE_SIZE); + u64 mapped_length = sublen; + struct btrfs_bio *bbio = NULL; + + /* + * with a length of PAGE_SIZE, each returned stripe + * represents one mirror + */ + ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + &bbio, 0); + if (ret || !bbio || mapped_length < sublen) { + kfree(bbio); + return -EIO; + } - printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " - "logical %llu\n", (unsigned long long)logical); + BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + mirror_index++) { + struct scrub_block *sblock; + struct scrub_page *page; + + if (mirror_index >= BTRFS_MAX_MIRRORS) + continue; + + sblock = sblocks_for_recheck + mirror_index; + page = sblock->pagev + page_index; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + page->bdev = bbio->stripes[mirror_index].dev->bdev; + page->mirror_num = mirror_index + 1; + page->page = alloc_page(GFP_NOFS); + if (!page->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; + } + sblock->page_count++; + } + kfree(bbio); + length -= sublen; + logical += sublen; + page_index++; + } + + return 0; } -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page) +/* + * this function will check the on disk data for checksum errors, header + * errors and read I/O errors. If any I/O errors happen, the exact pages + * which are errored are marked as being bad. The goal is to enable scrub + * to take those pages that are not errored from all the mirrors so that + * the pages that are errored in the just handled mirror can be repaired. + */ +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { - struct bio *bio = NULL; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); + int page_num; + + sblock->no_io_error_seen = 1; + sblock->header_error = 0; + sblock->checksum_error = 0; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct bio *bio; + int ret; + struct scrub_page *page = sblock->pagev + page_num; + DECLARE_COMPLETION_ONSTACK(complete); + + BUG_ON(!page->page); + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = page->bdev; + bio->bi_sector = page->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; + } + btrfsic_submit_bio(READ, bio); - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = bdev; - bio->bi_sector = sector; - bio_add_page(bio, page, PAGE_SIZE, 0); - bio->bi_end_io = scrub_fixup_end_io; - bio->bi_private = &complete; - btrfsic_submit_bio(rw, bio); + /* this will also unplug the queue */ + wait_for_completion(&complete); - /* this will also unplug the queue */ - wait_for_completion(&complete); + page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + sblock->no_io_error_seen = 0; + bio_put(bio); + } - ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); - bio_put(bio); - return ret; + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + csum_size); + + return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size) { - struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + int page_num; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + struct btrfs_root *root = fs_info->extent_root; + void *mapped_buffer; + + BUG_ON(!sblock->pagev[0].page); + if (is_metadata) { + struct btrfs_header *h; + + mapped_buffer = kmap_atomic(sblock->pagev[0].page); + h = (struct btrfs_header *)mapped_buffer; + + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + generation != le64_to_cpu(h->generation) || + memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + sblock->header_error = 1; + csum = h->csum; + } else { + if (!have_csum) + return; - sbio->err = err; - sbio->bio = bio; + mapped_buffer = kmap_atomic(sblock->pagev[0].page); + } - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + for (page_num = 0;;) { + if (page_num == 0 && is_metadata) + crc = btrfs_csum_data(root, + ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, + crc, PAGE_SIZE - BTRFS_CSUM_SIZE); + else + crc = btrfs_csum_data(root, mapped_buffer, crc, + PAGE_SIZE); + + kunmap_atomic(mapped_buffer); + page_num++; + if (page_num >= sblock->page_count) + break; + BUG_ON(!sblock->pagev[page_num].page); + + mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, csum, csum_size)) + sblock->checksum_error = 1; } -static void scrub_checksum(struct btrfs_work *work) +static void scrub_complete_bio_end_io(struct bio *bio, int err) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; - struct page *page; - void *buffer; - int i; - u64 flags; - u64 logical; - int ret; + complete((struct completion *)bio->bi_private); +} - if (sbio->err) { - ret = 0; - for (i = 0; i < sbio->count; ++i) - ret |= scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write) +{ + int page_num; + int ret = 0; - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + int ret_sub; - for (i = 0; i < sbio->count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - goto out; + ret_sub = scrub_repair_page_from_good_copy(sblock_bad, + sblock_good, + page_num, + force_write); + if (ret_sub) + ret = ret_sub; } - for (i = 0; i < sbio->count; ++i) { - page = sbio->bio->bi_io_vec[i].bv_page; - buffer = kmap_atomic(page); - flags = sbio->spag[i].flags; - logical = sbio->logical + i * PAGE_SIZE; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sdev, sbio->spag + i, - logical, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { - BUG_ON(i); - (void)scrub_checksum_super(sbio, buffer); - } else { - WARN_ON(1); - } - kunmap_atomic(buffer); - if (ret) { - ret = scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } + + return ret; +} + +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write) +{ + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_good = sblock_good->pagev + page_num; + + BUG_ON(sblock_bad->pagev[page_num].page == NULL); + BUG_ON(sblock_good->pagev[page_num].page == NULL); + if (force_write || sblock_bad->header_error || + sblock_bad->checksum_error || page_bad->io_error) { + struct bio *bio; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = page_bad->bdev; + bio->bi_sector = page_bad->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; } + btrfsic_submit_bio(WRITE, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + bio_put(bio); } -out: - scrub_free_bio(sbio->bio); - sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + return 0; +} + +static void scrub_checksum(struct scrub_block *sblock) +{ + u64 flags; + int ret; + + BUG_ON(sblock->page_count < 1); + flags = sblock->pagev[0].flags; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) + ret = scrub_checksum_data(sblock); + else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = scrub_checksum_tree_block(sblock); + else if (flags & BTRFS_EXTENT_FLAG_SUPER) + (void)scrub_checksum_super(sblock); + else + WARN_ON(1); + if (ret) + scrub_handle_errored_block(sblock); } -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer) +static int scrub_checksum_data(struct scrub_block *sblock) { + struct scrub_dev *sdev = sblock->sdev; u8 csum[BTRFS_CSUM_SIZE]; + u8 *on_disk_csum; + struct page *page; + void *buffer; u32 crc = ~(u32)0; int fail = 0; struct btrfs_root *root = sdev->dev->dev_root; + u64 len; + int index; - if (!spag->have_csum) + BUG_ON(sblock->page_count < 1); + if (!sblock->pagev[0].have_csum) return 0; - crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); + on_disk_csum = sblock->pagev[0].csum; + page = sblock->pagev[0].page; + buffer = kmap_atomic(page); + + len = sdev->sectorsize; + index = 0; + for (;;) { + u64 l = min_t(u64, len, PAGE_SIZE); + + crc = btrfs_csum_data(root, buffer, crc, l); + kunmap_atomic(buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + buffer = kmap_atomic(page); + } + btrfs_csum_final(crc, csum); - if (memcmp(csum, spag->csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - spin_lock(&sdev->stat_lock); - ++sdev->stat.data_extents_scrubbed; - sdev->stat.data_bytes_scrubbed += PAGE_SIZE; - if (fail) + if (fail) { + spin_lock(&sdev->stat_lock); ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); + spin_unlock(&sdev->stat_lock); + } return fail; } -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer) +static int scrub_checksum_tree_block(struct scrub_block *sblock) { + struct scrub_dev *sdev = sblock->sdev; struct btrfs_header *h; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; int crc_fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page); + h = (struct btrfs_header *)mapped_buffer; + memcpy(on_disk_csum, h->csum, sdev->csum_size); /* * we don't use the getter functions here, as we * a) don't have an extent buffer and * b) the page is already kmapped */ - h = (struct btrfs_header *)buffer; - if (logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) ++fail; - if (spag->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -887,51 +1305,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, h->csum, sdev->csum_size)) + BUG_ON(sdev->nodesize != sdev->leafsize); + len = sdev->nodesize - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++crc_fail; - spin_lock(&sdev->stat_lock); - ++sdev->stat.tree_extents_scrubbed; - sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); + if (crc_fail || fail) { + spin_lock(&sdev->stat_lock); + if (crc_fail) + ++sdev->stat.csum_errors; + if (fail) + ++sdev->stat.verify_errors; + spin_unlock(&sdev->stat_lock); + } return fail || crc_fail; } -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) +static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - u64 logical; - struct scrub_dev *sdev = sbio->sdev; + struct scrub_dev *sdev = sblock->sdev; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; + u64 len; + int index; - s = (struct btrfs_super_block *)buffer; - logical = sbio->logical; + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page); + s = (struct btrfs_super_block *)mapped_buffer; + memcpy(on_disk_csum, s->csum, sdev->csum_size); - if (logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail; - if (sbio->spag[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) ++fail; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++fail; if (fail) { @@ -948,29 +1414,42 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) return fail; } -static int scrub_submit(struct scrub_dev *sdev) +static void scrub_block_get(struct scrub_block *sblock) +{ + atomic_inc(&sblock->ref_count); +} + +static void scrub_block_put(struct scrub_block *sblock) +{ + if (atomic_dec_and_test(&sblock->ref_count)) { + int i; + + for (i = 0; i < sblock->page_count; i++) + if (sblock->pagev[i].page) + __free_page(sblock->pagev[i].page); + kfree(sblock); + } +} + +static void scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; if (sdev->curr == -1) - return 0; + return; sbio = sdev->bios[sdev->curr]; - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); - - return 0; } -static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage) { + struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; - struct page *page; int ret; again: @@ -983,7 +1462,7 @@ again: if (sdev->curr != -1) { sdev->first_free = sdev->bios[sdev->curr]->next_free; sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->count = 0; + sdev->bios[sdev->curr]->page_count = 0; spin_unlock(&sdev->list_lock); } else { spin_unlock(&sdev->list_lock); @@ -991,62 +1470,200 @@ again: } } sbio = sdev->bios[sdev->curr]; - if (sbio->count == 0) { + if (sbio->page_count == 0) { struct bio *bio; - sbio->physical = physical; - sbio->logical = logical; - bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - return -ENOMEM; + sbio->physical = spage->physical; + sbio->logical = spage->logical; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + if (!bio) + return -ENOMEM; + sbio->bio = bio; + } bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_sector = spage->physical >> 9; sbio->err = 0; - sbio->bio = bio; - } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || - sbio->logical + sbio->count * PAGE_SIZE != logical) { - ret = scrub_submit(sdev); - if (ret) - return ret; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_submit(sdev); goto again; } - sbio->spag[sbio->count].flags = flags; - sbio->spag[sbio->count].generation = gen; - sbio->spag[sbio->count].have_csum = 0; - sbio->spag[sbio->count].mirror_num = mirror_num; - - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - ret = scrub_submit(sdev); - if (ret) - return ret; + sbio->pagev[sbio->page_count] = spage; + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + return -EIO; + } + scrub_submit(sdev); goto again; } - if (csum) { - sbio->spag[sbio->count].have_csum = 1; - memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); + scrub_block_get(sblock); /* one for the added page */ + atomic_inc(&sblock->outstanding_pages); + sbio->page_count++; + if (sbio->page_count == sdev->pages_per_bio) + scrub_submit(sdev); + + return 0; +} + +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force) +{ + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; } - ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) { + + /* one ref inside this function, plus one for each page later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sdev = sdev; + sblock->no_io_error_seen = 1; + + for (index = 0; len > 0; index++) { + struct scrub_page *spage = sblock->pagev + index; + u64 l = min_t(u64, len, PAGE_SIZE); + + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + while (index > 0) { + index--; + __free_page(sblock->pagev[index].page); + } + kfree(sblock); + return -ENOMEM; + } + spage->sblock = sblock; + spage->bdev = sdev->dev->bdev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sdev->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + len -= l; + logical += l; + physical += l; + } + + BUG_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev + index; int ret; - ret = scrub_submit(sdev); - if (ret) + ret = scrub_add_page_to_bio(sdev, spage); + if (ret) { + scrub_block_put(sblock); return ret; + } } + if (force) + scrub_submit(sdev); + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); return 0; } +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); +} + +static void scrub_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_dev *sdev = sbio->sdev; + int i; + + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + if (sbio->err) { + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + spage->sblock->no_io_error_seen = 0; + } + } + + /* now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + struct scrub_block *sblock = spage->sblock; + + if (atomic_dec_and_test(&sblock->outstanding_pages)) + scrub_block_complete(sblock); + scrub_block_put(sblock); + } + + if (sbio->err) { + /* what is this good for??? */ + sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bio->bi_phys_segments = 0; + sbio->bio->bi_idx = 0; + + for (i = 0; i < sbio->page_count; i++) { + struct bio_vec *bi; + bi = &sbio->bio->bi_io_vec[i]; + bi->bv_offset = 0; + bi->bv_len = PAGE_SIZE; + } + } + + bio_put(sbio->bio); + sbio->bio = NULL; + spin_lock(&sdev->list_lock); + sbio->next_free = sdev->first_free; + sdev->first_free = sbio->index; + spin_unlock(&sdev->list_lock); + atomic_dec(&sdev->in_flight); + wake_up(&sdev->list_wait); +} + +static void scrub_block_complete(struct scrub_block *sblock) +{ + if (!sblock->no_io_error_seen) + scrub_handle_errored_block(sblock); + else + scrub_checksum(sblock); +} + static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum) { @@ -1054,7 +1671,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, int ret = 0; unsigned long i; unsigned long num_sectors; - u32 sectorsize = sdev->dev->dev_root->sectorsize; while (!list_empty(&sdev->csum_list)) { sum = list_first_entry(&sdev->csum_list, @@ -1072,7 +1688,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sectorsize; + num_sectors = sum->len / sdev->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { memcpy(csum, &sum->sums[i].sum, sdev->csum_size); @@ -1093,9 +1709,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, { int ret; u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sdev->sectorsize; + spin_lock(&sdev->stat_lock); + sdev->stat.data_extents_scrubbed++; + sdev->stat.data_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + BUG_ON(sdev->nodesize != sdev->leafsize); + blocksize = sdev->nodesize; + spin_lock(&sdev->stat_lock); + sdev->stat.tree_extents_scrubbed++; + sdev->stat.tree_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else { + blocksize = sdev->sectorsize; + BUG_ON(1); + } while (len) { - u64 l = min_t(u64, len, PAGE_SIZE); + u64 l = min_t(u64, len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -1104,8 +1739,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (have_csum == 0) ++sdev->stat.no_csum; } - ret = scrub_page(sdev, logical, l, physical, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + ret = scrub_pages(sdev, logical, l, physical, flags, gen, + mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; len -= l; @@ -1170,6 +1805,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; + /* + * work on commit root. The related disk blocks are static as + * long as COW is applied. This means, it is save to rewrite + * them to repair disk errors without any race conditions + */ path->search_commit_root = 1; path->skip_locking = 1; @@ -1516,15 +2156,18 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) struct btrfs_device *device = sdev->dev; struct btrfs_root *root = device->dev_root; + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return -EIO; + gen = root->fs_info->last_trans_committed; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } @@ -1583,10 +2226,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, /* * check some assumptions */ - if (root->sectorsize != PAGE_SIZE || - root->sectorsize != root->leafsize || - root->sectorsize != root->nodesize) { - printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); + if (root->nodesize != root->leafsize) { + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + root->nodesize, root->leafsize); + return -EINVAL; + } + + if (root->nodesize > BTRFS_STRIPE_LEN) { + /* + * in this case scrub is unable to calculate the checksum + * the way scrub is implemented. Do not handle this + * situation at all because it won't ever happen. + */ + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + root->nodesize, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (root->sectorsize != PAGE_SIZE) { + /* not supported for data w/o checksums */ + printk(KERN_ERR + "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", + root->sectorsize, (unsigned long long)PAGE_SIZE); return -EINVAL; } @@ -1656,7 +2319,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return ret; } -int btrfs_scrub_pause(struct btrfs_root *root) +void btrfs_scrub_pause(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1671,34 +2334,28 @@ int btrfs_scrub_pause(struct btrfs_root *root) mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); - - return 0; } -int btrfs_scrub_continue(struct btrfs_root *root) +void btrfs_scrub_continue(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; atomic_dec(&fs_info->scrub_pause_req); wake_up(&fs_info->scrub_pause_wait); - return 0; } -int btrfs_scrub_pause_super(struct btrfs_root *root) +void btrfs_scrub_pause_super(struct btrfs_root *root) { down_write(&root->fs_info->scrub_super_lock); - return 0; } -int btrfs_scrub_continue_super(struct btrfs_root *root) +void btrfs_scrub_continue_super(struct btrfs_root *root) { up_write(&root->fs_info->scrub_super_lock); - return 0; } -int btrfs_scrub_cancel(struct btrfs_root *root) +int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { @@ -1719,6 +2376,11 @@ int btrfs_scrub_cancel(struct btrfs_root *root) return 0; } +int btrfs_scrub_cancel(struct btrfs_root *root) +{ + return __btrfs_scrub_cancel(root->fs_info); +} + int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1741,6 +2403,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) return 0; } + int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) { struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index bc1f6ad1844..c6ffa581241 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -44,8 +44,9 @@ #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, \ + type *s, struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -54,9 +55,18 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ + unsigned long mem_len = sizeof(((type *)0)->member); \ u##bits res; \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ + kaddr = token->kaddr; \ + p = (type *)(kaddr + part_offset - token->offset); \ + res = le##bits##_to_cpu(p->member); \ + return res; \ + } \ err = map_private_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ + mem_len, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits leres; \ @@ -65,10 +75,15 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ res = le##bits##_to_cpu(p->member); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ return res; \ } \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ +void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -77,8 +92,17 @@ void btrfs_set_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ + unsigned long mem_len = sizeof(((type *)0)->member); \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ + kaddr = token->kaddr; \ + p = (type *)(kaddr + part_offset - token->offset); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ err = map_private_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ + mem_len, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits val2; \ @@ -88,7 +112,22 @@ void btrfs_set_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ p->member = cpu_to_le##bits(val); \ -} + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + btrfs_set_token_##name(eb, s, val, NULL); \ +} \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + return btrfs_token_##name(eb, s, NULL); \ +} \ #include "ctree.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 81df3fec6a6..8d5d380f7bd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -76,6 +76,9 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, case -EROFS: errstr = "Readonly filesystem"; break; + case -EEXIST: + errstr = "Object already exists"; + break; default: if (nbuf) { if (snprintf(nbuf, 16, "error %d", -errno) >= 0) @@ -116,6 +119,8 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); + __btrfs_scrub_cancel(fs_info); +// WARN_ON(1); } } @@ -124,25 +129,132 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * invokes the approciate error response. */ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno) + unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; char nbuf[16]; const char *errstr; + va_list args; + va_start(args, fmt); /* * Special case: if the error is EROFS, and we're already * under MS_RDONLY, then it is safe here. */ if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + if (fmt) { + struct va_format vaf = { + .fmt = fmt, + .va = &args, + }; + + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", + sb->s_id, function, line, errstr, &vaf); + } else { + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + } + + /* Don't go through full error handling during mount */ + if (sb->s_flags & MS_BORN) { + save_error_info(fs_info); + btrfs_handle_error(fs_info); + } + va_end(args); +} + +const char *logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + char lvl[4]; + struct va_format vaf; + va_list args; + const char *type = logtypes[4]; + + va_start(args, fmt); + + if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { + strncpy(lvl, fmt, 3); + fmt += 3; + type = logtypes[fmt[1] - '0']; + } else + *lvl = '\0'; + + vaf.fmt = fmt; + vaf.va = &args; + printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); +} + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno) +{ + WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); + trans->aborted = errno; + /* Nothing used. The other threads that have joined this + * transaction may be able to continue. */ + if (!trans->blocks_used) { + btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); return; + } + trans->transaction->aborted = errno; + __btrfs_std_error(root->fs_info, function, line, errno, NULL); +} +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, + * issues an alert, and either panics or BUGs, depending on mount options. + */ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char nbuf[16]; + char *s_id = "<unknown>"; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; - errstr = btrfs_decode_error(fs_info, errno, nbuf); - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); - save_error_info(fs_info); + if (fs_info) + s_id = fs_info->sb->s_id; - btrfs_handle_error(fs_info); + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", + s_id, function, line, &vaf, errstr); + + printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", + s_id, function, line, &vaf, errstr); + va_end(args); + /* Caller calls BUG() */ } static void btrfs_put_super(struct super_block *sb) @@ -166,7 +278,7 @@ enum { Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_err, }; @@ -206,12 +318,14 @@ static match_table_t tokens = { {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_err, NULL}, }; /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. + * XXX JDM: This needs to be cleaned up for remount. */ int btrfs_parse_options(struct btrfs_root *root, char *options) { @@ -438,6 +552,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) ret = -EINVAL; goto out; #endif + case Opt_fatal_errors: + if (strcmp(args[0].from, "panic") == 0) + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else if (strcmp(args[0].from, "bug") == 0) + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else { + ret = -EINVAL; + goto out; + } + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -762,6 +888,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(root, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + seq_puts(seq, ",fatal_errors=panic"); return 0; } @@ -995,11 +1123,20 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; + unsigned old_flags = sb->s_flags; + unsigned long old_opts = fs_info->mount_opt; + unsigned long old_compress_type = fs_info->compress_type; + u64 old_max_inline = fs_info->max_inline; + u64 old_alloc_start = fs_info->alloc_start; + int old_thread_pool_size = fs_info->thread_pool_size; + unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; ret = btrfs_parse_options(root, data); - if (ret) - return -EINVAL; + if (ret) { + ret = -EINVAL; + goto restore; + } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -1007,26 +1144,44 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (*flags & MS_RDONLY) { sb->s_flags |= MS_RDONLY; - ret = btrfs_commit_super(root); - WARN_ON(ret); + ret = btrfs_commit_super(root); + if (ret) + goto restore; } else { if (fs_info->fs_devices->rw_devices == 0) - return -EACCES; + ret = -EACCES; + goto restore; if (btrfs_super_log_root(fs_info->super_copy) != 0) - return -EINVAL; + ret = -EINVAL; + goto restore; ret = btrfs_cleanup_fs_roots(fs_info); - WARN_ON(ret); + if (ret) + goto restore; /* recover relocation */ ret = btrfs_recover_relocation(root); - WARN_ON(ret); + if (ret) + goto restore; sb->s_flags &= ~MS_RDONLY; } return 0; + +restore: + /* We've hit an error - don't reset MS_RDONLY */ + if (sb->s_flags & MS_RDONLY) + old_flags |= MS_RDONLY; + sb->s_flags = old_flags; + fs_info->mount_opt = old_opts; + fs_info->compress_type = old_compress_type; + fs_info->max_inline = old_max_inline; + fs_info->alloc_start = old_alloc_start; + fs_info->thread_pool_size = old_thread_pool_size; + fs_info->metadata_ratio = old_metadata_ratio; + return ret; } /* Used to sort the devices by max_avail(descending sort) */ @@ -1356,9 +1511,7 @@ static int __init init_btrfs_fs(void) if (err) return err; - err = btrfs_init_compress(); - if (err) - goto free_sysfs; + btrfs_init_compress(); err = btrfs_init_cachep(); if (err) @@ -1384,6 +1537,8 @@ static int __init init_btrfs_fs(void) if (err) goto unregister_ioctl; + btrfs_init_lockdep(); + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; @@ -1399,7 +1554,6 @@ free_cachep: btrfs_destroy_cachep(); free_compress: btrfs_exit_compress(); -free_sysfs: btrfs_exit_sysfs(); return err; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04b77e3ceb7..8da29e8e4de 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,7 +31,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -static noinline void put_transaction(struct btrfs_transaction *transaction) +void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { @@ -58,6 +58,12 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) spin_lock(&root->fs_info->trans_lock); loop: + /* The file system has been taken offline. No new transactions. */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&root->fs_info->trans_lock); + return -EROFS; + } + if (root->fs_info->trans_no_join) { if (!nofail) { spin_unlock(&root->fs_info->trans_lock); @@ -67,6 +73,8 @@ loop: cur_trans = root->fs_info->running_transaction; if (cur_trans) { + if (cur_trans->aborted) + return cur_trans->aborted; atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -123,6 +131,7 @@ loop: root->fs_info->generation++; cur_trans->transid = root->fs_info->generation; root->fs_info->running_transaction = cur_trans; + cur_trans->aborted = 0; spin_unlock(&root->fs_info->trans_lock); return 0; @@ -318,6 +327,7 @@ again: h->use_count = 1; h->block_rsv = NULL; h->orig_rsv = NULL; + h->aborted = 0; smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -327,8 +337,7 @@ again: if (num_bytes) { trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)(unsigned long)h, - num_bytes, 1); + h->transid, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } @@ -440,6 +449,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; + int err; smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) @@ -453,8 +463,11 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; - if (updates) - btrfs_run_delayed_refs(trans, root, updates); + if (updates) { + err = btrfs_run_delayed_refs(trans, root, updates); + if (err) /* Error code will also eval true */ + return err; + } trans->block_rsv = rsv; @@ -525,6 +538,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(root); + if (trans->aborted || + root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + return -EIO; + } + return 0; } @@ -690,11 +708,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); + if (ret) + return ret; old_root_used = btrfs_root_used(&root->root_item); ret = btrfs_write_dirty_block_groups(trans, root); - BUG_ON(ret); + if (ret) + return ret; } if (root != root->fs_info->extent_root) @@ -705,6 +725,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, /* * update all the cowonly tree roots on disk + * + * The error handling in this function may not be obvious. Any of the + * failures will cause the file system to go offline. We still need + * to clean up the delayed refs. */ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -715,22 +739,30 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, + 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); - update_cowonly_root(trans, root); + ret = update_cowonly_root(trans, root); + if (ret) + return ret; } down_write(&fs_info->extent_commit_sem); @@ -874,7 +906,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - pending->error = -ENOMEM; + ret = pending->error = -ENOMEM; goto fail; } @@ -911,21 +943,24 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, BTRFS_FT_DIR, index); - if (ret) { + if (ret == -EEXIST) { pending->error = -EEXIST; dput(parent); goto fail; + } else if (ret) { + goto abort_trans_dput; } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); ret = btrfs_update_inode(trans, parent_root, parent_inode); - BUG_ON(ret); + if (ret) + goto abort_trans_dput; /* * pull in the delayed directory update @@ -934,7 +969,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans, root); - BUG_ON(ret); + if (ret) { /* Transaction aborted */ + dput(parent); + goto fail; + } record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); @@ -949,12 +987,21 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_flags(new_root_item, root_flags); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + if (ret) { + btrfs_tree_unlock(old); + free_extent_buffer(old); + goto abort_trans_dput; + } + btrfs_set_lock_blocking(old); - btrfs_copy_root(trans, root, old, &tmp, objectid); + ret = btrfs_copy_root(trans, root, old, &tmp, objectid); + /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); + if (ret) + goto abort_trans_dput; /* see comments in should_cow_block() */ root->force_cow = 1; @@ -966,7 +1013,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - BUG_ON(ret); + if (ret) + goto abort_trans_dput; /* * insert root back/forward references @@ -975,19 +1023,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); dput(parent); + if (ret) + goto fail; key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(IS_ERR(pending->snap)); + if (IS_ERR(pending->snap)) { + ret = PTR_ERR(pending->snap); + goto abort_trans; + } - btrfs_reloc_post_snapshot(trans, pending); + ret = btrfs_reloc_post_snapshot(trans, pending); + if (ret) + goto abort_trans; + ret = 0; fail: kfree(new_root_item); trans->block_rsv = rsv; btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); - return 0; + return ret; + +abort_trans_dput: + dput(parent); +abort_trans: + btrfs_abort_transaction(trans, root, ret); + goto fail; } /* @@ -1124,6 +1185,33 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, return 0; } + +static void cleanup_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + WARN_ON(trans->use_count > 1); + + spin_lock(&root->fs_info->trans_lock); + list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + + btrfs_cleanup_one_transaction(trans->transaction, root); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + trace_btrfs_transaction_commit(root); + + btrfs_scrub_continue(root); + + if (current->journal_info == trans) + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); +} + /* * btrfs_transaction state sequence: * in_commit = 0, blocked = 0 (initial) @@ -1135,10 +1223,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { unsigned long joined = 0; - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret; + int ret = -EIO; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); @@ -1148,13 +1236,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (cur_trans->aborted) + goto cleanup_transaction; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); + if (ret) + goto cleanup_transaction; cur_trans = trans->transaction; + /* * set the flushing flag so procs in this transaction have to * start sending their work down. @@ -1162,19 +1255,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->delayed_refs.flushing = 1; ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); + if (ret) + goto cleanup_transaction; spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { spin_unlock(&cur_trans->commit_lock); atomic_inc(&cur_trans->use_count); - btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans, root); wait_for_commit(root, cur_trans); put_transaction(cur_trans); - return 0; + return ret; } trans->transaction->in_commit = 1; @@ -1214,12 +1308,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); - ret = btrfs_wait_ordered_extents(root, 0, 1); - BUG_ON(ret); + btrfs_wait_ordered_extents(root, 0, 1); } ret = btrfs_run_delayed_items(trans, root); - BUG_ON(ret); + if (ret) + goto cleanup_transaction; /* * rename don't use btrfs_join_transaction, so, once we @@ -1261,13 +1355,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->reloc_mutex); ret = btrfs_run_delayed_items(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } ret = create_pending_snapshots(trans, root->fs_info); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } /* * make sure none of the code above managed to slip in a @@ -1294,7 +1397,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->tree_log_mutex); ret = commit_fs_roots(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -1302,7 +1408,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_free_log_root_tree(trans, root->fs_info); ret = commit_cowonly_roots(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } btrfs_prepare_extent_commit(trans, root); @@ -1336,8 +1445,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wake_up(&root->fs_info->transaction_wait); ret = btrfs_write_and_wait_transaction(trans, root); - BUG_ON(ret); - write_ctree_super(trans, root, 0); + if (ret) { + btrfs_error(root->fs_info, ret, + "Error while writing out transaction."); + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } + + ret = write_ctree_super(trans, root, 0); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } /* * the super is written, we can safely allow the tree-loggers @@ -1373,6 +1492,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); return ret; + +cleanup_transaction: + btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); +// WARN_ON(1); + if (current->journal_info == trans) + current->journal_info = NULL; + cleanup_transaction(trans, root); + + return ret; } /* @@ -1388,6 +1516,8 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) spin_unlock(&fs_info->trans_lock); while (!list_empty(&list)) { + int ret; + root = list_entry(list.next, struct btrfs_root, root_list); list_del(&root->root_list); @@ -1395,9 +1525,10 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, NULL, 0, 0); + ret = btrfs_drop_snapshot(root, NULL, 0, 0); else - btrfs_drop_snapshot(root, NULL, 1, 0); + ret =btrfs_drop_snapshot(root, NULL, 1, 0); + BUG_ON(ret < 0); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 02564e6230a..fe27379e368 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -43,6 +43,7 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct btrfs_delayed_ref_root delayed_refs; + int aborted; }; struct btrfs_trans_handle { @@ -55,6 +56,7 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + int aborted; }; struct btrfs_pending_snapshot { @@ -114,4 +116,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void put_transaction(struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 966cc74f5d6..d017283ae6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -212,14 +212,13 @@ int btrfs_pin_log_trans(struct btrfs_root *root) * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -int btrfs_end_log_trans(struct btrfs_root *root) +void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } - return 0; } @@ -378,12 +377,11 @@ insert: u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (found_size > item_size) { + if (found_size > item_size) btrfs_truncate_item(trans, root, path, item_size, 1); - } else if (found_size < item_size) { - ret = btrfs_extend_item(trans, root, path, - item_size - found_size); - } + else if (found_size < item_size) + btrfs_extend_item(trans, root, path, + item_size - found_size); } else if (ret) { return ret; } @@ -1763,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic errors */ } free_extent_buffer(next); continue; @@ -1871,20 +1869,26 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, wret = walk_down_log_tree(trans, log, path, &level, wc); if (wret > 0) break; - if (wret < 0) + if (wret < 0) { ret = wret; + goto out; + } wret = walk_up_log_tree(trans, log, path, &level, wc); if (wret > 0) break; - if (wret < 0) + if (wret < 0) { ret = wret; + goto out; + } } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(log, path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level])); + if (ret) + goto out; if (wc->free) { struct extent_buffer *next; @@ -1900,10 +1904,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic errors */ } } +out: for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { free_extent_buffer(path->nodes[i]); @@ -1963,8 +1968,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, return 0; } -static int wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { DEFINE_WAIT(wait); while (root->fs_info->last_trans_log_full_commit != @@ -1978,7 +1983,6 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } - return 0; } /* @@ -2046,7 +2050,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * wait for them until later. */ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&root->log_mutex); + goto out; + } btrfs_set_root_node(&log->root_item, log->node); @@ -2077,7 +2085,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { - BUG_ON(ret != -ENOSPC); + if (ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); @@ -2117,7 +2129,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_set_super_log_root(root->fs_info->super_for_commit, @@ -2326,7 +2342,9 @@ out_unlock: if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; ret = 0; - } + } else if (ret < 0) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_log_trans(root); return err; @@ -2357,7 +2375,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; ret = 0; - } + } else if (ret < 0 && ret != -ENOENT) + btrfs_abort_transaction(trans, root, ret); btrfs_end_log_trans(root); return ret; @@ -3169,13 +3188,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) fs_info->log_root_recovering = 1; trans = btrfs_start_transaction(fs_info->tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error; + } wc.trans = trans; wc.pin = 1; ret = walk_log_tree(trans, log_root_tree, &wc); - BUG_ON(ret); + if (ret) { + btrfs_error(fs_info, ret, "Failed to pin buffers while " + "recovering log root tree."); + goto error; + } again: key.objectid = BTRFS_TREE_LOG_OBJECTID; @@ -3184,8 +3210,12 @@ again: while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) - break; + + if (ret < 0) { + btrfs_error(fs_info, ret, + "Couldn't find tree log root."); + goto error; + } if (ret > 0) { if (path->slots[0] == 0) break; @@ -3199,14 +3229,24 @@ again: log = btrfs_read_fs_root_no_radix(log_root_tree, &found_key); - BUG_ON(IS_ERR(log)); + if (IS_ERR(log)) { + ret = PTR_ERR(log); + btrfs_error(fs_info, ret, + "Couldn't read tree log root."); + goto error; + } tmp_key.objectid = found_key.offset; tmp_key.type = BTRFS_ROOT_ITEM_KEY; tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); + if (IS_ERR(wc.replay_dest)) { + ret = PTR_ERR(wc.replay_dest); + btrfs_error(fs_info, ret, "Couldn't read target root " + "for tree log recovery."); + goto error; + } wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); @@ -3254,6 +3294,10 @@ again: kfree(log_root_tree); return 0; + +error: + btrfs_free_path(path); + return ret; } /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 2270ac58d74..862ac813f6b 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -38,7 +38,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); -int btrfs_end_log_trans(struct btrfs_root *root); +void btrfs_end_log_trans(struct btrfs_root *root); int btrfs_pin_log_trans(struct btrfs_root *root); int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ef41f285a47..a872b48be0a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -67,7 +67,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } -int btrfs_cleanup_fs_uuids(void) +void btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -77,7 +77,6 @@ int btrfs_cleanup_fs_uuids(void) list_del(&fs_devices->list); free_fs_devices(fs_devices); } - return 0; } static noinline struct btrfs_device *__find_device(struct list_head *head, @@ -130,7 +129,7 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, * the list if the block device is congested. This way, multiple devices * can make progress from a single worker thread. */ -static noinline int run_scheduled_bios(struct btrfs_device *device) +static noinline void run_scheduled_bios(struct btrfs_device *device) { struct bio *pending; struct backing_dev_info *bdi; @@ -316,7 +315,6 @@ loop_lock: done: blk_finish_plug(&plug); - return 0; } static void pending_bios_fn(struct btrfs_work *work) @@ -455,7 +453,7 @@ error: return ERR_PTR(-ENOMEM); } -int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; @@ -503,7 +501,6 @@ again: fs_devices->latest_trans = latest_transid; mutex_unlock(&uuid_mutex); - return 0; } static void __free_device(struct work_struct *work) @@ -552,10 +549,10 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->num_can_discard--; new_device = kmalloc(sizeof(*new_device), GFP_NOFS); - BUG_ON(!new_device); + BUG_ON(!new_device); /* -ENOMEM */ memcpy(new_device, device, sizeof(*new_device)); new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(device->name && !new_device->name); + BUG_ON(device->name && !new_device->name); /* -ENOMEM */ new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; @@ -625,6 +622,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, printk(KERN_INFO "open %s failed\n", device->name); goto error; } + filemap_write_and_wait(bdev->bd_inode->i_mapping); + invalidate_bdev(bdev); set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); @@ -1039,8 +1038,10 @@ again: leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + } else { + btrfs_error(root->fs_info, ret, "Slot search failed"); + goto out; } - BUG_ON(ret); if (device->bytes_used > 0) { u64 len = btrfs_dev_extent_length(leaf, extent); @@ -1050,7 +1051,10 @@ again: spin_unlock(&root->fs_info->free_chunk_lock); } ret = btrfs_del_item(trans, root, path); - + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to remove dev extent item"); + } out: btrfs_free_path(path); return ret; @@ -1078,7 +1082,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_EXTENT_KEY; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1093,6 +1098,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); +out: btrfs_free_path(path); return ret; } @@ -1118,7 +1124,7 @@ static noinline int find_next_chunk(struct btrfs_root *root, if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { @@ -1162,7 +1168,7 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); @@ -1350,6 +1356,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } set_blocksize(bdev, 4096); + invalidate_bdev(bdev); bh = btrfs_read_dev_super(bdev); if (!bh) { ret = -EINVAL; @@ -1596,7 +1603,7 @@ next_slot: (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); - BUG_ON(!device); + BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { btrfs_set_device_generation(leaf, dev_item, @@ -1706,7 +1713,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; ret = btrfs_prepare_sprout(root); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } device->fs_devices = root->fs_info->fs_devices; @@ -1744,11 +1751,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { ret = init_first_rw_device(trans, root, device); - BUG_ON(ret); + if (ret) + goto error_trans; ret = btrfs_finish_sprout(trans, root); - BUG_ON(ret); + if (ret) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); + if (ret) + goto error_trans; } /* @@ -1758,17 +1769,31 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_clear_space_info_full(root->fs_info); unlock_chunks(root); - btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + if (ret) /* transaction commit */ + return ret; + ret = btrfs_relocate_sys_chunks(root); - BUG_ON(ret); + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to relocate sys chunks after " + "device initialization. This can be fixed " + "using the \"btrfs balance\" command."); } return ret; + +error_trans: + unlock_chunks(root); + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + kfree(device->name); + kfree(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { @@ -1876,10 +1901,20 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, key.type = BTRFS_CHUNK_ITEM_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - BUG_ON(ret); + if (ret < 0) + goto out; + else if (ret > 0) { /* Logic error or corruption */ + btrfs_error(root->fs_info, -ENOENT, + "Failed lookup while freeing chunk."); + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); - + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to delete chunk item."); +out: btrfs_free_path(path); return ret; } @@ -2041,7 +2076,7 @@ again: ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -2250,15 +2285,13 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ -static int chunk_profiles_filter(u64 chunk_profile, +static int chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { - chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; - - if (chunk_profile == 0) - chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; - if (bargs->profiles & chunk_profile) + if (bargs->profiles & chunk_type) return 0; return 1; @@ -2365,18 +2398,16 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } -static int chunk_soft_convert_filter(u64 chunk_profile, +static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return 0; - chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; - if (chunk_profile == 0) - chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (bargs->target & chunk_profile) + if (bargs->target == chunk_type) return 1; return 0; @@ -2602,6 +2633,30 @@ error: return ret; } +/** + * alloc_profile_is_valid - see if a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static int alloc_profile_is_valid(u64 flags, int extended) +{ + u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : + BTRFS_BLOCK_GROUP_PROFILE_MASK); + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + + /* 1) check that all other bits are zeroed */ + if (flags & ~mask) + return 0; + + /* 2) see if profile is reduced */ + if (flags == 0) + return !extended; /* "0" is valid for usual profiles */ + + /* true if exactly one bit set */ + return (flags & (flags - 1)) == 0; +} + static inline int balance_need_close(struct btrfs_fs_info *fs_info) { /* cancel requested || normal exit path */ @@ -2630,6 +2685,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, { struct btrfs_fs_info *fs_info = bctl->fs_info; u64 allowed; + int mixed = 0; int ret; if (btrfs_fs_closing(fs_info) || @@ -2639,13 +2695,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + /* * In case of mixed groups both data and meta should be picked, * and identical options should be given for both of them. */ - allowed = btrfs_super_incompat_flags(fs_info->super_copy); - if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; + if (mixed && (bctl->flags & allowed)) { if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { @@ -2656,14 +2715,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } - /* - * Profile changing sanity checks. Skip them if a simple - * balance is requested. - */ - if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & - BTRFS_BALANCE_ARGS_CONVERT)) - goto do_balance; - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (fs_info->fs_devices->num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; @@ -2673,24 +2724,27 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10); - if (!profile_is_valid(bctl->data.target, 1) || - bctl->data.target & ~allowed) { + if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->data.target, 1) || + (bctl->data.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "data profile %llu\n", (unsigned long long)bctl->data.target); ret = -EINVAL; goto out; } - if (!profile_is_valid(bctl->meta.target, 1) || - bctl->meta.target & ~allowed) { + if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->meta.target, 1) || + (bctl->meta.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "metadata profile %llu\n", (unsigned long long)bctl->meta.target); ret = -EINVAL; goto out; } - if (!profile_is_valid(bctl->sys.target, 1) || - bctl->sys.target & ~allowed) { + if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->sys.target, 1) || + (bctl->sys.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "system profile %llu\n", (unsigned long long)bctl->sys.target); @@ -2698,7 +2752,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + /* allow dup'ed data chunks only in mixed mode */ + if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { printk(KERN_ERR "btrfs: dup for data is not allowed\n"); ret = -EINVAL; goto out; @@ -2724,7 +2780,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } -do_balance: ret = insert_balance_item(fs_info->tree_root, bctl); if (ret && ret != -EEXIST) goto out; @@ -2967,7 +3022,7 @@ again: key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; - while (1) { + do { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto done; @@ -3009,8 +3064,7 @@ again: goto done; if (ret == -ENOSPC) failed++; - key.offset -= 1; - } + } while (key.offset-- > 0); if (failed && !retried) { failed = 0; @@ -3128,11 +3182,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int i; int j; - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } + BUG_ON(!alloc_profile_is_valid(type, 0)); if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; @@ -3328,13 +3378,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - BUG_ON(ret); free_extent_map(em); + if (ret) + goto error; ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); - BUG_ON(ret); + if (ret) + goto error; for (i = 0; i < map->num_stripes; ++i) { struct btrfs_device *device; @@ -3347,7 +3399,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, info->chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, dev_offset, stripe_size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto error; + } } kfree(devices_info); @@ -3383,7 +3438,8 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, device = map->stripes[index].dev; device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); - BUG_ON(ret); + if (ret) + goto out_free; index++; } @@ -3420,16 +3476,19 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, key.offset = chunk_offset; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - BUG_ON(ret); - if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + /* + * TODO: Cleanup of inserted chunk root in case of + * failure. + */ ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); - BUG_ON(ret); } +out_free: kfree(chunk); - return 0; + return ret; } /* @@ -3461,7 +3520,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, chunk_size, stripe_size); - BUG_ON(ret); + if (ret) + return ret; return 0; } @@ -3493,7 +3553,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); - BUG_ON(ret); + if (ret) + return ret; sys_chunk_offset = chunk_offset + chunk_size; @@ -3504,10 +3565,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); - BUG_ON(ret); + if (ret) + goto abort; ret = btrfs_add_device(trans, fs_info->chunk_root, device); - BUG_ON(ret); + if (ret) + goto abort; /* * Modifying chunk tree needs allocating new blocks from both @@ -3517,13 +3580,20 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, */ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, chunk_size, stripe_size); - BUG_ON(ret); + if (ret) + goto abort; ret = __finish_chunk_alloc(trans, extent_root, sys_map, sys_chunk_offset, sys_chunk_size, sys_stripe_size); - BUG_ON(ret); + if (ret) + goto abort; + return 0; + +abort: + btrfs_abort_transaction(trans, root, ret); + return ret; } int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) @@ -3874,7 +3944,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes); buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); - BUG_ON(!buf); + BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { if (devid && map->stripes[i].dev->devid != devid) @@ -3967,7 +4037,7 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline int schedule_bio(struct btrfs_root *root, +static noinline void schedule_bio(struct btrfs_root *root, struct btrfs_device *device, int rw, struct bio *bio) { @@ -3979,7 +4049,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio_get(bio); btrfsic_submit_bio(rw, bio); bio_put(bio); - return 0; + return; } /* @@ -4013,7 +4083,6 @@ static noinline int schedule_bio(struct btrfs_root *root, if (should_queue) btrfs_queue_worker(&root->fs_info->submit_workers, &device->work); - return 0; } int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, @@ -4036,7 +4105,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); - BUG_ON(ret); + if (ret) /* -ENOMEM */ + return ret; total_devs = bbio->num_stripes; if (map_length < length) { @@ -4055,7 +4125,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, while (dev_nr < total_devs) { if (dev_nr < total_devs - 1) { bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); + BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; } @@ -4209,13 +4279,13 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, write_lock(&map_tree->map_tree.lock); ret = add_extent_mapping(&map_tree->map_tree, em); write_unlock(&map_tree->map_tree.lock); - BUG_ON(ret); + BUG_ON(ret); /* Tree corruption */ free_extent_map(em); return 0; } -static int fill_device_from_item(struct extent_buffer *leaf, +static void fill_device_from_item(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item, struct btrfs_device *device) { @@ -4232,8 +4302,6 @@ static int fill_device_from_item(struct extent_buffer *leaf, ptr = (unsigned long)btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - - return 0; } static int open_seed_devices(struct btrfs_root *root, u8 *fsid) @@ -4384,7 +4452,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) * to silence the warning eg. on PowerPC 64. */ if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->first_page); + SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 19ac95048b8..bb6b03f97aa 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -260,12 +260,12 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); -int btrfs_cleanup_fs_uuids(void); +void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); diff --git a/fs/buffer.c b/fs/buffer.c index 70e2017edd7..36d66653b93 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1384,10 +1384,23 @@ static void invalidate_bh_lru(void *arg) } put_cpu_var(bh_lrus); } + +static bool has_bh_in_lru(int cpu, void *dummy) +{ + struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); + int i; + for (i = 0; i < BH_LRU_SIZE; i++) { + if (b->bhs[i]) + return 1; + } + + return 0; +} + void invalidate_bh_lrus(void) { - on_each_cpu(invalidate_bh_lru, NULL, 1); + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2c489378b4c..9fff9f3b17e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode, case S_IFLNK: inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { - int symlen = iinfo->symlink_len; + u32 symlen = iinfo->symlink_len; char *sym; - BUG_ON(symlen != inode->i_size); spin_unlock(&ci->i_ceph_lock); + err = -EINVAL; + if (WARN_ON(symlen != inode->i_size)) + goto out; + err = -ENOMEM; - sym = kmalloc(symlen+1, GFP_NOFS); + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); if (!sym) goto out; - memcpy(sym, iinfo->symlink, symlen); - sym[symlen] = 0; spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 866e8d7ca37..89971e137aa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; - s->s_cap_ttl = 0; + s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; @@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, int wake = 0; spin_lock(&session->s_cap_lock); - was_stale = is_renew && (session->s_cap_ttl == 0 || - time_after_eq(jiffies, session->s_cap_ttl)); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; @@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session, session->s_mds); spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; - session->s_cap_ttl = 0; + session->s_cap_ttl = jiffies - 1; spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a559c80f127..f04c0961f99 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) /* alloc new snap context */ err = -ENOMEM; - if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); if (!snapc) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 256f8522192..1e67dd7305a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -130,10 +130,12 @@ enum { Opt_nodirstat, Opt_rbytes, Opt_norbytes, + Opt_asyncreaddir, Opt_noasyncreaddir, Opt_dcache, Opt_nodcache, Opt_ino32, + Opt_noino32, }; static match_table_t fsopt_tokens = { @@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = { {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, + {Opt_asyncreaddir, "asyncreaddir"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {Opt_dcache, "dcache"}, {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, {-1, NULL} }; @@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_norbytes: fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; @@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; default: BUG_ON(token); } @@ -334,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, *path += 2; dout("server path '%s'\n", *path); - err = ceph_parse_options(popt, options, dev_name, dev_name_end, + *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); - if (err) + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); goto out; + } /* success */ *pfsopt = fsopt; @@ -926,6 +938,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; @@ -935,6 +948,7 @@ static int __init init_ceph(void) return 0; out_icache: + ceph_xattr_exit(); destroy_caches(); out: return ret; @@ -944,6 +958,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1421f3d875a..fc35036d258 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) u32 ino = vino & 0xffffffff; ino ^= vino >> 32; if (!ino) - ino = 1; + ino = 2; return ino; } @@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); /* caps.c */ extern const char *ceph_cap_string(int c); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a76f697303d..35b86331d8a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -8,9 +8,12 @@ #include <linux/xattr.h> #include <linux/slab.h> +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, "ceph.", 5) || + return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || @@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name) * These define virtual xattrs exposing the recursive directory * statistics and layout metadata. */ -struct ceph_vxattr_cb { - bool readonly; +struct ceph_vxattr { char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); + bool readonly; }; /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files); } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_subdirs); } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, (long)ci->i_rctime.tv_nsec); } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "ceph.dir.files", ceph_vxattrcb_files}, - { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, - { true, NULL, NULL } +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name + +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + } + +static struct ceph_vxattr ceph_dir_vxattrs[] = { + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { 0 } /* Required table terminator */ }; +static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, size_t size) { int ret; @@ -103,21 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (ceph_file_layout_pg_preferred(ci->i_layout)) - ret += snprintf(val + ret, size, "preferred_osd=%lld\n", + + if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) { + val += ret; + size -= ret; + ret += snprintf(val, size, "preferred_osd=%lld\n", (unsigned long long)ceph_file_layout_pg_preferred( ci->i_layout)); + } + return ret; } -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "ceph.file.layout", ceph_vxattrcb_layout}, +static struct ceph_vxattr ceph_file_vxattrs[] = { + XATTR_NAME_CEPH(file, layout), /* The following extended attribute name is deprecated */ - { true, "ceph.layout", ceph_vxattrcb_layout}, - { true, NULL, NULL } + { + .name = XATTR_CEPH_PREFIX "layout", + .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), + .getxattr_cb = ceph_vxattrcb_file_layout, + .readonly = true, + }, + { 0 } /* Required table terminator */ }; +static size_t ceph_file_vxattrs_name_size; /* total size of all names */ -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { if (S_ISDIR(inode->i_mode)) return ceph_dir_vxattrs; @@ -126,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + if (vxattrs == ceph_dir_vxattrs) + return ceph_dir_vxattrs_name_size; + if (vxattrs == ceph_file_vxattrs) + return ceph_file_vxattrs_name_size; + BUG(); + + return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + struct ceph_vxattr *vxattr; + size_t size = 0; + + for (vxattr = vxattrs; vxattr->name; vxattr++) + size += vxattr->name_size; + + return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ + ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); + ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ + ceph_dir_vxattrs_name_size = 0; + ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { - do { - if (strcmp(vxattr->name, name) == 0) - return vxattr; - vxattr++; - } while (vxattr->name); + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + return NULL; } @@ -502,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int err; struct ceph_inode_xattr *xattr; - struct ceph_vxattr_cb *vxattr = NULL; + struct ceph_vxattr *vxattr = NULL; if (!ceph_is_valid_xattr(name)) return -ENODATA; /* let's see if a virtual xattr was requested */ - if (vxattrs) - vxattr = ceph_match_vxattr(vxattrs, name); + vxattr = ceph_match_vxattr(inode, name); spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, @@ -568,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; u32 namelen; int err; @@ -596,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) goto out; list_xattr: - vir_namelen = 0; - /* include virtual dir xattrs */ - if (vxattrs) - for (i = 0; vxattrs[i].name; i++) - vir_namelen += strlen(vxattrs[i].name) + 1; + /* + * Start with virtual dir xattr names (if any) (including + * terminating '\0' characters for each). + */ + vir_namelen = ceph_vxattrs_name_size(vxattrs); + /* adding 1 byte per each variable due to the null termination */ namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; @@ -698,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int issued; int err; + int dirty; int name_len = strlen(name); int val_len = size; char *newname = NULL; char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; - int issued; int required_blob_size; - int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -716,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; @@ -730,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, goto out; if (val_len) { - newval = kmalloc(val_len + 1, GFP_NOFS); + newval = kmemdup(value, val_len, GFP_NOFS); if (!newval) goto out; - memcpy(newval, value, val_len); - newval[val_len] = '\0'; } xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); @@ -744,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); @@ -752,7 +818,7 @@ retry: if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob = NULL; + struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); @@ -766,12 +832,13 @@ retry: goto retry; } - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); err = __set_xattr(ci, newname, name_len, newval, val_len, 1, 1, 1, &xattr); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -816,8 +883,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; int required_blob_size; @@ -829,22 +896,19 @@ int ceph_removexattr(struct dentry *dentry, const char *name) if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; err = -ENOMEM; spin_lock(&ci->i_ceph_lock); - __build_xattrs(inode); retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -865,10 +929,10 @@ retry: } err = __remove_xattr_by_name(ceph_inode(inode), name); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 573b899b5a5..27046462941 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -58,15 +58,16 @@ cifs_dump_mem(char *label, void *data, int length) } #ifdef CONFIG_CIFS_DEBUG2 -void cifs_dump_detail(struct smb_hdr *smb) +void cifs_dump_detail(void *buf) { + struct smb_hdr *smb = (struct smb_hdr *)buf; + cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", smb->Command, smb->Status.CifsError, smb->Flags, smb->Flags2, smb->Mid, smb->Pid); cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb)); } - void cifs_dump_mids(struct TCP_Server_Info *server) { struct list_head *tmp; @@ -79,15 +80,15 @@ void cifs_dump_mids(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d", - mid_entry->midState, - (int)mid_entry->command, + cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), mid_entry->pid, mid_entry->callback_data, mid_entry->mid); #ifdef CONFIG_CIFS_STATS2 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", - mid_entry->largeBuf, + mid_entry->large_buf, mid_entry->resp_buf, mid_entry->when_received, jiffies); @@ -217,12 +218,12 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) mid_entry = list_entry(tmp3, struct mid_q_entry, qhead); seq_printf(m, "\tState: %d com: %d pid:" - " %d cbdata: %p mid %d\n", - mid_entry->midState, - (int)mid_entry->command, - mid_entry->pid, - mid_entry->callback_data, - mid_entry->mid); + " %d cbdata: %p mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); } spin_unlock(&GlobalMid_Lock); } @@ -417,7 +418,6 @@ static const struct file_operations cifs_stats_proc_fops = { static struct proc_dir_entry *proc_fs_cifs; static const struct file_operations cifsFYI_proc_fops; -static const struct file_operations cifs_oplock_proc_fops; static const struct file_operations cifs_lookup_cache_proc_fops; static const struct file_operations traceSMB_proc_fops; static const struct file_operations cifs_multiuser_mount_proc_fops; @@ -438,7 +438,6 @@ cifs_proc_init(void) #endif /* STATS */ proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); - proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops); proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, &cifs_linux_ext_proc_fops); proc_create("MultiuserMount", 0, proc_fs_cifs, @@ -462,7 +461,6 @@ cifs_proc_clean(void) remove_proc_entry("Stats", proc_fs_cifs); #endif remove_proc_entry("MultiuserMount", proc_fs_cifs); - remove_proc_entry("OplockEnabled", proc_fs_cifs); remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); @@ -508,46 +506,6 @@ static const struct file_operations cifsFYI_proc_fops = { .write = cifsFYI_proc_write, }; -static int cifs_oplock_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", enable_oplocks); - return 0; -} - -static int cifs_oplock_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cifs_oplock_proc_show, NULL); -} - -static ssize_t cifs_oplock_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *ppos) -{ - char c; - int rc; - - printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface " - "will be removed in kernel version 3.4. Please migrate to " - "using the 'enable_oplocks' module parameter in cifs.ko.\n"); - rc = get_user(c, buffer); - if (rc) - return rc; - if (c == '0' || c == 'n' || c == 'N') - enable_oplocks = false; - else if (c == '1' || c == 'y' || c == 'Y') - enable_oplocks = true; - - return count; -} - -static const struct file_operations cifs_oplock_proc_fops = { - .owner = THIS_MODULE, - .open = cifs_oplock_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_oplock_proc_write, -}; - static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) { seq_printf(m, "%d\n", linuxExtEnabled); diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 8942b28cf80..566e0ae8dc2 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -26,13 +26,13 @@ void cifs_dump_mem(char *label, void *data, int length); #ifdef CONFIG_CIFS_DEBUG2 #define DBG2 2 -void cifs_dump_detail(struct smb_hdr *); +void cifs_dump_detail(void *); void cifs_dump_mids(struct TCP_Server_Info *); #else #define DBG2 0 #endif extern int traceSMB; /* flag which enables the function below */ -void dump_smb(struct smb_hdr *, int); +void dump_smb(void *, int); #define CIFS_INFO 0x01 #define CIFS_RC 0x02 #define CIFS_TIMER 0x04 diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index eee522c56ef..d3421282244 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -85,6 +85,8 @@ extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; +struct workqueue_struct *cifsiod_wq; + static int cifs_read_super(struct super_block *sb) { @@ -1111,9 +1113,15 @@ init_cifs(void) cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ); } + cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cifsiod_wq) { + rc = -ENOMEM; + goto out_clean_proc; + } + rc = cifs_fscache_register(); if (rc) - goto out_clean_proc; + goto out_destroy_wq; rc = cifs_init_inodecache(); if (rc) @@ -1161,6 +1169,8 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); +out_destroy_wq: + destroy_workqueue(cifsiod_wq); out_clean_proc: cifs_proc_clean(); return rc; @@ -1183,6 +1193,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index fe5ecf1b422..d1389bb33ce 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.76" +#define CIFS_VERSION "1.77" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 339ebe3ebc0..4ff6313f0a9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -230,6 +230,12 @@ struct cifs_mnt_data { int flags; }; +static inline unsigned int +get_rfc1002_length(void *buf) +{ + return be32_to_cpu(*((__be32 *)buf)); +} + struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; @@ -276,7 +282,7 @@ struct TCP_Server_Info { vcnumbers */ int capabilities; /* allow selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ - __u16 CurrentMid; /* multiplex id - rotating counter */ + __u64 CurrentMid; /* multiplex id - rotating counter */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; @@ -335,6 +341,18 @@ has_credits(struct TCP_Server_Info *server, int *credits) return num > 0; } +static inline size_t +header_size(void) +{ + return sizeof(struct smb_hdr); +} + +static inline size_t +max_header_size(void) +{ + return MAX_CIFS_HDR_SIZE; +} + /* * Macros to allow the TCP_Server_Info->net field and related code to drop out * when CONFIG_NET_NS isn't set. @@ -583,9 +601,11 @@ struct cifs_io_parms { * Take a reference on the file private data. Must be called with * cifs_file_list_lock held. */ -static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) +static inline +struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file) { ++cifs_file->count; + return cifs_file; } void cifsFileInfo_put(struct cifsFileInfo *cifs_file); @@ -606,7 +626,7 @@ struct cifsInodeInfo { bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ unsigned long time; /* jiffies of last update of inode */ - u64 server_eof; /* current file size on server */ + u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ #ifdef CONFIG_CIFS_FSCACHE @@ -713,8 +733,8 @@ typedef void (mid_callback_t)(struct mid_q_entry *mid); /* one of these for every pending CIFS request to the server */ struct mid_q_entry { struct list_head qhead; /* mids waiting on reply from this server */ - __u16 mid; /* multiplex id */ - __u16 pid; /* process id */ + __u64 mid; /* multiplex id */ + __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ #ifdef CONFIG_CIFS_STATS2 @@ -724,10 +744,10 @@ struct mid_q_entry { mid_receive_t *receive; /* call receive callback */ mid_callback_t *callback; /* call completion callback */ void *callback_data; /* general purpose pointer for callback */ - struct smb_hdr *resp_buf; /* pointer to received SMB header */ - int midState; /* wish this were enum but can not pass to wait_event */ - __u8 command; /* smb command code */ - bool largeBuf:1; /* if valid response, is pointer to large buf */ + void *resp_buf; /* pointer to received SMB header */ + int mid_state; /* wish this were enum but can not pass to wait_event */ + __le16 command; /* smb command code */ + bool large_buf:1; /* if valid response, is pointer to large buf */ bool multiRsp:1; /* multiple trans2 responses for one request */ bool multiEnd:1; /* both received */ }; @@ -1052,5 +1072,6 @@ GLOBAL_EXTERN spinlock_t gidsidlock; void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; +extern struct workqueue_struct *cifsiod_wq; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 503e73d8bdb..96192c1e380 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -77,7 +77,7 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, struct smb_hdr * /* out */ , int * /* bytes returned */ , const int long_op); extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, - struct smb_hdr *in_buf, int flags); + char *in_buf, int flags); extern int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, @@ -91,9 +91,8 @@ extern int SendReceiveBlockingLock(const unsigned int xid, extern void cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add); extern void cifs_set_credits(struct TCP_Server_Info *server, const int val); -extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); -extern bool is_valid_oplock_break(struct smb_hdr *smb, - struct TCP_Server_Info *); +extern int checkSMB(char *buf, unsigned int length); +extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, @@ -107,7 +106,7 @@ extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, const unsigned short int port); -extern int map_smb_to_linux_error(struct smb_hdr *smb, bool logErr); +extern int map_smb_to_linux_error(char *buf, bool logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifs_tcon *, int /* length of fixed section (word count) in two byte units */); @@ -116,7 +115,7 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct, void **request_buf); extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp); -extern __u16 GetNextMid(struct TCP_Server_Info *server); +extern __u64 GetNextMid(struct TCP_Server_Info *server); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, @@ -484,18 +483,25 @@ int cifs_async_readv(struct cifs_readdata *rdata); /* asynchronous write support */ struct cifs_writedata { struct kref refcount; + struct list_head list; + struct completion done; enum writeback_sync_modes sync_mode; struct work_struct work; struct cifsFileInfo *cfile; __u64 offset; + pid_t pid; unsigned int bytes; int result; + void (*marshal_iov) (struct kvec *iov, + struct cifs_writedata *wdata); unsigned int nr_pages; struct page *pages[1]; }; int cifs_async_writev(struct cifs_writedata *wdata); -struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages); +void cifs_writev_complete(struct work_struct *work); +struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, + work_func_t complete); void cifs_writedata_release(struct kref *refcount); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 70aac35c398..8fecc99be34 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -696,7 +696,7 @@ CIFSSMBTDis(const int xid, struct cifs_tcon *tcon) if (rc) return rc; - rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0); if (rc) cFYI(1, "Tree disconnect failed %d", rc); @@ -792,7 +792,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses) pSMB->hdr.Uid = ses->Suid; pSMB->AndXCommand = 0xFF; - rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0); session_already_dead: mutex_unlock(&ses->session_mutex); @@ -1414,8 +1414,7 @@ cifs_readdata_free(struct cifs_readdata *rdata) static int cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) { - READ_RSP *rsp = (READ_RSP *)server->smallbuf; - unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length); + unsigned int rfclen = get_rfc1002_length(server->smallbuf); int remaining = rfclen + 4 - server->total_read; struct cifs_readdata *rdata = mid->callback_data; @@ -1424,7 +1423,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) length = cifs_read_from_socket(server, server->bigbuf, min_t(unsigned int, remaining, - CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)); + CIFSMaxBufSize + max_header_size())); if (length < 0) return length; server->total_read += length; @@ -1435,19 +1434,40 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) return 0; } +static inline size_t +read_rsp_size(void) +{ + return sizeof(READ_RSP); +} + +static inline unsigned int +read_data_offset(char *buf) +{ + READ_RSP *rsp = (READ_RSP *)buf; + return le16_to_cpu(rsp->DataOffset); +} + +static inline unsigned int +read_data_length(char *buf) +{ + READ_RSP *rsp = (READ_RSP *)buf; + return (le16_to_cpu(rsp->DataLengthHigh) << 16) + + le16_to_cpu(rsp->DataLength); +} + static int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length, len; unsigned int data_offset, remaining, data_len; struct cifs_readdata *rdata = mid->callback_data; - READ_RSP *rsp = (READ_RSP *)server->smallbuf; - unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4; + char *buf = server->smallbuf; + unsigned int buflen = get_rfc1002_length(buf) + 4; u64 eof; pgoff_t eof_index; struct page *page, *tpage; - cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__, + cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__, mid->mid, rdata->offset, rdata->bytes); /* @@ -1455,10 +1475,9 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) * can if there's not enough data. At this point, we've read down to * the Mid. */ - len = min_t(unsigned int, rfclen, sizeof(*rsp)) - - sizeof(struct smb_hdr) + 1; + len = min_t(unsigned int, buflen, read_rsp_size()) - header_size() + 1; - rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1; + rdata->iov[0].iov_base = buf + header_size() - 1; rdata->iov[0].iov_len = len; length = cifs_readv_from_socket(server, rdata->iov, 1, len); @@ -1467,7 +1486,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; /* Was the SMB read successful? */ - rdata->result = map_smb_to_linux_error(&rsp->hdr, false); + rdata->result = map_smb_to_linux_error(buf, false); if (rdata->result != 0) { cFYI(1, "%s: server returned error %d", __func__, rdata->result); @@ -1475,14 +1494,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } /* Is there enough to get to the rest of the READ_RSP header? */ - if (server->total_read < sizeof(READ_RSP)) { + if (server->total_read < read_rsp_size()) { cFYI(1, "%s: server returned short header. got=%u expected=%zu", - __func__, server->total_read, sizeof(READ_RSP)); + __func__, server->total_read, read_rsp_size()); rdata->result = -EIO; return cifs_readv_discard(server, mid); } - data_offset = le16_to_cpu(rsp->DataOffset) + 4; + data_offset = read_data_offset(buf) + 4; if (data_offset < server->total_read) { /* * win2k8 sometimes sends an offset of 0 when the read @@ -1506,7 +1525,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) len = data_offset - server->total_read; if (len > 0) { /* read any junk before data into the rest of smallbuf */ - rdata->iov[0].iov_base = server->smallbuf + server->total_read; + rdata->iov[0].iov_base = buf + server->total_read; rdata->iov[0].iov_len = len; length = cifs_readv_from_socket(server, rdata->iov, 1, len); if (length < 0) @@ -1515,15 +1534,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } /* set up first iov for signature check */ - rdata->iov[0].iov_base = server->smallbuf; + rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = server->total_read; cFYI(1, "0: iov_base=%p iov_len=%zu", rdata->iov[0].iov_base, rdata->iov[0].iov_len); /* how much data is in the response? */ - data_len = le16_to_cpu(rsp->DataLengthHigh) << 16; - data_len += le16_to_cpu(rsp->DataLength); - if (data_offset + data_len > rfclen) { + data_len = read_data_length(buf); + if (data_offset + data_len > buflen) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; return cifs_readv_discard(server, mid); @@ -1602,11 +1620,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) rdata->bytes = length; - cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read, - rfclen, remaining); + cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read, + buflen, remaining); /* discard anything left over */ - if (server->total_read < rfclen) + if (server->total_read < buflen) return cifs_readv_discard(server, mid); dequeue_mid(mid, false); @@ -1647,10 +1665,10 @@ cifs_readv_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__, - mid->mid, mid->midState, rdata->result, rdata->bytes); + cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, + mid->mid, mid->mid_state, rdata->result, rdata->bytes); - switch (mid->midState) { + switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: /* result already set, check signature */ if (server->sec_mode & @@ -1671,7 +1689,7 @@ cifs_readv_callback(struct mid_q_entry *mid) rdata->result = -EIO; } - queue_work(system_nrt_wq, &rdata->work); + queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); cifs_add_credits(server, 1); } @@ -2017,7 +2035,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) kref_put(&wdata->refcount, cifs_writedata_release); } -static void +void cifs_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, @@ -2026,7 +2044,9 @@ cifs_writev_complete(struct work_struct *work) int i = 0; if (wdata->result == 0) { + spin_lock(&inode->i_lock); cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes); + spin_unlock(&inode->i_lock); cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink), wdata->bytes); } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) @@ -2047,7 +2067,7 @@ cifs_writev_complete(struct work_struct *work) } struct cifs_writedata * -cifs_writedata_alloc(unsigned int nr_pages) +cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { struct cifs_writedata *wdata; @@ -2061,14 +2081,16 @@ cifs_writedata_alloc(unsigned int nr_pages) wdata = kzalloc(sizeof(*wdata) + sizeof(struct page *) * (nr_pages - 1), GFP_NOFS); if (wdata != NULL) { - INIT_WORK(&wdata->work, cifs_writev_complete); kref_init(&wdata->refcount); + INIT_LIST_HEAD(&wdata->list); + init_completion(&wdata->done); + INIT_WORK(&wdata->work, complete); } return wdata; } /* - * Check the midState and signature on received buffer (if any), and queue the + * Check the mid_state and signature on received buffer (if any), and queue the * workqueue completion task. */ static void @@ -2079,7 +2101,7 @@ cifs_writev_callback(struct mid_q_entry *mid) unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; - switch (mid->midState) { + switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: wdata->result = cifs_check_receive(mid, tcon->ses->server, 0); if (wdata->result != 0) @@ -2111,7 +2133,7 @@ cifs_writev_callback(struct mid_q_entry *mid) break; } - queue_work(system_nrt_wq, &wdata->work); + queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); cifs_add_credits(tcon->ses->server, 1); } @@ -2124,7 +2146,6 @@ cifs_async_writev(struct cifs_writedata *wdata) WRITE_REQ *smb = NULL; int wct; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct inode *inode = wdata->cfile->dentry->d_inode; struct kvec *iov = NULL; if (tcon->ses->capabilities & CAP_LARGE_FILES) { @@ -2148,8 +2169,8 @@ cifs_async_writev(struct cifs_writedata *wdata) goto async_writev_out; } - smb->hdr.Pid = cpu_to_le16((__u16)wdata->cfile->pid); - smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->cfile->pid >> 16)); + smb->hdr.Pid = cpu_to_le16((__u16)wdata->pid); + smb->hdr.PidHigh = cpu_to_le16((__u16)(wdata->pid >> 16)); smb->AndXCommand = 0xFF; /* none */ smb->Fid = wdata->cfile->netfid; @@ -2167,15 +2188,13 @@ cifs_async_writev(struct cifs_writedata *wdata) iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1; iov[0].iov_base = smb; - /* marshal up the pages into iov array */ - wdata->bytes = 0; - for (i = 0; i < wdata->nr_pages; i++) { - iov[i + 1].iov_len = min(inode->i_size - - page_offset(wdata->pages[i]), - (loff_t)PAGE_CACHE_SIZE); - iov[i + 1].iov_base = kmap(wdata->pages[i]); - wdata->bytes += iov[i + 1].iov_len; - } + /* + * This function should marshal up the page array into the kvec + * array, reserving [0] for the header. It should kmap the pages + * and set the iov_len properly for each one. It may also set + * wdata->bytes too. + */ + wdata->marshal_iov(iov, wdata); cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); @@ -2420,8 +2439,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon, (struct smb_hdr *) pSMB, &bytes_returned); cifs_small_buf_release(pSMB); } else { - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB, - timeout); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, timeout); /* SMB buffer freed by function above */ } cifs_stats_inc(&tcon->num_locks); @@ -2588,7 +2606,7 @@ CIFSSMBClose(const int xid, struct cifs_tcon *tcon, int smb_file_id) pSMB->FileID = (__u16) smb_file_id; pSMB->LastWriteTime = 0xFFFFFFFF; pSMB->ByteCount = 0; - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); cifs_stats_inc(&tcon->num_closes); if (rc) { if (rc != -EINTR) { @@ -2617,7 +2635,7 @@ CIFSSMBFlush(const int xid, struct cifs_tcon *tcon, int smb_file_id) pSMB->FileID = (__u16) smb_file_id; pSMB->ByteCount = 0; - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); cifs_stats_inc(&tcon->num_flushes); if (rc) cERROR(1, "Send error in Flush = %d", rc); @@ -4625,7 +4643,7 @@ CIFSFindClose(const int xid, struct cifs_tcon *tcon, pSMB->FileID = searchHandle; pSMB->ByteCount = 0; - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) cERROR(1, "Send error in FindClose = %d", rc); @@ -5646,7 +5664,7 @@ CIFSSMBSetFileSize(const int xid, struct cifs_tcon *tcon, __u64 size, pSMB->Reserved4 = 0; inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) { cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc); } @@ -5715,7 +5733,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifs_tcon *tcon, inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); @@ -5774,7 +5792,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifs_tcon *tcon, inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); *data_offset = delete_file ? 1 : 0; - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) cFYI(1, "Send error in SetFileDisposition = %d", rc); @@ -6006,7 +6024,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifs_tcon *tcon, cifs_fill_unix_set_info(data_offset, args); - rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5560e1d5e54..302a15c505a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -40,6 +40,8 @@ #include <linux/module.h> #include <keys/user-type.h> #include <net/ipv6.h> +#include <linux/parser.h> + #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -63,6 +65,193 @@ extern mempool_t *cifs_req_poolp; #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) +enum { + + /* Mount options that take no arguments */ + Opt_user_xattr, Opt_nouser_xattr, + Opt_forceuid, Opt_noforceuid, + Opt_noblocksend, Opt_noautotune, + Opt_hard, Opt_soft, Opt_perm, Opt_noperm, + Opt_mapchars, Opt_nomapchars, Opt_sfu, + Opt_nosfu, Opt_nodfs, Opt_posixpaths, + Opt_noposixpaths, Opt_nounix, + Opt_nocase, + Opt_brl, Opt_nobrl, + Opt_forcemandatorylock, Opt_setuids, + Opt_nosetuids, Opt_dynperm, Opt_nodynperm, + Opt_nohard, Opt_nosoft, + Opt_nointr, Opt_intr, + Opt_nostrictsync, Opt_strictsync, + Opt_serverino, Opt_noserverino, + Opt_rwpidforward, Opt_cifsacl, Opt_nocifsacl, + Opt_acl, Opt_noacl, Opt_locallease, + Opt_sign, Opt_seal, Opt_direct, + Opt_strictcache, Opt_noac, + Opt_fsc, Opt_mfsymlinks, + Opt_multiuser, Opt_sloppy, + + /* Mount options which take numeric value */ + Opt_backupuid, Opt_backupgid, Opt_uid, + Opt_cruid, Opt_gid, Opt_file_mode, + Opt_dirmode, Opt_port, + Opt_rsize, Opt_wsize, Opt_actimeo, + + /* Mount options which take string value */ + Opt_user, Opt_pass, Opt_ip, + Opt_unc, Opt_domain, + Opt_srcaddr, Opt_prefixpath, + Opt_iocharset, Opt_sockopt, + Opt_netbiosname, Opt_servern, + Opt_ver, Opt_sec, + + /* Mount options to be ignored */ + Opt_ignore, + + /* Options which could be blank */ + Opt_blank_pass, + + Opt_err +}; + +static const match_table_t cifs_mount_option_tokens = { + + { Opt_user_xattr, "user_xattr" }, + { Opt_nouser_xattr, "nouser_xattr" }, + { Opt_forceuid, "forceuid" }, + { Opt_noforceuid, "noforceuid" }, + { Opt_noblocksend, "noblocksend" }, + { Opt_noautotune, "noautotune" }, + { Opt_hard, "hard" }, + { Opt_soft, "soft" }, + { Opt_perm, "perm" }, + { Opt_noperm, "noperm" }, + { Opt_mapchars, "mapchars" }, + { Opt_nomapchars, "nomapchars" }, + { Opt_sfu, "sfu" }, + { Opt_nosfu, "nosfu" }, + { Opt_nodfs, "nodfs" }, + { Opt_posixpaths, "posixpaths" }, + { Opt_noposixpaths, "noposixpaths" }, + { Opt_nounix, "nounix" }, + { Opt_nounix, "nolinux" }, + { Opt_nocase, "nocase" }, + { Opt_nocase, "ignorecase" }, + { Opt_brl, "brl" }, + { Opt_nobrl, "nobrl" }, + { Opt_nobrl, "nolock" }, + { Opt_forcemandatorylock, "forcemandatorylock" }, + { Opt_forcemandatorylock, "forcemand" }, + { Opt_setuids, "setuids" }, + { Opt_nosetuids, "nosetuids" }, + { Opt_dynperm, "dynperm" }, + { Opt_nodynperm, "nodynperm" }, + { Opt_nohard, "nohard" }, + { Opt_nosoft, "nosoft" }, + { Opt_nointr, "nointr" }, + { Opt_intr, "intr" }, + { Opt_nostrictsync, "nostrictsync" }, + { Opt_strictsync, "strictsync" }, + { Opt_serverino, "serverino" }, + { Opt_noserverino, "noserverino" }, + { Opt_rwpidforward, "rwpidforward" }, + { Opt_cifsacl, "cifsacl" }, + { Opt_nocifsacl, "nocifsacl" }, + { Opt_acl, "acl" }, + { Opt_noacl, "noacl" }, + { Opt_locallease, "locallease" }, + { Opt_sign, "sign" }, + { Opt_seal, "seal" }, + { Opt_direct, "direct" }, + { Opt_direct, "forceddirectio" }, + { Opt_strictcache, "strictcache" }, + { Opt_noac, "noac" }, + { Opt_fsc, "fsc" }, + { Opt_mfsymlinks, "mfsymlinks" }, + { Opt_multiuser, "multiuser" }, + { Opt_sloppy, "sloppy" }, + + { Opt_backupuid, "backupuid=%s" }, + { Opt_backupgid, "backupgid=%s" }, + { Opt_uid, "uid=%s" }, + { Opt_cruid, "cruid=%s" }, + { Opt_gid, "gid=%s" }, + { Opt_file_mode, "file_mode=%s" }, + { Opt_dirmode, "dirmode=%s" }, + { Opt_dirmode, "dir_mode=%s" }, + { Opt_port, "port=%s" }, + { Opt_rsize, "rsize=%s" }, + { Opt_wsize, "wsize=%s" }, + { Opt_actimeo, "actimeo=%s" }, + + { Opt_user, "user=%s" }, + { Opt_user, "username=%s" }, + { Opt_blank_pass, "pass=" }, + { Opt_pass, "pass=%s" }, + { Opt_pass, "password=%s" }, + { Opt_ip, "ip=%s" }, + { Opt_ip, "addr=%s" }, + { Opt_unc, "unc=%s" }, + { Opt_unc, "target=%s" }, + { Opt_unc, "path=%s" }, + { Opt_domain, "dom=%s" }, + { Opt_domain, "domain=%s" }, + { Opt_domain, "workgroup=%s" }, + { Opt_srcaddr, "srcaddr=%s" }, + { Opt_prefixpath, "prefixpath=%s" }, + { Opt_iocharset, "iocharset=%s" }, + { Opt_sockopt, "sockopt=%s" }, + { Opt_netbiosname, "netbiosname=%s" }, + { Opt_servern, "servern=%s" }, + { Opt_ver, "ver=%s" }, + { Opt_ver, "vers=%s" }, + { Opt_ver, "version=%s" }, + { Opt_sec, "sec=%s" }, + + { Opt_ignore, "cred" }, + { Opt_ignore, "credentials" }, + { Opt_ignore, "guest" }, + { Opt_ignore, "rw" }, + { Opt_ignore, "ro" }, + { Opt_ignore, "suid" }, + { Opt_ignore, "nosuid" }, + { Opt_ignore, "exec" }, + { Opt_ignore, "noexec" }, + { Opt_ignore, "nodev" }, + { Opt_ignore, "noauto" }, + { Opt_ignore, "dev" }, + { Opt_ignore, "mand" }, + { Opt_ignore, "nomand" }, + { Opt_ignore, "_netdev" }, + + { Opt_err, NULL } +}; + +enum { + Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, + Opt_sec_ntlmsspi, Opt_sec_ntlmssp, + Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2i, + Opt_sec_nontlm, Opt_sec_lanman, + Opt_sec_none, + + Opt_sec_err +}; + +static const match_table_t cifs_secflavor_tokens = { + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + { Opt_sec_ntlmsspi, "ntlmsspi" }, + { Opt_sec_ntlmssp, "ntlmssp" }, + { Opt_ntlm, "ntlm" }, + { Opt_sec_ntlmi, "ntlmi" }, + { Opt_sec_ntlmv2i, "ntlmv2i" }, + { Opt_sec_nontlm, "nontlm" }, + { Opt_sec_lanman, "lanman" }, + { Opt_sec_none, "none" }, + + { Opt_sec_err, NULL } +}; + static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); @@ -143,8 +332,8 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - if (mid_entry->midState == MID_REQUEST_SUBMITTED) - mid_entry->midState = MID_RETRY_NEEDED; + if (mid_entry->mid_state == MID_REQUEST_SUBMITTED) + mid_entry->mid_state = MID_RETRY_NEEDED; list_move(&mid_entry->qhead, &retry_list); } spin_unlock(&GlobalMid_Lock); @@ -183,8 +372,9 @@ cifs_reconnect(struct TCP_Server_Info *server) -EINVAL = invalid transact2 */ -static int check2ndT2(struct smb_hdr *pSMB) +static int check2ndT2(char *buf) { + struct smb_hdr *pSMB = (struct smb_hdr *)buf; struct smb_t2_rsp *pSMBt; int remaining; __u16 total_data_size, data_in_this_rsp; @@ -224,10 +414,10 @@ static int check2ndT2(struct smb_hdr *pSMB) return remaining; } -static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) +static int coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) { - struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond; - struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; + struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)second_buf; + struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)target_hdr; char *data_area_of_tgt; char *data_area_of_src; int remaining; @@ -280,23 +470,23 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount); /* fix up the BCC */ - byte_count = get_bcc(pTargetSMB); + byte_count = get_bcc(target_hdr); byte_count += total_in_src; /* is the result too big for the field? */ if (byte_count > USHRT_MAX) { cFYI(1, "coalesced BCC too large (%u)", byte_count); return -EPROTO; } - put_bcc(byte_count, pTargetSMB); + put_bcc(byte_count, target_hdr); - byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); + byte_count = be32_to_cpu(target_hdr->smb_buf_length); byte_count += total_in_src; /* don't allow buffer to overflow */ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count); return -ENOBUFS; } - pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); + target_hdr->smb_buf_length = cpu_to_be32(byte_count); /* copy second buffer into end of first buffer */ memcpy(data_area_of_tgt, data_area_of_src, total_in_src); @@ -334,7 +524,7 @@ cifs_echo_request(struct work_struct *work) server->hostname); requeue_echo: - queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); } static bool @@ -350,7 +540,7 @@ allocate_buffers(struct TCP_Server_Info *server) } } else if (server->large_buf) { /* we are reusing a dirty large buf, clear its start */ - memset(server->bigbuf, 0, sizeof(struct smb_hdr)); + memset(server->bigbuf, 0, header_size()); } if (!server->smallbuf) { @@ -364,7 +554,7 @@ allocate_buffers(struct TCP_Server_Info *server) /* beginning of smb buffer is cleared in our buf_get */ } else { /* if existing small buf clear beginning */ - memset(server->smallbuf, 0, sizeof(struct smb_hdr)); + memset(server->smallbuf, 0, header_size()); } return true; @@ -566,15 +756,16 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) } static struct mid_q_entry * -find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf) +find_mid(struct TCP_Server_Info *server, char *buffer) { + struct smb_hdr *buf = (struct smb_hdr *)buffer; struct mid_q_entry *mid; spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if (mid->mid == buf->Mid && - mid->midState == MID_REQUEST_SUBMITTED && - mid->command == buf->Command) { + mid->mid_state == MID_REQUEST_SUBMITTED && + le16_to_cpu(mid->command) == buf->Command) { spin_unlock(&GlobalMid_Lock); return mid; } @@ -591,16 +782,16 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) #endif spin_lock(&GlobalMid_Lock); if (!malformed) - mid->midState = MID_RESPONSE_RECEIVED; + mid->mid_state = MID_RESPONSE_RECEIVED; else - mid->midState = MID_RESPONSE_MALFORMED; + mid->mid_state = MID_RESPONSE_MALFORMED; list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); } static void handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, - struct smb_hdr *buf, int malformed) + char *buf, int malformed) { if (malformed == 0 && check2ndT2(buf) > 0) { mid->multiRsp = true; @@ -620,13 +811,13 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, } else { /* Have first buffer */ mid->resp_buf = buf; - mid->largeBuf = true; + mid->large_buf = true; server->bigbuf = NULL; } return; } mid->resp_buf = buf; - mid->largeBuf = server->large_buf; + mid->large_buf = server->large_buf; /* Was previous buf put in mpx struct for multi-rsp? */ if (!mid->multiRsp) { /* smb buffer will be freed by user thread */ @@ -682,8 +873,8 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Clearing mid 0x%x", mid_entry->mid); - mid_entry->midState = MID_SHUTDOWN; + cFYI(1, "Clearing mid 0x%llx", mid_entry->mid); + mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); } spin_unlock(&GlobalMid_Lock); @@ -691,7 +882,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) /* now walk dispose list and issue callbacks */ list_for_each_safe(tmp, tmp2, &dispose_list) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Callback mid 0x%x", mid_entry->mid); + cFYI(1, "Callback mid 0x%llx", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); } @@ -731,11 +922,10 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length; char *buf = server->smallbuf; - struct smb_hdr *smb_buffer = (struct smb_hdr *)buf; - unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + unsigned int pdu_length = get_rfc1002_length(buf); /* make sure this will fit in a large buffer */ - if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + if (pdu_length > CIFSMaxBufSize + max_header_size() - 4) { cERROR(1, "SMB response too long (%u bytes)", pdu_length); cifs_reconnect(server); @@ -746,20 +936,18 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* switch to large buffer if too big for a small one */ if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { server->large_buf = true; - memcpy(server->bigbuf, server->smallbuf, server->total_read); + memcpy(server->bigbuf, buf, server->total_read); buf = server->bigbuf; - smb_buffer = (struct smb_hdr *)buf; } /* now read the rest */ - length = cifs_read_from_socket(server, - buf + sizeof(struct smb_hdr) - 1, - pdu_length - sizeof(struct smb_hdr) + 1 + 4); + length = cifs_read_from_socket(server, buf + header_size() - 1, + pdu_length - header_size() + 1 + 4); if (length < 0) return length; server->total_read += length; - dump_smb(smb_buffer, server->total_read); + dump_smb(buf, server->total_read); /* * We know that we received enough to get to the MID as we @@ -770,7 +958,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read); + length = checkSMB(buf, server->total_read); if (length != 0) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -778,7 +966,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (!mid) return length; - handle_mid(mid, server, smb_buffer, length); + handle_mid(mid, server, buf, length); return 0; } @@ -789,7 +977,6 @@ cifs_demultiplex_thread(void *p) struct TCP_Server_Info *server = p; unsigned int pdu_length; char *buf = NULL; - struct smb_hdr *smb_buffer = NULL; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; @@ -810,7 +997,6 @@ cifs_demultiplex_thread(void *p) continue; server->large_buf = false; - smb_buffer = (struct smb_hdr *)server->smallbuf; buf = server->smallbuf; pdu_length = 4; /* enough to get RFC1001 header */ @@ -823,14 +1009,14 @@ cifs_demultiplex_thread(void *p) * The right amount was read from socket - 4 bytes, * so we can now interpret the length field. */ - pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); + pdu_length = get_rfc1002_length(buf); cFYI(1, "RFC1002 header 0x%x", pdu_length); if (!is_smb_response(server, buf[0])) continue; /* make sure we have enough to get to the MID */ - if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) { + if (pdu_length < header_size() - 1 - 4) { cERROR(1, "SMB response too short (%u bytes)", pdu_length); cifs_reconnect(server); @@ -840,12 +1026,12 @@ cifs_demultiplex_thread(void *p) /* read down to the MID */ length = cifs_read_from_socket(server, buf + 4, - sizeof(struct smb_hdr) - 1 - 4); + header_size() - 1 - 4); if (length < 0) continue; server->total_read += length; - mid_entry = find_mid(server, smb_buffer); + mid_entry = find_mid(server, buf); if (!mid_entry || !mid_entry->receive) length = standard_receive3(server, mid_entry); @@ -855,22 +1041,19 @@ cifs_demultiplex_thread(void *p) if (length < 0) continue; - if (server->large_buf) { + if (server->large_buf) buf = server->bigbuf; - smb_buffer = (struct smb_hdr *)buf; - } server->lstrp = jiffies; if (mid_entry != NULL) { if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); - } else if (!is_valid_oplock_break(smb_buffer, server)) { + } else if (!is_valid_oplock_break(buf, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); - cifs_dump_mem("Received Data is: ", buf, - sizeof(struct smb_hdr)); + cifs_dump_mem("Received Data is: ", buf, header_size()); #ifdef CONFIG_CIFS_DEBUG2 - cifs_dump_detail(smb_buffer); + cifs_dump_detail(buf); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ @@ -926,23 +1109,95 @@ extract_hostname(const char *unc) return dst; } +static int get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = kstrtoul(string, 10, option); + kfree(string); + + return rc; +} + + +static int cifs_parse_security_flavors(char *value, + struct smb_vol *vol) +{ + + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_secflavor_tokens, args)) { + case Opt_sec_krb5: + vol->secFlg |= CIFSSEC_MAY_KRB5; + break; + case Opt_sec_krb5i: + vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN; + break; + case Opt_sec_krb5p: + /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ + cERROR(1, "Krb5 cifs privacy not supported"); + break; + case Opt_sec_ntlmssp: + vol->secFlg |= CIFSSEC_MAY_NTLMSSP; + break; + case Opt_sec_ntlmsspi: + vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN; + break; + case Opt_ntlm: + /* ntlm is default so can be turned off too */ + vol->secFlg |= CIFSSEC_MAY_NTLM; + break; + case Opt_sec_ntlmi: + vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN; + break; + case Opt_sec_nontlm: + vol->secFlg |= CIFSSEC_MAY_NTLMV2; + break; + case Opt_sec_ntlmv2i: + vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN; + break; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + case Opt_sec_lanman: + vol->secFlg |= CIFSSEC_MAY_LANMAN; + break; +#endif + case Opt_sec_none: + vol->nullauth = 1; + break; + default: + cERROR(1, "bad security option: %s", value); + return 1; + } + + return 0; +} + static int cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) { - char *value, *data, *end; + char *data, *end; char *mountdata_copy = NULL, *options; - int err; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; short int override_gid = -1; bool uid_specified = false; bool gid_specified = false; + bool sloppy = false; + char *invalid = NULL; char *nodename = utsname()->nodename; + char *string = NULL; + char *tmp_end, *value; + char delim; separator[0] = ','; separator[1] = 0; + delim = separator[0]; /* * does not have to be perfect mapping since field is @@ -981,6 +1236,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, options = mountdata_copy; end = options + strlen(options); + if (strncmp(options, "sep=", 4) == 0) { if (options[4] != 0) { separator[0] = options[4]; @@ -993,609 +1249,652 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->backupgid_specified = false; /* no backup intent for a group */ while ((data = strsep(&options, separator)) != NULL) { + substring_t args[MAX_OPT_ARGS]; + unsigned long option; + int token; + if (!*data) continue; - if ((value = strchr(data, '=')) != NULL) - *value++ = '\0'; - /* Have to parse this before we parse for "user" */ - if (strnicmp(data, "user_xattr", 10) == 0) { + token = match_token(data, cifs_mount_option_tokens, args); + + switch (token) { + + /* Ingnore the following */ + case Opt_ignore: + break; + + /* Boolean values */ + case Opt_user_xattr: vol->no_xattr = 0; - } else if (strnicmp(data, "nouser_xattr", 12) == 0) { + break; + case Opt_nouser_xattr: vol->no_xattr = 1; - } else if (strnicmp(data, "user", 4) == 0) { - if (!value) { - printk(KERN_WARNING - "CIFS: invalid or missing username\n"); - goto cifs_parse_mount_err; - } else if (!*value) { - /* null user, ie anonymous, authentication */ - vol->nullauth = 1; - } - if (strnlen(value, MAX_USERNAME_SIZE) < - MAX_USERNAME_SIZE) { - vol->username = kstrdup(value, GFP_KERNEL); - if (!vol->username) { - printk(KERN_WARNING "CIFS: no memory " - "for username\n"); - goto cifs_parse_mount_err; - } - } else { - printk(KERN_WARNING "CIFS: username too long\n"); - goto cifs_parse_mount_err; - } - } else if (strnicmp(data, "pass", 4) == 0) { - if (!value) { - vol->password = NULL; - continue; - } else if (value[0] == 0) { - /* check if string begins with double comma - since that would mean the password really - does start with a comma, and would not - indicate an empty string */ - if (value[1] != separator[0]) { - vol->password = NULL; - continue; - } - } - temp_len = strlen(value); - /* removed password length check, NTLM passwords - can be arbitrarily long */ - - /* if comma in password, the string will be - prematurely null terminated. Commas in password are - specified across the cifs mount interface by a double - comma ie ,, and a comma used as in other cases ie ',' - as a parameter delimiter/separator is single and due - to the strsep above is temporarily zeroed. */ - - /* NB: password legally can have multiple commas and - the only illegal character in a password is null */ - - if ((value[temp_len] == 0) && - (value + temp_len < end) && - (value[temp_len+1] == separator[0])) { - /* reinsert comma */ - value[temp_len] = separator[0]; - temp_len += 2; /* move after second comma */ - while (value[temp_len] != 0) { - if (value[temp_len] == separator[0]) { - if (value[temp_len+1] == - separator[0]) { - /* skip second comma */ - temp_len++; - } else { - /* single comma indicating start - of next parm */ - break; - } - } - temp_len++; - } - if (value[temp_len] == 0) { - options = NULL; - } else { - value[temp_len] = 0; - /* point option to start of next parm */ - options = value + temp_len + 1; - } - /* go from value to value + temp_len condensing - double commas to singles. Note that this ends up - allocating a few bytes too many, which is ok */ - vol->password = kzalloc(temp_len, GFP_KERNEL); - if (vol->password == NULL) { - printk(KERN_WARNING "CIFS: no memory " - "for password\n"); - goto cifs_parse_mount_err; - } - for (i = 0, j = 0; i < temp_len; i++, j++) { - vol->password[j] = value[i]; - if (value[i] == separator[0] - && value[i+1] == separator[0]) { - /* skip second comma */ - i++; - } - } - vol->password[j] = 0; - } else { - vol->password = kzalloc(temp_len+1, GFP_KERNEL); - if (vol->password == NULL) { - printk(KERN_WARNING "CIFS: no memory " - "for password\n"); - goto cifs_parse_mount_err; - } - strcpy(vol->password, value); - } - } else if (!strnicmp(data, "ip", 2) || - !strnicmp(data, "addr", 4)) { - if (!value || !*value) { - vol->UNCip = NULL; - } else if (strnlen(value, INET6_ADDRSTRLEN) < - INET6_ADDRSTRLEN) { - vol->UNCip = kstrdup(value, GFP_KERNEL); - if (!vol->UNCip) { - printk(KERN_WARNING "CIFS: no memory " - "for UNC IP\n"); - goto cifs_parse_mount_err; - } - } else { - printk(KERN_WARNING "CIFS: ip address " - "too long\n"); - goto cifs_parse_mount_err; - } - } else if (strnicmp(data, "sec", 3) == 0) { - if (!value || !*value) { - cERROR(1, "no security value specified"); - continue; - } else if (strnicmp(value, "krb5i", 5) == 0) { - vol->secFlg |= CIFSSEC_MAY_KRB5 | - CIFSSEC_MUST_SIGN; - } else if (strnicmp(value, "krb5p", 5) == 0) { - /* vol->secFlg |= CIFSSEC_MUST_SEAL | - CIFSSEC_MAY_KRB5; */ - cERROR(1, "Krb5 cifs privacy not supported"); - goto cifs_parse_mount_err; - } else if (strnicmp(value, "krb5", 4) == 0) { - vol->secFlg |= CIFSSEC_MAY_KRB5; - } else if (strnicmp(value, "ntlmsspi", 8) == 0) { - vol->secFlg |= CIFSSEC_MAY_NTLMSSP | - CIFSSEC_MUST_SIGN; - } else if (strnicmp(value, "ntlmssp", 7) == 0) { - vol->secFlg |= CIFSSEC_MAY_NTLMSSP; - } else if (strnicmp(value, "ntlmv2i", 7) == 0) { - vol->secFlg |= CIFSSEC_MAY_NTLMV2 | - CIFSSEC_MUST_SIGN; - } else if (strnicmp(value, "ntlmv2", 6) == 0) { - vol->secFlg |= CIFSSEC_MAY_NTLMV2; - } else if (strnicmp(value, "ntlmi", 5) == 0) { - vol->secFlg |= CIFSSEC_MAY_NTLM | - CIFSSEC_MUST_SIGN; - } else if (strnicmp(value, "ntlm", 4) == 0) { - /* ntlm is default so can be turned off too */ - vol->secFlg |= CIFSSEC_MAY_NTLM; - } else if (strnicmp(value, "nontlm", 6) == 0) { - /* BB is there a better way to do this? */ - vol->secFlg |= CIFSSEC_MAY_NTLMV2; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - } else if (strnicmp(value, "lanman", 6) == 0) { - vol->secFlg |= CIFSSEC_MAY_LANMAN; -#endif - } else if (strnicmp(value, "none", 4) == 0) { - vol->nullauth = 1; - } else { - cERROR(1, "bad security option: %s", value); - goto cifs_parse_mount_err; - } - } else if (strnicmp(data, "vers", 3) == 0) { - if (!value || !*value) { - cERROR(1, "no protocol version specified" - " after vers= mount option"); - } else if ((strnicmp(value, "cifs", 4) == 0) || - (strnicmp(value, "1", 1) == 0)) { - /* this is the default */ - continue; - } - } else if ((strnicmp(data, "unc", 3) == 0) - || (strnicmp(data, "target", 6) == 0) - || (strnicmp(data, "path", 4) == 0)) { - if (!value || !*value) { - printk(KERN_WARNING "CIFS: invalid path to " - "network resource\n"); - goto cifs_parse_mount_err; - } - if ((temp_len = strnlen(value, 300)) < 300) { - vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); - if (vol->UNC == NULL) - goto cifs_parse_mount_err; - strcpy(vol->UNC, value); - if (strncmp(vol->UNC, "//", 2) == 0) { - vol->UNC[0] = '\\'; - vol->UNC[1] = '\\'; - } else if (strncmp(vol->UNC, "\\\\", 2) != 0) { - printk(KERN_WARNING - "CIFS: UNC Path does not begin " - "with // or \\\\ \n"); - goto cifs_parse_mount_err; - } - } else { - printk(KERN_WARNING "CIFS: UNC name too long\n"); - goto cifs_parse_mount_err; - } - } else if ((strnicmp(data, "domain", 3) == 0) - || (strnicmp(data, "workgroup", 5) == 0)) { - if (!value || !*value) { - printk(KERN_WARNING "CIFS: invalid domain name\n"); - goto cifs_parse_mount_err; - } - /* BB are there cases in which a comma can be valid in - a domain name and need special handling? */ - if (strnlen(value, 256) < 256) { - vol->domainname = kstrdup(value, GFP_KERNEL); - if (!vol->domainname) { - printk(KERN_WARNING "CIFS: no memory " - "for domainname\n"); - goto cifs_parse_mount_err; - } - cFYI(1, "Domain name set"); - } else { - printk(KERN_WARNING "CIFS: domain name too " - "long\n"); - goto cifs_parse_mount_err; - } - } else if (strnicmp(data, "srcaddr", 7) == 0) { - vol->srcaddr.ss_family = AF_UNSPEC; - - if (!value || !*value) { - printk(KERN_WARNING "CIFS: srcaddr value" - " not specified.\n"); - goto cifs_parse_mount_err; - } - i = cifs_convert_address((struct sockaddr *)&vol->srcaddr, - value, strlen(value)); - if (i == 0) { - printk(KERN_WARNING "CIFS: Could not parse" - " srcaddr: %s\n", - value); - goto cifs_parse_mount_err; - } - } else if (strnicmp(data, "prefixpath", 10) == 0) { - if (!value || !*value) { - printk(KERN_WARNING - "CIFS: invalid path prefix\n"); - goto cifs_parse_mount_err; - } - if ((temp_len = strnlen(value, 1024)) < 1024) { - if (value[0] != '/') - temp_len++; /* missing leading slash */ - vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); - if (vol->prepath == NULL) - goto cifs_parse_mount_err; - if (value[0] != '/') { - vol->prepath[0] = '/'; - strcpy(vol->prepath+1, value); - } else - strcpy(vol->prepath, value); - cFYI(1, "prefix path %s", vol->prepath); - } else { - printk(KERN_WARNING "CIFS: prefix too long\n"); - goto cifs_parse_mount_err; - } - } else if (strnicmp(data, "iocharset", 9) == 0) { - if (!value || !*value) { - printk(KERN_WARNING "CIFS: invalid iocharset " - "specified\n"); - goto cifs_parse_mount_err; - } - if (strnlen(value, 65) < 65) { - if (strnicmp(value, "default", 7)) { - vol->iocharset = kstrdup(value, - GFP_KERNEL); - - if (!vol->iocharset) { - printk(KERN_WARNING "CIFS: no " - "memory for" - "charset\n"); - goto cifs_parse_mount_err; - } - } - /* if iocharset not set then load_nls_default - is used by caller */ - cFYI(1, "iocharset set to %s", value); - } else { - printk(KERN_WARNING "CIFS: iocharset name " - "too long.\n"); - goto cifs_parse_mount_err; - } - } else if (!strnicmp(data, "uid", 3) && value && *value) { - vol->linux_uid = simple_strtoul(value, &value, 0); - uid_specified = true; - } else if (!strnicmp(data, "cruid", 5) && value && *value) { - vol->cred_uid = simple_strtoul(value, &value, 0); - } else if (!strnicmp(data, "forceuid", 8)) { + break; + case Opt_forceuid: override_uid = 1; - } else if (!strnicmp(data, "noforceuid", 10)) { + break; + case Opt_noforceuid: override_uid = 0; - } else if (!strnicmp(data, "gid", 3) && value && *value) { - vol->linux_gid = simple_strtoul(value, &value, 0); - gid_specified = true; - } else if (!strnicmp(data, "forcegid", 8)) { - override_gid = 1; - } else if (!strnicmp(data, "noforcegid", 10)) { - override_gid = 0; - } else if (strnicmp(data, "file_mode", 4) == 0) { - if (value && *value) { - vol->file_mode = - simple_strtoul(value, &value, 0); - } - } else if (strnicmp(data, "dir_mode", 4) == 0) { - if (value && *value) { - vol->dir_mode = - simple_strtoul(value, &value, 0); - } - } else if (strnicmp(data, "dirmode", 4) == 0) { - if (value && *value) { - vol->dir_mode = - simple_strtoul(value, &value, 0); - } - } else if (strnicmp(data, "port", 4) == 0) { - if (value && *value) { - vol->port = - simple_strtoul(value, &value, 0); - } - } else if (strnicmp(data, "rsize", 5) == 0) { - if (value && *value) { - vol->rsize = - simple_strtoul(value, &value, 0); - } - } else if (strnicmp(data, "wsize", 5) == 0) { - if (value && *value) { - vol->wsize = - simple_strtoul(value, &value, 0); - } - } else if (strnicmp(data, "sockopt", 5) == 0) { - if (!value || !*value) { - cERROR(1, "no socket option specified"); - continue; - } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) { - vol->sockopt_tcp_nodelay = 1; - } - } else if (strnicmp(data, "netbiosname", 4) == 0) { - if (!value || !*value || (*value == ' ')) { - cFYI(1, "invalid (empty) netbiosname"); - } else { - memset(vol->source_rfc1001_name, 0x20, - RFC1001_NAME_LEN); - /* - * FIXME: are there cases in which a comma can - * be valid in workstation netbios name (and - * need special handling)? - */ - for (i = 0; i < RFC1001_NAME_LEN; i++) { - /* don't ucase netbiosname for user */ - if (value[i] == 0) - break; - vol->source_rfc1001_name[i] = value[i]; - } - /* The string has 16th byte zero still from - set at top of the function */ - if (i == RFC1001_NAME_LEN && value[i] != 0) - printk(KERN_WARNING "CIFS: netbiosname" - " longer than 15 truncated.\n"); - } - } else if (strnicmp(data, "servern", 7) == 0) { - /* servernetbiosname specified override *SMBSERVER */ - if (!value || !*value || (*value == ' ')) { - cFYI(1, "empty server netbiosname specified"); - } else { - /* last byte, type, is 0x20 for servr type */ - memset(vol->target_rfc1001_name, 0x20, - RFC1001_NAME_LEN_WITH_NULL); - - for (i = 0; i < 15; i++) { - /* BB are there cases in which a comma can be - valid in this workstation netbios name - (and need special handling)? */ - - /* user or mount helper must uppercase - the netbiosname */ - if (value[i] == 0) - break; - else - vol->target_rfc1001_name[i] = - value[i]; - } - /* The string has 16th byte zero still from - set at top of the function */ - if (i == RFC1001_NAME_LEN && value[i] != 0) - printk(KERN_WARNING "CIFS: server net" - "biosname longer than 15 truncated.\n"); - } - } else if (strnicmp(data, "actimeo", 7) == 0) { - if (value && *value) { - vol->actimeo = HZ * simple_strtoul(value, - &value, 0); - if (vol->actimeo > CIFS_MAX_ACTIMEO) { - cERROR(1, "CIFS: attribute cache" - "timeout too large"); - goto cifs_parse_mount_err; - } - } - } else if (strnicmp(data, "credentials", 4) == 0) { - /* ignore */ - } else if (strnicmp(data, "version", 3) == 0) { - /* ignore */ - } else if (strnicmp(data, "guest", 5) == 0) { - /* ignore */ - } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) { - /* ignore */ - } else if (strnicmp(data, "ro", 2) == 0) { - /* ignore */ - } else if (strnicmp(data, "noblocksend", 11) == 0) { + break; + case Opt_noblocksend: vol->noblocksnd = 1; - } else if (strnicmp(data, "noautotune", 10) == 0) { + break; + case Opt_noautotune: vol->noautotune = 1; - } else if ((strnicmp(data, "suid", 4) == 0) || - (strnicmp(data, "nosuid", 6) == 0) || - (strnicmp(data, "exec", 4) == 0) || - (strnicmp(data, "noexec", 6) == 0) || - (strnicmp(data, "nodev", 5) == 0) || - (strnicmp(data, "noauto", 6) == 0) || - (strnicmp(data, "dev", 3) == 0)) { - /* The mount tool or mount.cifs helper (if present) - uses these opts to set flags, and the flags are read - by the kernel vfs layer before we get here (ie - before read super) so there is no point trying to - parse these options again and set anything and it - is ok to just ignore them */ - continue; - } else if (strnicmp(data, "hard", 4) == 0) { + break; + case Opt_hard: vol->retry = 1; - } else if (strnicmp(data, "soft", 4) == 0) { + break; + case Opt_soft: vol->retry = 0; - } else if (strnicmp(data, "perm", 4) == 0) { + break; + case Opt_perm: vol->noperm = 0; - } else if (strnicmp(data, "noperm", 6) == 0) { + break; + case Opt_noperm: vol->noperm = 1; - } else if (strnicmp(data, "mapchars", 8) == 0) { + break; + case Opt_mapchars: vol->remap = 1; - } else if (strnicmp(data, "nomapchars", 10) == 0) { + break; + case Opt_nomapchars: vol->remap = 0; - } else if (strnicmp(data, "sfu", 3) == 0) { + break; + case Opt_sfu: vol->sfu_emul = 1; - } else if (strnicmp(data, "nosfu", 5) == 0) { + break; + case Opt_nosfu: vol->sfu_emul = 0; - } else if (strnicmp(data, "nodfs", 5) == 0) { + break; + case Opt_nodfs: vol->nodfs = 1; - } else if (strnicmp(data, "posixpaths", 10) == 0) { + break; + case Opt_posixpaths: vol->posix_paths = 1; - } else if (strnicmp(data, "noposixpaths", 12) == 0) { + break; + case Opt_noposixpaths: vol->posix_paths = 0; - } else if (strnicmp(data, "nounix", 6) == 0) { - vol->no_linux_ext = 1; - } else if (strnicmp(data, "nolinux", 7) == 0) { + break; + case Opt_nounix: vol->no_linux_ext = 1; - } else if ((strnicmp(data, "nocase", 6) == 0) || - (strnicmp(data, "ignorecase", 10) == 0)) { + break; + case Opt_nocase: vol->nocase = 1; - } else if (strnicmp(data, "mand", 4) == 0) { - /* ignore */ - } else if (strnicmp(data, "nomand", 6) == 0) { - /* ignore */ - } else if (strnicmp(data, "_netdev", 7) == 0) { - /* ignore */ - } else if (strnicmp(data, "brl", 3) == 0) { + break; + case Opt_brl: vol->nobrl = 0; - } else if ((strnicmp(data, "nobrl", 5) == 0) || - (strnicmp(data, "nolock", 6) == 0)) { + break; + case Opt_nobrl: vol->nobrl = 1; - /* turn off mandatory locking in mode - if remote locking is turned off since the - local vfs will do advisory */ + /* + * turn off mandatory locking in mode + * if remote locking is turned off since the + * local vfs will do advisory + */ if (vol->file_mode == (S_IALLUGO & ~(S_ISUID | S_IXGRP))) vol->file_mode = S_IALLUGO; - } else if (strnicmp(data, "forcemandatorylock", 9) == 0) { - /* will take the shorter form "forcemand" as well */ - /* This mount option will force use of mandatory - (DOS/Windows style) byte range locks, instead of - using posix advisory byte range locks, even if the - Unix extensions are available and posix locks would - be supported otherwise. If Unix extensions are not - negotiated this has no effect since mandatory locks - would be used (mandatory locks is all that those - those servers support) */ + break; + case Opt_forcemandatorylock: vol->mand_lock = 1; - } else if (strnicmp(data, "setuids", 7) == 0) { + break; + case Opt_setuids: vol->setuids = 1; - } else if (strnicmp(data, "nosetuids", 9) == 0) { + break; + case Opt_nosetuids: vol->setuids = 0; - } else if (strnicmp(data, "dynperm", 7) == 0) { + break; + case Opt_dynperm: vol->dynperm = true; - } else if (strnicmp(data, "nodynperm", 9) == 0) { + break; + case Opt_nodynperm: vol->dynperm = false; - } else if (strnicmp(data, "nohard", 6) == 0) { + break; + case Opt_nohard: vol->retry = 0; - } else if (strnicmp(data, "nosoft", 6) == 0) { + break; + case Opt_nosoft: vol->retry = 1; - } else if (strnicmp(data, "nointr", 6) == 0) { + break; + case Opt_nointr: vol->intr = 0; - } else if (strnicmp(data, "intr", 4) == 0) { + break; + case Opt_intr: vol->intr = 1; - } else if (strnicmp(data, "nostrictsync", 12) == 0) { + break; + case Opt_nostrictsync: vol->nostrictsync = 1; - } else if (strnicmp(data, "strictsync", 10) == 0) { + break; + case Opt_strictsync: vol->nostrictsync = 0; - } else if (strnicmp(data, "serverino", 7) == 0) { + break; + case Opt_serverino: vol->server_ino = 1; - } else if (strnicmp(data, "noserverino", 9) == 0) { + break; + case Opt_noserverino: vol->server_ino = 0; - } else if (strnicmp(data, "rwpidforward", 12) == 0) { + break; + case Opt_rwpidforward: vol->rwpidforward = 1; - } else if (strnicmp(data, "cifsacl", 7) == 0) { + break; + case Opt_cifsacl: vol->cifs_acl = 1; - } else if (strnicmp(data, "nocifsacl", 9) == 0) { + break; + case Opt_nocifsacl: vol->cifs_acl = 0; - } else if (strnicmp(data, "acl", 3) == 0) { + break; + case Opt_acl: vol->no_psx_acl = 0; - } else if (strnicmp(data, "noacl", 5) == 0) { + break; + case Opt_noacl: vol->no_psx_acl = 1; - } else if (strnicmp(data, "locallease", 6) == 0) { + break; + case Opt_locallease: vol->local_lease = 1; - } else if (strnicmp(data, "sign", 4) == 0) { + break; + case Opt_sign: vol->secFlg |= CIFSSEC_MUST_SIGN; - } else if (strnicmp(data, "seal", 4) == 0) { + break; + case Opt_seal: /* we do not do the following in secFlags because seal - is a per tree connection (mount) not a per socket - or per-smb connection option in the protocol */ - /* vol->secFlg |= CIFSSEC_MUST_SEAL; */ + * is a per tree connection (mount) not a per socket + * or per-smb connection option in the protocol + * vol->secFlg |= CIFSSEC_MUST_SEAL; + */ vol->seal = 1; - } else if (strnicmp(data, "direct", 6) == 0) { - vol->direct_io = 1; - } else if (strnicmp(data, "forcedirectio", 13) == 0) { + break; + case Opt_direct: vol->direct_io = 1; - } else if (strnicmp(data, "strictcache", 11) == 0) { + break; + case Opt_strictcache: vol->strict_io = 1; - } else if (strnicmp(data, "noac", 4) == 0) { + break; + case Opt_noac: printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); - } else if (strnicmp(data, "fsc", 3) == 0) { + break; + case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " "kernel config option set"); goto cifs_parse_mount_err; #endif vol->fsc = true; - } else if (strnicmp(data, "mfsymlinks", 10) == 0) { + break; + case Opt_mfsymlinks: vol->mfsymlinks = true; - } else if (strnicmp(data, "multiuser", 8) == 0) { + break; + case Opt_multiuser: vol->multiuser = true; - } else if (!strnicmp(data, "backupuid", 9) && value && *value) { - err = kstrtouint(value, 0, &vol->backupuid); - if (err < 0) { + break; + case Opt_sloppy: + sloppy = true; + break; + + /* Numeric Values */ + case Opt_backupuid: + if (get_option_ul(args, &option)) { cERROR(1, "%s: Invalid backupuid value", __func__); goto cifs_parse_mount_err; } + vol->backupuid = option; vol->backupuid_specified = true; - } else if (!strnicmp(data, "backupgid", 9) && value && *value) { - err = kstrtouint(value, 0, &vol->backupgid); - if (err < 0) { + break; + case Opt_backupgid: + if (get_option_ul(args, &option)) { cERROR(1, "%s: Invalid backupgid value", __func__); goto cifs_parse_mount_err; } + vol->backupgid = option; vol->backupgid_specified = true; - } else - printk(KERN_WARNING "CIFS: Unknown mount option %s\n", - data); - } - if (vol->UNC == NULL) { - if (devname == NULL) { - printk(KERN_WARNING "CIFS: Missing UNC name for mount " - "target\n"); - goto cifs_parse_mount_err; - } - if ((temp_len = strnlen(devname, 300)) < 300) { - vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); - if (vol->UNC == NULL) + break; + case Opt_uid: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid uid value", + __func__); + goto cifs_parse_mount_err; + } + vol->linux_uid = option; + uid_specified = true; + break; + case Opt_cruid: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid cruid value", + __func__); + goto cifs_parse_mount_err; + } + vol->cred_uid = option; + break; + case Opt_gid: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid gid value", + __func__); + goto cifs_parse_mount_err; + } + vol->linux_gid = option; + gid_specified = true; + break; + case Opt_file_mode: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid file_mode value", + __func__); + goto cifs_parse_mount_err; + } + vol->file_mode = option; + break; + case Opt_dirmode: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid dir_mode value", + __func__); + goto cifs_parse_mount_err; + } + vol->dir_mode = option; + break; + case Opt_port: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid port value", + __func__); + goto cifs_parse_mount_err; + } + vol->port = option; + break; + case Opt_rsize: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid rsize value", + __func__); + goto cifs_parse_mount_err; + } + vol->rsize = option; + break; + case Opt_wsize: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid wsize value", + __func__); + goto cifs_parse_mount_err; + } + vol->wsize = option; + break; + case Opt_actimeo: + if (get_option_ul(args, &option)) { + cERROR(1, "%s: Invalid actimeo value", + __func__); + goto cifs_parse_mount_err; + } + vol->actimeo = HZ * option; + if (vol->actimeo > CIFS_MAX_ACTIMEO) { + cERROR(1, "CIFS: attribute cache" + "timeout too large"); + goto cifs_parse_mount_err; + } + break; + + /* String Arguments */ + + case Opt_user: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + /* null user, ie. anonymous authentication */ + vol->nullauth = 1; + } else if (strnlen(string, MAX_USERNAME_SIZE) > + MAX_USERNAME_SIZE) { + printk(KERN_WARNING "CIFS: username too long\n"); + goto cifs_parse_mount_err; + } + vol->username = kstrdup(string, GFP_KERNEL); + if (!vol->username) { + printk(KERN_WARNING "CIFS: no memory " + "for username\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_blank_pass: + vol->password = NULL; + break; + case Opt_pass: + /* passwords have to be handled differently + * to allow the character used for deliminator + * to be passed within them + */ + + /* Obtain the value string */ + value = strchr(data, '='); + if (value != NULL) + *value++ = '\0'; + + /* Set tmp_end to end of the string */ + tmp_end = (char *) value + strlen(value); + + /* Check if following character is the deliminator + * If yes, we have encountered a double deliminator + * reset the NULL character to the deliminator + */ + if (tmp_end < end && tmp_end[1] == delim) + tmp_end[0] = delim; + + /* Keep iterating until we get to a single deliminator + * OR the end + */ + while ((tmp_end = strchr(tmp_end, delim)) != NULL && + (tmp_end[1] == delim)) { + tmp_end = (char *) &tmp_end[2]; + } + + /* Reset var options to point to next element */ + if (tmp_end) { + tmp_end[0] = '\0'; + options = (char *) &tmp_end[1]; + } else + /* Reached the end of the mount option string */ + options = end; + + /* Now build new password string */ + temp_len = strlen(value); + vol->password = kzalloc(temp_len+1, GFP_KERNEL); + if (vol->password == NULL) { + printk(KERN_WARNING "CIFS: no memory " + "for password\n"); goto cifs_parse_mount_err; - strcpy(vol->UNC, devname); - if (strncmp(vol->UNC, "//", 2) == 0) { + } + + for (i = 0, j = 0; i < temp_len; i++, j++) { + vol->password[j] = value[i]; + if ((value[i] == delim) && + value[i+1] == delim) + /* skip the second deliminator */ + i++; + } + vol->password[j] = '\0'; + break; + case Opt_ip: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + vol->UNCip = NULL; + } else if (strnlen(string, INET6_ADDRSTRLEN) > + INET6_ADDRSTRLEN) { + printk(KERN_WARNING "CIFS: ip address " + "too long\n"); + goto cifs_parse_mount_err; + } + vol->UNCip = kstrdup(string, GFP_KERNEL); + if (!vol->UNCip) { + printk(KERN_WARNING "CIFS: no memory " + "for UNC IP\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_unc: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: invalid path to " + "network resource\n"); + goto cifs_parse_mount_err; + } + + temp_len = strnlen(string, 300); + if (temp_len == 300) { + printk(KERN_WARNING "CIFS: UNC name too long\n"); + goto cifs_parse_mount_err; + } + + if (strncmp(string, "//", 2) == 0) { vol->UNC[0] = '\\'; vol->UNC[1] = '\\'; - } else if (strncmp(vol->UNC, "\\\\", 2) != 0) { + } else if (strncmp(string, "\\\\", 2) != 0) { printk(KERN_WARNING "CIFS: UNC Path does not " - "begin with // or \\\\ \n"); + "begin with // or \\\\\n"); goto cifs_parse_mount_err; } - value = strpbrk(vol->UNC+2, "/\\"); - if (value) - *value = '\\'; - } else { - printk(KERN_WARNING "CIFS: UNC name too long\n"); + + vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); + if (vol->UNC == NULL) { + printk(KERN_WARNING "CIFS: no memory " + "for UNC\n"); + goto cifs_parse_mount_err; + } + strcpy(vol->UNC, string); + break; + case Opt_domain: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: invalid domain" + " name\n"); + goto cifs_parse_mount_err; + } else if (strnlen(string, 256) == 256) { + printk(KERN_WARNING "CIFS: domain name too" + " long\n"); + goto cifs_parse_mount_err; + } + + vol->domainname = kstrdup(string, GFP_KERNEL); + if (!vol->domainname) { + printk(KERN_WARNING "CIFS: no memory " + "for domainname\n"); + goto cifs_parse_mount_err; + } + cFYI(1, "Domain name set"); + break; + case Opt_srcaddr: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: srcaddr value not" + " specified\n"); + goto cifs_parse_mount_err; + } else if (!cifs_convert_address( + (struct sockaddr *)&vol->srcaddr, + string, strlen(string))) { + printk(KERN_WARNING "CIFS: Could not parse" + " srcaddr: %s\n", string); + goto cifs_parse_mount_err; + } + break; + case Opt_prefixpath: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: Invalid path" + " prefix\n"); + goto cifs_parse_mount_err; + } + temp_len = strnlen(string, 1024); + if (string[0] != '/') + temp_len++; /* missing leading slash */ + if (temp_len > 1024) { + printk(KERN_WARNING "CIFS: prefix too long\n"); + goto cifs_parse_mount_err; + } + + vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); + if (vol->prepath == NULL) { + printk(KERN_WARNING "CIFS: no memory " + "for path prefix\n"); + goto cifs_parse_mount_err; + } + + if (string[0] != '/') { + vol->prepath[0] = '/'; + strcpy(vol->prepath+1, string); + } else + strcpy(vol->prepath, string); + + break; + case Opt_iocharset: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: Invalid iocharset" + " specified\n"); + goto cifs_parse_mount_err; + } else if (strnlen(string, 1024) >= 65) { + printk(KERN_WARNING "CIFS: iocharset name " + "too long.\n"); + goto cifs_parse_mount_err; + } + + if (strnicmp(string, "default", 7) != 0) { + vol->iocharset = kstrdup(string, + GFP_KERNEL); + if (!vol->iocharset) { + printk(KERN_WARNING "CIFS: no memory" + "for charset\n"); + goto cifs_parse_mount_err; + } + } + /* if iocharset not set then load_nls_default + * is used by caller + */ + cFYI(1, "iocharset set to %s", string); + break; + case Opt_sockopt: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: No socket option" + " specified\n"); + goto cifs_parse_mount_err; + } + if (strnicmp(string, "TCP_NODELAY", 11) == 0) + vol->sockopt_tcp_nodelay = 1; + break; + case Opt_netbiosname: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: Invalid (empty)" + " netbiosname\n"); + break; + } + + memset(vol->source_rfc1001_name, 0x20, + RFC1001_NAME_LEN); + /* + * FIXME: are there cases in which a comma can + * be valid in workstation netbios name (and + * need special handling)? + */ + for (i = 0; i < RFC1001_NAME_LEN; i++) { + /* don't ucase netbiosname for user */ + if (string[i] == 0) + break; + vol->source_rfc1001_name[i] = string[i]; + } + /* The string has 16th byte zero still from + * set at top of the function + */ + if (i == RFC1001_NAME_LEN && string[i] != 0) + printk(KERN_WARNING "CIFS: netbiosname" + " longer than 15 truncated.\n"); + + break; + case Opt_servern: + /* servernetbiosname specified override *SMBSERVER */ + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: Empty server" + " netbiosname specified\n"); + break; + } + /* last byte, type, is 0x20 for servr type */ + memset(vol->target_rfc1001_name, 0x20, + RFC1001_NAME_LEN_WITH_NULL); + + /* BB are there cases in which a comma can be + valid in this workstation netbios name + (and need special handling)? */ + + /* user or mount helper must uppercase the + netbios name */ + for (i = 0; i < 15; i++) { + if (string[i] == 0) + break; + vol->target_rfc1001_name[i] = string[i]; + } + /* The string has 16th byte zero still from + set at top of the function */ + if (i == RFC1001_NAME_LEN && string[i] != 0) + printk(KERN_WARNING "CIFS: server net" + "biosname longer than 15 truncated.\n"); + break; + case Opt_ver: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + cERROR(1, "no protocol version specified" + " after vers= mount option"); + goto cifs_parse_mount_err; + } + + if (strnicmp(string, "cifs", 4) == 0 || + strnicmp(string, "1", 1) == 0) { + /* This is the default */ + break; + } + /* For all other value, error */ + printk(KERN_WARNING "CIFS: Invalid version" + " specified\n"); goto cifs_parse_mount_err; + case Opt_sec: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + + if (!*string) { + printk(KERN_WARNING "CIFS: no security flavor" + " specified\n"); + break; + } + + if (cifs_parse_security_flavors(string, vol) != 0) + goto cifs_parse_mount_err; + break; + default: + /* + * An option we don't recognize. Save it off for later + * if we haven't already found one + */ + if (!invalid) + invalid = data; + break; } + /* Free up any allocated string */ + kfree(string); + string = NULL; + } + + if (!sloppy && invalid) { + printk(KERN_ERR "CIFS: Unknown mount option \"%s\"\n", invalid); + goto cifs_parse_mount_err; } #ifndef CONFIG_KEYS @@ -1625,7 +1924,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, kfree(mountdata_copy); return 0; +out_nomem: + printk(KERN_WARNING "Could not allocate temporary buffer\n"); cifs_parse_mount_err: + kfree(string); kfree(mountdata_copy); return 1; } @@ -1977,7 +2279,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cifs_fscache_get_client_cookie(tcp_ses); /* queue echo request delayed work */ - queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); return tcp_ses; @@ -3543,7 +3845,7 @@ remote_path_check: tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); mount_fail_check: @@ -4097,6 +4399,6 @@ cifs_prune_tlinks(struct work_struct *work) } spin_unlock(&cifs_sb->tlink_tree_lock); - queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 159fcc56dc2..460d87b7cda 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1399,7 +1399,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) return rc; } -/* update the file size (if needed) after a write */ +/* + * update the file size (if needed) after a write. Should be called with + * the inode->i_lock held + */ void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written) @@ -1471,7 +1474,9 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, __u32 pid, return rc; } } else { + spin_lock(&dentry->d_inode->i_lock); cifs_update_eof(cifsi, *poffset, bytes_written); + spin_unlock(&dentry->d_inode->i_lock); *poffset += bytes_written; } } @@ -1648,6 +1653,27 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } +/* + * Marshal up the iov array, reserving the first one for the header. Also, + * set wdata->bytes. + */ +static void +cifs_writepages_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata) +{ + int i; + struct inode *inode = wdata->cfile->dentry->d_inode; + loff_t size = i_size_read(inode); + + /* marshal up the pages into iov array */ + wdata->bytes = 0; + for (i = 0; i < wdata->nr_pages; i++) { + iov[i + 1].iov_len = min(size - page_offset(wdata->pages[i]), + (loff_t)PAGE_CACHE_SIZE); + iov[i + 1].iov_base = kmap(wdata->pages[i]); + wdata->bytes += iov[i + 1].iov_len; + } +} + static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1684,7 +1710,8 @@ retry: tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1; - wdata = cifs_writedata_alloc((unsigned int)tofind); + wdata = cifs_writedata_alloc((unsigned int)tofind, + cifs_writev_complete); if (!wdata) { rc = -ENOMEM; break; @@ -1791,6 +1818,7 @@ retry: wdata->sync_mode = wbc->sync_mode; wdata->nr_pages = nr_pages; wdata->offset = page_offset(wdata->pages[0]); + wdata->marshal_iov = cifs_writepages_marshal_iov; do { if (wdata->cfile != NULL) @@ -1802,6 +1830,7 @@ retry: rc = -EBADF; break; } + wdata->pid = wdata->cfile->pid; rc = cifs_async_writev(wdata); } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); @@ -2043,7 +2072,7 @@ cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) unsigned long i; for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(__GFP_HIGHMEM); + pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); if (!pages[i]) { /* * save number of pages we have already allocated and @@ -2051,15 +2080,14 @@ cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) */ num_pages = i; rc = -ENOMEM; - goto error; + break; } } - return rc; - -error: - for (i = 0; i < num_pages; i++) - put_page(pages[i]); + if (rc) { + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + } return rc; } @@ -2070,9 +2098,7 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) size_t clen; clen = min_t(const size_t, len, wsize); - num_pages = clen / PAGE_CACHE_SIZE; - if (clen % PAGE_CACHE_SIZE) - num_pages++; + num_pages = DIV_ROUND_UP(clen, PAGE_SIZE); if (cur_len) *cur_len = clen; @@ -2080,24 +2106,79 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) return num_pages; } +static void +cifs_uncached_marshal_iov(struct kvec *iov, struct cifs_writedata *wdata) +{ + int i; + size_t bytes = wdata->bytes; + + /* marshal up the pages into iov array */ + for (i = 0; i < wdata->nr_pages; i++) { + iov[i + 1].iov_len = min_t(size_t, bytes, PAGE_SIZE); + iov[i + 1].iov_base = kmap(wdata->pages[i]); + bytes -= iov[i + 1].iov_len; + } +} + +static void +cifs_uncached_writev_complete(struct work_struct *work) +{ + int i; + struct cifs_writedata *wdata = container_of(work, + struct cifs_writedata, work); + struct inode *inode = wdata->cfile->dentry->d_inode; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + spin_lock(&inode->i_lock); + cifs_update_eof(cifsi, wdata->offset, wdata->bytes); + if (cifsi->server_eof > inode->i_size) + i_size_write(inode, cifsi->server_eof); + spin_unlock(&inode->i_lock); + + complete(&wdata->done); + + if (wdata->result != -EAGAIN) { + for (i = 0; i < wdata->nr_pages; i++) + put_page(wdata->pages[i]); + } + + kref_put(&wdata->refcount, cifs_writedata_release); +} + +/* attempt to send write to server, retry on any -EAGAIN errors */ +static int +cifs_uncached_retry_writev(struct cifs_writedata *wdata) +{ + int rc; + + do { + if (wdata->cfile->invalidHandle) { + rc = cifs_reopen_file(wdata->cfile, false); + if (rc != 0) + continue; + } + rc = cifs_async_writev(wdata); + } while (rc == -EAGAIN); + + return rc; +} + static ssize_t cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) { - unsigned int written; - unsigned long num_pages, npages, i; + unsigned long nr_pages, i; size_t copied, len, cur_len; ssize_t total_written = 0; - struct kvec *to_send; - struct page **pages; + loff_t offset = *poffset; struct iov_iter it; - struct inode *inode; struct cifsFileInfo *open_file; - struct cifs_tcon *pTcon; + struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; - struct cifs_io_parms io_parms; - int xid, rc; - __u32 pid; + struct cifs_writedata *wdata, *tmp; + struct list_head wdata_list; + int rc; + pid_t pid; len = iov_length(iov, nr_segs); if (!len) @@ -2107,103 +2188,103 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, if (rc) return rc; + INIT_LIST_HEAD(&wdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - num_pages = get_numpages(cifs_sb->wsize, len, &cur_len); - - pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL); - if (!pages) - return -ENOMEM; - - to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL); - if (!to_send) { - kfree(pages); - return -ENOMEM; - } - - rc = cifs_write_allocate_pages(pages, num_pages); - if (rc) { - kfree(pages); - kfree(to_send); - return rc; - } - - xid = GetXid(); open_file = file->private_data; + tcon = tlink_tcon(open_file->tlink); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - pTcon = tlink_tcon(open_file->tlink); - inode = file->f_path.dentry->d_inode; - iov_iter_init(&it, iov, nr_segs, len, 0); - npages = num_pages; - do { - size_t save_len = cur_len; - for (i = 0; i < npages; i++) { - copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE); - copied = iov_iter_copy_from_user(pages[i], &it, 0, - copied); + size_t save_len; + + nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len); + wdata = cifs_writedata_alloc(nr_pages, + cifs_uncached_writev_complete); + if (!wdata) { + rc = -ENOMEM; + break; + } + + rc = cifs_write_allocate_pages(wdata->pages, nr_pages); + if (rc) { + kfree(wdata); + break; + } + + save_len = cur_len; + for (i = 0; i < nr_pages; i++) { + copied = min_t(const size_t, cur_len, PAGE_SIZE); + copied = iov_iter_copy_from_user(wdata->pages[i], &it, + 0, copied); cur_len -= copied; iov_iter_advance(&it, copied); - to_send[i+1].iov_base = kmap(pages[i]); - to_send[i+1].iov_len = copied; } - cur_len = save_len - cur_len; - do { - if (open_file->invalidHandle) { - rc = cifs_reopen_file(open_file, false); - if (rc != 0) - break; - } - io_parms.netfid = open_file->netfid; - io_parms.pid = pid; - io_parms.tcon = pTcon; - io_parms.offset = *poffset; - io_parms.length = cur_len; - rc = CIFSSMBWrite2(xid, &io_parms, &written, to_send, - npages, 0); - } while (rc == -EAGAIN); - - for (i = 0; i < npages; i++) - kunmap(pages[i]); - - if (written) { - len -= written; - total_written += written; - cifs_update_eof(CIFS_I(inode), *poffset, written); - *poffset += written; - } else if (rc < 0) { - if (!total_written) - total_written = rc; + wdata->sync_mode = WB_SYNC_ALL; + wdata->nr_pages = nr_pages; + wdata->offset = (__u64)offset; + wdata->cfile = cifsFileInfo_get(open_file); + wdata->pid = pid; + wdata->bytes = cur_len; + wdata->marshal_iov = cifs_uncached_marshal_iov; + rc = cifs_uncached_retry_writev(wdata); + if (rc) { + kref_put(&wdata->refcount, cifs_writedata_release); break; } - /* get length and number of kvecs of the next write */ - npages = get_numpages(cifs_sb->wsize, len, &cur_len); + list_add_tail(&wdata->list, &wdata_list); + offset += cur_len; + len -= cur_len; } while (len > 0); - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > inode->i_size) - i_size_write(inode, *poffset); - spin_unlock(&inode->i_lock); + /* + * If at least one write was successfully sent, then discard any rc + * value from the later writes. If the other write succeeds, then + * we'll end up returning whatever was written. If it fails, then + * we'll get a new rc value from that. + */ + if (!list_empty(&wdata_list)) + rc = 0; + + /* + * Wait for and collect replies for any successful sends in order of + * increasing offset. Once an error is hit or we get a fatal signal + * while waiting, then return without waiting for any more replies. + */ +restart_loop: + list_for_each_entry_safe(wdata, tmp, &wdata_list, list) { + if (!rc) { + /* FIXME: freezable too? */ + rc = wait_for_completion_killable(&wdata->done); + if (rc) + rc = -EINTR; + else if (wdata->result) + rc = wdata->result; + else + total_written += wdata->bytes; + + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + rc = cifs_uncached_retry_writev(wdata); + goto restart_loop; + } + } + list_del_init(&wdata->list); + kref_put(&wdata->refcount, cifs_writedata_release); } - cifs_stats_bytes_written(pTcon, total_written); - mark_inode_dirty_sync(inode); + if (total_written > 0) + *poffset += total_written; - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - kfree(to_send); - kfree(pages); - FreeXid(xid); - return total_written; + cifs_stats_bytes_written(tcon, total_written); + return total_written ? total_written : (ssize_t)rc; } ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c273c12de98..c29d1aa2c54 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -213,55 +213,62 @@ cifs_small_buf_release(void *buf_to_free) } /* - Find a free multiplex id (SMB mid). Otherwise there could be - mid collisions which might cause problems, demultiplexing the - wrong response to this request. Multiplex ids could collide if - one of a series requests takes much longer than the others, or - if a very large number of long lived requests (byte range - locks or FindNotify requests) are pending. No more than - 64K-1 requests can be outstanding at one time. If no - mids are available, return zero. A future optimization - could make the combination of mids and uid the key we use - to demultiplex on (rather than mid alone). - In addition to the above check, the cifs demultiplex - code already used the command code as a secondary - check of the frame and if signing is negotiated the - response would be discarded if the mid were the same - but the signature was wrong. Since the mid is not put in the - pending queue until later (when it is about to be dispatched) - we do have to limit the number of outstanding requests - to somewhat less than 64K-1 although it is hard to imagine - so many threads being in the vfs at one time. -*/ -__u16 GetNextMid(struct TCP_Server_Info *server) + * Find a free multiplex id (SMB mid). Otherwise there could be + * mid collisions which might cause problems, demultiplexing the + * wrong response to this request. Multiplex ids could collide if + * one of a series requests takes much longer than the others, or + * if a very large number of long lived requests (byte range + * locks or FindNotify requests) are pending. No more than + * 64K-1 requests can be outstanding at one time. If no + * mids are available, return zero. A future optimization + * could make the combination of mids and uid the key we use + * to demultiplex on (rather than mid alone). + * In addition to the above check, the cifs demultiplex + * code already used the command code as a secondary + * check of the frame and if signing is negotiated the + * response would be discarded if the mid were the same + * but the signature was wrong. Since the mid is not put in the + * pending queue until later (when it is about to be dispatched) + * we do have to limit the number of outstanding requests + * to somewhat less than 64K-1 although it is hard to imagine + * so many threads being in the vfs at one time. + */ +__u64 GetNextMid(struct TCP_Server_Info *server) { - __u16 mid = 0; - __u16 last_mid; + __u64 mid = 0; + __u16 last_mid, cur_mid; bool collision; spin_lock(&GlobalMid_Lock); - last_mid = server->CurrentMid; /* we do not want to loop forever */ - server->CurrentMid++; - /* This nested loop looks more expensive than it is. - In practice the list of pending requests is short, - fewer than 50, and the mids are likely to be unique - on the first pass through the loop unless some request - takes longer than the 64 thousand requests before it - (and it would also have to have been a request that - did not time out) */ - while (server->CurrentMid != last_mid) { + + /* mid is 16 bit only for CIFS/SMB */ + cur_mid = (__u16)((server->CurrentMid) & 0xffff); + /* we do not want to loop forever */ + last_mid = cur_mid; + cur_mid++; + + /* + * This nested loop looks more expensive than it is. + * In practice the list of pending requests is short, + * fewer than 50, and the mids are likely to be unique + * on the first pass through the loop unless some request + * takes longer than the 64 thousand requests before it + * (and it would also have to have been a request that + * did not time out). + */ + while (cur_mid != last_mid) { struct mid_q_entry *mid_entry; unsigned int num_mids; collision = false; - if (server->CurrentMid == 0) - server->CurrentMid++; + if (cur_mid == 0) + cur_mid++; num_mids = 0; list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { ++num_mids; - if (mid_entry->mid == server->CurrentMid && - mid_entry->midState == MID_REQUEST_SUBMITTED) { + if (mid_entry->mid == cur_mid && + mid_entry->mid_state == MID_REQUEST_SUBMITTED) { /* This mid is in use, try a different one */ collision = true; break; @@ -282,10 +289,11 @@ __u16 GetNextMid(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedReconnect; if (!collision) { - mid = server->CurrentMid; + mid = (__u64)cur_mid; + server->CurrentMid = mid; break; } - server->CurrentMid++; + cur_mid++; } spin_unlock(&GlobalMid_Lock); return mid; @@ -420,8 +428,10 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) } int -checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read) +checkSMB(char *buf, unsigned int total_read) { + struct smb_hdr *smb = (struct smb_hdr *)buf; + __u16 mid = smb->Mid; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", @@ -502,8 +512,9 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read) } bool -is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) +is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) { + struct smb_hdr *buf = (struct smb_hdr *)buffer; struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; @@ -584,7 +595,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cifs_set_oplock_level(pCifsInode, pSMB->OplockLevel ? OPLOCK_READ : 0); - queue_work(system_nrt_wq, + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -604,16 +615,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) } void -dump_smb(struct smb_hdr *smb_buf, int smb_buf_length) +dump_smb(void *buf, int smb_buf_length) { int i, j; char debug_line[17]; - unsigned char *buffer; + unsigned char *buffer = buf; if (traceSMB == 0) return; - buffer = (unsigned char *) smb_buf; for (i = 0, j = 0; i < smb_buf_length; i++, j++) { if (i % 8 == 0) { /* have reached the beginning of line */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 73e47e84b61..dd23a321bdd 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -836,8 +836,9 @@ ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode) } int -map_smb_to_linux_error(struct smb_hdr *smb, bool logErr) +map_smb_to_linux_error(char *buf, bool logErr) { + struct smb_hdr *smb = (struct smb_hdr *)buf; unsigned int i; int rc = -EIO; /* if transport error smb error may not be set */ __u8 smberrclass; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 310918b6fcb..0961336513d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -60,8 +60,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) memset(temp, 0, sizeof(struct mid_q_entry)); temp->mid = smb_buffer->Mid; /* always LE */ temp->pid = current->pid; - temp->command = smb_buffer->Command; - cFYI(1, "For smb_command %d", temp->command); + temp->command = cpu_to_le16(smb_buffer->Command); + cFYI(1, "For smb_command %d", smb_buffer->Command); /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ /* when mid allocated can be before when sent */ temp->when_alloc = jiffies; @@ -75,7 +75,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) } atomic_inc(&midCount); - temp->midState = MID_REQUEST_ALLOCATED; + temp->mid_state = MID_REQUEST_ALLOCATED; return temp; } @@ -85,9 +85,9 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) #ifdef CONFIG_CIFS_STATS2 unsigned long now; #endif - midEntry->midState = MID_FREE; + midEntry->mid_state = MID_FREE; atomic_dec(&midCount); - if (midEntry->largeBuf) + if (midEntry->large_buf) cifs_buf_release(midEntry->resp_buf); else cifs_small_buf_release(midEntry->resp_buf); @@ -97,8 +97,8 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) something is wrong, unless it is quite a slow link or server */ if ((now - midEntry->when_alloc) > HZ) { if ((cifsFYI & CIFS_TIMER) && - (midEntry->command != SMB_COM_LOCKING_ANDX)) { - printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %d", + (midEntry->command != cpu_to_le16(SMB_COM_LOCKING_ANDX))) { + printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n", now - midEntry->when_alloc, @@ -126,11 +126,11 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) int rc = 0; int i = 0; struct msghdr smb_msg; - struct smb_hdr *smb_buffer = iov[0].iov_base; + __be32 *buf_len = (__be32 *)(iov[0].iov_base); unsigned int len = iov[0].iov_len; unsigned int total_len; int first_vec = 0; - unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length); + unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); struct socket *ssocket = server->ssocket; if (ssocket == NULL) @@ -150,7 +150,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) total_len += iov[i].iov_len; cFYI(1, "Sending smb: total_len %d", total_len); - dump_smb(smb_buffer, len); + dump_smb(iov[0].iov_base, len); i = 0; while (total_len) { @@ -158,24 +158,24 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) n_vec - first_vec, total_len); if ((rc == -ENOSPC) || (rc == -EAGAIN)) { i++; - /* if blocking send we try 3 times, since each can block - for 5 seconds. For nonblocking we have to try more - but wait increasing amounts of time allowing time for - socket to clear. The overall time we wait in either - case to send on the socket is about 15 seconds. - Similarly we wait for 15 seconds for - a response from the server in SendReceive[2] - for the server to send a response back for - most types of requests (except SMB Write - past end of file which can be slow, and - blocking lock operations). NFS waits slightly longer - than CIFS, but this can make it take longer for - nonresponsive servers to be detected and 15 seconds - is more than enough time for modern networks to - send a packet. In most cases if we fail to send - after the retries we will kill the socket and - reconnect which may clear the network problem. - */ + /* + * If blocking send we try 3 times, since each can block + * for 5 seconds. For nonblocking we have to try more + * but wait increasing amounts of time allowing time for + * socket to clear. The overall time we wait in either + * case to send on the socket is about 15 seconds. + * Similarly we wait for 15 seconds for a response from + * the server in SendReceive[2] for the server to send + * a response back for most types of requests (except + * SMB Write past end of file which can be slow, and + * blocking lock operations). NFS waits slightly longer + * than CIFS, but this can make it take longer for + * nonresponsive servers to be detected and 15 seconds + * is more than enough time for modern networks to + * send a packet. In most cases if we fail to send + * after the retries we will kill the socket and + * reconnect which may clear the network problem. + */ if ((i >= 14) || (!server->noblocksnd && (i > 2))) { cERROR(1, "sends on sock %p stuck for 15 seconds", ssocket); @@ -235,9 +235,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) else rc = 0; - /* Don't want to modify the buffer as a - side effect of this call. */ - smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length); + /* Don't want to modify the buffer as a side effect of this call. */ + *buf_len = cpu_to_be32(smb_buf_length); return rc; } @@ -342,13 +341,40 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) int error; error = wait_event_freezekillable(server->response_q, - midQ->midState != MID_REQUEST_SUBMITTED); + midQ->mid_state != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; return 0; } +static int +cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, + unsigned int nvec, struct mid_q_entry **ret_mid) +{ + int rc; + struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; + struct mid_q_entry *mid; + + /* enable signing if server requires it */ + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + mid = AllocMidQEntry(hdr, server); + if (mid == NULL) + return -ENOMEM; + + /* put it on the pending_mid_q */ + spin_lock(&GlobalMid_Lock); + list_add_tail(&mid->qhead, &server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + + rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number); + if (rc) + delete_mid(mid); + *ret_mid = mid; + return rc; +} /* * Send a SMB request and set the callback function in the mid to handle @@ -361,40 +387,24 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, { int rc; struct mid_q_entry *mid; - struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; rc = wait_for_free_request(server, ignore_pend ? CIFS_ASYNC_OP : 0); if (rc) return rc; - /* enable signing if server requires it */ - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - mutex_lock(&server->srv_mutex); - mid = AllocMidQEntry(hdr, server); - if (mid == NULL) { + rc = cifs_setup_async_request(server, iov, nvec, &mid); + if (rc) { mutex_unlock(&server->srv_mutex); cifs_add_credits(server, 1); wake_up(&server->request_q); - return -ENOMEM; - } - - /* put it on the pending_mid_q */ - spin_lock(&GlobalMid_Lock); - list_add_tail(&mid->qhead, &server->pending_mid_q); - spin_unlock(&GlobalMid_Lock); - - rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number); - if (rc) { - mutex_unlock(&server->srv_mutex); - goto out_err; + return rc; } mid->receive = receive; mid->callback = callback; mid->callback_data = cbdata; - mid->midState = MID_REQUEST_SUBMITTED; + mid->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(server); rc = smb_sendv(server, iov, nvec); @@ -424,14 +434,14 @@ out_err: */ int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, - struct smb_hdr *in_buf, int flags) + char *in_buf, int flags) { int rc; struct kvec iov[1]; int resp_buf_type; - iov[0].iov_base = (char *)in_buf; - iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4; + iov[0].iov_base = in_buf; + iov[0].iov_len = get_rfc1002_length(in_buf) + 4; flags |= CIFS_NO_RESP; rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc); @@ -444,11 +454,11 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) { int rc = 0; - cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command, - mid->mid, mid->midState); + cFYI(1, "%s: cmd=%d mid=%llu state=%d", __func__, + le16_to_cpu(mid->command), mid->mid, mid->mid_state); spin_lock(&GlobalMid_Lock); - switch (mid->midState) { + switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: spin_unlock(&GlobalMid_Lock); return rc; @@ -463,8 +473,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) break; default: list_del_init(&mid->qhead); - cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, - mid->mid, mid->midState); + cERROR(1, "%s: invalid mid state mid=%llu state=%d", __func__, + mid->mid, mid->mid_state); rc = -EIO; } spin_unlock(&GlobalMid_Lock); @@ -514,7 +524,7 @@ int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4; + unsigned int len = get_rfc1002_length(mid->resp_buf) + 4; dump_smb(mid->resp_buf, min_t(u32, 92, len)); @@ -534,6 +544,24 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, return map_smb_to_linux_error(mid->resp_buf, log_error); } +static int +cifs_setup_request(struct cifs_ses *ses, struct kvec *iov, + unsigned int nvec, struct mid_q_entry **ret_mid) +{ + int rc; + struct smb_hdr *hdr = (struct smb_hdr *)iov[0].iov_base; + struct mid_q_entry *mid; + + rc = allocate_mid(ses, hdr, &mid); + if (rc) + return rc; + rc = cifs_sign_smb2(iov, nvec, ses->server, &mid->sequence_number); + if (rc) + delete_mid(mid); + *ret_mid = mid; + return rc; +} + int SendReceive2(const unsigned int xid, struct cifs_ses *ses, struct kvec *iov, int n_vec, int *pRespBufType /* ret */, @@ -542,55 +570,53 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, int rc = 0; int long_op; struct mid_q_entry *midQ; - struct smb_hdr *in_buf = iov[0].iov_base; + char *buf = iov[0].iov_base; long_op = flags & CIFS_TIMEOUT_MASK; *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */ if ((ses == NULL) || (ses->server == NULL)) { - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); cERROR(1, "Null session"); return -EIO; } if (ses->server->tcpStatus == CifsExiting) { - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); return -ENOENT; } - /* Ensure that we do not send more than 50 overlapping requests - to the same server. We may make this configurable later or - use ses->maxReq */ + /* + * Ensure that we do not send more than 50 overlapping requests + * to the same server. We may make this configurable later or + * use ses->maxReq. + */ rc = wait_for_free_request(ses->server, long_op); if (rc) { - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); return rc; } - /* make sure that we sign in the same order that we send on this socket - and avoid races inside tcp sendmsg code that could cause corruption - of smb data */ + /* + * Make sure that we sign in the same order that we send on this socket + * and avoid races inside tcp sendmsg code that could cause corruption + * of smb data. + */ mutex_lock(&ses->server->srv_mutex); - rc = allocate_mid(ses, in_buf, &midQ); + rc = cifs_setup_request(ses, iov, n_vec, &midQ); if (rc) { mutex_unlock(&ses->server->srv_mutex); - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); /* Update # of requests on wire to server */ cifs_add_credits(ses->server, 1); return rc; } - rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); - if (rc) { - mutex_unlock(&ses->server->srv_mutex); - cifs_small_buf_release(in_buf); - goto out; - } - midQ->midState = MID_REQUEST_SUBMITTED; + midQ->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(ses->server); rc = smb_sendv(ses->server, iov, n_vec); cifs_in_send_dec(ses->server); @@ -599,30 +625,30 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); goto out; } if (long_op == CIFS_ASYNC_OP) { - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); goto out; } rc = wait_for_response(ses->server, midQ); if (rc != 0) { - send_nt_cancel(ses->server, in_buf, midQ); + send_nt_cancel(ses->server, (struct smb_hdr *)buf, midQ); spin_lock(&GlobalMid_Lock); - if (midQ->midState == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED) { midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); cifs_add_credits(ses->server, 1); return rc; } spin_unlock(&GlobalMid_Lock); } - cifs_small_buf_release(in_buf); + cifs_small_buf_release(buf); rc = cifs_sync_mid_result(midQ, ses->server); if (rc != 0) { @@ -630,15 +656,16 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, return rc; } - if (!midQ->resp_buf || midQ->midState != MID_RESPONSE_RECEIVED) { + if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; cFYI(1, "Bad MID state?"); goto out; } - iov[0].iov_base = (char *)midQ->resp_buf; - iov[0].iov_len = be32_to_cpu(midQ->resp_buf->smb_buf_length) + 4; - if (midQ->largeBuf) + buf = (char *)midQ->resp_buf; + iov[0].iov_base = buf; + iov[0].iov_len = get_rfc1002_length(buf) + 4; + if (midQ->large_buf) *pRespBufType = CIFS_LARGE_BUFFER; else *pRespBufType = CIFS_SMALL_BUFFER; @@ -710,7 +737,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, goto out; } - midQ->midState = MID_REQUEST_SUBMITTED; + midQ->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(ses->server); rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); @@ -728,7 +755,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc != 0) { send_nt_cancel(ses->server, in_buf, midQ); spin_lock(&GlobalMid_Lock); - if (midQ->midState == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); @@ -745,13 +772,13 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } if (!midQ->resp_buf || !out_buf || - midQ->midState != MID_RESPONSE_RECEIVED) { + midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; cERROR(1, "Bad MID state?"); goto out; } - *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length); + *pbytes_returned = get_rfc1002_length(midQ->resp_buf); memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); rc = cifs_check_receive(midQ, ses->server, 0); out: @@ -844,7 +871,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - midQ->midState = MID_REQUEST_SUBMITTED; + midQ->mid_state = MID_REQUEST_SUBMITTED; cifs_in_send_inc(ses->server); rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); cifs_in_send_dec(ses->server); @@ -858,13 +885,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* Wait for a reply - allow signals to interrupt. */ rc = wait_event_interruptible(ses->server->response_q, - (!(midQ->midState == MID_REQUEST_SUBMITTED)) || + (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || ((ses->server->tcpStatus != CifsGood) && (ses->server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ if ((rc == -ERESTARTSYS) && - (midQ->midState == MID_REQUEST_SUBMITTED) && + (midQ->mid_state == MID_REQUEST_SUBMITTED) && ((ses->server->tcpStatus == CifsGood) || (ses->server->tcpStatus == CifsNew))) { @@ -894,7 +921,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { send_nt_cancel(ses->server, in_buf, midQ); spin_lock(&GlobalMid_Lock); - if (midQ->midState == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED) { /* no longer considered to be "in-flight" */ midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); @@ -912,13 +939,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return rc; /* rcvd frame is ok */ - if (out_buf == NULL || midQ->midState != MID_RESPONSE_RECEIVED) { + if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; cERROR(1, "Bad MID state?"); goto out; } - *pbytes_returned = be32_to_cpu(midQ->resp_buf->smb_buf_length); + *pbytes_returned = get_rfc1002_length(midQ->resp_buf); memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4); rc = cifs_check_receive(midQ, ses->server, 0); out: diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 05156c17b55..2870597b5c9 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -21,7 +21,6 @@ #include <linux/vfs.h> #include <linux/slab.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <linux/fs.h> diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 8f616e0e252..761d5b31b18 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -38,7 +38,6 @@ #include <linux/mutex.h> #include <linux/device.h> #include <asm/io.h> -#include <asm/system.h> #include <asm/poll.h> #include <asm/uaccess.h> diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 9727e0c5257..0c68fd31fbf 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -14,7 +14,6 @@ * improvements to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>. */ -#include <asm/system.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/types.h> diff --git a/fs/compat.c b/fs/compat.c index 14483a715bb..f2944ace7a7 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1170,10 +1170,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, } asmlinkage ssize_t -compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) +compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { - loff_t pos = ((loff_t)pos_high << 32) | pos_low; struct file *file; int fput_needed; ssize_t ret; @@ -1190,6 +1189,14 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, return ret; } +asmlinkage ssize_t +compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_low, u32 pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_preadv64(fd, vec, vlen, pos); +} + static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, unsigned long vlen, loff_t *pos) @@ -1229,10 +1236,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, } asmlinkage ssize_t -compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) +compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { - loff_t pos = ((loff_t)pos_high << 32) | pos_low; struct file *file; int fput_needed; ssize_t ret; @@ -1249,6 +1255,14 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, return ret; } +asmlinkage ssize_t +compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_low, u32 pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_pwritev64(fd, vec, vlen, pos); +} + asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, unsigned int nr_segs, unsigned int flags) diff --git a/fs/dcache.c b/fs/dcache.c index e9a07b2a094..b60ddc41d78 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2404,6 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) if (d_ancestor(alias, dentry)) { /* Check for loops */ actual = ERR_PTR(-ELOOP); + spin_unlock(&inode->i_lock); } else if (IS_ROOT(alias)) { /* Is this an anonymous mountpoint that we * could splice into our tree? */ @@ -2413,7 +2414,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) goto found; } else { /* Nope, but we must(!) avoid directory - * aliasing */ + * aliasing. This drops inode->i_lock */ actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 629e9ed99d0..739b0985b39 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,7 +34,6 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> diff --git a/fs/exec.c b/fs/exec.c index 644f6c4eb60..b1fd2025e59 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,7 @@ #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> +#include <asm/exec.h> #include <trace/events/task.h> #include "internal.h" @@ -1027,10 +1028,10 @@ static void flush_old_files(struct files_struct * files) fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->close_on_exec->fds_bits[j]; + set = fdt->close_on_exec[j]; if (!set) continue; - fdt->close_on_exec->fds_bits[j] = 0; + fdt->close_on_exec[j] = 0; spin_unlock(&files->file_lock); for ( ; set ; i++,set >>= 1) { if (set & 1) { @@ -2067,8 +2068,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) fd_install(0, rp); spin_lock(&cf->file_lock); fdt = files_fdtable(cf); - FD_SET(0, fdt->open_fds); - FD_CLR(0, fdt->close_on_exec); + __set_open_fd(0, fdt); + __clear_close_on_exec(0, fdt); spin_unlock(&cf->file_lock); /* and disallow core files too */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 7f2b590a36b..735ca06430a 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -389,7 +389,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait) ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); - fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); fscb->s_magic = cpu_to_le16(sb->s_magic); fscb->s_newfs = 0; fscb->s_version = EXOFS_FSCB_VER; @@ -529,7 +529,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, struct osd_dev_info *odi) { odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); - memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + if (likely(odi->systemid_len)) + memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); odi->osdname = dt_dev->osdname; @@ -565,7 +566,7 @@ int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); if (unlikely(!aoded)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", numdevs); return -ENOMEM; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 75ad433c669..0b2b4db5bdc 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -1,5 +1,636 @@ +/* + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ #include <linux/fs.h> #include <linux/ext2_fs.h> +#include <linux/blockgroup_lock.h> +#include <linux/percpu_counter.h> +#include <linux/rbtree.h> + +/* XXX Here for now... not interested in restructing headers JUST now */ + +/* data type for block offset of block group */ +typedef int ext2_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext2_fsblk_t; + +#define E2FSBLK "%lu" + +struct ext2_reserve_window { + ext2_fsblk_t _rsv_start; /* First byte reserved */ + ext2_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext2_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext2_reserve_window rsv_window; +}; + +struct ext2_block_alloc_info { + /* information about reservation window */ + struct ext2_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext2_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext2_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext2_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * second extended-fs super-block data in memory + */ +struct ext2_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext2_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + unsigned long s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + unsigned long s_dir_count; + u8 *s_debts; + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock *s_blockgroup_lock; + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext2_reserve_window_node s_rsv_window_head; + /* + * s_lock protects against concurrent modifications of s_mount_state, + * s_blocks_last, s_overhead_last and the content of superblock's + * buffer pointed to by sbi->s_es. + * + * Note: It is used in ext2_show_options() to provide a consistent view + * of the mount options. + */ + spinlock_t s_lock; +}; + +static inline spinlock_t * +sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} + +/* + * Define EXT2FS_DEBUG to produce debug messages + */ +#undef EXT2FS_DEBUG + +/* + * Define EXT2_RESERVATION to reserve data blocks for expanding files + */ +#define EXT2_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT2_MAX_RESERVE_BLOCKS 1027 +#define EXT2_RESERVE_WINDOW_NOT_ALLOCATED 0 +/* + * The second extended file system version + */ +#define EXT2FS_DATE "95/08/09" +#define EXT2FS_VERSION "0.5b" + +/* + * Debug code + */ +#ifdef EXT2FS_DEBUG +# define ext2_debug(f, a...) { \ + printk ("EXT2-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (f, ## a); \ + } +#else +# define ext2_debug(f, a...) /**/ +#endif + +/* + * Special inode numbers + */ +#define EXT2_BAD_INO 1 /* Bad blocks inode */ +#define EXT2_ROOT_INO 2 /* Root inode */ +#define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +/* First non-reserved inode for old ext2 filesystems */ +#define EXT2_GOOD_OLD_FIRST_INO 11 + +static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT2_MIN_BLOCK_SIZE 1024 +#define EXT2_MAX_BLOCK_SIZE 4096 +#define EXT2_MIN_BLOCK_LOG_SIZE 10 +#define EXT2_BLOCK_SIZE(s) ((s)->s_blocksize) +#define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#define EXT2_ADDR_PER_BLOCK_BITS(s) (EXT2_SB(s)->s_addr_per_block_bits) +#define EXT2_INODE_SIZE(s) (EXT2_SB(s)->s_inode_size) +#define EXT2_FIRST_INO(s) (EXT2_SB(s)->s_first_ino) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT2_MIN_FRAG_SIZE 1024 +#define EXT2_MAX_FRAG_SIZE 4096 +#define EXT2_MIN_FRAG_LOG_SIZE 10 +#define EXT2_FRAG_SIZE(s) (EXT2_SB(s)->s_frag_size) +#define EXT2_FRAGS_PER_BLOCK(s) (EXT2_SB(s)->s_frags_per_block) + +/* + * Structure of a blocks group descriptor + */ +struct ext2_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT2_BLOCKS_PER_GROUP(s) (EXT2_SB(s)->s_blocks_per_group) +#define EXT2_DESC_PER_BLOCK(s) (EXT2_SB(s)->s_desc_per_block) +#define EXT2_INODES_PER_GROUP(s) (EXT2_SB(s)->s_inodes_per_group) +#define EXT2_DESC_PER_BLOCK_BITS(s) (EXT2_SB(s)->s_desc_per_block_bits) + +/* + * Constants relative to the data blocks + */ +#define EXT2_NDIR_BLOCKS 12 +#define EXT2_IND_BLOCK EXT2_NDIR_BLOCKS +#define EXT2_DIND_BLOCK (EXT2_IND_BLOCK + 1) +#define EXT2_TIND_BLOCK (EXT2_DIND_BLOCK + 1) +#define EXT2_N_BLOCKS (EXT2_TIND_BLOCK + 1) + +/* + * Inode flags (GETFLAGS/SETFLAGS) + */ +#define EXT2_SECRM_FL FS_SECRM_FL /* Secure deletion */ +#define EXT2_UNRM_FL FS_UNRM_FL /* Undelete */ +#define EXT2_COMPR_FL FS_COMPR_FL /* Compress file */ +#define EXT2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ +#define EXT2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ +#define EXT2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ +#define EXT2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ +#define EXT2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT2_DIRTY_FL FS_DIRTY_FL +#define EXT2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ +#define EXT2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ +#define EXT2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT2_BTREE_FL FS_BTREE_FL /* btree format dir */ +#define EXT2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ +#define EXT2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ +#define EXT2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ +#define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ +#define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ +#define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ +#define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + +#define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ +#define EXT2_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\ + EXT2_SYNC_FL | EXT2_NODUMP_FL |\ + EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\ + EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\ + EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT2_REG_FLMASK (~(EXT2_DIRSYNC_FL | EXT2_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT2_OTHER_FLMASK (EXT2_NODUMP_FL | EXT2_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT2_REG_FLMASK; + else + return flags & EXT2_OTHER_FLMASK; +} + +/* + * ioctl commands + */ +#define EXT2_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT2_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT2_IOC_GETVERSION FS_IOC_GETVERSION +#define EXT2_IOC_SETVERSION FS_IOC_SETVERSION +#define EXT2_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT2_IOC_SETRSVSZ _IOW('f', 6, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT2_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT2_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT2_IOC32_GETVERSION FS_IOC32_GETVERSION +#define EXT2_IOC32_SETVERSION FS_IOC32_SETVERSION + +/* + * Structure of an inode on the disk + */ +struct ext2_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_reserved1; + } linux1; + struct { + __le32 h_i_translator; + } hurd1; + struct { + __le32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT2_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __le16 h_i_mode_high; + __le16 h_i_uid_high; + __le16 h_i_gid_high; + __le32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ +}; + +#define i_size_high i_dir_acl + +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +/* + * File system states + */ +#define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT2_ERROR_FS 0x0002 /* Errors detected */ + +/* + * Mount flags + */ +#define EXT2_MOUNT_CHECK 0x000001 /* Do mount-time checks */ +#define EXT2_MOUNT_OLDALLOC 0x000002 /* Don't use the new Orlov allocator */ +#define EXT2_MOUNT_GRPID 0x000004 /* Create files with directory's group */ +#define EXT2_MOUNT_DEBUG 0x000008 /* Some debugging messages */ +#define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */ +#define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */ +#define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */ +#define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */ +#define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */ +#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ +#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ +#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ +#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ +#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ +#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ + + +#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt +#define set_opt(o, opt) o |= EXT2_MOUNT_##opt +#define test_opt(sb, opt) (EXT2_SB(sb)->s_mount_opt & \ + EXT2_MOUNT_##opt) +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT2_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT2_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT2_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT2_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT2_ERRORS_PANIC 3 /* Panic */ +#define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext2_super_block { + __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ + __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ + __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ + __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ + __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ + __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT2_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ + __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ + __u8 s_uuid[16]; /* 128-bit uuid for volume */ + char s_volume_name[16]; /* volume name */ + char s_last_mounted[64]; /* directory where last mounted */ + __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT2_COMPAT_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __u16 s_padding1; + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ + __u8 s_journal_uuid[16]; /* uuid of journal superblock */ + __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ + __u32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __u32 s_reserved[190]; /* Padding to the end of the block */ +}; + +/* + * Codes for operating systems + */ +#define EXT2_OS_LINUX 0 +#define EXT2_OS_HURD 1 +#define EXT2_OS_MASIX 2 +#define EXT2_OS_FREEBSD 3 +#define EXT2_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT2_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT2_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT2_CURRENT_REV EXT2_GOOD_OLD_REV +#define EXT2_MAX_SUPP_REV EXT2_DYNAMIC_REV + +#define EXT2_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT2_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT2_SET_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT2_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT2_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT2_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT2_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT2_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT2_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT2_FEATURE_COMPAT_RESIZE_INO 0x0010 +#define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT2_FEATURE_COMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT2_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT2_FEATURE_RO_COMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 +#define EXT2_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \ + EXT2_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT2_FEATURE_RO_COMPAT_BTREE_DIR) +#define EXT2_FEATURE_RO_COMPAT_UNSUPPORTED ~EXT2_FEATURE_RO_COMPAT_SUPP +#define EXT2_FEATURE_INCOMPAT_UNSUPPORTED ~EXT2_FEATURE_INCOMPAT_SUPP + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT2_DEF_RESUID 0 +#define EXT2_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT2_DEFM_DEBUG 0x0001 +#define EXT2_DEFM_BSDGROUPS 0x0002 +#define EXT2_DEFM_XATTR_USER 0x0004 +#define EXT2_DEFM_ACL 0x0008 +#define EXT2_DEFM_UID16 0x0010 + /* Not used by ext2, but reserved for use by ext3 */ +#define EXT3_DEFM_JMODE 0x0060 +#define EXT3_DEFM_JMODE_DATA 0x0020 +#define EXT3_DEFM_JMODE_ORDERED 0x0040 +#define EXT3_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ + +struct ext2_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[]; /* File name, up to EXT2_NAME_LEN */ +}; + +/* + * The new version of the directory entry. Since EXT2 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext2_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[]; /* File name, up to EXT2_NAME_LEN */ +}; + +/* + * Ext2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +enum { + EXT2_FT_UNKNOWN = 0, + EXT2_FT_REG_FILE = 1, + EXT2_FT_DIR = 2, + EXT2_FT_CHRDEV = 3, + EXT2_FT_BLKDEV = 4, + EXT2_FT_FIFO = 5, + EXT2_FT_SOCK = 6, + EXT2_FT_SYMLINK = 7, + EXT2_FT_MAX +}; + +/* + * EXT2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT2_DIR_PAD 4 +#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1) +#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \ + ~EXT2_DIR_ROUND) +#define EXT2_MAX_REC_LEN ((1<<16)-1) + +static inline void verify_offsets(void) +{ +#define A(x,y) BUILD_BUG_ON(x != offsetof(struct ext2_super_block, y)); + A(EXT2_SB_MAGIC_OFFSET, s_magic); + A(EXT2_SB_BLOCKS_OFFSET, s_blocks_count); + A(EXT2_SB_BSIZE_OFFSET, s_log_block_size); +#undef A +} /* * ext2 mount options diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index be7a8d02c9a..cfedb2cb0d8 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -3,10 +3,7 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext2_fs.h> +#include "ext2.h" #include <linux/security.h> #include "xattr.h" diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 2989467d359..7e192574c00 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -5,10 +5,7 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/string.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext2_fs.h> +#include "ext2.h" #include "xattr.h" static size_t diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index 322a56b2dfb..1c3312858fc 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -9,8 +9,6 @@ #include <linux/fs.h> #include <linux/genhd.h> #include <linux/buffer_head.h> -#include <linux/ext2_fs_sb.h> -#include <linux/ext2_fs.h> #include <linux/blkdev.h> #include "ext2.h" #include "xip.h" diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 3091f62e55b..c76832c8d19 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -4,13 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a2038928f9a..baac1b129fb 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -11,17 +11,9 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <linux/time.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <trace/events/ext3.h> +#include "ext3.h" /* * balloc.c contains the blocks allocation and deallocation routines @@ -1743,8 +1735,11 @@ allocated: *errp = 0; brelse(bitmap_bh); - dquot_free_block(inode, *count-num); - *count = num; + + if (num < *count) { + dquot_free_block(inode, *count-num); + *count = num; + } trace_ext3_allocate_blocks(inode, goal, num, (unsigned long long)ret_block); @@ -1970,7 +1965,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, sbi = EXT3_SB(sb); /* Walk through the whole group */ - while (start < max) { + while (start <= max) { start = bitmap_search_next_usable_block(start, bitmap_bh, max); if (start < 0) break; @@ -1980,7 +1975,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, * Allocate contiguous free extents by setting bits in the * block bitmap */ - while (next < max + while (next <= max && claim_block(sb_bgl_lock(sbi, group), next, bitmap_bh)) { next++; @@ -2091,73 +2086,74 @@ err_out: */ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) { - ext3_grpblk_t last_block, first_block, free_blocks; - unsigned long first_group, last_group; - unsigned long group, ngroups; + ext3_grpblk_t last_block, first_block; + unsigned long group, first_group, last_group; struct ext3_group_desc *gdp; struct ext3_super_block *es = EXT3_SB(sb)->s_es; - uint64_t start, len, minlen, trimmed; + uint64_t start, minlen, end, trimmed = 0; + ext3_fsblk_t first_data_blk = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); int ret = 0; - start = (range->start >> sb->s_blocksize_bits) + - le32_to_cpu(es->s_first_data_block); - len = range->len >> sb->s_blocksize_bits; + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - trimmed = 0; - if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) + if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || + unlikely(start >= max_blks)) return -EINVAL; - if (start >= max_blks) - return -EINVAL; - if (start + len > max_blks) - len = max_blks - start; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; - ngroups = EXT3_SB(sb)->s_groups_count; smp_rmb(); /* Determine first and last group to examine based on start and len */ ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, &first_group, &first_block); - ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len), + ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, &last_group, &last_block); - last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_block = EXT3_BLOCKS_PER_GROUP(sb); - if (first_group > last_group) - return -EINVAL; + /* end now represents the last block to discard in this group */ + end = EXT3_BLOCKS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { gdp = ext3_get_group_desc(sb, group, NULL); if (!gdp) break; - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - if (free_blocks < minlen) - continue; - /* * For all the groups except the last one, last block will - * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to - * change it for the last group in which case first_block + - * len < EXT3_BLOCKS_PER_GROUP(sb). + * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_block is + * already computed earlier by ext3_get_group_no_and_offset() */ - if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) - last_block = first_block + len; - len -= last_block - first_block; + if (group == last_group) + end = last_block; - ret = ext3_trim_all_free(sb, group, first_block, - last_block, minlen); - if (ret < 0) - break; + if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { + ret = ext3_trim_all_free(sb, group, first_block, + end, minlen); + if (ret < 0) + break; + trimmed += ret; + } - trimmed += ret; + /* + * For every group except the first one, we are sure + * that the first block to discard will be block #0. + */ first_block = 0; } - if (ret >= 0) + if (ret > 0) ret = 0; - range->len = trimmed * sb->s_blocksize; +out: + range->len = trimmed * sb->s_blocksize; return ret; } diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c index 6afc39d8025..909d13e2656 100644 --- a/fs/ext3/bitmap.c +++ b/fs/ext3/bitmap.c @@ -7,9 +7,7 @@ * Universite Pierre et Marie Curie (Paris VI) */ -#include <linux/buffer_head.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #ifdef EXT3FS_DEBUG diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 34f0a072b93..cc761ad8fa5 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -21,12 +21,7 @@ * */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/rbtree.h> +#include "ext3.h" static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h new file mode 100644 index 00000000000..b6515fd7e56 --- /dev/null +++ b/fs/ext3/ext3.h @@ -0,0 +1,1322 @@ +/* + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/fs.h> +#include <linux/jbd.h> +#include <linux/magic.h> +#include <linux/bug.h> +#include <linux/blockgroup_lock.h> + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT3FS_DEBUG to produce debug messages + */ +#undef EXT3FS_DEBUG + +/* + * Define EXT3_RESERVATION to reserve data blocks for expanding files + */ +#define EXT3_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT3_MAX_RESERVE_BLOCKS 1027 +#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0 + +/* + * Debug code + */ +#ifdef EXT3FS_DEBUG +#define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext3_debug(f, a...) do {} while (0) +#endif + +/* + * Special inodes numbers + */ +#define EXT3_BAD_INO 1 /* Bad blocks inode */ +#define EXT3_ROOT_INO 2 /* Root inode */ +#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT3_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext3 filesystems */ +#define EXT3_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT3_LINK_MAX 32000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT3_MIN_BLOCK_SIZE 1024 +#define EXT3_MAX_BLOCK_SIZE 65536 +#define EXT3_MIN_BLOCK_LOG_SIZE 10 +#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) +#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) +#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT3_MIN_FRAG_SIZE 1024 +#define EXT3_MAX_FRAG_SIZE 4096 +#define EXT3_MIN_FRAG_LOG_SIZE 10 +#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) +#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) + +/* + * Structure of a blocks group descriptor + */ +struct ext3_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) +#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) +#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) +#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) + +/* + * Constants relative to the data blocks + */ +#define EXT3_NDIR_BLOCKS 12 +#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS +#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) +#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) +#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT3_UNRM_FL 0x00000002 /* Undelete */ +#define EXT3_COMPR_FL 0x00000004 /* Compress file */ +#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT3_DIRTY_FL 0x00000100 +#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ + EXT3_SYNC_FL | EXT3_NODUMP_FL |\ + EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ + EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ + EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT3_REG_FLMASK; + else + return flags & EXT3_OTHER_FLMASK; +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext3_new_group_input { + __u32 group; /* Group number for this data */ + __u32 block_bitmap; /* Absolute block number of block bitmap */ + __u32 inode_bitmap; /* Absolute block number of inode bitmap */ + __u32 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ +struct ext3_new_group_data { + __u32 group; + __u32 block_bitmap; + __u32 inode_bitmap; + __u32 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + + +/* + * ioctl commands + */ +#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT3_IOC_GETVERSION _IOR('f', 3, long) +#define EXT3_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) +#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT3_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT3_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + + +/* + * Mount options + */ +struct ext3_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext3_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; +}; + +#define i_size_high i_dir_acl + +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +/* + * File system states + */ +#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT3_ERROR_FS 0x0002 /* Errors detected */ +#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +/* EXT3_MOUNT_OLDALLOC was there */ +#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ + +/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt +#define set_opt(o, opt) o |= EXT3_MOUNT_##opt +#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ + EXT3_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS +#endif + +#define ext3_set_bit __set_bit_le +#define ext3_set_bit_atomic ext2_set_bit_atomic +#define ext3_clear_bit __clear_bit_le +#define ext3_clear_bit_atomic ext2_clear_bit_atomic +#define ext3_test_bit test_bit_le +#define ext3_find_next_zero_bit find_next_zero_bit_le + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT3_ERRORS_PANIC 3 /* Panic */ +#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext3_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ +}; + +/* data type for block offset of block group */ +typedef int ext3_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext3_fsblk_t; + +#define E3FSBLK "%lu" + +struct ext3_reserve_window { + ext3_fsblk_t _rsv_start; /* First byte reserved */ + ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext3_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext3_reserve_window rsv_window; +}; + +struct ext3_block_alloc_info { + /* information about reservation window */ + struct ext3_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext3_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext3_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext3_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * third extended file system inode data in memory + */ +struct ext3_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; +#ifdef EXT3_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; +#endif + ext3_fsblk_t i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + unsigned long i_state_flags; /* Dynamic state flags for ext3 */ + + /* block reservation info */ + struct ext3_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; +#ifdef CONFIG_EXT3_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3_get_block (growth) and ext3_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * truncate_mutex is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_mutex. + */ + struct mutex truncate_mutex; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + atomic_t i_sync_tid; + atomic_t i_datasync_tid; + + struct inode vfs_inode; +}; + +/* + * third extended-fs super-block data in memory + */ +struct ext3_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + ext3_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock *s_blockgroup_lock; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext3_reserve_window_node s_rsv_window_head; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif +}; + +static inline spinlock_t * +sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} + +static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext3_inode_info *EXT3_I(struct inode *inode) +{ + return container_of(inode, struct ext3_inode_info, vfs_inode); +} + +static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT3_ROOT_INO || + ino == EXT3_JOURNAL_INO || + ino == EXT3_RESIZE_INO || + (ino >= EXT3_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state_flags); +} + +#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT3_OS_LINUX 0 +#define EXT3_OS_HURD 1 +#define EXT3_OS_MASIX 2 +#define EXT3_OS_FREEBSD 3 +#define EXT3_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV +#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + +#define EXT3_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + +#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 + +#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER| \ + EXT3_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT3_DEF_RESUID 0 +#define EXT3_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT3_DEFM_DEBUG 0x0001 +#define EXT3_DEFM_BSDGROUPS 0x0002 +#define EXT3_DEFM_XATTR_USER 0x0004 +#define EXT3_DEFM_ACL 0x0008 +#define EXT3_DEFM_UID16 0x0010 +#define EXT3_DEFM_JMODE 0x0060 +#define EXT3_DEFM_JMODE_DATA 0x0020 +#define EXT3_DEFM_JMODE_ORDERED 0x0040 +#define EXT3_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT3_NAME_LEN 255 + +struct ext3_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext3_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT3_FT_UNKNOWN 0 +#define EXT3_FT_REG_FILE 1 +#define EXT3_FT_DIR 2 +#define EXT3_FT_CHRDEV 3 +#define EXT3_FT_BLKDEV 4 +#define EXT3_FT_FIFO 5 +#define EXT3_FT_SOCK 6 +#define EXT3_FT_SYMLINK 7 + +#define EXT3_FT_MAX 8 + +/* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT3_DIR_PAD 4 +#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) +#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) +#define EXT3_MAX_REC_LEN ((1<<16)-1) + +/* + * Tests against MAX_REC_LEN etc were put in place for 64k block + * sizes; if that is not possible on this arch, we can skip + * those tests and speed things up. + */ +static inline unsigned ext3_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT3_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 ext3_rec_len_to_disk(unsigned len) +{ +#if (PAGE_CACHE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(EXT3_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); +#endif + return cpu_to_le16(len); +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT3_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) +#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) +#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + +#define EXT3_HTREE_EOF 0x7fffffff + +/* + * Control parameters used by ext3_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext3_iloc +{ + struct buffer_head *bh; + unsigned long offset; + unsigned long block_group; +}; + +static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc) +{ + return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext3_fsblk_t +ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in <linux/kernel.h> but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern int ext3_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp); +extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp); +extern void ext3_free_blocks (handle_t *handle, struct inode *inode, + ext3_fsblk_t block, unsigned long count); +extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *); +extern void ext3_check_blocks_bitmap (struct super_block *); +extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext3_init_block_alloc_info(struct inode *); +extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); +extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range); + +/* dir.c */ +extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); +extern void ext3_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext3_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext3fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext3_new_inode (handle_t *, struct inode *, + const struct qstr *, umode_t); +extern void ext3_free_inode (handle_t *, struct inode *); +extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); +extern unsigned long ext3_count_free_inodes (struct super_block *); +extern unsigned long ext3_count_dirs (struct super_block *); +extern void ext3_check_inodes_bitmap (struct super_block *); +extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + +/* inode.c */ +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr); +struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, + int create); + +extern struct inode *ext3_iget(struct super_block *, unsigned long); +extern int ext3_write_inode (struct inode *, struct writeback_control *); +extern int ext3_setattr (struct dentry *, struct iattr *); +extern void ext3_evict_inode (struct inode *); +extern int ext3_sync_inode (handle_t *, struct inode *); +extern void ext3_discard_reservation (struct inode *); +extern void ext3_dirty_inode(struct inode *, int); +extern int ext3_change_inode_journal_flag(struct inode *, int); +extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); +extern int ext3_can_truncate(struct inode *inode); +extern void ext3_truncate(struct inode *inode); +extern void ext3_set_inode_flags(struct inode *); +extern void ext3_get_inode_flags(struct ext3_inode_info *); +extern void ext3_set_aops(struct inode *inode); +extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); + +/* ioctl.c */ +extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* namei.c */ +extern int ext3_orphan_add(handle_t *, struct inode *); +extern int ext3_orphan_del(handle_t *, struct inode *); +extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext3_group_add(struct super_block *sb, + struct ext3_new_group_data *input); +extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + ext3_fsblk_t n_blocks_count); + +/* super.c */ +extern __printf(3, 4) +void ext3_error(struct super_block *, const char *, const char *, ...); +extern void __ext3_std_error (struct super_block *, const char *, int); +extern __printf(3, 4) +void ext3_abort(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_warning(struct super_block *, const char *, const char *, ...); +extern __printf(3, 4) +void ext3_msg(struct super_block *, const char *, const char *, ...); +extern void ext3_update_dynamic_rev (struct super_block *sb); + +#define ext3_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext3_std_error((sb), __func__, (errno)); \ +} while (0) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext3_dir_operations; + +/* file.c */ +extern const struct inode_operations ext3_file_inode_operations; +extern const struct file_operations ext3_file_operations; + +/* namei.c */ +extern const struct inode_operations ext3_dir_inode_operations; +extern const struct inode_operations ext3_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext3_symlink_inode_operations; +extern const struct inode_operations ext3_fast_symlink_inode_operations; + +#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + +#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT3_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ + EXT3_XATTR_TRANS_BLOCKS - 2 + \ + EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT3_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT3_RESERVE_TRANS_BLOCKS 12U + +#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) + +int +ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc); + +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext3 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3 can control + * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext3_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + journal_release_buffer(handle, bh); +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext3_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh); + +int __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext3_journal_get_undo_access(handle, bh) \ + __ext3_journal_get_undo_access(__func__, (handle), (bh)) +#define ext3_journal_get_write_access(handle, bh) \ + __ext3_journal_get_write_access(__func__, (handle), (bh)) +#define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext3_journal_get_create_access(handle, bh) \ + __ext3_journal_get_create_access(__func__, (handle), (bh)) +#define ext3_journal_dirty_metadata(handle, bh) \ + __ext3_journal_dirty_metadata(__func__, (handle), (bh)) +#define ext3_journal_forget(handle, bh) \ + __ext3_journal_forget(__func__, (handle), (bh)) + +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); +int __ext3_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) +{ + return ext3_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext3_journal_stop(handle) \ + __ext3_journal_stop(__func__, (handle)) + +static inline handle_t *ext3_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext3_journal_extend(handle_t *handle, int nblocks) +{ + return journal_extend(handle, nblocks); +} + +static inline int ext3_journal_restart(handle_t *handle, int nblocks) +{ + return journal_restart(handle, nblocks); +} + +static inline int ext3_journal_blocks_per_page(struct inode *inode) +{ + return journal_blocks_per_page(inode); +} + +static inline int ext3_journal_force_commit(journal_t *journal) +{ + return journal_force_commit(journal); +} + +/* super.c */ +int ext3_force_commit(struct super_block *sb); + +static inline int ext3_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext3_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext3_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#include <trace/events/ext3.h> diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c index d401f148d74..785a3261a26 100644 --- a/fs/ext3/ext3_jbd.c +++ b/fs/ext3/ext3_jbd.c @@ -2,7 +2,7 @@ * Interface between ext3 and JBD */ -#include <linux/ext3_jbd.h> +#include "ext3.h" int __ext3_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 724df69847d..25cb413277e 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -18,12 +18,8 @@ * (jj@sunsite.ms.mff.cuni.cz) */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> #include <linux/quotaops.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 1860ed35632..d4dff278cbd 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -22,15 +22,9 @@ * we can depend on generic_block_fdatasync() to sync the data blocks. */ -#include <linux/time.h> #include <linux/blkdev.h> -#include <linux/fs.h> -#include <linux/sched.h> #include <linux/writeback.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <trace/events/ext3.h> +#include "ext3.h" /* * akpm: A new design for ext3_sync_file(). diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index 7d215b4d4f2..d10231ddcf8 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -9,9 +9,7 @@ * License. */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include <linux/cryptohash.h> #define DELTA 0x9E3779B9 diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 1cde2843801..e3c39e4cec1 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -12,21 +12,10 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> #include <linux/random.h> -#include <linux/bitops.h> -#include <trace/events/ext3.h> - -#include <asm/byteorder.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2d0afeca0b4..10d7812f602 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,22 +22,12 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/ext3_jbd.h> -#include <linux/jbd.h> #include <linux/highuid.h> -#include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/mpage.h> -#include <linux/uio.h> -#include <linux/bio.h> -#include <linux/fiemap.h> #include <linux/namei.h> -#include <trace/events/ext3.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -756,6 +746,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; block_i = ei->i_block_alloc_info; /* @@ -795,9 +786,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } /* ext3_mark_inode_dirty already updated i_sync_tid */ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 4af574ce4a4..677a5c27dc6 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -7,15 +7,10 @@ * Universite Pierre et Marie Curie (Paris VI) */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/capability.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> #include <linux/mount.h> -#include <linux/time.h> #include <linux/compat.h> #include <asm/uaccess.h> +#include "ext3.h" long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index e8e211795e9..d7940b24cf6 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -24,20 +24,8 @@ * Theodore Ts'o, 2002 */ -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/jbd.h> -#include <linux/time.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/bio.h> -#include <trace/events/ext3.h> - +#include "ext3.h" #include "namei.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 7916e4ce166..0f814f3450d 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -11,10 +11,7 @@ #define EXT3FS_DEBUG -#include <linux/ext3_jbd.h> - -#include <linux/errno.h> -#include <linux/slab.h> +#include "ext3.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index e0b45b93327..cf0b5921cf0 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -17,22 +17,12 @@ */ #include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> -#include <linux/ext3_jbd.h> -#include <linux/slab.h> -#include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/buffer_head.h> #include <linux/exportfs.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/random.h> #include <linux/mount.h> -#include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/log2.h> @@ -40,13 +30,13 @@ #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS + +#include "ext3.h" #include "xattr.h" #include "acl.h" #include "namei.h" -#define CREATE_TRACE_POINTS -#include <trace/events/ext3.h> - #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA #else diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 7c489820777..6b01c3eab1f 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -17,10 +17,8 @@ * ext3 symlink handling code */ -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/ext3_fs.h> #include <linux/namei.h> +#include "ext3.h" #include "xattr.h" static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index d565759d82e..d22ebb7a4f5 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -50,14 +50,9 @@ * by the buffer lock. */ -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include <linux/mbcache.h> #include <linux/quotaops.h> -#include <linux/rwsem.h> #include "xattr.h" #include "acl.h" diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index ea26f2acab9..3387664ad70 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -3,12 +3,8 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> #include <linux/security.h> +#include "ext3.h" #include "xattr.h" static size_t diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index 2526a8829de..d75727cc67f 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -5,11 +5,7 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/string.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" static size_t diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index b32e473a1e3..5612af3567e 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -5,10 +5,7 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/ext3_jbd.h> -#include <linux/ext3_fs.h> +#include "ext3.h" #include "xattr.h" static size_t diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9e2cd8cf71..4bbd07a6fa1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -336,10 +336,10 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; - struct buffer_head *bh = NULL; + struct buffer_head *bh; ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); + ext4_error(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } @@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ + set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { - put_bh(bh); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + return bh; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return 1; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); - return NULL; + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + return 1; } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ ext4_valid_block_bitmap(sb, desc, block_group, bh); - /* - * file system mounted not to panic on error, - * continue with corrupt bitmap - */ + return 0; +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + + bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (ext4_wait_block_bitmap(sb, block_group, bh)) { + put_bh(bh); + return NULL; + } return bh; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 164c56092e5..b8678620264 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext4_readdir(struct file *, void *, filldir_t); static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir); -static int ext4_release_dir(struct inode *inode, - struct file *filp); - -const struct file_operations ext4_dir_operations = { - .llseek = ext4_llseek, - .read = generic_read_dir, - .readdir = ext4_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .fsync = ext4_sync_file, - .release = ext4_release_dir, -}; - static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -60,6 +44,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext4_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} + /* * Return 0 if the directory entry is OK, and 1 if there is a problem * @@ -91,17 +95,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 0; if (filp) - ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, + ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else - ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, + ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset%bh->b_size), + error_msg, (unsigned) (offset % bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -115,18 +119,13 @@ static int ext4_readdir(struct file *filp, unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; - struct super_block *sb; int err; struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX) && - ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { ret = err; @@ -254,22 +253,134 @@ out: return ret; } +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() based on generic_file_llseek() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t ret = -EINVAL; + int dx_dir = is_dx_dir(inode); + + mutex_lock(&inode->i_mutex); + + /* NOTE: relative offsets with dx directories might not work + * as expected, as it is difficult to figure out the + * correct offset between dx hashes */ + + switch (origin) { + case SEEK_END: + if (unlikely(offset > 0)) + goto out_err; /* not supported for directories */ + + /* so only negative offsets are left, does that have a + * meaning for directories at all? */ + if (dx_dir) + offset += ext4_get_htree_eof(file); + else + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + offset = file->f_pos; + goto out_ok; + } + + offset += file->f_pos; + break; + } + + if (unlikely(offset < 0)) + goto out_err; + + if (!dx_dir) { + if (offset > inode->i_sb->s_maxbytes) + goto out_err; + } else if (offset > ext4_get_htree_eof(file)) + goto out_err; + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out_ok: + ret = offset; +out_err: + mutex_unlock(&inode->i_mutex); + + return ret; +} /* * This structure holds the nodes of the red-black tree used to store @@ -330,15 +441,16 @@ static void free_rb_tree_fname(struct rb_root *root) } -static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -425,11 +537,12 @@ static int call_filldir(struct file *filp, void *dirent, sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "EXT4-fs: call_filldir: called with " - "null fname?!?\n"); + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, fname->name_len, curr_pos, @@ -454,13 +567,13 @@ static int ext4_dx_readdir(struct file *filp, int ret; if (!info) { - info = ext4_htree_create_dir_info(filp->f_pos); + info = ext4_htree_create_dir_info(filp, filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; } - if (filp->f_pos == EXT4_HTREE_EOF) + if (filp->f_pos == ext4_get_htree_eof(filp)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ @@ -468,8 +581,8 @@ static int ext4_dx_readdir(struct file *filp, free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); } /* @@ -501,7 +614,7 @@ static int ext4_dx_readdir(struct file *filp, if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT4_HTREE_EOF; + filp->f_pos = ext4_get_htree_eof(filp); break; } info->curr_node = rb_first(&info->root); @@ -521,7 +634,7 @@ static int ext4_dx_readdir(struct file *filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT4_HTREE_EOF; + filp->f_pos = ext4_get_htree_eof(filp); break; } info->curr_hash = info->next_hash; @@ -540,3 +653,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) return 0; } + +const struct file_operations ext4_dir_operations = { + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .readdir = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 513004fc3d8..ab2594a30f8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,7 +53,7 @@ printk(KERN_DEBUG f, ## a); \ } while (0) #else -#define ext4_debug(f, a...) do {} while (0) +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ @@ -184,6 +184,8 @@ struct mpage_da_data { #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 +#define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_IN_FSYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -192,18 +194,25 @@ struct ext4_io_page { #define MAX_IO_PAGES 128 +/* + * For converting uninitialized extents on a work queue. + * + * 'page' is only used from the writepage() path; 'pages' is only used for + * buffered writes; they are used to keep page references until conversion + * takes place. For AIO/DIO, neither field is filled in. + */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - struct page *page; /* page struct for buffer write */ + struct page *page; /* for writepage() path */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; - struct ext4_io_page *pages[MAX_IO_PAGES]; + int num_io_pages; /* for writepages() */ + struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ } ext4_io_end_t; struct ext4_io_submit { @@ -923,6 +932,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ @@ -941,7 +951,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ @@ -1142,6 +1151,7 @@ struct ext4_sb_info { unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned int s_mount_flags; + unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; uid_t s_resuid; gid_t s_resgid; @@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1612,7 +1623,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT4_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext4_htree_next_block @@ -1794,8 +1809,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); extern void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t group, @@ -1841,6 +1862,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ extern long ext4_mb_stats; diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a52db3a69a3..0f58b86e3a0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -47,9 +47,9 @@ */ #define EXT_DEBUG__ #ifdef EXT_DEBUG -#define ext_debug(a...) printk(a) +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ext_debug(a...) +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1dab1..83b20fcf940 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -104,6 +104,78 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + */ +static inline void ext4_journal_callback_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); +} + int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, /* super.c */ int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return 1; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 1; - return 0; + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; } static inline int ext4_should_order_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; } static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 1; - if (!S_ISREG(inode->i_mode)) - return 0; - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return 1; - return 0; + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74f23c292e1..1421938e679 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,14 @@ #include <trace/events/ext4.h> +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, @@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle, int split_flag, int flags); +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + if (len == 0) + return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_extent *ex; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf\n", start); + ext_debug("truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext_debug(" border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ - if (end <= ex_ee_block) { + if (end < ex_ee_block) { ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { - EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", - start, end); + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); err = -EIO; goto out; } else if (a != ex_ee_block) { @@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) handle_t *handle; int i, err; - ext_debug("truncate since %u\n", start); + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start(inode, depth + 1); @@ -2504,6 +2525,61 @@ again: trace_ext4_ext_remove_space(inode, start, depth); /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block; + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, end, NULL); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) + goto cont; + + ee_block = le32_to_cpu(ex->ee_block); + + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && + end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + int split_flag = 0; + + if (ext4_ext_is_uninitialized(ex)) + split_flag = EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent + */ + err = ext4_split_extent_at(handle, inode, path, + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (err < 0) + goto out; + } + ext4_ext_drop_refs(path); + kfree(path); + } +cont: + + /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ @@ -2515,6 +2591,7 @@ again: } path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; @@ -2526,7 +2603,7 @@ again: /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial_cluster, start, - EXT_MAX_BLOCKS - 1); + end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) - printk(KERN_INFO "EXT4-fs: file extents enabled"); + printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST - printk(", aggressive tests"); + ", aggressive tests" #endif #ifdef CHECK_BINSEARCH - printk(", check binsearch"); + ", check binsearch" #endif #ifdef EXTENTS_STATS - printk(", stats"); + ", stats" #endif - printk("\n"); + "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); @@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) } /* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ - due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ - -/* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle @@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, depth = ext_depth(inode); eh = path[depth].p_hdr; - if (unlikely(!eh->eh_entries)) { - EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " - "EOFBLOCKS_FL set"); - return -EIO; - } + /* + * We're going to remove EOFBLOCKS_FL entirely in future so we + * do not care for this case anymore. Simply remove the flag + * if there are no extents. + */ + if (unlikely(!eh->eh_entries)) + goto out; last_ex = EXT_LAST_EXTENT(eh); /* * We should clear the EOFBLOCKS_FL flag if we are writing the @@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, for (i = depth-1; i >= 0; i--) if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) return 0; +out: ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); return ext4_mark_inode_dirty(handle, inode); } @@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; - unsigned int punched_out = 0; - unsigned int result = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext4_lblk_t cluster_offset; @@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && - ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) @@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { - struct ext4_map_blocks punch_map; - ext4_fsblk_t partial_cluster = 0; - newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; - } - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, - allocated, newblock); - return ret; - } - - /* - * Punch out the map length, but only to the - * end of the extent - */ - punched_out = allocated < map->m_len ? - allocated : map->m_len; - /* - * Sense extents need to be converted to - * uninitialized, they must fit in an - * uninitialized extent + * Do not put uninitialized extent + * in the cache */ - if (punched_out > EXT_UNINIT_MAX_LEN) - punched_out = EXT_UNINIT_MAX_LEN; - - punch_map.m_lblk = map->m_lblk; - punch_map.m_pblk = newblock; - punch_map.m_len = punched_out; - punch_map.m_flags = 0; - - /* Check to see if the extent needs to be split */ - if (punch_map.m_len != ee_len || - punch_map.m_lblk != ee_block) { - - ret = ext4_split_extent(handle, inode, - path, &punch_map, 0, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT | - EXT4_GET_BLOCKS_PRE_IO); - - if (ret < 0) { - err = ret; - goto out2; - } - /* - * find extent for the block at - * the start of the hole - */ - ext4_ext_drop_refs(path); - kfree(path); - - path = ext4_ext_find_extent(inode, - map->m_lblk, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out2; - } - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - ee_block = le32_to_cpu(ex->ee_block); - ee_start = ext4_ext_pblock(ex); - - } - - ext4_ext_mark_uninitialized(ex); - - ext4_ext_invalidate_cache(inode); - - err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, map->m_lblk, - map->m_lblk + punched_out); - - if (!err && path->p_hdr->eh_entries == 0) { - /* - * Punch hole freed all of this sub tree, - * so we need to correct eh_depth - */ - err = ext4_ext_get_access(handle, inode, path); - if (err == 0) { - ext_inode_hdr(inode)->eh_depth = 0; - ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root( - inode, 0)); - - err = ext4_ext_dirty( - handle, inode, path); - } + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; } - - goto out2; + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; } } @@ -4165,13 +4146,11 @@ out2: ext4_ext_drop_refs(path); kfree(path); } - result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? - punched_out : allocated; trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : result); + newblock, map->m_len, err ? err : allocated); - return err ? err : result; + return err ? err : allocated; } void ext4_ext_truncate(struct inode *inode) @@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, map.m_len); + ext4_msg(inode->i_sb, KERN_ERR, + "%s:%d: inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + __func__, __LINE__, inode->i_ino, map.m_lblk, + map.m_len, ret); } ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; - struct ext4_ext_cache cache_ex; - ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; + ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - struct ext4_map_blocks map; handle_t *handle; loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; - int ret, credits, blocks_released, err = 0; + int credits, err = 0; /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } - /* * If i_size is contained in the last page, we need to * unmap and zero the partial page after i_size @@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + /* If there are no blocks to remove, return now */ - if (first_block >= last_block) + if (first_block >= stop_block) goto out; down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); - /* - * Loop over all the blocks and identify blocks - * that need to be punched out - */ - iblock = first_block; - blocks_released = 0; - while (iblock < last_block) { - max_blocks = last_block - iblock; - num_blocks = 1; - memset(&map, 0, sizeof(map)); - map.m_lblk = iblock; - map.m_len = max_blocks; - ret = ext4_ext_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - - if (ret > 0) { - blocks_released += ret; - num_blocks = ret; - } else if (ret == 0) { - /* - * If map blocks could not find the block, - * then it is in a hole. If the hole was - * not already cached, then map blocks should - * put it in the cache. So we can get the hole - * out of the cache - */ - memset(&cache_ex, 0, sizeof(cache_ex)); - if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && - !cache_ex.ec_start) { - - /* The hole is cached */ - num_blocks = cache_ex.ec_block + - cache_ex.ec_len - iblock; - - } else { - /* The block could not be identified */ - err = -EIO; - break; - } - } else { - /* Map blocks error */ - err = ret; - break; - } - - if (num_blocks == 0) { - /* This condition should never happen */ - ext_debug("Block lookup failed"); - err = -EIO; - break; - } - - iblock += num_blocks; - } + err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - if (blocks_released > 0) { - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - } + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 00a2cb753ef..bb6c7d81131 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode) io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); list_del_init(&io->list); + io->flag |= EXT4_IO_END_IN_FSYNC; /* * Calling ext4_end_io_nolock() to convert completed * IO to written. @@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode) if (ret < 0) ret2 = ret; spin_lock_irqsave(&ei->i_completed_io_lock, flags); + io->flag &= ~EXT4_IO_END_IN_FSYNC; } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index ac8f168c8ab..fa8e4911d35 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT4_HTREE_EOF << 1)) - hash = (EXT4_HTREE_EOF-1) << 1; + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25d8c9781ad..409c2ee7750 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, return EXT4_INODES_PER_GROUP(sb); } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return bh; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); return NULL; } return bh; @@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - if (atomic_read(&inode->i_count) > 1) { - printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); return; } - if (inode->i_nlink) { - printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); return; } - if (!sb) { - printk(KERN_ERR "ext4_free_inode: inode on " - "nonexistent device\n"); + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); @@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode - */ -static int ext4_claim_inode(struct super_block *sb, - struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, umode_t mode) -{ - int free = 0, retval = 0, count; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - - /* - * We have to be sure that new inode allocation does not race with - * inode table initialization, because otherwise we may end up - * allocating and writing new inode right before sb_issue_zeroout - * takes place and overwriting our new inode with zeroes. So we - * take alloc_sem to prevent it. - */ - down_read(&grp->alloc_sem); - ext4_lock_group(sb, group); - if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { - /* not a free inode */ - retval = 1; - goto err_ret; - } - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - ext4_error(sb, "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - return 1; - } - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); - - atomic_inc(&sbi->s_flex_groups[f].used_dirs); - } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - return retval; -} - -/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -741,6 +664,11 @@ got_group: if (ret2 == -1) goto out; + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; @@ -757,51 +685,24 @@ repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - - if (ino < EXT4_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - group_desc_bh); - if (err) - goto fail; - if (!ext4_claim_inode(sb, inode_bitmap_bh, - ino, group, mode)) { - /* we won it */ - BUFFER_TRACE(inode_bitmap_bh, - "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, - NULL, - inode_bitmap_bh); - if (err) - goto fail; - /* zero bit is inode number 1*/ - ino++; - goto got; - } - /* we lost it */ - ext4_handle_release_buffer(handle, inode_bitmap_bh); - ext4_handle_release_buffer(handle, group_desc_bh); - - if (++ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; + if (ino >= EXT4_INODES_PER_GROUP(sb)) { + if (++group == ngroups) + group = 0; + continue; } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ - if (++group == ngroups) - group = 0; + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; } err = -ENOSPC; goto out; @@ -838,6 +739,59 @@ got: if (err) goto fail; } + + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) + goto fail; + + /* Update the relevant bg descriptor fields */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + } + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) + goto fail; + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) @@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) @@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, sbi->s_inodes_per_block); if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { - ext4_error(sb, "Something is wrong with group %u\n" - "Used itable blocks: %d" - "itable unused count: %u\n", + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index feaa82fe629..c77b0bd2c71 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode, trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " - "with only %d reserved data blocks\n", + "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); @@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) */ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " - "data blocks\n", inode->i_ino, to_free, + "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; @@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - printk(KERN_CRIT "Total free blocks count %lld\n", + struct super_block *sb = inode->i_sb; + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(inode->i_sb))); - printk(KERN_CRIT "Free/Dirty block details\n"); - printk(KERN_CRIT "free_blocks=%lld\n", + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); - printk(KERN_CRIT "dirty_blocks=%lld\n", + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(inode->i_sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); - printk(KERN_CRIT "Block reservation details\n"); - printk(KERN_CRIT "i_reserved_data_blocks=%u\n", - EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + EXT4_I(inode)->i_reserved_data_blocks); + ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) { - if (ext4_should_order_data(inode)) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: return ext4_ordered_write_end(file, mapping, pos, len, copied, page, fsdata); - } else if (ext4_should_writeback_data(inode)) { + case EXT4_INODE_WRITEBACK_DATA_MODE: return ext4_writeback_write_end(file, mapping, pos, len, copied, page, fsdata); - } else { + default: BUG(); } } @@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); @@ -2795,9 +2798,6 @@ out: /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - - /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(inode); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - printk("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); + ext4_msg(io_end->inode->i_sb, KERN_INFO, + "sb umounted, discard end_io request for inode %lu", + io_end->inode->i_ino); ext4_free_io_end(io_end); goto out; } @@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode, GFP_NOFS); - if (!iocb->private) + ext4_io_end_t *io_end = + ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) return -ENOMEM; + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; /* * we save the io structure for current async * direct IO, so that later ext4_map_blocks() @@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_get_block_write, ext4_end_io_dio, NULL, - DIO_LOCKING | DIO_SKIP_HOLES); + DIO_LOCKING); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; /* @@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = { void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_order_data(inode)) - inode->i_mapping->a_ops = &ext4_ordered_aops; - else if (ext4_should_writeback_data(inode) && - test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else if (ext4_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext4_writeback_aops; - else + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } } @@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file->f_path.dentry->d_inode; if (!S_ISREG(inode->i_mode)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { /* TODO: Add support for non extent hole punching */ - return -ENOTSUPP; + return -EOPNOTSUPP; } if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ - return -ENOTSUPP; + return -EOPNOTSUPP; } return ext4_ext_punch_hole(file, offset, length); @@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle, ext4_update_dynamic_rev(sb); EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, NULL, - EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super(handle, sb); } } raw_inode->i_generation = cpu_to_le32(inode->i_generation); @@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { + if (attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - ext4_truncate(inode); - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - ext4_truncate(inode); + ext4_truncate(inode); } if (!rc) { @@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (test_opt(inode->i_sb, I_VERSION)) + if (IS_I_VERSION(inode)) inode_inc_iversion(inode); /* the do_update_inode consumes one bh->b_count */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cb990b21c69..99ab428bcfa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,6 +21,7 @@ * mballoc.c contains the multiblocks allocation routines */ +#include "ext4_jbd2.h" #include "mballoc.h" #include <linux/debugfs.h> #include <linux/slab.h> @@ -339,7 +340,7 @@ */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for @@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { @@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); - return EXT4_MB_BITMAP(e4b); + return e4b->bd_bitmap; } - bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; @@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( - !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + !mb_test_bit(k, e4b->bd_bitmap)); } count++; } @@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int groups_per_page; int err = 0; int i; - ext4_group_t first_group; + ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; @@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { - err = -ENOMEM; i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) + if (bh == NULL) { + err = -ENOMEM; goto out; + } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - struct ext4_group_desc *desc; - - if (first_group + i >= ngroups) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) break; - grinfo = ext4_get_group_info(sb, first_group + i); + grinfo = ext4_get_group_info(sb, group); /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so @@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore) bh[i] = NULL; continue; } - - err = -EIO; - desc = ext4_get_group_desc(sb, first_group + i, NULL); - if (desc == NULL) - goto out; - - err = -ENOMEM; - bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); - if (bh[i] == NULL) + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; goto out; - - if (bitmap_uptodate(bh[i])) - continue; - - lock_buffer(bh[i]); - if (bitmap_uptodate(bh[i])) { - unlock_buffer(bh[i]); - continue; - } - ext4_lock_group(sb, first_group + i); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh[i], - first_group + i, desc); - set_bitmap_uptodate(bh[i]); - set_buffer_uptodate(bh[i]); - ext4_unlock_group(sb, first_group + i); - unlock_buffer(bh[i]); - continue; } - ext4_unlock_group(sb, first_group + i); - if (buffer_uptodate(bh[i])) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh[i]); - unlock_buffer(bh[i]); - continue; - } - get_bh(bh[i]); - /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. - */ - set_bitmap_uptodate(bh[i]); - bh[i]->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh[i]); - mb_debug(1, "read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ - for (i = 0; i < groups_per_page; i++) - if (bh[i]) - wait_on_buffer(bh[i]); - - err = -EIO; - for (i = 0; i < groups_per_page; i++) - if (bh[i] && !buffer_uptodate(bh[i])) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; goto out; + } + } - err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { int group; @@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) int order = 1; void *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = EXT4_MB_BUDDY(e4b); + bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { @@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, /* let's maintain fragments counter */ if (first != 0) - block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); + block = !mb_test_bit(first - 1, e4b->bd_bitmap); if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(first + count, e4b->bd_bitmap); if (block && max) e4b->bd_info->bb_fragments--; else if (!block && !max) @@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, block = first++; order = 0; - if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { + if (!mb_test_bit(block, e4b->bd_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u)", block); } - mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); + mb_clear_bit(block, e4b->bd_bitmap); e4b->bd_info->bb_counters[order]++; /* start of the buddy */ @@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, break; next = (block + 1) * (1 << order); - if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); @@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain fragments counter */ if (start != 0) - mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) @@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; @@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; @@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); goto exit_meta_group_info; } @@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); - if (sbi->s_journal) - sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; out_free_locality_groups: @@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb, * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - struct super_block *sb = journal->j_private; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; - struct ext4_free_data *entry; - struct list_head *l, *ltmp; - list_for_each_safe(l, ltmp, &txn->t_private_list) { - entry = list_entry(l, struct ext4_free_data, list); + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, entry->efd_count); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, - entry->start_cluster, entry->count); + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); - err = ext4_mb_load_buddy(sb, entry->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->count; - count2++; - ext4_lock_group(sb, entry->group); - /* Take it out of per group rb tree */ - rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); - /* - * Clear the trimmed flag for the group so that the next - * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. - */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->group); - kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_unload_buddy(&e4b); + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } @@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void) return -ENOMEM; } - ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, - SLAB_RECLAIM_ACCOUNT); - if (ext4_free_ext_cachep == NULL) { + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; @@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void) rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_ext_cachep); + kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); ext4_remove_debugfs_entry(); } @@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " - "fs metadata\n", block, block+len); + "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. @@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; - loff_t size, orig_size, start_off; + loff_t size, start_off; + loff_t orig_size __maybe_unused; ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, n = rb_first(&(grp->bb_free_root)); while (n) { - entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_cluster, entry->count); + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; @@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, @@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", ac->ac_ex_scanned, ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4428,9 +4376,9 @@ out: static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { - if ((entry1->t_tid == entry2->t_tid) && - (entry1->group == entry2->group) && - ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } @@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_node = &new_entry->node; - cluster = new_entry->start_cluster; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to @@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } while (*n) { parent = *n; - entry = rb_entry(parent, struct ext4_free_data, node); - if (cluster < entry->start_cluster) + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; - else if (cluster >= (entry->start_cluster + entry->count)) + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, @@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(entry, new_entry)) { - new_entry->start_cluster = entry->start_cluster; - new_entry->count += entry->count; + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ - spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } @@ -4691,15 +4634,15 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { err = -ENOMEM; goto error_return; } - new_entry->start_cluster = bit; - new_entry->group = block_group; - new_entry->count = count_clusters; - new_entry->t_tid = handle->h_transaction->t_tid; + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; - while (start < max) { - start = mb_find_next_zero_bit(bitmap, max, start); - if (start >= max) + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) break; - next = mb_find_next_bit(bitmap, max, start); + next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { ext4_trim_extent(sb, start, @@ -5027,37 +4970,36 @@ out: int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct ext4_group_info *grp; - ext4_group_t first_group, last_group; - ext4_group_t group, ngroups = ext4_get_groups_count(sb); + ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; - uint64_t start, len, minlen, trimmed = 0; + uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; - len = range->len >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = range->minlen >> sb->s_blocksize_bits; - if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || + unlikely(start >= max_blks)) return -EINVAL; - if (start + len <= first_data_blk) + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) goto out; - if (start < first_data_blk) { - len -= first_data_blk - start; + if (start < first_data_blk) start = first_data_blk; - } - /* Determine first and last group to examine based on start and len */ + /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); - ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); - last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; - last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); - if (first_group > last_group) - return -EINVAL; + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } /* - * For all the groups except the last one, last block will - * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to - * change it for the last group in which case start + - * len < EXT4_BLOCKS_PER_GROUP(sb). + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() */ - if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) - last_cluster = first_cluster + len; - len -= last_cluster - first_cluster; + if (group == last_group) + end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - last_cluster, minlen); + end, minlen); if (cnt < 0) { ret = cnt; break; } + trimmed += cnt; } - trimmed += cnt; + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ first_cluster = 0; } - range->len = trimmed * sb->s_blocksize; if (!ret) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: + range->len = trimmed * sb->s_blocksize; return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 47705f3285e..c070618c21c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -96,21 +96,23 @@ extern u8 mb_enable_debug; struct ext4_free_data { - /* this links the free block information from group_info */ - struct rb_node node; + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ - /* this links the free block information from ext4_sb_info */ - struct list_head list; + /* this links the free block information from group_info */ + struct rb_node efd_node; /* group which free block extent belongs */ - ext4_group_t group; + ext4_group_t efd_group; /* free block extent */ - ext4_grpblk_t start_cluster; - ext4_grpblk_t count; + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; /* transaction which freed this extent */ - tid_t t_tid; + tid_t efd_tid; }; struct ext4_prealloc_space { @@ -210,8 +212,6 @@ struct ext4_buddy { __u16 bd_blkbits; ext4_group_t bd_group; }; -#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7d6bb0acfa..f39f80f8f2c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode) tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { - retval = PTR_ERR(inode); + retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 7ea4ba4eff2..ed6548d8916 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb, * If check_interval in MMP block is larger, use that instead of * update_interval from the superblock. */ - if (mmp->mmp_check_interval > mmp_check_interval) - mmp_check_interval = mmp->mmp_check_interval; + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); seq = le32_to_cpu(mmp->mmp_seq); if (seq == EXT4_MMP_SEQ_CLEAN) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2043f482375..349d7b3671c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -468,7 +468,7 @@ fail2: fail: if (*err == ERR_BAD_DX_DIR) ext4_warning(dir->i_sb, - "Corrupt dir inode %ld, running e2fsck is " + "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); return NULL; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 47585189651..dcdeef169a6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -110,6 +110,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->iocb) aio_complete(io->iocb, io->result, 0); + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) wake_up_all(ext4_ioend_wq(io->inode)); @@ -127,12 +129,18 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (io->flag & EXT4_IO_END_IN_FSYNC) + goto requeue; if (list_empty(&io->list)) { spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); goto free; } if (!mutex_trylock(&inode->i_mutex)) { + bool was_queued; +requeue: + was_queued = !!(io->flag & EXT4_IO_END_QUEUED); + io->flag |= EXT4_IO_END_QUEUED; spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); /* * Requeue the work instead of waiting so that the work @@ -145,9 +153,8 @@ static void ext4_end_io_work(struct work_struct *work) * yield the cpu if it sees an end_io request that has already * been requeued. */ - if (io->flag & EXT4_IO_END_QUEUED) + if (was_queued) yield(); - io->flag |= EXT4_IO_END_QUEUED; return; } list_del_init(&io->list); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f9d948f0eb8..59fa0be2725 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb, do_div(reserved_blocks, 100); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); /* * We need to protect s_groups_count against other CPUs seeing @@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, } ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ @@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", - o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT4-fs: filesystem on %s:" - " too large to resize to %llu blocks safely\n", - sb->s_id, n_blocks_count); + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); if (sizeof(sector_t) < 8) ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; @@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_fsblk_t o_blocks_count; ext4_group_t o_group; ext4_group_t n_group; - ext4_grpblk_t offset; + ext4_grpblk_t offset, add; unsigned long n_desc_blocks; unsigned long o_desc_blocks; unsigned long desc_blocks; @@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ @@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) return 0; ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); - ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / EXT4_DESC_PER_BLOCK(sb); @@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) } brelse(bh); - if (offset != 0) { - /* extend the last group */ - ext4_grpblk_t add; - add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) goto out; @@ -1674,7 +1681,7 @@ out: iput(resize_inode); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " - "upto %llu blocks\n", o_blocks_count, n_blocks_count); + ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " + "upto %llu blocks", o_blocks_count, n_blocks_count); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 933900909ed..ceebaf853be 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); @@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb) return bdi->dev == NULL; } +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce, *tmp; + + spin_lock(&sbi->s_md_lock); + list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); if (block) - printk(KERN_CONT "block %llu: ", block); - printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function, path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: ", - inode->i_sb->s_id, function, line, inode->i_ino); - if (block) - printk(KERN_CONT "block %llu: ", block); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); va_end(args); ext4_handle_error(inode->i_sb); @@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->dio_unwritten_wq); lock_super(sb); - if (sb->s_dirt) - ext4_commit_super(sb, 1); - if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, 1); } + if (sb->s_dirt || !(sb->s_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } kobject_del(&sbi->s_kobj); @@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode) } } -static inline void ext4_show_quota_options(struct seq_file *seq, - struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int ext4_show_options(struct seq_file *seq, struct dentry *root) -{ - int def_errors; - unsigned long def_mount_opts; - struct super_block *sb = root->d_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - def_errors = le16_to_cpu(es->s_errors); - - if (sbi->s_sb_block != 1) - seq_printf(seq, ",sb=%llu", sbi->s_sb_block); - if (test_opt(sb, MINIX_DF)) - seq_puts(seq, ",minixdf"); - if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",grpid"); - if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) - seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT4_DEF_RESUID || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); - } - if (sbi->s_resgid != EXT4_DEF_RESGID || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); - } - if (test_opt(sb, ERRORS_RO)) { - if (def_errors == EXT4_ERRORS_PANIC || - def_errors == EXT4_ERRORS_CONTINUE) { - seq_puts(seq, ",errors=remount-ro"); - } - } - if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) - seq_puts(seq, ",errors=continue"); - if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) - seq_puts(seq, ",errors=panic"); - if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) - seq_puts(seq, ",nouid32"); - if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) - seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT4_FS_XATTR - if (test_opt(sb, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER)) - seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",acl"); - if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) - seq_puts(seq, ",noacl"); -#endif - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - seq_printf(seq, ",commit=%u", - (unsigned) (sbi->s_commit_interval / HZ)); - } - if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { - seq_printf(seq, ",min_batch_time=%u", - (unsigned) sbi->s_min_batch_time); - } - if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { - seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_max_batch_time); - } - - /* - * We're changing the default of barrier mount option, so - * let's always display its mount state so it's clear what its - * status is. - */ - seq_puts(seq, ",barrier="); - seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) - seq_puts(seq, ",journal_async_commit"); - else if (test_opt(sb, JOURNAL_CHECKSUM)) - seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, I_VERSION)) - seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC) && - !(def_mount_opts & EXT4_DEFM_NODELALLOC)) - seq_puts(seq, ",nodelalloc"); - - if (!test_opt(sb, MBLK_IO_SUBMIT)) - seq_puts(seq, ",nomblk_io_submit"); - if (sbi->s_stripe) - seq_printf(seq, ",stripe=%lu", sbi->s_stripe); - /* - * journal mode get enabled in different ways - * So just print the value even if we didn't specify it - */ - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - - if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) - seq_printf(seq, ",inode_readahead_blks=%u", - sbi->s_inode_readahead_blks); - - if (test_opt(sb, DATA_ERR_ABORT)) - seq_puts(seq, ",data_err=abort"); - - if (test_opt(sb, NO_AUTO_DA_ALLOC)) - seq_puts(seq, ",noauto_da_alloc"); - - if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) - seq_puts(seq, ",discard"); - - if (test_opt(sb, NOLOAD)) - seq_puts(seq, ",norecovery"); - - if (test_opt(sb, DIOREAD_NOLOCK)) - seq_puts(seq, ",dioread_nolock"); - - if (test_opt(sb, BLOCK_VALIDITY) && - !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) - seq_puts(seq, ",block_validity"); - - if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_itable"); - else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_itable=%u", - (unsigned) sbi->s_li_wait_mult); - - ext4_show_quota_options(seq, sb); - - return 0; -} - static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_update, Opt_journal_dev, - Opt_journal_checksum, Opt_journal_async_commit, + Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1350,20 +1203,19 @@ static const match_table_t tokens = { {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, + {Opt_removed, "oldalloc"}, + {Opt_removed, "orlov"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, - {Opt_noload, "noload"}, {Opt_noload, "norecovery"}, - {Opt_nobh, "nobh"}, - {Opt_bh, "bh"}, + {Opt_noload, "noload"}, + {Opt_removed, "nobh"}, + {Opt_removed, "bh"}, {Opt_commit, "commit=%u"}, {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_update, "journal=update"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -1389,7 +1241,6 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, - {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_mblk_io_submit, "mblk_io_submit"}, @@ -1408,6 +1259,11 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ + {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ + {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ {Opt_err, NULL}, }; @@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype) } #endif -static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, - ext4_fsblk_t *n_blocks_count, int is_remount) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; - substring_t args[MAX_OPT_ARGS]; - int data_opt = 0; - int option; +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#define MOPT_CLEAR_ERR 0x0010 +#define MOPT_GTE0 0x0020 #ifdef CONFIG_QUOTA - int qfmt; +#define MOPT_Q 0 +#define MOPT_QFMT 0x0040 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT #endif - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, MINIX_DF); - break; - case Opt_minix_df: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, MINIX_DF); - - break; - case Opt_grpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - set_opt(sb, GRPID); - - break; - case Opt_nogrpid: - ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); - clear_opt(sb, GRPID); - - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resuid = option; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_resgid = option; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_RO); - set_opt(sb, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt(sb, ERRORS_CONT); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt(sb, ERRORS_RO); - clear_opt(sb, ERRORS_PANIC); - set_opt(sb, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt(sb, NO_UID32); - break; - case Opt_debug: - set_opt(sb, DEBUG); - break; - case Opt_oldalloc: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated oldalloc option"); - break; - case Opt_orlov: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated orlov option"); - break; +#define MOPT_DATAJ 0x0080 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, + {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, + {Opt_inode_readahead_blks, 0, MOPT_GTE0}, + {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_stripe, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, #ifdef CONFIG_EXT4_FS_XATTR - case Opt_user_xattr: - set_opt(sb, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt(sb, XATTR_USER); - break; + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, + {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); - break; + {Opt_user_xattr, 0, MOPT_NOSUPPORT}, + {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL - case Opt_acl: - set_opt(sb, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sb, POSIX_ACL); - break; + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, + {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, #else - case Opt_acl: - case Opt_noacl: - ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); - break; + {Opt_acl, 0, MOPT_NOSUPPORT}, + {Opt_noacl, 0, MOPT_NOSUPPORT}, #endif - case Opt_journal_update: - /* @@@ FIXME */ - /* Eventually we will want to be able to create - a journal file here. For now, only allow the - user to specify an existing inode to be the - journal file. */ - if (is_remount) { - ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; - } - set_opt(sb, UPDATE_JOURNAL); - break; - case Opt_journal_dev: - if (is_remount) { + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, + {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, + unsigned int *journal_ioprio, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; + int arg = 0; + + if (args->from && match_int(args, &arg)) + return -1; + switch (token) { + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); + break; + case Opt_sb: + return 1; /* handled by get_sb_block() */ + case Opt_removed: + ext4_msg(sb, KERN_WARNING, + "Ignoring removed %s option", opt); + return 1; + case Opt_resuid: + sbi->s_resuid = arg; + return 1; + case Opt_resgid: + sbi->s_resgid = arg; + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; + return 1; + case Opt_journal_dev: + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + *journal_devnum = arg; + return 1; + case Opt_journal_ioprio: + if (arg < 0 || arg > 7) + return -1; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + return 1; + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + if (token != m->token) + continue; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + if (arg == 0) + arg = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg > (1 << 30)) + return -1; + if (arg && !is_power_of_2(arg)) { ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return 0; + "EXT4-fs: inode_readahead_blks" + " must be a power of 2"); + return -1; } - if (match_int(&args[0], &option)) - return 0; - *journal_devnum = option; - break; - case Opt_journal_checksum: - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_journal_async_commit: - set_opt(sb, JOURNAL_ASYNC_COMMIT); - set_opt(sb, JOURNAL_CHECKSUM); - break; - case Opt_noload: - set_opt(sb, NOLOAD); - break; - case Opt_commit: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * option; - break; - case Opt_max_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = option; - break; - case Opt_min_batch_time: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_min_batch_time = option; - break; - case Opt_data_journal: - data_opt = EXT4_MOUNT_JOURNAL_DATA; - goto datacheck; - case Opt_data_ordered: - data_opt = EXT4_MOUNT_ORDERED_DATA; - goto datacheck; - case Opt_data_writeback: - data_opt = EXT4_MOUNT_WRITEBACK_DATA; - datacheck: + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != data_opt) { + else if (test_opt(sb, DATA_FLAGS) != + m->mount_opt) { ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return 0; + "Cannot change data mode on remount"); + return -1; } } else { clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= data_opt; + sbi->s_mount_opt |= m->mount_opt; } - break; - case Opt_data_err_abort: - set_opt(sb, DATA_ERR_ABORT); - break; - case Opt_data_err_ignore: - clear_opt(sb, DATA_ERR_ABORT); - break; #ifdef CONFIG_QUOTA - case Opt_usrjquota: + } else if (token == Opt_usrjquota) { if (!set_qf_name(sb, USRQUOTA, &args[0])) - return 0; - break; - case Opt_grpjquota: + return -1; + } else if (token == Opt_grpjquota) { if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return 0; - break; - case Opt_offusrjquota: + return -1; + } else if (token == Opt_offusrjquota) { if (!clear_qf_name(sb, USRQUOTA)) - return 0; - break; - case Opt_offgrpjquota: + return -1; + } else if (token == Opt_offgrpjquota) { if (!clear_qf_name(sb, GRPQUOTA)) - return 0; - break; - - case Opt_jqfmt_vfsold: - qfmt = QFMT_VFS_OLD; - goto set_qf_format; - case Opt_jqfmt_vfsv0: - qfmt = QFMT_VFS_V0; - goto set_qf_format; - case Opt_jqfmt_vfsv1: - qfmt = QFMT_VFS_V1; -set_qf_format: + return -1; + } else if (m->flags & MOPT_QFMT) { if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != qfmt) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); - return 0; - } - sbi->s_jquota_fmt = qfmt; - break; - case Opt_quota: - case Opt_usrquota: - set_opt(sb, QUOTA); - set_opt(sb, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sb, QUOTA); - set_opt(sb, GRPQUOTA); - break; - case Opt_noquota: - if (sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return 0; + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot " + "change journaled quota options " + "when quota turned on"); + return -1; } - clear_opt(sb, QUOTA); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - break; -#else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext4_msg(sb, KERN_ERR, - "quota options not supported"); - break; - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - ext4_msg(sb, KERN_ERR, - "journaled quota options not supported"); - break; - case Opt_noquota: - break; + sbi->s_jquota_fmt = m->mount_opt; #endif - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - break; - case Opt_nobarrier: - clear_opt(sb, BARRIER); - break; - case Opt_barrier: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - set_opt(sb, BARRIER); - else - clear_opt(sb, BARRIER); - break; - case Opt_ignore: - break; - case Opt_resize: - if (!is_remount) { - ext4_msg(sb, KERN_ERR, - "resize option only available " - "for remount"); - return 0; - } - if (match_int(&args[0], &option) != 0) - return 0; - *n_blocks_count = option; - break; - case Opt_nobh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated nobh option"); - break; - case Opt_bh: - ext4_msg(sb, KERN_WARNING, - "Ignoring deprecated bh option"); - break; - case Opt_i_version: - set_opt(sb, I_VERSION); - sb->s_flags |= MS_I_VERSION; - break; - case Opt_nodelalloc: - clear_opt(sb, DELALLOC); - clear_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_mblk_io_submit: - set_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_nomblk_io_submit: - clear_opt(sb, MBLK_IO_SUBMIT); - break; - case Opt_stripe: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_stripe = option; - break; - case Opt_delalloc: - set_opt(sb, DELALLOC); - set_opt2(sb, EXPLICIT_DELALLOC); - break; - case Opt_block_validity: - set_opt(sb, BLOCK_VALIDITY); - break; - case Opt_noblock_validity: - clear_opt(sb, BLOCK_VALIDITY); - break; - case Opt_inode_readahead_blks: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > (1 << 30)) - return 0; - if (option && !is_power_of_2(option)) { - ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return 0; + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; } - sbi->s_inode_readahead_blks = option; - break; - case Opt_journal_ioprio: - if (match_int(&args[0], &option)) - return 0; - if (option < 0 || option > 7) - break; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, - option); - break; - case Opt_noauto_da_alloc: - set_opt(sb, NO_AUTO_DA_ALLOC); - break; - case Opt_auto_da_alloc: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - clear_opt(sb, NO_AUTO_DA_ALLOC); + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; else - set_opt(sb,NO_AUTO_DA_ALLOC); - break; - case Opt_discard: - set_opt(sb, DISCARD); - break; - case Opt_nodiscard: - clear_opt(sb, DISCARD); - break; - case Opt_dioread_nolock: - set_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_dioread_lock: - clear_opt(sb, DIOREAD_NOLOCK); - break; - case Opt_init_itable: - set_opt(sb, INIT_INODE_TABLE); - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = EXT4_DEF_LI_WAIT_MULT; - if (option < 0) - return 0; - sbi->s_li_wait_mult = option; - break; - case Opt_noinit_itable: - clear_opt(sb, INIT_INODE_TABLE); - break; - default: - ext4_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " - "or missing value", p); - return 0; + sbi->s_mount_opt &= ~m->mount_opt; } + return 1; + } + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; +} + +static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, + int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, + journal_ioprio, is_remount) < 0) + return 0; } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { @@ -1942,6 +1651,160 @@ set_qf_format: return 1; } +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +static const char *token2str(int token) +{ + static const struct match_token *t; + + for (t = tokens; t->token != Opt_err; t++) + if (t->token == token && !strchr(t->pattern, '=')) + break; + return t->pattern; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + (m->flags & MOPT_CLEAR_ERR)) + continue; + if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + continue; /* skip if same as the default */ + if ((want_set && + (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (sbi->s_mount_opt & m->mount_opt))) + continue; /* select Opt_noFoo vs Opt_Foo */ + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); + if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (sb->s_flags & MS_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_puts(seq, "\n"); + return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, options_seq_show, PDE(inode)->data); +} + +static const struct file_operations ext4_seq_options_fops = { + .owner = THIS_MODULE, + .open = options_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { @@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void) ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; - printk(KERN_CRIT "EXT4: error %d creating inode table " + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; @@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { - ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", - "2.6.38"); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); - } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ @@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, NULL, 0)) { + &journal_devnum, &journal_ioprio, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", sbi->s_es->s_mount_opts); } + sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, NULL, 0)) + &journal_ioprio, 0)) goto failed_mount; if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #else es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif - sb->s_dirt = 1; } /* Handle clustersize */ @@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + if (sbi->s_proc) + proc_create_data("options", S_IRUGO, sbi->s_proc, + &ext4_seq_options_fops, sb); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + /* * The journal may have updated the bg summary counts, so we * need to update the global counters. @@ -3861,6 +3727,7 @@ failed_mount2: ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA @@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb, if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); - if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { - err = jbd2_journal_update_format(journal); - if (err) { - ext4_msg(sb, KERN_ERR, "error updating journal"); - jbd2_journal_destroy(journal); - return err; - } - } - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { @@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) { struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; int enable_quota = 0; @@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &journal_ioprio, - &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; } @@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > ext4_blocks_count(es)) { + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; @@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((err = ext4_group_extend(sb, es, n_blocks_count))) - goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; if (EXT4_HAS_INCOMPAT_FEATURE(sb, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 93a00d89a22..e88748e55c0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -82,8 +82,8 @@ printk("\n"); \ } while (0) #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_cache_insert(struct buffer_head *); @@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) static inline int ext4_xattr_check_block(struct buffer_head *bh) { - int error; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EIO; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); - return error; + return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); } static inline int @@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) goto cleanup; @@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = 0; if (!EXT4_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); error = -EIO; if (!bh) @@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); } - unlock_buffer(bh); out: ext4_std_error(inode->i_sb, error); return; @@ -834,7 +834,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %d", block); + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); new_bh = sb_getblk(sb, block); if (!new_bh) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 22764c7c838..75e7c1f3a08 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -32,20 +32,20 @@ void set_close_on_exec(unsigned int fd, int flag) spin_lock(&files->file_lock); fdt = files_fdtable(files); if (flag) - FD_SET(fd, fdt->close_on_exec); + __set_close_on_exec(fd, fdt); else - FD_CLR(fd, fdt->close_on_exec); + __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } -static int get_close_on_exec(unsigned int fd) +static bool get_close_on_exec(unsigned int fd) { struct files_struct *files = current->files; struct fdtable *fdt; - int res; + bool res; rcu_read_lock(); fdt = files_fdtable(files); - res = FD_ISSET(fd, fdt->close_on_exec); + res = close_on_exec(fd, fdt); rcu_read_unlock(); return res; } @@ -90,15 +90,15 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) err = -EBUSY; fdt = files_fdtable(files); tofree = fdt->fd[newfd]; - if (!tofree && FD_ISSET(newfd, fdt->open_fds)) + if (!tofree && fd_is_open(newfd, fdt)) goto out_unlock; get_file(file); rcu_assign_pointer(fdt->fd[newfd], file); - FD_SET(newfd, fdt->open_fds); + __set_open_fd(newfd, fdt); if (flags & O_CLOEXEC) - FD_SET(newfd, fdt->close_on_exec); + __set_close_on_exec(newfd, fdt); else - FD_CLR(newfd, fdt->close_on_exec); + __clear_close_on_exec(newfd, fdt); spin_unlock(&files->file_lock); if (tofree) diff --git a/fs/file.c b/fs/file.c index 3c426de7203..ba3f6053025 100644 --- a/fs/file.c +++ b/fs/file.c @@ -40,7 +40,7 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */ */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); -static void *alloc_fdmem(unsigned int size) +static void *alloc_fdmem(size_t size) { /* * Very large allocations can stress page reclaim, so fall back to @@ -142,7 +142,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; - char *data; + void *data; /* * Figure out how many fds we actually want to support in this fdtable. @@ -172,14 +172,15 @@ static struct fdtable * alloc_fdtable(unsigned int nr) data = alloc_fdmem(nr * sizeof(struct file *)); if (!data) goto out_fdt; - fdt->fd = (struct file **)data; - data = alloc_fdmem(max_t(unsigned int, + fdt->fd = data; + + data = alloc_fdmem(max_t(size_t, 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); if (!data) goto out_arr; - fdt->open_fds = (fd_set *)data; + fdt->open_fds = data; data += nr / BITS_PER_BYTE; - fdt->close_on_exec = (fd_set *)data; + fdt->close_on_exec = data; fdt->next = NULL; return fdt; @@ -275,11 +276,11 @@ static int count_open_files(struct fdtable *fdt) int i; /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) + for (i = size / BITS_PER_LONG; i > 0; ) { + if (fdt->open_fds[--i]) break; } - i = (i+1) * 8 * sizeof(long); + i = (i + 1) * BITS_PER_LONG; return i; } @@ -306,8 +307,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; - new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - new_fdt->open_fds = (fd_set *)&newf->open_fds_init; + new_fdt->close_on_exec = newf->close_on_exec_init; + new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; new_fdt->next = NULL; @@ -350,10 +351,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); + memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -366,7 +365,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) * is partway through open(). So make sure that this * fd is available to the new process. */ - FD_CLR(open_files - i, new_fdt->open_fds); + __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } @@ -379,11 +378,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) memset(new_fds, 0, size); if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); + int left = (new_fdt->max_fds - open_files) / 8; + int start = open_files / BITS_PER_LONG; - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds[start], 0, left); + memset(&new_fdt->close_on_exec[start], 0, left); } rcu_assign_pointer(newf->fdt, new_fdt); @@ -419,8 +418,8 @@ struct files_struct init_files = { .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], - .close_on_exec = (fd_set *)&init_files.close_on_exec_init, - .open_fds = (fd_set *)&init_files.open_fds_init, + .close_on_exec = init_files.close_on_exec_init, + .open_fds = init_files.open_fds_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; @@ -443,8 +442,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fds, fd); + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); error = expand_files(files, fd); if (error < 0) @@ -460,11 +458,11 @@ repeat: if (start <= files->next_fd) files->next_fd = fd + 1; - FD_SET(fd, fdt->open_fds); + __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) - FD_SET(fd, fdt->close_on_exec); + __set_close_on_exec(fd, fdt); else - FD_CLR(fd, fdt->close_on_exec); + __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 236972b752f..539f36cf3e4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -256,7 +256,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) } /* - * Move expired dirty inodes from @delaying_queue to @dispatch_queue. + * Move expired (dirtied after work->older_than_this) dirty inodes from + * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, @@ -1148,23 +1149,6 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); -/* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. - * - * The inodes to be written are parked on bdi->b_io. They are moved back onto - * bdi->b_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. - */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -1364,8 +1348,6 @@ int write_inode_now(struct inode *inode, int sync) ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); - if (sync) - inode_sync_wait(inode); return ret; } EXPORT_SYMBOL(write_inode_now); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 76834587a8a..a3d2c9ee8d6 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -18,7 +18,6 @@ #include <linux/mount.h> #include <linux/fs.h> #include <linux/gfs2_ondisk.h> -#include <linux/ext2_fs.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/crc32.h> diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 3cbfa93cd78..1fe731337f0 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -67,7 +67,8 @@ extern int access_file(char *path, int r, int w, int x); extern int open_file(char *path, int r, int w, int append); extern void *open_dir(char *path, int *err_out); extern char *read_dir(void *stream, unsigned long long *pos, - unsigned long long *ino_out, int *len_out); + unsigned long long *ino_out, int *len_out, + unsigned int *type_out); extern void close_file(void *stream); extern int replace_file(int oldfd, int fd); extern void close_dir(void *stream); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 588d45885a6..07c516bfea7 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -283,6 +283,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) char *name; unsigned long long next, ino; int error, len; + unsigned int type; name = dentry_name(file->f_path.dentry); if (name == NULL) @@ -292,9 +293,9 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) if (dir == NULL) return -error; next = file->f_pos; - while ((name = read_dir(dir, &next, &ino, &len)) != NULL) { + while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { error = (*filldir)(ent, name, len, file->f_pos, - ino, DT_UNKNOWN); + ino, type); if (error) break; file->f_pos = next; } diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index dd7bc38a382..a74ad0d371c 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -98,7 +98,8 @@ void *open_dir(char *path, int *err_out) } char *read_dir(void *stream, unsigned long long *pos, - unsigned long long *ino_out, int *len_out) + unsigned long long *ino_out, int *len_out, + unsigned int *type_out) { DIR *dir = stream; struct dirent *ent; @@ -109,6 +110,7 @@ char *read_dir(void *stream, unsigned long long *pos, return NULL; *len_out = strlen(ent->d_name); *ino_out = ent->d_ino; + *type_out = ent->d_type; *pos = telldir(dir); return ent->d_name; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index d49d202903f..c78841ee81c 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh) * whole transaction. * * Requires j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __try_to_free_cp_buf(struct journal_head *jh) { int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + if (jh->b_transaction == NULL && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { /* * Get our reference so that bh cannot be freed before @@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; - jbd_unlock_bh_state(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); - } else { - jbd_unlock_bh_state(bh); } return ret; } @@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) } /* - * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. - * The caller must restart a list walk. Wait for someone else to run - * jbd_unlock_bh_state(). - */ -static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) - __releases(journal->j_list_lock) -{ - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_lock_bh_state(bh); - jbd_unlock_bh_state(bh); - put_bh(bh); -} - -/* * Clean up transaction's list of buffers submitted for io. * We wait for any pending IO to complete and remove any clean * buffers. Note that we take the buffers in the opposite ordering @@ -222,15 +203,9 @@ restart: while (!released && transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -246,7 +221,6 @@ restart: * it has been written out and so we can drop it from the list */ released = __jbd2_journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); __brelse(bh); } @@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count) for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; - clear_buffer_jwrite(bh); BUFFER_TRACE(bh, "brelse"); __brelse(bh); } @@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count) * be written out. * * Called with j_list_lock held and drops it if 1 is returned - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, int *batch_count, transaction_t *transaction) @@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so starting and @@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, if (unlikely(buffer_write_io_error(bh))) ret = -EIO; get_bh(bh); - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); __brelse(bh); } else { /* @@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); - set_buffer_jwrite(bh); journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); - jbd_unlock_bh_state(bh); transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == JBD2_NR_BATCH) { @@ -407,15 +373,7 @@ restart: int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { - struct buffer_head *bh; - jh = transaction->t_checkpoint_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - retry = 1; - break; - } retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) @@ -478,79 +436,28 @@ out: int jbd2_cleanup_journal_tail(journal_t *journal) { - transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned long blocknr; if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * - * If the log is now empty, we need to work out which is the - * next transaction ID we will write, and where it will - * start. */ - - write_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - transaction = journal->j_checkpoint_transactions; - if (transaction) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_committing_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_running_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = journal->j_head; - } else { - first_tid = journal->j_transaction_sequence; - blocknr = journal->j_head; - } - spin_unlock(&journal->j_list_lock); - J_ASSERT(blocknr != 0); - - /* If the oldest pinned transaction is at the tail of the log - already then there's not much we can do right now. */ - if (journal->j_tail_sequence == first_tid) { - write_unlock(&journal->j_state_lock); + if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; - } - - /* OK, update the superblock to recover the freed space. - * Physical blocks come first: have we wrapped beyond the end of - * the log? */ - freed = blocknr - journal->j_tail; - if (blocknr < journal->j_tail) - freed = freed + journal->j_last - journal->j_first; - - trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); - jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", - journal->j_tail_sequence, first_tid, blocknr, freed); - - journal->j_free += freed; - journal->j_tail_sequence = first_tid; - journal->j_tail = blocknr; - write_unlock(&journal->j_state_lock); + J_ASSERT(blocknr != 0); /* - * If there is an external journal, we need to make sure that - * any data blocks that were recently written out --- perhaps - * by jbd2_log_do_checkpoint() --- are flushed out before we - * drop the transactions from the external journal. It's - * unlikely this will be necessary, especially with a - * appropriately sized journal, but we need this to guarantee - * correctness. Fortunately jbd2_cleanup_journal_tail() - * doesn't get called all that often. + * We need to make sure that any blocks that were recently written out + * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before + * we drop the transactions from the journal. It's unlikely this will + * be necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * jbd2_cleanup_journal_tail() doesn't get called all that often. */ - if ((journal->j_fs_dev != journal->j_dev) && - (journal->j_flags & JBD2_BARRIER)) + if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - if (!(journal->j_flags & JBD2_ABORT)) - jbd2_journal_update_superblock(journal, 1); + + __jbd2_update_log_tail(journal, first_tid, blocknr); return 0; } @@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) do { jh = next_jh; next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; } } /* @@ -673,9 +577,7 @@ out: * The function can free jh and bh. * * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ - int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { struct transaction_chp_stats_s *stats; @@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); - kfree(transaction); + jbd2_journal_free_transaction(transaction); /* Just in case anybody was waiting for more transactions to be checkpointed... */ @@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd2_drop_transaction(journal, transaction); + jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index c067a8cae63..806525a7269 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -28,7 +28,6 @@ #include <linux/blkdev.h> #include <linux/bitops.h> #include <trace/events/jbd2.h> -#include <asm/system.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -331,6 +330,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; struct blk_plug plug; + /* Tail of the journal */ + unsigned long first_block; + tid_t first_tid; + int update_tail; /* * First job: lock down the current transaction and wait for @@ -340,7 +343,18 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); - jbd2_journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -677,10 +691,30 @@ start_journal_io: err = 0; } + /* + * Get current oldest transaction in the log before we issue flush + * to the filesystem device. After the flush we can be sure that + * blocks of all older transactions are checkpointed to persistent + * storage and we will be safe to update journal start in the + * superblock with the numbers we get here. + */ + update_tail = + jbd2_journal_get_log_tail(journal, &first_tid, &first_block); + write_lock(&journal->j_state_lock); + if (update_tail) { + long freed = first_block - journal->j_tail; + + if (first_block < journal->j_tail) + freed += journal->j_last - journal->j_first; + /* Update tail only if we free significant amount of space */ + if (freed < journal->j_maxlen / 4) + update_tail = 0; + } J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_state = T_COMMIT_DFLUSH; write_unlock(&journal->j_state_lock); + /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue @@ -831,6 +865,14 @@ wait_for_iobuf: if (err) jbd2_journal_abort(journal, err); + /* + * Now disk caches for filesystem device are flushed so we are safe to + * erase checkpointed transactions from the log by updating journal + * superblock. + */ + if (update_tail) + jbd2_update_log_tail(journal, first_tid, first_block); + /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on @@ -1048,7 +1090,7 @@ restart_loop: jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) - kfree(commit_transaction); + jbd2_journal_free_transaction(commit_transaction); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 839377e3d62..1afb701622b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -50,7 +50,6 @@ #include <asm/uaccess.h> #include <asm/page.h> -#include <asm/system.h> EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); @@ -71,7 +70,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke); EXPORT_SYMBOL(jbd2_journal_init_dev); EXPORT_SYMBOL(jbd2_journal_init_inode); -EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); @@ -96,7 +94,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); @@ -746,6 +743,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return jbd2_journal_add_journal_head(bh); } +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block) +{ + transaction_t *transaction; + int ret; + + read_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + *tid = transaction->t_tid; + *block = journal->j_head; + } else { + *tid = journal->j_transaction_sequence; + *block = journal->j_head; + } + ret = tid_gt(*tid, journal->j_tail_sequence); + spin_unlock(&journal->j_list_lock); + read_unlock(&journal->j_state_lock); + + return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + unsigned long freed; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + + /* + * We cannot afford for write to remain in drive's caches since as + * soon as we update j_tail, next transaction can start reusing journal + * space and if we lose sb update during power failure we'd replay + * old transaction with possibly newly overwritten data. + */ + jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + write_lock(&journal->j_state_lock); + freed = block - journal->j_tail; + if (block < journal->j_tail) + freed += journal->j_last - journal->j_first; + + trace_jbd2_update_log_tail(journal, tid, block, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, tid, block, freed); + + journal->j_free += freed; + journal->j_tail_sequence = tid; + journal->j_tail = block; + write_unlock(&journal->j_state_lock); +} + +/* + * This is a variaon of __jbd2_update_log_tail which checks for validity of + * provided log tail and locks j_checkpoint_mutex. So it is safe against races + * with other threads updating log tail. + */ +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + mutex_lock(&journal->j_checkpoint_mutex); + if (tid_gt(tid, journal->j_tail_sequence)) + __jbd2_update_log_tail(journal, tid, block); + mutex_unlock(&journal->j_checkpoint_mutex); +} + struct jbd2_stats_proc_session { journal_t *journal; struct transaction_stats_s *stats; @@ -1114,40 +1203,45 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - jbd2_journal_update_superblock(journal, 1); - return jbd2_journal_start_thread(journal); -} - -/** - * void jbd2_journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void jbd2_journal_update_superblock(journal_t *journal, int wait) -{ - journal_superblock_t *sb = journal->j_superblock; - struct buffer_head *bh = journal->j_sb_buffer; - /* * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JBD2_FLUSHED. This avoids + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JBD2_FLUSHED. This avoids * attempting a write to a potential-readonly device. */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { + if (sb->s_start == 0) { jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); - goto out; + journal->j_flags |= JBD2_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); } + return jbd2_journal_start_thread(journal); +} +static void jbd2_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + int ret; + + trace_jbd2_write_superblock(journal, write_op); + if (!(journal->j_flags & JBD2_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1163,48 +1257,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + printk(KERN_ERR "JBD2: Error %d detected when updating " + "journal superblock for %s.\n", ret, + journal->j_devname); + } +} + +/** + * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned long tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + jbd2_write_superblock(journal, write_op); + + /* Log is no longer empty */ + write_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JBD2_FLUSHED; + write_unlock(&journal->j_state_lock); +} + +/** + * jbd2_mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void jbd2_mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); read_lock(&journal->j_state_lock); - jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - printk(KERN_ERR "JBD2: I/O error detected " - "when updating journal superblock for %s.\n", - journal->j_devname); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); - -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + jbd2_write_superblock(journal, WRITE_FUA); + /* Log is no longer empty */ write_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JBD2_FLUSHED; - else - journal->j_flags |= JBD2_FLUSHED; + journal->j_flags |= JBD2_FLUSHED; write_unlock(&journal->j_state_lock); } + +/** + * jbd2_journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void jbd2_journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + read_lock(&journal->j_state_lock); + jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + read_unlock(&journal->j_state_lock); + + jbd2_write_superblock(journal, WRITE_SYNC); +} + /* * Read the superblock for a given journal, performing initial * validation of the format. */ - static int journal_get_superblock(journal_t *journal) { struct buffer_head *bh; @@ -1398,14 +1550,11 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = - ++journal->j_transaction_sequence; - jbd2_journal_update_superblock(journal, 1); - } else { + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } @@ -1552,61 +1701,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, EXPORT_SYMBOL(jbd2_journal_clear_features); /** - * int jbd2_journal_update_format () - Update on-disk journal structure. - * @journal: Journal to act on. - * - * Given an initialised but unloaded journal struct, poke about in the - * on-disk structure to update it to the most recent supported version. - */ -int jbd2_journal_update_format (journal_t *journal) -{ - journal_superblock_t *sb; - int err; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - switch (be32_to_cpu(sb->s_header.h_blocktype)) { - case JBD2_SUPERBLOCK_V2: - return 0; - case JBD2_SUPERBLOCK_V1: - return journal_convert_superblock_v1(journal, sb); - default: - break; - } - return -EINVAL; -} - -static int journal_convert_superblock_v1(journal_t *journal, - journal_superblock_t *sb) -{ - int offset, blocksize; - struct buffer_head *bh; - - printk(KERN_WARNING - "JBD2: Converting superblock from version 1 to 2.\n"); - - /* Pre-initialise new fields to zero */ - offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); - blocksize = be32_to_cpu(sb->s_blocksize); - memset(&sb->s_feature_compat, 0, blocksize-offset); - - sb->s_nr_users = cpu_to_be32(1); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - journal->j_format_version = 2; - - bh = journal->j_sb_buffer; - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); - return 0; -} - - -/** * int jbd2_journal_flush () - Flush journal * @journal: Journal to act on. * @@ -1619,7 +1713,6 @@ int jbd2_journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; write_lock(&journal->j_state_lock); @@ -1654,6 +1747,7 @@ int jbd2_journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); jbd2_cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1661,14 +1755,9 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - write_unlock(&journal->j_state_lock); - jbd2_journal_update_superblock(journal, 1); - write_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1708,8 +1797,12 @@ int jbd2_journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); - if (write) - jbd2_journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + jbd2_mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1759,7 +1852,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); if (errno) - jbd2_journal_update_superblock(journal, 1); + jbd2_journal_update_sb_errno(journal); } /** @@ -2017,7 +2110,7 @@ static struct kmem_cache *jbd2_journal_head_cache; static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif -static int journal_init_jbd2_journal_head_cache(void) +static int jbd2_journal_init_journal_head_cache(void) { int retval; @@ -2035,7 +2128,7 @@ static int journal_init_jbd2_journal_head_cache(void) return retval; } -static void jbd2_journal_destroy_jbd2_journal_head_cache(void) +static void jbd2_journal_destroy_journal_head_cache(void) { if (jbd2_journal_head_cache) { kmem_cache_destroy(jbd2_journal_head_cache); @@ -2323,7 +2416,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; -static int __init journal_init_handle_cache(void) +static int __init jbd2_journal_init_handle_cache(void) { jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); if (jbd2_handle_cache == NULL) { @@ -2358,17 +2451,20 @@ static int __init journal_init_caches(void) ret = jbd2_journal_init_revoke_caches(); if (ret == 0) - ret = journal_init_jbd2_journal_head_cache(); + ret = jbd2_journal_init_journal_head_cache(); + if (ret == 0) + ret = jbd2_journal_init_handle_cache(); if (ret == 0) - ret = journal_init_handle_cache(); + ret = jbd2_journal_init_transaction_cache(); return ret; } static void jbd2_journal_destroy_caches(void) { jbd2_journal_destroy_revoke_caches(); - jbd2_journal_destroy_jbd2_journal_head_cache(); + jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_transaction_cache(); jbd2_journal_destroy_slabs(); } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index da6d7baf139..c1a03354a22 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/crc32.h> +#include <linux/blkdev.h> #endif /* @@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; - + /* Make sure all replayed data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); return err; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 30b2867d6cc..6973705d6a3 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void) J_ASSERT(!jbd2_revoke_record_cache); J_ASSERT(!jbd2_revoke_table_cache); - jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", - sizeof(struct jbd2_revoke_record_s), - 0, - SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, - NULL); + jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) goto record_cache_failure; - jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", - sizeof(struct jbd2_revoke_table_s), - 0, SLAB_TEMPORARY, NULL); + jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, + SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) goto table_cache_failure; return 0; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5aba56e1fd..ddcd3549c6c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -33,6 +33,35 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); +static struct kmem_cache *transaction_cache; +int __init jbd2_journal_init_transaction_cache(void) +{ + J_ASSERT(!transaction_cache); + transaction_cache = kmem_cache_create("jbd2_transaction_s", + sizeof(transaction_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (transaction_cache) + return 0; + return -ENOMEM; +} + +void jbd2_journal_destroy_transaction_cache(void) +{ + if (transaction_cache) { + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; + } +} + +void jbd2_journal_free_transaction(transaction_t *transaction) +{ + if (unlikely(ZERO_OR_NULL_PTR(transaction))) + return; + kmem_cache_free(transaction_cache, transaction); +} + /* * jbd2_get_transaction: obtain a new transaction_t object. * @@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); + new_transaction = kmem_cache_alloc(transaction_cache, + gfp_mask | __GFP_ZERO); if (!new_transaction) { /* * If __GFP_FS is not present, then we may be @@ -162,7 +192,7 @@ repeat: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { read_unlock(&journal->j_state_lock); - kfree(new_transaction); + jbd2_journal_free_transaction(new_transaction); return -EROFS; } @@ -284,7 +314,7 @@ repeat: read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); - kfree(new_transaction); + jbd2_journal_free_transaction(new_transaction); return 0; } @@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * of these pointers, it could go bad. Generally the caller needs to re-read * the pointer from the transaction_t. * - * Called under j_list_lock. The journal may not be locked. + * Called under j_list_lock. */ -void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; @@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ - if (jh->b_jlist == BJ_None) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __jbd2_journal_remove_checkpoint(jh); - } + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __jbd2_journal_remove_checkpoint(jh); } spin_unlock(&journal->j_list_lock); out: @@ -1949,6 +1977,8 @@ zap_buffer_unlocked: clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 926d02068a1..922f146e423 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 404111b016c..2b60ce1996a 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/jffs2.h> #include <linux/mtd/mtd.h> @@ -42,12 +44,13 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index); if (IS_ERR(tsk)) { - printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk)); + pr_warn("fork failed for JFFS2 garbage collect thread: %ld\n", + -PTR_ERR(tsk)); complete(&c->gc_thread_exit); ret = PTR_ERR(tsk); } else { /* Wait for it... */ - D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid)); + jffs2_dbg(1, "Garbage collect thread is pid %d\n", tsk->pid); wait_for_completion(&c->gc_thread_start); ret = tsk->pid; } @@ -60,7 +63,7 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c) int wait = 0; spin_lock(&c->erase_completion_lock); if (c->gc_task) { - D1(printk(KERN_DEBUG "jffs2: Killing GC task %d\n", c->gc_task->pid)); + jffs2_dbg(1, "Killing GC task %d\n", c->gc_task->pid); send_sig(SIGKILL, c->gc_task, 1); wait = 1; } @@ -90,7 +93,7 @@ static int jffs2_garbage_collect_thread(void *_c) if (!jffs2_thread_should_wake(c)) { set_current_state (TASK_INTERRUPTIBLE); spin_unlock(&c->erase_completion_lock); - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n")); + jffs2_dbg(1, "%s(): sleeping...\n", __func__); schedule(); } else spin_unlock(&c->erase_completion_lock); @@ -109,7 +112,7 @@ static int jffs2_garbage_collect_thread(void *_c) schedule_timeout_interruptible(msecs_to_jiffies(50)); if (kthread_should_stop()) { - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n")); + jffs2_dbg(1, "%s(): kthread_stop() called\n", __func__); goto die; } @@ -126,28 +129,32 @@ static int jffs2_garbage_collect_thread(void *_c) switch(signr) { case SIGSTOP: - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGSTOP received.\n")); + jffs2_dbg(1, "%s(): SIGSTOP received\n", + __func__); set_current_state(TASK_STOPPED); schedule(); break; case SIGKILL: - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGKILL received.\n")); + jffs2_dbg(1, "%s(): SIGKILL received\n", + __func__); goto die; case SIGHUP: - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): SIGHUP received.\n")); + jffs2_dbg(1, "%s(): SIGHUP received\n", + __func__); break; default: - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): signal %ld received\n", signr)); + jffs2_dbg(1, "%s(): signal %ld received\n", + __func__, signr); } } /* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */ disallow_signal(SIGHUP); - D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): pass\n")); + jffs2_dbg(1, "%s(): pass\n", __func__); if (jffs2_garbage_collect_pass(c) == -ENOSPC) { - printk(KERN_NOTICE "No space for garbage collection. Aborting GC thread\n"); + pr_notice("No space for garbage collection. Aborting GC thread\n"); goto die; } } diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 3005ec4520a..a3750f902ad 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> @@ -307,8 +309,8 @@ static void jffs2_calc_trigger_levels(struct jffs2_sb_info *c) trying to GC to make more space. It'll be a fruitless task */ c->nospc_dirty_size = c->sector_size + (c->flash_size / 100); - dbg_fsbuild("JFFS2 trigger levels (size %d KiB, block size %d KiB, %d blocks)\n", - c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks); + dbg_fsbuild("trigger levels (size %d KiB, block size %d KiB, %d blocks)\n", + c->flash_size / 1024, c->sector_size / 1024, c->nr_blocks); dbg_fsbuild("Blocks required to allow deletion: %d (%d KiB)\n", c->resv_blocks_deletion, c->resv_blocks_deletion*c->sector_size/1024); dbg_fsbuild("Blocks required to allow writes: %d (%d KiB)\n", diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 96ed3c9ec3f..4849a4c9a0e 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -12,6 +12,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include "compr.h" static DEFINE_SPINLOCK(jffs2_compressor_list_lock); @@ -79,7 +81,7 @@ static int jffs2_selected_compress(u8 compr, unsigned char *data_in, output_buf = kmalloc(*cdatalen, GFP_KERNEL); if (!output_buf) { - printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); + pr_warn("No memory for compressor allocation. Compression failed.\n"); return ret; } orig_slen = *datalen; @@ -188,7 +190,8 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, tmp_buf = kmalloc(orig_slen, GFP_KERNEL); spin_lock(&jffs2_compressor_list_lock); if (!tmp_buf) { - printk(KERN_WARNING "JFFS2: No memory for compressor allocation. (%d bytes)\n", orig_slen); + pr_warn("No memory for compressor allocation. (%d bytes)\n", + orig_slen); continue; } else { @@ -235,7 +238,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, cpage_out, datalen, cdatalen); break; default: - printk(KERN_ERR "JFFS2: unknown compression mode.\n"); + pr_err("unknown compression mode\n"); } if (ret == JFFS2_COMPR_NONE) { @@ -277,7 +280,8 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, ret = this->decompress(cdata_in, data_out, cdatalen, datalen); spin_lock(&jffs2_compressor_list_lock); if (ret) { - printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); + pr_warn("Decompressor \"%s\" returned %d\n", + this->name, ret); } else { this->stat_decompr_blocks++; @@ -287,7 +291,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, return ret; } } - printk(KERN_WARNING "JFFS2 compression type 0x%02x not available.\n", comprtype); + pr_warn("compression type 0x%02x not available\n", comprtype); spin_unlock(&jffs2_compressor_list_lock); return -EIO; } @@ -299,7 +303,7 @@ int jffs2_register_compressor(struct jffs2_compressor *comp) struct jffs2_compressor *this; if (!comp->name) { - printk(KERN_WARNING "NULL compressor name at registering JFFS2 compressor. Failed.\n"); + pr_warn("NULL compressor name at registering JFFS2 compressor. Failed.\n"); return -1; } comp->compr_buf_size=0; @@ -309,7 +313,7 @@ int jffs2_register_compressor(struct jffs2_compressor *comp) comp->stat_compr_new_size=0; comp->stat_compr_blocks=0; comp->stat_decompr_blocks=0; - D1(printk(KERN_DEBUG "Registering JFFS2 compressor \"%s\"\n", comp->name)); + jffs2_dbg(1, "Registering JFFS2 compressor \"%s\"\n", comp->name); spin_lock(&jffs2_compressor_list_lock); @@ -332,15 +336,15 @@ out: int jffs2_unregister_compressor(struct jffs2_compressor *comp) { - D2(struct jffs2_compressor *this;) + D2(struct jffs2_compressor *this); - D1(printk(KERN_DEBUG "Unregistering JFFS2 compressor \"%s\"\n", comp->name)); + jffs2_dbg(1, "Unregistering JFFS2 compressor \"%s\"\n", comp->name); spin_lock(&jffs2_compressor_list_lock); if (comp->usecount) { spin_unlock(&jffs2_compressor_list_lock); - printk(KERN_WARNING "JFFS2: Compressor module is in use. Unregister failed.\n"); + pr_warn("Compressor module is in use. Unregister failed.\n"); return -1; } list_del(&comp->list); @@ -377,17 +381,17 @@ int __init jffs2_compressors_init(void) /* Setting default compression mode */ #ifdef CONFIG_JFFS2_CMODE_NONE jffs2_compression_mode = JFFS2_COMPR_MODE_NONE; - D1(printk(KERN_INFO "JFFS2: default compression mode: none\n");) + jffs2_dbg(1, "default compression mode: none\n"); #else #ifdef CONFIG_JFFS2_CMODE_SIZE jffs2_compression_mode = JFFS2_COMPR_MODE_SIZE; - D1(printk(KERN_INFO "JFFS2: default compression mode: size\n");) + jffs2_dbg(1, "default compression mode: size\n"); #else #ifdef CONFIG_JFFS2_CMODE_FAVOURLZO jffs2_compression_mode = JFFS2_COMPR_MODE_FAVOURLZO; - D1(printk(KERN_INFO "JFFS2: default compression mode: favourlzo\n");) + jffs2_dbg(1, "default compression mode: favourlzo\n"); #else - D1(printk(KERN_INFO "JFFS2: default compression mode: priority\n");) + jffs2_dbg(1, "default compression mode: priority\n"); #endif #endif #endif diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c index af186ee674d..c553bd6506d 100644 --- a/fs/jffs2/compr_lzo.c +++ b/fs/jffs2/compr_lzo.c @@ -33,7 +33,6 @@ static int __init alloc_workspace(void) lzo_compress_buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); if (!lzo_mem || !lzo_compress_buf) { - printk(KERN_WARNING "Failed to allocate lzo deflate workspace\n"); free_workspace(); return -ENOMEM; } diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index 9e7cec808c4..92e0644bf86 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/string.h> #include <linux/types.h> #include <linux/jffs2.h> diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 5a001020c54..0b9a1e44e83 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -14,6 +14,8 @@ #error "The userspace support got too messy and was removed. Update your mkfs.jffs2" #endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/zlib.h> #include <linux/zutil.h> @@ -42,18 +44,18 @@ static int __init alloc_workspaces(void) { def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)); - if (!def_strm.workspace) { - printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)); + if (!def_strm.workspace) return -ENOMEM; - } - D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL))); + + jffs2_dbg(1, "Allocated %d bytes for deflate workspace\n", + zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)); inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); if (!inf_strm.workspace) { - printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize()); vfree(def_strm.workspace); return -ENOMEM; } - D1(printk(KERN_DEBUG "Allocated %d bytes for inflate workspace\n", zlib_inflate_workspacesize())); + jffs2_dbg(1, "Allocated %d bytes for inflate workspace\n", + zlib_inflate_workspacesize()); return 0; } @@ -79,7 +81,7 @@ static int jffs2_zlib_compress(unsigned char *data_in, mutex_lock(&deflate_mutex); if (Z_OK != zlib_deflateInit(&def_strm, 3)) { - printk(KERN_WARNING "deflateInit failed\n"); + pr_warn("deflateInit failed\n"); mutex_unlock(&deflate_mutex); return -1; } @@ -93,13 +95,14 @@ static int jffs2_zlib_compress(unsigned char *data_in, while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); - D1(printk(KERN_DEBUG "calling deflate with avail_in %d, avail_out %d\n", - def_strm.avail_in, def_strm.avail_out)); + jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n", + def_strm.avail_in, def_strm.avail_out); ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); - D1(printk(KERN_DEBUG "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", - def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out)); + jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", + def_strm.avail_in, def_strm.avail_out, + def_strm.total_in, def_strm.total_out); if (ret != Z_OK) { - D1(printk(KERN_DEBUG "deflate in loop returned %d\n", ret)); + jffs2_dbg(1, "deflate in loop returned %d\n", ret); zlib_deflateEnd(&def_strm); mutex_unlock(&deflate_mutex); return -1; @@ -111,20 +114,20 @@ static int jffs2_zlib_compress(unsigned char *data_in, zlib_deflateEnd(&def_strm); if (ret != Z_STREAM_END) { - D1(printk(KERN_DEBUG "final deflate returned %d\n", ret)); + jffs2_dbg(1, "final deflate returned %d\n", ret); ret = -1; goto out; } if (def_strm.total_out >= def_strm.total_in) { - D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld; failing\n", - def_strm.total_in, def_strm.total_out)); + jffs2_dbg(1, "zlib compressed %ld bytes into %ld; failing\n", + def_strm.total_in, def_strm.total_out); ret = -1; goto out; } - D1(printk(KERN_DEBUG "zlib compressed %ld bytes into %ld\n", - def_strm.total_in, def_strm.total_out)); + jffs2_dbg(1, "zlib compressed %ld bytes into %ld\n", + def_strm.total_in, def_strm.total_out); *dstlen = def_strm.total_out; *sourcelen = def_strm.total_in; @@ -157,18 +160,18 @@ static int jffs2_zlib_decompress(unsigned char *data_in, ((data_in[0] & 0x0f) == Z_DEFLATED) && !(((data_in[0]<<8) + data_in[1]) % 31)) { - D2(printk(KERN_DEBUG "inflate skipping adler32\n")); + jffs2_dbg(2, "inflate skipping adler32\n"); wbits = -((data_in[0] >> 4) + 8); inf_strm.next_in += 2; inf_strm.avail_in -= 2; } else { /* Let this remain D1 for now -- it should never happen */ - D1(printk(KERN_DEBUG "inflate not skipping adler32\n")); + jffs2_dbg(1, "inflate not skipping adler32\n"); } if (Z_OK != zlib_inflateInit2(&inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); + pr_warn("inflateInit failed\n"); mutex_unlock(&inflate_mutex); return 1; } @@ -176,7 +179,7 @@ static int jffs2_zlib_decompress(unsigned char *data_in, while((ret = zlib_inflate(&inf_strm, Z_FINISH)) == Z_OK) ; if (ret != Z_STREAM_END) { - printk(KERN_NOTICE "inflate returned %d\n", ret); + pr_notice("inflate returned %d\n", ret); } zlib_inflateEnd(&inf_strm); mutex_unlock(&inflate_mutex); diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c index e0b76c87a91..1090eb64b90 100644 --- a/fs/jffs2/debug.c +++ b/fs/jffs2/debug.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/types.h> #include <linux/pagemap.h> @@ -261,12 +263,15 @@ void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) bad += c->sector_size; } -#define check(sz) \ - if (sz != c->sz##_size) { \ - printk(KERN_WARNING #sz "_size mismatch counted 0x%x, c->" #sz "_size 0x%x\n", \ - sz, c->sz##_size); \ - dump = 1; \ - } +#define check(sz) \ +do { \ + if (sz != c->sz##_size) { \ + pr_warn("%s_size mismatch counted 0x%x, c->%s_size 0x%x\n", \ + #sz, sz, #sz, c->sz##_size); \ + dump = 1; \ + } \ +} while (0) + check(free); check(dirty); check(used); @@ -274,11 +279,12 @@ void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) check(unchecked); check(bad); check(erasing); + #undef check if (nr_counted != c->nr_blocks) { - printk(KERN_WARNING "%s counted only 0x%x blocks of 0x%x. Where are the others?\n", - __func__, nr_counted, c->nr_blocks); + pr_warn("%s counted only 0x%x blocks of 0x%x. Where are the others?\n", + __func__, nr_counted, c->nr_blocks); dump = 1; } diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h index c4f8eef5ca6..4fd9be4cbc9 100644 --- a/fs/jffs2/debug.h +++ b/fs/jffs2/debug.h @@ -51,6 +51,7 @@ * superseded by nicer dbg_xxx() macros... */ #if CONFIG_JFFS2_FS_DEBUG > 0 +#define DEBUG #define D1(x) x #else #define D1(x) @@ -62,50 +63,33 @@ #define D2(x) #endif +#define jffs2_dbg(level, fmt, ...) \ +do { \ + if (CONFIG_JFFS2_FS_DEBUG >= level) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + /* The prefixes of JFFS2 messages */ +#define JFFS2_DBG KERN_DEBUG #define JFFS2_DBG_PREFIX "[JFFS2 DBG]" -#define JFFS2_ERR_PREFIX "JFFS2 error:" -#define JFFS2_WARN_PREFIX "JFFS2 warning:" -#define JFFS2_NOTICE_PREFIX "JFFS2 notice:" - -#define JFFS2_ERR KERN_ERR -#define JFFS2_WARN KERN_WARNING -#define JFFS2_NOT KERN_NOTICE -#define JFFS2_DBG KERN_DEBUG - #define JFFS2_DBG_MSG_PREFIX JFFS2_DBG JFFS2_DBG_PREFIX -#define JFFS2_ERR_MSG_PREFIX JFFS2_ERR JFFS2_ERR_PREFIX -#define JFFS2_WARN_MSG_PREFIX JFFS2_WARN JFFS2_WARN_PREFIX -#define JFFS2_NOTICE_MSG_PREFIX JFFS2_NOT JFFS2_NOTICE_PREFIX /* JFFS2 message macros */ -#define JFFS2_ERROR(fmt, ...) \ - do { \ - printk(JFFS2_ERR_MSG_PREFIX \ - " (%d) %s: " fmt, task_pid_nr(current), \ - __func__ , ##__VA_ARGS__); \ - } while(0) +#define JFFS2_ERROR(fmt, ...) \ + pr_err("error: (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) #define JFFS2_WARNING(fmt, ...) \ - do { \ - printk(JFFS2_WARN_MSG_PREFIX \ - " (%d) %s: " fmt, task_pid_nr(current), \ - __func__ , ##__VA_ARGS__); \ - } while(0) + pr_warn("warning: (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) #define JFFS2_NOTICE(fmt, ...) \ - do { \ - printk(JFFS2_NOTICE_MSG_PREFIX \ - " (%d) %s: " fmt, task_pid_nr(current), \ - __func__ , ##__VA_ARGS__); \ - } while(0) + pr_notice("notice: (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) #define JFFS2_DEBUG(fmt, ...) \ - do { \ - printk(JFFS2_DBG_MSG_PREFIX \ - " (%d) %s: " fmt, task_pid_nr(current), \ - __func__ , ##__VA_ARGS__); \ - } while(0) + printk(KERN_DEBUG "[JFFS2 DBG] (%d) %s: " fmt, \ + task_pid_nr(current), __func__, ##__VA_ARGS__) /* * We split our debugging messages on several parts, depending on the JFFS2 diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 973ac5822bd..b56018896d5 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> @@ -79,7 +81,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, uint32_t ino = 0; struct inode *inode = NULL; - D1(printk(KERN_DEBUG "jffs2_lookup()\n")); + jffs2_dbg(1, "jffs2_lookup()\n"); if (target->d_name.len > JFFS2_MAX_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -103,7 +105,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, if (ino) { inode = jffs2_iget(dir_i->i_sb, ino); if (IS_ERR(inode)) - printk(KERN_WARNING "iget() failed for ino #%u\n", ino); + pr_warn("iget() failed for ino #%u\n", ino); } return d_splice_alias(inode, target); @@ -119,21 +121,22 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) struct jffs2_full_dirent *fd; unsigned long offset, curofs; - D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino)); + jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", + filp->f_path.dentry->d_inode->i_ino); f = JFFS2_INODE_INFO(inode); offset = filp->f_pos; if (offset == 0) { - D1(printk(KERN_DEBUG "Dirent 0: \".\", ino #%lu\n", inode->i_ino)); + jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino); if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) goto out; offset++; } if (offset == 1) { unsigned long pino = parent_ino(filp->f_path.dentry); - D1(printk(KERN_DEBUG "Dirent 1: \"..\", ino #%lu\n", pino)); + jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino); if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0) goto out; offset++; @@ -146,16 +149,18 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) curofs++; /* First loop: curofs = 2; offset = 2 */ if (curofs < offset) { - D2(printk(KERN_DEBUG "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", - fd->name, fd->ino, fd->type, curofs, offset)); + jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", + fd->name, fd->ino, fd->type, curofs, offset); continue; } if (!fd->ino) { - D2(printk(KERN_DEBUG "Skipping deletion dirent \"%s\"\n", fd->name)); + jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n", + fd->name); offset++; continue; } - D2(printk(KERN_DEBUG "Dirent %ld: \"%s\", ino #%u, type %d\n", offset, fd->name, fd->ino, fd->type)); + jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n", + offset, fd->name, fd->ino, fd->type); if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0) break; offset++; @@ -184,12 +189,12 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, c = JFFS2_SB_INFO(dir_i->i_sb); - D1(printk(KERN_DEBUG "jffs2_create()\n")); + jffs2_dbg(1, "%s()\n", __func__); inode = jffs2_new_inode(dir_i, mode, ri); if (IS_ERR(inode)) { - D1(printk(KERN_DEBUG "jffs2_new_inode() failed\n")); + jffs2_dbg(1, "jffs2_new_inode() failed\n"); jffs2_free_raw_inode(ri); return PTR_ERR(inode); } @@ -217,9 +222,9 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, jffs2_free_raw_inode(ri); - D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", - inode->i_ino, inode->i_mode, inode->i_nlink, - f->inocache->pino_nlink, inode->i_mapping->nrpages)); + jffs2_dbg(1, "%s(): Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", + __func__, inode->i_ino, inode->i_mode, inode->i_nlink, + f->inocache->pino_nlink, inode->i_mapping->nrpages); d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -362,14 +367,15 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* We use f->target field to store the target path. */ f->target = kmemdup(target, targetlen + 1, GFP_KERNEL); if (!f->target) { - printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); + pr_warn("Can't allocate %d bytes of memory\n", targetlen + 1); mutex_unlock(&f->sem); jffs2_complete_reservation(c); ret = -ENOMEM; goto fail; } - D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); + jffs2_dbg(1, "%s(): symlink's target '%s' cached\n", + __func__, (char *)f->target); /* No data here. Only a metadata node, which will be obsoleted by the first data write @@ -856,7 +862,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, f->inocache->pino_nlink++; mutex_unlock(&f->sem); - printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); + pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", + __func__, ret); /* Might as well let the VFS know */ d_instantiate(new_dentry, old_dentry->d_inode); ihold(old_dentry->d_inode); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index eafb8d37a6f..4a6cf289be2 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mtd/mtd.h> @@ -46,11 +48,12 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, #else /* Linux */ struct erase_info *instr; - D1(printk(KERN_DEBUG "jffs2_erase_block(): erase block %#08x (range %#08x-%#08x)\n", - jeb->offset, jeb->offset, jeb->offset + c->sector_size)); + jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n", + __func__, + jeb->offset, jeb->offset, jeb->offset + c->sector_size); instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); if (!instr) { - printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); + pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); @@ -69,7 +72,6 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, instr->len = c->sector_size; instr->callback = jffs2_erase_callback; instr->priv = (unsigned long)(&instr[1]); - instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; @@ -84,7 +86,8 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, if (ret == -ENOMEM || ret == -EAGAIN) { /* Erase failed immediately. Refile it on the list */ - D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); + jffs2_dbg(1, "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", + jeb->offset, ret); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); @@ -97,9 +100,11 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, } if (ret == -EROFS) - printk(KERN_WARNING "Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n", jeb->offset); + pr_warn("Erase at 0x%08x failed immediately: -EROFS. Is the sector locked?\n", + jeb->offset); else - printk(KERN_WARNING "Erase at 0x%08x failed immediately: errno %d\n", jeb->offset, ret); + pr_warn("Erase at 0x%08x failed immediately: errno %d\n", + jeb->offset, ret); jffs2_erase_failed(c, jeb, bad_offset); } @@ -125,13 +130,14 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) work_done++; if (!--count) { - D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); + jffs2_dbg(1, "Count reached. jffs2_erase_pending_blocks leaving\n"); goto done; } } else if (!list_empty(&c->erase_pending_list)) { jeb = list_entry(c->erase_pending_list.next, struct jffs2_eraseblock, list); - D1(printk(KERN_DEBUG "Starting erase of pending block 0x%08x\n", jeb->offset)); + jffs2_dbg(1, "Starting erase of pending block 0x%08x\n", + jeb->offset); list_del(&jeb->list); c->erasing_size += c->sector_size; c->wasted_size -= jeb->wasted_size; @@ -159,13 +165,13 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->erase_free_sem); done: - D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); + jffs2_dbg(1, "jffs2_erase_pending_blocks completed\n"); return work_done; } static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { - D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); + jffs2_dbg(1, "Erase completed successfully at 0x%08x\n", jeb->offset); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move_tail(&jeb->list, &c->erase_complete_list); @@ -214,7 +220,7 @@ static void jffs2_erase_callback(struct erase_info *instr) struct erase_priv_struct *priv = (void *)instr->priv; if(instr->state != MTD_ERASE_DONE) { - printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", + pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", (unsigned long long)instr->addr, instr->state); jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); } else { @@ -269,8 +275,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, return; } - D1(printk(KERN_DEBUG "Removed nodes in range 0x%08x-0x%08x from ino #%u\n", - jeb->offset, jeb->offset + c->sector_size, ic->ino)); + jffs2_dbg(1, "Removed nodes in range 0x%08x-0x%08x from ino #%u\n", + jeb->offset, jeb->offset + c->sector_size, ic->ino); D2({ int i=0; @@ -281,7 +287,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, printk(KERN_DEBUG); while(this) { - printk(KERN_CONT "0x%08x(%d)->", + pr_cont("0x%08x(%d)->", ref_offset(this), ref_flags(this)); if (++i == 5) { printk(KERN_DEBUG); @@ -289,7 +295,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, } this = this->next_in_ino; } - printk(KERN_CONT "\n"); + pr_cont("\n"); }); switch (ic->class) { @@ -310,7 +316,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { struct jffs2_raw_node_ref *block, *ref; - D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset)); + jffs2_dbg(1, "Freeing all node refs for eraseblock offset 0x%08x\n", + jeb->offset); block = ref = jeb->first_node; @@ -342,12 +349,13 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl &ebuf, NULL); if (ret != -EOPNOTSUPP) { if (ret) { - D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); + jffs2_dbg(1, "MTD point failed %d\n", ret); goto do_flash_read; } if (retlen < c->sector_size) { /* Don't muck about if it won't let us point to the whole erase sector */ - D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen)); + jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n", + retlen); mtd_unpoint(c->mtd, jeb->offset, retlen); goto do_flash_read; } @@ -359,8 +367,10 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl } while(--retlen); mtd_unpoint(c->mtd, jeb->offset, c->sector_size); if (retlen) { - printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n", - *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf)); + pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08tx\n", + *wordebuf, + jeb->offset + + c->sector_size-retlen * sizeof(*wordebuf)); return -EIO; } return 0; @@ -368,11 +378,12 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl do_flash_read: ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!ebuf) { - printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", jeb->offset); + pr_warn("Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", + jeb->offset); return -EAGAIN; } - D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset)); + jffs2_dbg(1, "Verifying erase at 0x%08x\n", jeb->offset); for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) { uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs); @@ -382,12 +393,14 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf); if (ret) { - printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); + pr_warn("Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", + ofs, ret); ret = -EIO; goto fail; } if (retlen != readlen) { - printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen); + pr_warn("Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", + ofs, readlen, retlen); ret = -EIO; goto fail; } @@ -396,7 +409,8 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl unsigned long *datum = ebuf + i; if (*datum + 1) { *bad_offset += i; - printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", *datum, *bad_offset); + pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08x\n", + *datum, *bad_offset); ret = -EIO; goto fail; } @@ -422,7 +436,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb } /* Write the erase complete marker */ - D1(printk(KERN_DEBUG "Writing erased marker to block at 0x%08x\n", jeb->offset)); + jffs2_dbg(1, "Writing erased marker to block at 0x%08x\n", jeb->offset); bad_offset = jeb->offset; /* Cleanmarker in oob area or no cleanmarker at all ? */ @@ -451,10 +465,10 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb if (ret || retlen != sizeof(marker)) { if (ret) - printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n", + pr_warn("Write clean marker to block at 0x%08x failed: %d\n", jeb->offset, ret); else - printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n", + pr_warn("Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n", jeb->offset, sizeof(marker), retlen); goto filebad; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 61e6723535b..db3889ba881 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/fs.h> #include <linux/time.h> @@ -85,7 +87,8 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) unsigned char *pg_buf; int ret; - D2(printk(KERN_DEBUG "jffs2_do_readpage_nolock(): ino #%lu, page at offset 0x%lx\n", inode->i_ino, pg->index << PAGE_CACHE_SHIFT)); + jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", + __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT); BUG_ON(!PageLocked(pg)); @@ -105,7 +108,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) flush_dcache_page(pg); kunmap(pg); - D2(printk(KERN_DEBUG "readpage finished\n")); + jffs2_dbg(2, "readpage finished\n"); return ret; } @@ -144,7 +147,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, return -ENOMEM; *pagep = pg; - D1(printk(KERN_DEBUG "jffs2_write_begin()\n")); + jffs2_dbg(1, "%s()\n", __func__); if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ @@ -153,8 +156,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct jffs2_full_dnode *fn; uint32_t alloc_len; - D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", - (unsigned int)inode->i_size, pageofs)); + jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", + (unsigned int)inode->i_size, pageofs); ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); @@ -198,7 +201,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, f->metadata = NULL; } if (ret) { - D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n", ret)); + jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in write_begin, returned %d\n", + ret); jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); @@ -222,7 +226,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (ret) goto out_page; } - D1(printk(KERN_DEBUG "end write_begin(). pg->flags %lx\n", pg->flags)); + jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); return ret; out_page: @@ -248,8 +252,9 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, int ret = 0; uint32_t writtenlen = 0; - D1(printk(KERN_DEBUG "jffs2_write_end(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", - inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags)); + jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", + __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT, + start, end, pg->flags); /* We need to avoid deadlock with page_cache_read() in jffs2_garbage_collect_pass(). So the page must be @@ -268,7 +273,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, ri = jffs2_alloc_raw_inode(); if (!ri) { - D1(printk(KERN_DEBUG "jffs2_write_end(): Allocation of raw inode failed\n")); + jffs2_dbg(1, "%s(): Allocation of raw inode failed\n", + __func__); unlock_page(pg); page_cache_release(pg); return -ENOMEM; @@ -315,13 +321,14 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, /* generic_file_write has written more to the page cache than we've actually written to the medium. Mark the page !Uptodate so that it gets reread */ - D1(printk(KERN_DEBUG "jffs2_write_end(): Not all bytes written. Marking page !uptodate\n")); + jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n", + __func__); SetPageError(pg); ClearPageUptodate(pg); } - D1(printk(KERN_DEBUG "jffs2_write_end() returning %d\n", - writtenlen > 0 ? writtenlen : ret)); + jffs2_dbg(1, "%s() returning %d\n", + __func__, writtenlen > 0 ? writtenlen : ret); unlock_page(pg); page_cache_release(pg); return writtenlen > 0 ? writtenlen : ret; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index c0d5c9d770d..bb6f993ebca 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -39,7 +41,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int ret; int alloc_type = ALLOC_NORMAL; - D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino)); + jffs2_dbg(1, "%s(): ino #%lu\n", __func__, inode->i_ino); /* Special cases - we don't want more than one data node for these types on the medium at any time. So setattr @@ -50,7 +52,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) /* For these, we don't actually need to read the old node */ mdatalen = jffs2_encode_dev(&dev, inode->i_rdev); mdata = (char *)&dev; - D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen)); + jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n", + __func__, mdatalen); } else if (S_ISLNK(inode->i_mode)) { mutex_lock(&f->sem); mdatalen = f->metadata->size; @@ -66,7 +69,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return ret; } mutex_unlock(&f->sem); - D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen)); + jffs2_dbg(1, "%s(): Writing %d bytes of symlink target\n", + __func__, mdatalen); } ri = jffs2_alloc_raw_inode(); @@ -233,7 +237,8 @@ void jffs2_evict_inode (struct inode *inode) struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode)); + jffs2_dbg(1, "%s(): ino #%lu mode %o\n", + __func__, inode->i_ino, inode->i_mode); truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); jffs2_do_clear_inode(c, f); @@ -249,7 +254,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) dev_t rdev = 0; int ret; - D1(printk(KERN_DEBUG "jffs2_iget(): ino == %lu\n", ino)); + jffs2_dbg(1, "%s(): ino == %lu\n", __func__, ino); inode = iget_locked(sb, ino); if (!inode) @@ -317,14 +322,16 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) /* Read the device numbers from the media */ if (f->metadata->size != sizeof(jdev.old_id) && f->metadata->size != sizeof(jdev.new_id)) { - printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); + pr_notice("Device node has strange size %d\n", + f->metadata->size); goto error_io; } - D1(printk(KERN_DEBUG "Reading device numbers from flash\n")); + jffs2_dbg(1, "Reading device numbers from flash\n"); ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size); if (ret < 0) { /* Eep */ - printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); + pr_notice("Read device numbers for inode %lu failed\n", + (unsigned long)inode->i_ino); goto error; } if (f->metadata->size == sizeof(jdev.old_id)) @@ -339,12 +346,13 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) break; default: - printk(KERN_WARNING "jffs2_read_inode(): Bogus imode %o for ino %lu\n", inode->i_mode, (unsigned long)inode->i_ino); + pr_warn("%s(): Bogus i_mode %o for ino %lu\n", + __func__, inode->i_mode, (unsigned long)inode->i_ino); } mutex_unlock(&f->sem); - D1(printk(KERN_DEBUG "jffs2_read_inode() returning\n")); + jffs2_dbg(1, "jffs2_read_inode() returning\n"); unlock_new_inode(inode); return inode; @@ -362,11 +370,13 @@ void jffs2_dirty_inode(struct inode *inode, int flags) struct iattr iattr; if (!(inode->i_state & I_DIRTY_DATASYNC)) { - D2(printk(KERN_DEBUG "jffs2_dirty_inode() not calling setattr() for ino #%lu\n", inode->i_ino)); + jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", + __func__, inode->i_ino); return; } - D1(printk(KERN_DEBUG "jffs2_dirty_inode() calling setattr() for ino #%lu\n", inode->i_ino)); + jffs2_dbg(1, "%s(): calling setattr() for ino #%lu\n", + __func__, inode->i_ino); iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME; iattr.ia_mode = inode->i_mode; @@ -414,7 +424,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r struct jffs2_inode_info *f; int ret; - D1(printk(KERN_DEBUG "jffs2_new_inode(): dir_i %ld, mode 0x%x\n", dir_i->i_ino, mode)); + jffs2_dbg(1, "%s(): dir_i %ld, mode 0x%x\n", + __func__, dir_i->i_ino, mode); c = JFFS2_SB_INFO(sb); @@ -504,11 +515,11 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) #ifndef CONFIG_JFFS2_FS_WRITEBUFFER if (c->mtd->type == MTD_NANDFLASH) { - printk(KERN_ERR "jffs2: Cannot operate on NAND flash unless jffs2 NAND support is compiled in.\n"); + pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n"); return -EINVAL; } if (c->mtd->type == MTD_DATAFLASH) { - printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n"); + pr_err("Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in\n"); return -EINVAL; } #endif @@ -522,12 +533,13 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) */ if ((c->sector_size * blocks) != c->flash_size) { c->flash_size = c->sector_size * blocks; - printk(KERN_INFO "jffs2: Flash size not aligned to erasesize, reducing to %dKiB\n", + pr_info("Flash size not aligned to erasesize, reducing to %dKiB\n", c->flash_size / 1024); } if (c->flash_size < 5*c->sector_size) { - printk(KERN_ERR "jffs2: Too few erase blocks (%d)\n", c->flash_size / c->sector_size); + pr_err("Too few erase blocks (%d)\n", + c->flash_size / c->sector_size); return -EINVAL; } @@ -550,17 +562,17 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) if ((ret = jffs2_do_mount_fs(c))) goto out_inohash; - D1(printk(KERN_DEBUG "jffs2_do_fill_super(): Getting root inode\n")); + jffs2_dbg(1, "%s(): Getting root inode\n", __func__); root_i = jffs2_iget(sb, 1); if (IS_ERR(root_i)) { - D1(printk(KERN_WARNING "get root inode failed\n")); + jffs2_dbg(1, "get root inode failed\n"); ret = PTR_ERR(root_i); goto out_root; } ret = -ENOMEM; - D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n")); + jffs2_dbg(1, "%s(): d_make_root()\n", __func__); sb->s_root = d_make_root(root_i); if (!sb->s_root) goto out_root; @@ -618,20 +630,21 @@ struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, */ inode = ilookup(OFNI_BS_2SFFJ(c), inum); if (!inode) { - D1(printk(KERN_DEBUG "ilookup() failed for ino #%u; inode is probably deleted.\n", - inum)); + jffs2_dbg(1, "ilookup() failed for ino #%u; inode is probably deleted.\n", + inum); spin_lock(&c->inocache_lock); ic = jffs2_get_ino_cache(c, inum); if (!ic) { - D1(printk(KERN_DEBUG "Inode cache for ino #%u is gone.\n", inum)); + jffs2_dbg(1, "Inode cache for ino #%u is gone\n", + inum); spin_unlock(&c->inocache_lock); return NULL; } if (ic->state != INO_STATE_CHECKEDABSENT) { /* Wait for progress. Don't just loop */ - D1(printk(KERN_DEBUG "Waiting for ino #%u in state %d\n", - ic->ino, ic->state)); + jffs2_dbg(1, "Waiting for ino #%u in state %d\n", + ic->ino, ic->state); sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); } else { spin_unlock(&c->inocache_lock); @@ -649,8 +662,8 @@ struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, return ERR_CAST(inode); } if (is_bad_inode(inode)) { - printk(KERN_NOTICE "Eep. read_inode() failed for ino #%u. unlinked %d\n", - inum, unlinked); + pr_notice("Eep. read_inode() failed for ino #%u. unlinked %d\n", + inum, unlinked); /* NB. This will happen again. We need to do something appropriate here. */ iput(inode); return ERR_PTR(-EIO); diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 31dce611337..ad271c70aa2 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/mtd/mtd.h> #include <linux/slab.h> @@ -51,44 +53,44 @@ static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c) number of free blocks is low. */ again: if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) { - D1(printk(KERN_DEBUG "Picking block from bad_used_list to GC next\n")); + jffs2_dbg(1, "Picking block from bad_used_list to GC next\n"); nextlist = &c->bad_used_list; } else if (n < 50 && !list_empty(&c->erasable_list)) { /* Note that most of them will have gone directly to be erased. So don't favour the erasable_list _too_ much. */ - D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next\n")); + jffs2_dbg(1, "Picking block from erasable_list to GC next\n"); nextlist = &c->erasable_list; } else if (n < 110 && !list_empty(&c->very_dirty_list)) { /* Most of the time, pick one off the very_dirty list */ - D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next\n")); + jffs2_dbg(1, "Picking block from very_dirty_list to GC next\n"); nextlist = &c->very_dirty_list; } else if (n < 126 && !list_empty(&c->dirty_list)) { - D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next\n")); + jffs2_dbg(1, "Picking block from dirty_list to GC next\n"); nextlist = &c->dirty_list; } else if (!list_empty(&c->clean_list)) { - D1(printk(KERN_DEBUG "Picking block from clean_list to GC next\n")); + jffs2_dbg(1, "Picking block from clean_list to GC next\n"); nextlist = &c->clean_list; } else if (!list_empty(&c->dirty_list)) { - D1(printk(KERN_DEBUG "Picking block from dirty_list to GC next (clean_list was empty)\n")); + jffs2_dbg(1, "Picking block from dirty_list to GC next (clean_list was empty)\n"); nextlist = &c->dirty_list; } else if (!list_empty(&c->very_dirty_list)) { - D1(printk(KERN_DEBUG "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n")); + jffs2_dbg(1, "Picking block from very_dirty_list to GC next (clean_list and dirty_list were empty)\n"); nextlist = &c->very_dirty_list; } else if (!list_empty(&c->erasable_list)) { - D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n")); + jffs2_dbg(1, "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n"); nextlist = &c->erasable_list; } else if (!list_empty(&c->erasable_pending_wbuf_list)) { /* There are blocks are wating for the wbuf sync */ - D1(printk(KERN_DEBUG "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n")); + jffs2_dbg(1, "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n"); spin_unlock(&c->erase_completion_lock); jffs2_flush_wbuf_pad(c); spin_lock(&c->erase_completion_lock); goto again; } else { /* Eep. All were empty */ - D1(printk(KERN_NOTICE "jffs2: No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n")); + jffs2_dbg(1, "No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n"); return NULL; } @@ -97,13 +99,15 @@ again: c->gcblock = ret; ret->gc_node = ret->first_node; if (!ret->gc_node) { - printk(KERN_WARNING "Eep. ret->gc_node for block at 0x%08x is NULL\n", ret->offset); + pr_warn("Eep. ret->gc_node for block at 0x%08x is NULL\n", + ret->offset); BUG(); } /* Have we accidentally picked a clean block with wasted space ? */ if (ret->wasted_size) { - D1(printk(KERN_DEBUG "Converting wasted_size %08x to dirty_size\n", ret->wasted_size)); + jffs2_dbg(1, "Converting wasted_size %08x to dirty_size\n", + ret->wasted_size); ret->dirty_size += ret->wasted_size; c->wasted_size -= ret->wasted_size; c->dirty_size += ret->wasted_size; @@ -140,8 +144,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) /* checked_ino is protected by the alloc_sem */ if (c->checked_ino > c->highest_ino && xattr) { - printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n", - c->unchecked_size); + pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n", + c->unchecked_size); jffs2_dbg_dump_block_lists_nolock(c); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); @@ -163,8 +167,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) } if (!ic->pino_nlink) { - D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink/pino zero\n", - ic->ino)); + jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n", + ic->ino); spin_unlock(&c->inocache_lock); jffs2_xattr_delete_inode(c, ic); continue; @@ -172,13 +176,15 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) switch(ic->state) { case INO_STATE_CHECKEDABSENT: case INO_STATE_PRESENT: - D1(printk(KERN_DEBUG "Skipping ino #%u already checked\n", ic->ino)); + jffs2_dbg(1, "Skipping ino #%u already checked\n", + ic->ino); spin_unlock(&c->inocache_lock); continue; case INO_STATE_GC: case INO_STATE_CHECKING: - printk(KERN_WARNING "Inode #%u is in state %d during CRC check phase!\n", ic->ino, ic->state); + pr_warn("Inode #%u is in state %d during CRC check phase!\n", + ic->ino, ic->state); spin_unlock(&c->inocache_lock); BUG(); @@ -186,7 +192,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) /* We need to wait for it to finish, lest we move on and trigger the BUG() above while we haven't yet finished checking all its nodes */ - D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino)); + jffs2_dbg(1, "Waiting for ino #%u to finish reading\n", + ic->ino); /* We need to come back again for the _same_ inode. We've made no progress in this case, but that should be OK */ c->checked_ino--; @@ -204,11 +211,13 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) ic->state = INO_STATE_CHECKING; spin_unlock(&c->inocache_lock); - D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() triggering inode scan of ino#%u\n", ic->ino)); + jffs2_dbg(1, "%s(): triggering inode scan of ino#%u\n", + __func__, ic->ino); ret = jffs2_do_crccheck_inode(c, ic); if (ret) - printk(KERN_WARNING "Returned error for crccheck of ino #%u. Expect badness...\n", ic->ino); + pr_warn("Returned error for crccheck of ino #%u. Expect badness...\n", + ic->ino); jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT); mutex_unlock(&c->alloc_sem); @@ -220,11 +229,11 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) !list_empty(&c->erase_pending_list)) { spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); - D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); + jffs2_dbg(1, "%s(): erasing pending blocks\n", __func__); if (jffs2_erase_pending_blocks(c, 1)) return 0; - D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); + jffs2_dbg(1, "No progress from erasing block; doing GC anyway\n"); spin_lock(&c->erase_completion_lock); mutex_lock(&c->alloc_sem); } @@ -242,13 +251,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) mutex_unlock(&c->alloc_sem); return -EAGAIN; } - D1(printk(KERN_NOTICE "jffs2: Couldn't find erase block to garbage collect!\n")); + jffs2_dbg(1, "Couldn't find erase block to garbage collect!\n"); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); return -EIO; } - D1(printk(KERN_DEBUG "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n", jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size)); + jffs2_dbg(1, "GC from block %08x, used_size %08x, dirty_size %08x, free_size %08x\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->free_size); D1(if (c->nextblock) printk(KERN_DEBUG "Nextblock at %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size)); @@ -261,12 +271,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) gcblock_dirty = jeb->dirty_size; while(ref_obsolete(raw)) { - D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw))); + jffs2_dbg(1, "Node at 0x%08x is obsolete... skipping\n", + ref_offset(raw)); raw = ref_next(raw); if (unlikely(!raw)) { - printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n"); - printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n", - jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size); + pr_warn("eep. End of raw list while still supposedly nodes to GC\n"); + pr_warn("erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n", + jeb->offset, jeb->free_size, + jeb->dirty_size, jeb->used_size); jeb->gc_node = raw; spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); @@ -275,7 +287,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) } jeb->gc_node = raw; - D1(printk(KERN_DEBUG "Going to garbage collect node at 0x%08x\n", ref_offset(raw))); + jffs2_dbg(1, "Going to garbage collect node at 0x%08x\n", + ref_offset(raw)); if (!raw->next_in_ino) { /* Inode-less node. Clean marker, snapshot or something like that */ @@ -316,7 +329,9 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) spin_unlock(&c->erase_completion_lock); - D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n", jeb->offset, ref_offset(raw), ref_flags(raw), ic->ino)); + jffs2_dbg(1, "%s(): collecting from block @0x%08x. Node @0x%08x(%d), ino #%u\n", + __func__, jeb->offset, ref_offset(raw), ref_flags(raw), + ic->ino); /* Three possibilities: 1. Inode is already in-core. We must iget it and do proper @@ -336,8 +351,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) if (ref_flags(raw) == REF_PRISTINE) ic->state = INO_STATE_GC; else { - D1(printk(KERN_DEBUG "Ino #%u is absent but node not REF_PRISTINE. Reading.\n", - ic->ino)); + jffs2_dbg(1, "Ino #%u is absent but node not REF_PRISTINE. Reading.\n", + ic->ino); } break; @@ -353,8 +368,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) we're holding the alloc_sem, no other garbage collection can happen. */ - printk(KERN_CRIT "Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n", - ic->ino, ic->state); + pr_crit("Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n", + ic->ino, ic->state); mutex_unlock(&c->alloc_sem); spin_unlock(&c->inocache_lock); BUG(); @@ -367,8 +382,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) drop the alloc_sem before sleeping. */ mutex_unlock(&c->alloc_sem); - D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() waiting for ino #%u in state %d\n", - ic->ino, ic->state)); + jffs2_dbg(1, "%s(): waiting for ino #%u in state %d\n", + __func__, ic->ino, ic->state); sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); /* And because we dropped the alloc_sem we must start again from the beginning. Ponder chance of livelock here -- we're returning success @@ -433,7 +448,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) test_gcnode: if (jeb->dirty_size == gcblock_dirty && !ref_obsolete(jeb->gc_node)) { /* Eep. This really should never happen. GC is broken */ - printk(KERN_ERR "Error garbage collecting node at %08x!\n", ref_offset(jeb->gc_node)); + pr_err("Error garbage collecting node at %08x!\n", + ref_offset(jeb->gc_node)); ret = -ENOSPC; } release_sem: @@ -445,7 +461,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) eraseit: if (c->gcblock && !c->gcblock->used_size) { - D1(printk(KERN_DEBUG "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n", c->gcblock->offset)); + jffs2_dbg(1, "Block at 0x%08x completely obsoleted by GC. Moving to erase_pending_list\n", + c->gcblock->offset); /* We're GC'ing an empty block? */ list_add_tail(&c->gcblock->list, &c->erase_pending_list); c->gcblock = NULL; @@ -475,12 +492,12 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era if (c->gcblock != jeb) { spin_unlock(&c->erase_completion_lock); - D1(printk(KERN_DEBUG "GC block is no longer gcblock. Restart\n")); + jffs2_dbg(1, "GC block is no longer gcblock. Restart\n"); goto upnout; } if (ref_obsolete(raw)) { spin_unlock(&c->erase_completion_lock); - D1(printk(KERN_DEBUG "node to be GC'd was obsoleted in the meantime.\n")); + jffs2_dbg(1, "node to be GC'd was obsoleted in the meantime.\n"); /* They'll call again */ goto upnout; } @@ -536,10 +553,10 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era } else if (fd) { ret = jffs2_garbage_collect_deletion_dirent(c, jeb, f, fd); } else { - printk(KERN_WARNING "Raw node at 0x%08x wasn't in node lists for ino #%u\n", - ref_offset(raw), f->inocache->ino); + pr_warn("Raw node at 0x%08x wasn't in node lists for ino #%u\n", + ref_offset(raw), f->inocache->ino); if (ref_obsolete(raw)) { - printk(KERN_WARNING "But it's obsolete so we don't mind too much\n"); + pr_warn("But it's obsolete so we don't mind too much\n"); } else { jffs2_dbg_dump_node(c, ref_offset(raw)); BUG(); @@ -562,7 +579,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, uint32_t crc, rawlen; int retried = 0; - D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw))); + jffs2_dbg(1, "Going to GC REF_PRISTINE node at 0x%08x\n", + ref_offset(raw)); alloclen = rawlen = ref_totlen(c, c->gcblock, raw); @@ -595,8 +613,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, crc = crc32(0, node, sizeof(struct jffs2_unknown_node)-4); if (je32_to_cpu(node->u.hdr_crc) != crc) { - printk(KERN_WARNING "Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc); + pr_warn("Header CRC failed on REF_PRISTINE node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->u.hdr_crc), crc); goto bail; } @@ -604,16 +622,18 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, case JFFS2_NODETYPE_INODE: crc = crc32(0, node, sizeof(node->i)-8); if (je32_to_cpu(node->i.node_crc) != crc) { - printk(KERN_WARNING "Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ref_offset(raw), je32_to_cpu(node->i.node_crc), crc); + pr_warn("Node CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), je32_to_cpu(node->i.node_crc), + crc); goto bail; } if (je32_to_cpu(node->i.dsize)) { crc = crc32(0, node->i.data, je32_to_cpu(node->i.csize)); if (je32_to_cpu(node->i.data_crc) != crc) { - printk(KERN_WARNING "Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ref_offset(raw), je32_to_cpu(node->i.data_crc), crc); + pr_warn("Data CRC failed on REF_PRISTINE data node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), + je32_to_cpu(node->i.data_crc), crc); goto bail; } } @@ -622,21 +642,24 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, case JFFS2_NODETYPE_DIRENT: crc = crc32(0, node, sizeof(node->d)-8); if (je32_to_cpu(node->d.node_crc) != crc) { - printk(KERN_WARNING "Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ref_offset(raw), je32_to_cpu(node->d.node_crc), crc); + pr_warn("Node CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), + je32_to_cpu(node->d.node_crc), crc); goto bail; } if (strnlen(node->d.name, node->d.nsize) != node->d.nsize) { - printk(KERN_WARNING "Name in dirent node at 0x%08x contains zeroes\n", ref_offset(raw)); + pr_warn("Name in dirent node at 0x%08x contains zeroes\n", + ref_offset(raw)); goto bail; } if (node->d.nsize) { crc = crc32(0, node->d.name, node->d.nsize); if (je32_to_cpu(node->d.name_crc) != crc) { - printk(KERN_WARNING "Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ref_offset(raw), je32_to_cpu(node->d.name_crc), crc); + pr_warn("Name CRC failed on REF_PRISTINE dirent node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + ref_offset(raw), + je32_to_cpu(node->d.name_crc), crc); goto bail; } } @@ -644,8 +667,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, default: /* If it's inode-less, we don't _know_ what it is. Just copy it intact */ if (ic) { - printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n", - ref_offset(raw), je16_to_cpu(node->u.nodetype)); + pr_warn("Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n", + ref_offset(raw), je16_to_cpu(node->u.nodetype)); goto bail; } } @@ -657,12 +680,13 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node); if (ret || (retlen != rawlen)) { - printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n", - rawlen, phys_ofs, ret, retlen); + pr_notice("Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n", + rawlen, phys_ofs, ret, retlen); if (retlen) { jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL); } else { - printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs); + pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", + phys_ofs); } if (!retried) { /* Try to reallocate space and retry */ @@ -671,7 +695,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, retried = 1; - D1(printk(KERN_DEBUG "Retrying failed write of REF_PRISTINE node.\n")); + jffs2_dbg(1, "Retrying failed write of REF_PRISTINE node.\n"); jffs2_dbg_acct_sanity_check(c,jeb); jffs2_dbg_acct_paranoia_check(c, jeb); @@ -681,14 +705,16 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, it is only an upper estimation */ if (!ret) { - D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", phys_ofs)); + jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n", + phys_ofs); jffs2_dbg_acct_sanity_check(c,jeb); jffs2_dbg_acct_paranoia_check(c, jeb); goto retry; } - D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret)); + jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n", + ret); } if (!ret) @@ -698,7 +724,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c, jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic); jffs2_mark_node_obsolete(c, raw); - D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw))); + jffs2_dbg(1, "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", + ref_offset(raw)); out_node: kfree(node); @@ -725,29 +752,32 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ /* For these, we don't actually need to read the old node */ mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f)); mdata = (char *)&dev; - D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen)); + jffs2_dbg(1, "%s(): Writing %d bytes of kdev_t\n", + __func__, mdatalen); } else if (S_ISLNK(JFFS2_F_I_MODE(f))) { mdatalen = fn->size; mdata = kmalloc(fn->size, GFP_KERNEL); if (!mdata) { - printk(KERN_WARNING "kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n"); + pr_warn("kmalloc of mdata failed in jffs2_garbage_collect_metadata()\n"); return -ENOMEM; } ret = jffs2_read_dnode(c, f, fn, mdata, 0, mdatalen); if (ret) { - printk(KERN_WARNING "read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n", ret); + pr_warn("read of old metadata failed in jffs2_garbage_collect_metadata(): %d\n", + ret); kfree(mdata); return ret; } - D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bites of symlink target\n", mdatalen)); + jffs2_dbg(1, "%s(): Writing %d bites of symlink target\n", + __func__, mdatalen); } ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen, JFFS2_SUMMARY_INODE_SIZE); if (ret) { - printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n", - sizeof(ri)+ mdatalen, ret); + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n", + sizeof(ri) + mdatalen, ret); goto out; } @@ -784,7 +814,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC); if (IS_ERR(new_fn)) { - printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn)); + pr_warn("Error writing new dnode: %ld\n", PTR_ERR(new_fn)); ret = PTR_ERR(new_fn); goto out; } @@ -827,14 +857,15 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen, JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize)); if (ret) { - printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n", - sizeof(rd)+rd.nsize, ret); + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n", + sizeof(rd)+rd.nsize, ret); return ret; } new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC); if (IS_ERR(new_fd)) { - printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd)); + pr_warn("jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", + PTR_ERR(new_fd)); return PTR_ERR(new_fd); } jffs2_add_fd_to_list(c, new_fd, &f->dents); @@ -887,19 +918,22 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset)) continue; - D1(printk(KERN_DEBUG "Check potential deletion dirent at %08x\n", ref_offset(raw))); + jffs2_dbg(1, "Check potential deletion dirent at %08x\n", + ref_offset(raw)); /* This is an obsolete node belonging to the same directory, and it's of the right length. We need to take a closer look...*/ ret = jffs2_flash_read(c, ref_offset(raw), rawlen, &retlen, (char *)rd); if (ret) { - printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Read error (%d) reading obsolete node at %08x\n", ret, ref_offset(raw)); + pr_warn("%s(): Read error (%d) reading obsolete node at %08x\n", + __func__, ret, ref_offset(raw)); /* If we can't read it, we don't need to continue to obsolete it. Continue */ continue; } if (retlen != rawlen) { - printk(KERN_WARNING "jffs2_g_c_deletion_dirent(): Short read (%zd not %u) reading header from obsolete node at %08x\n", - retlen, rawlen, ref_offset(raw)); + pr_warn("%s(): Short read (%zd not %u) reading header from obsolete node at %08x\n", + __func__, retlen, rawlen, + ref_offset(raw)); continue; } @@ -923,8 +957,9 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct a new deletion dirent to replace it */ mutex_unlock(&c->erase_free_sem); - D1(printk(KERN_DEBUG "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n", - ref_offset(fd->raw), fd->name, ref_offset(raw), je32_to_cpu(rd->ino))); + jffs2_dbg(1, "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n", + ref_offset(fd->raw), fd->name, + ref_offset(raw), je32_to_cpu(rd->ino)); kfree(rd); return jffs2_garbage_collect_dirent(c, jeb, f, fd); @@ -947,7 +982,8 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct fdp = &(*fdp)->next; } if (!found) { - printk(KERN_WARNING "Deletion dirent \"%s\" not found in list for ino #%u\n", fd->name, f->inocache->ino); + pr_warn("Deletion dirent \"%s\" not found in list for ino #%u\n", + fd->name, f->inocache->ino); } jffs2_mark_node_obsolete(c, fd->raw); jffs2_free_full_dirent(fd); @@ -964,8 +1000,8 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras uint32_t alloclen, ilen; int ret; - D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n", - f->inocache->ino, start, end)); + jffs2_dbg(1, "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n", + f->inocache->ino, start, end); memset(&ri, 0, sizeof(ri)); @@ -976,35 +1012,37 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras write it out again with the _same_ version as before */ ret = jffs2_flash_read(c, ref_offset(fn->raw), sizeof(ri), &readlen, (char *)&ri); if (readlen != sizeof(ri) || ret) { - printk(KERN_WARNING "Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n", ret, readlen); + pr_warn("Node read failed in jffs2_garbage_collect_hole. Ret %d, retlen %zd. Data will be lost by writing new hole node\n", + ret, readlen); goto fill; } if (je16_to_cpu(ri.nodetype) != JFFS2_NODETYPE_INODE) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n", - ref_offset(fn->raw), - je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE); + pr_warn("%s(): Node at 0x%08x had node type 0x%04x instead of JFFS2_NODETYPE_INODE(0x%04x)\n", + __func__, ref_offset(fn->raw), + je16_to_cpu(ri.nodetype), JFFS2_NODETYPE_INODE); return -EIO; } if (je32_to_cpu(ri.totlen) != sizeof(ri)) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n", - ref_offset(fn->raw), - je32_to_cpu(ri.totlen), sizeof(ri)); + pr_warn("%s(): Node at 0x%08x had totlen 0x%x instead of expected 0x%zx\n", + __func__, ref_offset(fn->raw), + je32_to_cpu(ri.totlen), sizeof(ri)); return -EIO; } crc = crc32(0, &ri, sizeof(ri)-8); if (crc != je32_to_cpu(ri.node_crc)) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n", - ref_offset(fn->raw), - je32_to_cpu(ri.node_crc), crc); + pr_warn("%s: Node at 0x%08x had CRC 0x%08x which doesn't match calculated CRC 0x%08x\n", + __func__, ref_offset(fn->raw), + je32_to_cpu(ri.node_crc), crc); /* FIXME: We could possibly deal with this by writing new holes for each frag */ - printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", - start, end, f->inocache->ino); + pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", + start, end, f->inocache->ino); goto fill; } if (ri.compr != JFFS2_COMPR_ZERO) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: Node 0x%08x wasn't a hole node!\n", ref_offset(fn->raw)); - printk(KERN_WARNING "Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", - start, end, f->inocache->ino); + pr_warn("%s(): Node 0x%08x wasn't a hole node!\n", + __func__, ref_offset(fn->raw)); + pr_warn("Data in the range 0x%08x to 0x%08x of inode #%u will be lost\n", + start, end, f->inocache->ino); goto fill; } } else { @@ -1043,14 +1081,14 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen, JFFS2_SUMMARY_INODE_SIZE); if (ret) { - printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n", - sizeof(ri), ret); + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n", + sizeof(ri), ret); return ret; } new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC); if (IS_ERR(new_fn)) { - printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn)); + pr_warn("Error writing new hole node: %ld\n", PTR_ERR(new_fn)); return PTR_ERR(new_fn); } if (je32_to_cpu(ri.version) == f->highest_version) { @@ -1070,9 +1108,9 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras * above.) */ D1(if(unlikely(fn->frags <= 1)) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n", - fn->frags, je32_to_cpu(ri.version), f->highest_version, - je32_to_cpu(ri.ino)); + pr_warn("%s(): Replacing fn with %d frag(s) but new ver %d != highest_version %d of ino #%d\n", + __func__, fn->frags, je32_to_cpu(ri.version), + f->highest_version, je32_to_cpu(ri.ino)); }); /* This is a partially-overlapped hole node. Mark it REF_NORMAL not REF_PRISTINE */ @@ -1089,11 +1127,11 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras } } if (fn->frags) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: Old node still has frags!\n"); + pr_warn("%s(): Old node still has frags!\n", __func__); BUG(); } if (!new_fn->frags) { - printk(KERN_WARNING "jffs2_garbage_collect_hole: New node has no frags!\n"); + pr_warn("%s(): New node has no frags!\n", __func__); BUG(); } @@ -1117,8 +1155,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era memset(&ri, 0, sizeof(ri)); - D1(printk(KERN_DEBUG "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n", - f->inocache->ino, start, end)); + jffs2_dbg(1, "Writing replacement dnode for ino #%u from offset 0x%x to 0x%x\n", + f->inocache->ino, start, end); orig_end = end; orig_start = start; @@ -1149,15 +1187,15 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era /* If the previous frag doesn't even reach the beginning, there's excessive fragmentation. Just merge. */ if (frag->ofs > min) { - D1(printk(KERN_DEBUG "Expanding down to cover partial frag (0x%x-0x%x)\n", - frag->ofs, frag->ofs+frag->size)); + jffs2_dbg(1, "Expanding down to cover partial frag (0x%x-0x%x)\n", + frag->ofs, frag->ofs+frag->size); start = frag->ofs; continue; } /* OK. This frag holds the first byte of the page. */ if (!frag->node || !frag->node->raw) { - D1(printk(KERN_DEBUG "First frag in page is hole (0x%x-0x%x). Not expanding down.\n", - frag->ofs, frag->ofs+frag->size)); + jffs2_dbg(1, "First frag in page is hole (0x%x-0x%x). Not expanding down.\n", + frag->ofs, frag->ofs+frag->size); break; } else { @@ -1171,19 +1209,25 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era jeb = &c->blocks[raw->flash_offset / c->sector_size]; if (jeb == c->gcblock) { - D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n", - frag->ofs, frag->ofs+frag->size, ref_offset(raw))); + jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in gcblock at %08x\n", + frag->ofs, + frag->ofs + frag->size, + ref_offset(raw)); start = frag->ofs; break; } if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) { - D1(printk(KERN_DEBUG "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n", - frag->ofs, frag->ofs+frag->size, jeb->offset)); + jffs2_dbg(1, "Not expanding down to cover frag (0x%x-0x%x) in clean block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); break; } - D1(printk(KERN_DEBUG "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n", - frag->ofs, frag->ofs+frag->size, jeb->offset)); + jffs2_dbg(1, "Expanding down to cover frag (0x%x-0x%x) in dirty block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); start = frag->ofs; break; } @@ -1199,15 +1243,15 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era /* If the previous frag doesn't even reach the beginning, there's lots of fragmentation. Just merge. */ if (frag->ofs+frag->size < max) { - D1(printk(KERN_DEBUG "Expanding up to cover partial frag (0x%x-0x%x)\n", - frag->ofs, frag->ofs+frag->size)); + jffs2_dbg(1, "Expanding up to cover partial frag (0x%x-0x%x)\n", + frag->ofs, frag->ofs+frag->size); end = frag->ofs + frag->size; continue; } if (!frag->node || !frag->node->raw) { - D1(printk(KERN_DEBUG "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n", - frag->ofs, frag->ofs+frag->size)); + jffs2_dbg(1, "Last frag in page is hole (0x%x-0x%x). Not expanding up.\n", + frag->ofs, frag->ofs+frag->size); break; } else { @@ -1221,25 +1265,31 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era jeb = &c->blocks[raw->flash_offset / c->sector_size]; if (jeb == c->gcblock) { - D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n", - frag->ofs, frag->ofs+frag->size, ref_offset(raw))); + jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in gcblock at %08x\n", + frag->ofs, + frag->ofs + frag->size, + ref_offset(raw)); end = frag->ofs + frag->size; break; } if (!ISDIRTY(jeb->dirty_size + jeb->wasted_size)) { - D1(printk(KERN_DEBUG "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n", - frag->ofs, frag->ofs+frag->size, jeb->offset)); + jffs2_dbg(1, "Not expanding up to cover frag (0x%x-0x%x) in clean block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); break; } - D1(printk(KERN_DEBUG "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n", - frag->ofs, frag->ofs+frag->size, jeb->offset)); + jffs2_dbg(1, "Expanding up to cover frag (0x%x-0x%x) in dirty block %08x\n", + frag->ofs, + frag->ofs + frag->size, + jeb->offset); end = frag->ofs + frag->size; break; } } - D1(printk(KERN_DEBUG "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n", - orig_start, orig_end, start, end)); + jffs2_dbg(1, "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n", + orig_start, orig_end, start, end); D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size)); BUG_ON(end < orig_end); @@ -1256,7 +1306,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg); if (IS_ERR(pg_ptr)) { - printk(KERN_WARNING "read_cache_page() returned error: %ld\n", PTR_ERR(pg_ptr)); + pr_warn("read_cache_page() returned error: %ld\n", + PTR_ERR(pg_ptr)); return PTR_ERR(pg_ptr); } @@ -1270,8 +1321,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era &alloclen, JFFS2_SUMMARY_INODE_SIZE); if (ret) { - printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n", - sizeof(ri)+ JFFS2_MIN_DATA_LEN, ret); + pr_warn("jffs2_reserve_space_gc of %zd bytes for garbage_collect_dnode failed: %d\n", + sizeof(ri) + JFFS2_MIN_DATA_LEN, ret); break; } cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset); @@ -1308,7 +1359,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era jffs2_free_comprbuf(comprbuf, writebuf); if (IS_ERR(new_fn)) { - printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn)); + pr_warn("Error writing new dnode: %ld\n", + PTR_ERR(new_fn)); ret = PTR_ERR(new_fn); break; } diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index c082868910f..4f47aa24b55 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 5e03233c236..975a1f562c1 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/fs.h> @@ -687,8 +689,8 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb if (!size) return 0; if (unlikely(size > jeb->free_size)) { - printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n", - size, jeb->free_size, jeb->wasted_size); + pr_crit("Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n", + size, jeb->free_size, jeb->wasted_size); BUG(); } /* REF_EMPTY_NODE is !obsolete, so that works OK */ @@ -726,8 +728,10 @@ static inline uint32_t __ref_totlen(struct jffs2_sb_info *c, /* Last node in block. Use free_space */ if (unlikely(ref != jeb->last_node)) { - printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n", - ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0); + pr_crit("ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n", + ref, ref_offset(ref), jeb->last_node, + jeb->last_node ? + ref_offset(jeb->last_node) : 0); BUG(); } ref_end = jeb->offset + c->sector_size - jeb->free_size; @@ -747,16 +751,20 @@ uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *je if (!jeb) jeb = &c->blocks[ref->flash_offset / c->sector_size]; - printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n", - ref, ref_offset(ref), ref_offset(ref)+ref->__totlen, - ret, ref->__totlen); + pr_crit("Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n", + ref, ref_offset(ref), ref_offset(ref) + ref->__totlen, + ret, ref->__totlen); if (ref_next(ref)) { - printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)), - ref_offset(ref_next(ref))+ref->__totlen); + pr_crit("next %p (0x%08x-0x%08x)\n", + ref_next(ref), ref_offset(ref_next(ref)), + ref_offset(ref_next(ref)) + ref->__totlen); } else - printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node); + pr_crit("No next ref. jeb->last_node is %p\n", + jeb->last_node); - printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size); + pr_crit("jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", + jeb->wasted_size, jeb->dirty_size, jeb->used_size, + jeb->free_size); #if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS) __jffs2_dbg_dump_node_refs_nolock(c, jeb); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 694aa5b0350..6784d1e7a7e 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/mtd/mtd.h> #include <linux/compiler.h> @@ -46,10 +48,10 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, /* align it */ minsize = PAD(minsize); - D1(printk(KERN_DEBUG "jffs2_reserve_space(): Requested 0x%x bytes\n", minsize)); + jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); mutex_lock(&c->alloc_sem); - D1(printk(KERN_DEBUG "jffs2_reserve_space(): alloc sem got\n")); + jffs2_dbg(1, "%s(): alloc sem got\n", __func__); spin_lock(&c->erase_completion_lock); @@ -73,11 +75,13 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size; if (dirty < c->nospc_dirty_size) { if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { - D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n")); + jffs2_dbg(1, "%s(): Low on dirty space to GC, but it's a deletion. Allowing...\n", + __func__); break; } - D1(printk(KERN_DEBUG "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n", - dirty, c->unchecked_size, c->sector_size)); + jffs2_dbg(1, "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n", + dirty, c->unchecked_size, + c->sector_size); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); @@ -96,12 +100,13 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size; if ( (avail / c->sector_size) <= blocksneeded) { if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { - D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n")); + jffs2_dbg(1, "%s(): Low on possibly available space, but it's a deletion. Allowing...\n", + __func__); break; } - D1(printk(KERN_DEBUG "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n", - avail, blocksneeded * c->sector_size)); + jffs2_dbg(1, "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n", + avail, blocksneeded * c->sector_size); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); return -ENOSPC; @@ -109,9 +114,14 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, mutex_unlock(&c->alloc_sem); - D1(printk(KERN_DEBUG "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n", - c->nr_free_blocks, c->nr_erasing_blocks, c->free_size, c->dirty_size, c->wasted_size, c->used_size, c->erasing_size, c->bad_size, - c->free_size + c->dirty_size + c->wasted_size + c->used_size + c->erasing_size + c->bad_size, c->flash_size)); + jffs2_dbg(1, "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n", + c->nr_free_blocks, c->nr_erasing_blocks, + c->free_size, c->dirty_size, c->wasted_size, + c->used_size, c->erasing_size, c->bad_size, + c->free_size + c->dirty_size + + c->wasted_size + c->used_size + + c->erasing_size + c->bad_size, + c->flash_size); spin_unlock(&c->erase_completion_lock); ret = jffs2_garbage_collect_pass(c); @@ -124,7 +134,8 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, DECLARE_WAITQUEUE(wait, current); set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&c->erase_wait, &wait); - D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__)); + jffs2_dbg(1, "%s waiting for erase to complete\n", + __func__); spin_unlock(&c->erase_completion_lock); schedule(); @@ -144,7 +155,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, ret = jffs2_do_reserve_space(c, minsize, len, sumsize); if (ret) { - D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret)); + jffs2_dbg(1, "%s(): ret is %d\n", __func__, ret); } } spin_unlock(&c->erase_completion_lock); @@ -161,13 +172,14 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, int ret = -EAGAIN; minsize = PAD(minsize); - D1(printk(KERN_DEBUG "jffs2_reserve_space_gc(): Requested 0x%x bytes\n", minsize)); + jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); spin_lock(&c->erase_completion_lock); while(ret == -EAGAIN) { ret = jffs2_do_reserve_space(c, minsize, len, sumsize); if (ret) { - D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret)); + jffs2_dbg(1, "%s(): looping, ret is %d\n", + __func__, ret); } } spin_unlock(&c->erase_completion_lock); @@ -184,8 +196,8 @@ static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblo { if (c->nextblock == NULL) { - D1(printk(KERN_DEBUG "jffs2_close_nextblock: Erase block at 0x%08x has already been placed in a list\n", - jeb->offset)); + jffs2_dbg(1, "%s(): Erase block at 0x%08x has already been placed in a list\n", + __func__, jeb->offset); return; } /* Check, if we have a dirty block now, or if it was dirty already */ @@ -195,17 +207,20 @@ static void jffs2_close_nextblock(struct jffs2_sb_info *c, struct jffs2_eraseblo jeb->dirty_size += jeb->wasted_size; jeb->wasted_size = 0; if (VERYDIRTY(c, jeb->dirty_size)) { - D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", - jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + jffs2_dbg(1, "Adding full erase block at 0x%08x to very_dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); list_add_tail(&jeb->list, &c->very_dirty_list); } else { - D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", - jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + jffs2_dbg(1, "Adding full erase block at 0x%08x to dirty_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); list_add_tail(&jeb->list, &c->dirty_list); } } else { - D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", - jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); list_add_tail(&jeb->list, &c->clean_list); } c->nextblock = NULL; @@ -230,13 +245,14 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_garbage_collect_trigger(c); - D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", - ejeb->offset)); + jffs2_dbg(1, "%s(): Triggering erase of erasable block at 0x%08x\n", + __func__, ejeb->offset); } if (!c->nr_erasing_blocks && !list_empty(&c->erasable_pending_wbuf_list)) { - D1(printk(KERN_DEBUG "jffs2_find_nextblock: Flushing write buffer\n")); + jffs2_dbg(1, "%s(): Flushing write buffer\n", + __func__); /* c->nextblock is NULL, no update to c->nextblock allowed */ spin_unlock(&c->erase_completion_lock); jffs2_flush_wbuf_pad(c); @@ -248,9 +264,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) if (!c->nr_erasing_blocks) { /* Ouch. We're in GC, or we wouldn't have got here. And there's no space left. At all. */ - printk(KERN_CRIT "Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n", - c->nr_erasing_blocks, c->nr_free_blocks, list_empty(&c->erasable_list)?"yes":"no", - list_empty(&c->erasing_list)?"yes":"no", list_empty(&c->erase_pending_list)?"yes":"no"); + pr_crit("Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n", + c->nr_erasing_blocks, c->nr_free_blocks, + list_empty(&c->erasable_list) ? "yes" : "no", + list_empty(&c->erasing_list) ? "yes" : "no", + list_empty(&c->erase_pending_list) ? "yes" : "no"); return -ENOSPC; } @@ -278,7 +296,8 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) c->wbuf_ofs = 0xffffffff; #endif - D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset)); + jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n", + __func__, c->nextblock->offset); return 0; } @@ -345,7 +364,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, if (jffs2_wbuf_dirty(c)) { spin_unlock(&c->erase_completion_lock); - D1(printk(KERN_DEBUG "jffs2_do_reserve_space: Flushing write buffer\n")); + jffs2_dbg(1, "%s(): Flushing write buffer\n", + __func__); jffs2_flush_wbuf_pad(c); spin_lock(&c->erase_completion_lock); jeb = c->nextblock; @@ -387,7 +407,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, jeb = c->nextblock; if (jeb->free_size != c->sector_size - c->cleanmarker_size) { - printk(KERN_WARNING "Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n", jeb->offset, jeb->free_size); + pr_warn("Eep. Block 0x%08x taken from free_list had free_size of 0x%08x!!\n", + jeb->offset, jeb->free_size); goto restart; } } @@ -408,8 +429,9 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, spin_lock(&c->erase_completion_lock); } - D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n", - *len, jeb->offset + (c->sector_size - jeb->free_size))); + jffs2_dbg(1, "%s(): Giving 0x%x bytes at 0x%x\n", + __func__, + *len, jeb->offset + (c->sector_size - jeb->free_size)); return 0; } @@ -434,20 +456,22 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, jeb = &c->blocks[ofs / c->sector_size]; - D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", - ofs & ~3, ofs & 3, len)); + jffs2_dbg(1, "%s(): Node at 0x%x(%d), size 0x%x\n", + __func__, ofs & ~3, ofs & 3, len); #if 1 /* Allow non-obsolete nodes only to be added at the end of c->nextblock, if c->nextblock is set. Note that wbuf.c will file obsolete nodes even after refiling c->nextblock */ if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE)) && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) { - printk(KERN_WARNING "argh. node added in wrong place at 0x%08x(%d)\n", ofs & ~3, ofs & 3); + pr_warn("argh. node added in wrong place at 0x%08x(%d)\n", + ofs & ~3, ofs & 3); if (c->nextblock) - printk(KERN_WARNING "nextblock 0x%08x", c->nextblock->offset); + pr_warn("nextblock 0x%08x", c->nextblock->offset); else - printk(KERN_WARNING "No nextblock"); - printk(", expected at %08x\n", jeb->offset + (c->sector_size - jeb->free_size)); + pr_warn("No nextblock"); + pr_cont(", expected at %08x\n", + jeb->offset + (c->sector_size - jeb->free_size)); return ERR_PTR(-EINVAL); } #endif @@ -457,8 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) { /* If it lives on the dirty_list, jffs2_reserve_space will put it there */ - D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", - jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + jffs2_dbg(1, "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); if (jffs2_wbuf_dirty(c)) { /* Flush the last write in the block if it's outstanding */ spin_unlock(&c->erase_completion_lock); @@ -480,7 +505,7 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, void jffs2_complete_reservation(struct jffs2_sb_info *c) { - D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); + jffs2_dbg(1, "jffs2_complete_reservation()\n"); spin_lock(&c->erase_completion_lock); jffs2_garbage_collect_trigger(c); spin_unlock(&c->erase_completion_lock); @@ -493,7 +518,7 @@ static inline int on_list(struct list_head *obj, struct list_head *head) list_for_each(this, head) { if (this == obj) { - D1(printk("%p is on list at %p\n", obj, head)); + jffs2_dbg(1, "%p is on list at %p\n", obj, head); return 1; } @@ -511,16 +536,18 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref uint32_t freed_len; if(unlikely(!ref)) { - printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n"); + pr_notice("EEEEEK. jffs2_mark_node_obsolete called with NULL node\n"); return; } if (ref_obsolete(ref)) { - D1(printk(KERN_DEBUG "jffs2_mark_node_obsolete called with already obsolete node at 0x%08x\n", ref_offset(ref))); + jffs2_dbg(1, "%s(): called with already obsolete node at 0x%08x\n", + __func__, ref_offset(ref)); return; } blocknr = ref->flash_offset / c->sector_size; if (blocknr >= c->nr_blocks) { - printk(KERN_NOTICE "raw node at 0x%08x is off the end of device!\n", ref->flash_offset); + pr_notice("raw node at 0x%08x is off the end of device!\n", + ref->flash_offset); BUG(); } jeb = &c->blocks[blocknr]; @@ -542,27 +569,31 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref if (ref_flags(ref) == REF_UNCHECKED) { D1(if (unlikely(jeb->unchecked_size < freed_len)) { - printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n", - freed_len, blocknr, ref->flash_offset, jeb->used_size); + pr_notice("raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n", + freed_len, blocknr, + ref->flash_offset, jeb->used_size); BUG(); }) - D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len)); + jffs2_dbg(1, "Obsoleting previously unchecked node at 0x%08x of len %x\n", + ref_offset(ref), freed_len); jeb->unchecked_size -= freed_len; c->unchecked_size -= freed_len; } else { D1(if (unlikely(jeb->used_size < freed_len)) { - printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n", - freed_len, blocknr, ref->flash_offset, jeb->used_size); + pr_notice("raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n", + freed_len, blocknr, + ref->flash_offset, jeb->used_size); BUG(); }) - D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len)); + jffs2_dbg(1, "Obsoleting node at 0x%08x of len %#x: ", + ref_offset(ref), freed_len); jeb->used_size -= freed_len; c->used_size -= freed_len; } // Take care, that wasted size is taken into concern if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) { - D1(printk("Dirtying\n")); + jffs2_dbg(1, "Dirtying\n"); addedsize = freed_len; jeb->dirty_size += freed_len; c->dirty_size += freed_len; @@ -570,12 +601,12 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref /* Convert wasted space to dirty, if not a bad block */ if (jeb->wasted_size) { if (on_list(&jeb->list, &c->bad_used_list)) { - D1(printk(KERN_DEBUG "Leaving block at %08x on the bad_used_list\n", - jeb->offset)); + jffs2_dbg(1, "Leaving block at %08x on the bad_used_list\n", + jeb->offset); addedsize = 0; /* To fool the refiling code later */ } else { - D1(printk(KERN_DEBUG "Converting %d bytes of wasted space to dirty in block at %08x\n", - jeb->wasted_size, jeb->offset)); + jffs2_dbg(1, "Converting %d bytes of wasted space to dirty in block at %08x\n", + jeb->wasted_size, jeb->offset); addedsize += jeb->wasted_size; jeb->dirty_size += jeb->wasted_size; c->dirty_size += jeb->wasted_size; @@ -584,7 +615,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref } } } else { - D1(printk("Wasting\n")); + jffs2_dbg(1, "Wasting\n"); addedsize = 0; jeb->wasted_size += freed_len; c->wasted_size += freed_len; @@ -606,50 +637,57 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref } if (jeb == c->nextblock) { - D2(printk(KERN_DEBUG "Not moving nextblock 0x%08x to dirty/erase_pending list\n", jeb->offset)); + jffs2_dbg(2, "Not moving nextblock 0x%08x to dirty/erase_pending list\n", + jeb->offset); } else if (!jeb->used_size && !jeb->unchecked_size) { if (jeb == c->gcblock) { - D1(printk(KERN_DEBUG "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n", jeb->offset)); + jffs2_dbg(1, "gcblock at 0x%08x completely dirtied. Clearing gcblock...\n", + jeb->offset); c->gcblock = NULL; } else { - D1(printk(KERN_DEBUG "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n", jeb->offset)); + jffs2_dbg(1, "Eraseblock at 0x%08x completely dirtied. Removing from (dirty?) list...\n", + jeb->offset); list_del(&jeb->list); } if (jffs2_wbuf_dirty(c)) { - D1(printk(KERN_DEBUG "...and adding to erasable_pending_wbuf_list\n")); + jffs2_dbg(1, "...and adding to erasable_pending_wbuf_list\n"); list_add_tail(&jeb->list, &c->erasable_pending_wbuf_list); } else { if (jiffies & 127) { /* Most of the time, we just erase it immediately. Otherwise we spend ages scanning it on mount, etc. */ - D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); + jffs2_dbg(1, "...and adding to erase_pending_list\n"); list_add_tail(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_garbage_collect_trigger(c); } else { /* Sometimes, however, we leave it elsewhere so it doesn't get immediately reused, and we spread the load a bit. */ - D1(printk(KERN_DEBUG "...and adding to erasable_list\n")); + jffs2_dbg(1, "...and adding to erasable_list\n"); list_add_tail(&jeb->list, &c->erasable_list); } } - D1(printk(KERN_DEBUG "Done OK\n")); + jffs2_dbg(1, "Done OK\n"); } else if (jeb == c->gcblock) { - D2(printk(KERN_DEBUG "Not moving gcblock 0x%08x to dirty_list\n", jeb->offset)); + jffs2_dbg(2, "Not moving gcblock 0x%08x to dirty_list\n", + jeb->offset); } else if (ISDIRTY(jeb->dirty_size) && !ISDIRTY(jeb->dirty_size - addedsize)) { - D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n", jeb->offset)); + jffs2_dbg(1, "Eraseblock at 0x%08x is freshly dirtied. Removing from clean list...\n", + jeb->offset); list_del(&jeb->list); - D1(printk(KERN_DEBUG "...and adding to dirty_list\n")); + jffs2_dbg(1, "...and adding to dirty_list\n"); list_add_tail(&jeb->list, &c->dirty_list); } else if (VERYDIRTY(c, jeb->dirty_size) && !VERYDIRTY(c, jeb->dirty_size - addedsize)) { - D1(printk(KERN_DEBUG "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n", jeb->offset)); + jffs2_dbg(1, "Eraseblock at 0x%08x is now very dirty. Removing from dirty list...\n", + jeb->offset); list_del(&jeb->list); - D1(printk(KERN_DEBUG "...and adding to very_dirty_list\n")); + jffs2_dbg(1, "...and adding to very_dirty_list\n"); list_add_tail(&jeb->list, &c->very_dirty_list); } else { - D1(printk(KERN_DEBUG "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n", - jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); + jffs2_dbg(1, "Eraseblock at 0x%08x not moved anywhere. (free 0x%08x, dirty 0x%08x, used 0x%08x)\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->used_size); } spin_unlock(&c->erase_completion_lock); @@ -665,33 +703,40 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */ - D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref))); + jffs2_dbg(1, "obliterating obsoleted node at 0x%08x\n", + ref_offset(ref)); ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n); if (ret) { - printk(KERN_WARNING "Read error reading from obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret); + pr_warn("Read error reading from obsoleted node at 0x%08x: %d\n", + ref_offset(ref), ret); goto out_erase_sem; } if (retlen != sizeof(n)) { - printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen); + pr_warn("Short read from obsoleted node at 0x%08x: %zd\n", + ref_offset(ref), retlen); goto out_erase_sem; } if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) { - printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len); + pr_warn("Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", + je32_to_cpu(n.totlen), freed_len); goto out_erase_sem; } if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) { - D1(printk(KERN_DEBUG "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n", ref_offset(ref), je16_to_cpu(n.nodetype))); + jffs2_dbg(1, "Node at 0x%08x was already marked obsolete (nodetype 0x%04x)\n", + ref_offset(ref), je16_to_cpu(n.nodetype)); goto out_erase_sem; } /* XXX FIXME: This is ugly now */ n.nodetype = cpu_to_je16(je16_to_cpu(n.nodetype) & ~JFFS2_NODE_ACCURATE); ret = jffs2_flash_write(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n); if (ret) { - printk(KERN_WARNING "Write error in obliterating obsoleted node at 0x%08x: %d\n", ref_offset(ref), ret); + pr_warn("Write error in obliterating obsoleted node at 0x%08x: %d\n", + ref_offset(ref), ret); goto out_erase_sem; } if (retlen != sizeof(n)) { - printk(KERN_WARNING "Short write in obliterating obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen); + pr_warn("Short write in obliterating obsoleted node at 0x%08x: %zd\n", + ref_offset(ref), retlen); goto out_erase_sem; } @@ -751,8 +796,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c) return 1; if (c->unchecked_size) { - D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", - c->unchecked_size, c->checked_ino)); + jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", + c->unchecked_size, c->checked_ino); return 1; } @@ -780,8 +825,9 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c) } } - D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n", - c->nr_free_blocks, c->nr_erasing_blocks, c->dirty_size, nr_very_dirty, ret?"yes":"no")); + jffs2_dbg(1, "%s(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n", + __func__, c->nr_free_blocks, c->nr_erasing_blocks, + c->dirty_size, nr_very_dirty, ret ? "yes" : "no"); return ret; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index ab65ee3ec85..1cd3aec9d9a 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -76,7 +76,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) #define jffs2_write_nand_cleanmarker(c,jeb) (-EIO) #define jffs2_flash_write(c, ofs, len, retlen, buf) jffs2_flash_direct_write(c, ofs, len, retlen, buf) -#define jffs2_flash_read(c, ofs, len, retlen, buf) ((c)->mtd->read((c)->mtd, ofs, len, retlen, buf)) +#define jffs2_flash_read(c, ofs, len, retlen, buf) (mtd_read((c)->mtd, ofs, len, retlen, buf)) #define jffs2_flush_wbuf_pad(c) ({ do{} while(0); (void)(c), 0; }) #define jffs2_flush_wbuf_gc(c, i) ({ do{} while(0); (void)(c), (void) i, 0; }) #define jffs2_write_nand_badblock(c,jeb,bad_offset) (1) @@ -108,8 +108,6 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) #define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH) -#define jffs2_flash_write_oob(c, ofs, len, retlen, buf) ((c)->mtd->write_oob((c)->mtd, ofs, len, retlen, buf)) -#define jffs2_flash_read_oob(c, ofs, len, retlen, buf) ((c)->mtd->read_oob((c)->mtd, ofs, len, retlen, buf)) #define jffs2_wbuf_dirty(c) (!!(c)->wbuf_len) /* wbuf.c */ diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c index 3f39be1b045..0b042b1fc82 100644 --- a/fs/jffs2/read.c +++ b/fs/jffs2/read.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/crc32.h> @@ -36,24 +38,25 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, ret = jffs2_flash_read(c, ref_offset(fd->raw), sizeof(*ri), &readlen, (char *)ri); if (ret) { jffs2_free_raw_inode(ri); - printk(KERN_WARNING "Error reading node from 0x%08x: %d\n", ref_offset(fd->raw), ret); + pr_warn("Error reading node from 0x%08x: %d\n", + ref_offset(fd->raw), ret); return ret; } if (readlen != sizeof(*ri)) { jffs2_free_raw_inode(ri); - printk(KERN_WARNING "Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n", - ref_offset(fd->raw), sizeof(*ri), readlen); + pr_warn("Short read from 0x%08x: wanted 0x%zx bytes, got 0x%zx\n", + ref_offset(fd->raw), sizeof(*ri), readlen); return -EIO; } crc = crc32(0, ri, sizeof(*ri)-8); - D1(printk(KERN_DEBUG "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n", + jffs2_dbg(1, "Node read from %08x: node_crc %08x, calculated CRC %08x. dsize %x, csize %x, offset %x, buf %p\n", ref_offset(fd->raw), je32_to_cpu(ri->node_crc), crc, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize), - je32_to_cpu(ri->offset), buf)); + je32_to_cpu(ri->offset), buf); if (crc != je32_to_cpu(ri->node_crc)) { - printk(KERN_WARNING "Node CRC %08x != calculated CRC %08x for node at %08x\n", - je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw)); + pr_warn("Node CRC %08x != calculated CRC %08x for node at %08x\n", + je32_to_cpu(ri->node_crc), crc, ref_offset(fd->raw)); ret = -EIO; goto out_ri; } @@ -66,8 +69,8 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, } D1(if(ofs + len > je32_to_cpu(ri->dsize)) { - printk(KERN_WARNING "jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n", - len, ofs, je32_to_cpu(ri->dsize)); + pr_warn("jffs2_read_dnode() asked for %d bytes at %d from %d-byte node\n", + len, ofs, je32_to_cpu(ri->dsize)); ret = -EINVAL; goto out_ri; }); @@ -107,8 +110,8 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, decomprbuf = readbuf; } - D2(printk(KERN_DEBUG "Read %d bytes to %p\n", je32_to_cpu(ri->csize), - readbuf)); + jffs2_dbg(2, "Read %d bytes to %p\n", je32_to_cpu(ri->csize), + readbuf); ret = jffs2_flash_read(c, (ref_offset(fd->raw)) + sizeof(*ri), je32_to_cpu(ri->csize), &readlen, readbuf); @@ -119,18 +122,19 @@ int jffs2_read_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, crc = crc32(0, readbuf, je32_to_cpu(ri->csize)); if (crc != je32_to_cpu(ri->data_crc)) { - printk(KERN_WARNING "Data CRC %08x != calculated CRC %08x for node at %08x\n", - je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw)); + pr_warn("Data CRC %08x != calculated CRC %08x for node at %08x\n", + je32_to_cpu(ri->data_crc), crc, ref_offset(fd->raw)); ret = -EIO; goto out_decomprbuf; } - D2(printk(KERN_DEBUG "Data CRC matches calculated CRC %08x\n", crc)); + jffs2_dbg(2, "Data CRC matches calculated CRC %08x\n", crc); if (ri->compr != JFFS2_COMPR_NONE) { - D2(printk(KERN_DEBUG "Decompress %d bytes from %p to %d bytes at %p\n", - je32_to_cpu(ri->csize), readbuf, je32_to_cpu(ri->dsize), decomprbuf)); + jffs2_dbg(2, "Decompress %d bytes from %p to %d bytes at %p\n", + je32_to_cpu(ri->csize), readbuf, + je32_to_cpu(ri->dsize), decomprbuf); ret = jffs2_decompress(c, f, ri->compr | (ri->usercompr << 8), readbuf, decomprbuf, je32_to_cpu(ri->csize), je32_to_cpu(ri->dsize)); if (ret) { - printk(KERN_WARNING "Error: jffs2_decompress returned %d\n", ret); + pr_warn("Error: jffs2_decompress returned %d\n", ret); goto out_decomprbuf; } } @@ -157,8 +161,8 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_node_frag *frag; int ret; - D1(printk(KERN_DEBUG "jffs2_read_inode_range: ino #%u, range 0x%08x-0x%08x\n", - f->inocache->ino, offset, offset+len)); + jffs2_dbg(1, "%s(): ino #%u, range 0x%08x-0x%08x\n", + __func__, f->inocache->ino, offset, offset + len); frag = jffs2_lookup_node_frag(&f->fragtree, offset); @@ -168,22 +172,27 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, * (or perhaps is before it, if we've been asked to read off the * end of the file). */ while(offset < end) { - D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end)); + jffs2_dbg(2, "%s(): offset %d, end %d\n", + __func__, offset, end); if (unlikely(!frag || frag->ofs > offset || frag->ofs + frag->size <= offset)) { uint32_t holesize = end - offset; if (frag && frag->ofs > offset) { - D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset)); + jffs2_dbg(1, "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", + f->inocache->ino, frag->ofs, offset); holesize = min(holesize, frag->ofs - offset); } - D1(printk(KERN_DEBUG "Filling non-frag hole from %d-%d\n", offset, offset+holesize)); + jffs2_dbg(1, "Filling non-frag hole from %d-%d\n", + offset, offset + holesize); memset(buf, 0, holesize); buf += holesize; offset += holesize; continue; } else if (unlikely(!frag->node)) { uint32_t holeend = min(end, frag->ofs + frag->size); - D1(printk(KERN_DEBUG "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n", offset, holeend, frag->ofs, frag->ofs + frag->size)); + jffs2_dbg(1, "Filling frag hole from %d-%d (frag 0x%x 0x%x)\n", + offset, holeend, frag->ofs, + frag->ofs + frag->size); memset(buf, 0, holeend - offset); buf += holeend - offset; offset = holeend; @@ -195,20 +204,23 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, fragofs = offset - frag->ofs; readlen = min(frag->size - fragofs, end - offset); - D1(printk(KERN_DEBUG "Reading %d-%d from node at 0x%08x (%d)\n", - frag->ofs+fragofs, frag->ofs+fragofs+readlen, - ref_offset(frag->node->raw), ref_flags(frag->node->raw))); + jffs2_dbg(1, "Reading %d-%d from node at 0x%08x (%d)\n", + frag->ofs+fragofs, + frag->ofs + fragofs+readlen, + ref_offset(frag->node->raw), + ref_flags(frag->node->raw)); ret = jffs2_read_dnode(c, f, frag->node, buf, fragofs + frag->ofs - frag->node->ofs, readlen); - D2(printk(KERN_DEBUG "node read done\n")); + jffs2_dbg(2, "node read done\n"); if (ret) { - D1(printk(KERN_DEBUG"jffs2_read_inode_range error %d\n",ret)); + jffs2_dbg(1, "%s(): error %d\n", + __func__, ret); memset(buf, 0, readlen); return ret; } buf += readlen; offset += readlen; frag = frag_next(frag); - D2(printk(KERN_DEBUG "node read was OK. Looping\n")); + jffs2_dbg(2, "node read was OK. Looping\n"); } } return 0; diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 3093ac4fb24..dc0437e8476 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index f99464833bb..7654e87b042 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> @@ -22,15 +24,15 @@ #define DEFAULT_EMPTY_SCAN_SIZE 256 -#define noisy_printk(noise, args...) do { \ - if (*(noise)) { \ - printk(KERN_NOTICE args); \ - (*(noise))--; \ - if (!(*(noise))) { \ - printk(KERN_NOTICE "Further such events for this erase block will not be printed\n"); \ - } \ - } \ -} while(0) +#define noisy_printk(noise, fmt, ...) \ +do { \ + if (*(noise)) { \ + pr_notice(fmt, ##__VA_ARGS__); \ + (*(noise))--; \ + if (!(*(noise))) \ + pr_notice("Further such events for this erase block will not be printed\n"); \ + } \ +} while (0) static uint32_t pseudo_random; @@ -96,18 +98,17 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) #ifndef __ECOS size_t pointlen, try_size; - if (c->mtd->point) { - ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen, - (void **)&flashbuf, NULL); - if (!ret && pointlen < c->mtd->size) { - /* Don't muck about if it won't let us point to the whole flash */ - D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen)); - mtd_unpoint(c->mtd, 0, pointlen); - flashbuf = NULL; - } - if (ret && ret != -EOPNOTSUPP) - D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); + ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen, + (void **)&flashbuf, NULL); + if (!ret && pointlen < c->mtd->size) { + /* Don't muck about if it won't let us point to the whole flash */ + jffs2_dbg(1, "MTD point returned len too short: 0x%zx\n", + pointlen); + mtd_unpoint(c->mtd, 0, pointlen); + flashbuf = NULL; } + if (ret && ret != -EOPNOTSUPP) + jffs2_dbg(1, "MTD point failed %d\n", ret); #endif if (!flashbuf) { /* For NAND it's quicker to read a whole eraseblock at a time, @@ -117,15 +118,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) else try_size = PAGE_SIZE; - D1(printk(KERN_DEBUG "Trying to allocate readbuf of %zu " - "bytes\n", try_size)); + jffs2_dbg(1, "Trying to allocate readbuf of %zu " + "bytes\n", try_size); flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size); if (!flashbuf) return -ENOMEM; - D1(printk(KERN_DEBUG "Allocated readbuf of %zu bytes\n", - try_size)); + jffs2_dbg(1, "Allocated readbuf of %zu bytes\n", + try_size); buf_size = (uint32_t)try_size; } @@ -178,7 +179,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) c->nr_free_blocks++; } else { /* Dirt */ - D1(printk(KERN_DEBUG "Adding all-dirty block at 0x%08x to erase_pending_list\n", jeb->offset)); + jffs2_dbg(1, "Adding all-dirty block at 0x%08x to erase_pending_list\n", + jeb->offset); list_add(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; } @@ -205,7 +207,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } /* update collected summary information for the current nextblock */ jffs2_sum_move_collected(c, s); - D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset)); + jffs2_dbg(1, "%s(): new nextblock = 0x%08x\n", + __func__, jeb->offset); c->nextblock = jeb; } else { ret = file_dirty(c, jeb); @@ -217,20 +220,21 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) case BLK_STATE_ALLDIRTY: /* Nothing valid - not even a clean marker. Needs erasing. */ /* For now we just put it on the erasing list. We'll start the erases later */ - D1(printk(KERN_NOTICE "JFFS2: Erase block at 0x%08x is not formatted. It will be erased\n", jeb->offset)); + jffs2_dbg(1, "Erase block at 0x%08x is not formatted. It will be erased\n", + jeb->offset); list_add(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; break; case BLK_STATE_BADBLOCK: - D1(printk(KERN_NOTICE "JFFS2: Block at 0x%08x is bad\n", jeb->offset)); + jffs2_dbg(1, "Block at 0x%08x is bad\n", jeb->offset); list_add(&jeb->list, &c->bad_list); c->bad_size += c->sector_size; c->free_size -= c->sector_size; bad_blocks++; break; default: - printk(KERN_WARNING "jffs2_scan_medium(): unknown block state\n"); + pr_warn("%s(): unknown block state\n", __func__); BUG(); } } @@ -250,16 +254,17 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) uint32_t skip = c->nextblock->free_size % c->wbuf_pagesize; - D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n", - skip)); + jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n", + __func__, skip); jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); jffs2_scan_dirty_space(c, c->nextblock, skip); } #endif if (c->nr_erasing_blocks) { if ( !c->used_size && ((c->nr_free_blocks+empty_blocks+bad_blocks)!= c->nr_blocks || bad_blocks == c->nr_blocks) ) { - printk(KERN_NOTICE "Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n"); - printk(KERN_NOTICE "empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n",empty_blocks,bad_blocks,c->nr_blocks); + pr_notice("Cowardly refusing to erase blocks on filesystem with no valid JFFS2 nodes\n"); + pr_notice("empty_blocks %d, bad_blocks %d, c->nr_blocks %d\n", + empty_blocks, bad_blocks, c->nr_blocks); ret = -EIO; goto out; } @@ -287,11 +292,13 @@ static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf, ret = jffs2_flash_read(c, ofs, len, &retlen, buf); if (ret) { - D1(printk(KERN_WARNING "mtd->read(0x%x bytes from 0x%x) returned %d\n", len, ofs, ret)); + jffs2_dbg(1, "mtd->read(0x%x bytes from 0x%x) returned %d\n", + len, ofs, ret); return ret; } if (retlen < len) { - D1(printk(KERN_WARNING "Read at 0x%x gave only 0x%zx bytes\n", ofs, retlen)); + jffs2_dbg(1, "Read at 0x%x gave only 0x%zx bytes\n", + ofs, retlen); return -EIO; } return 0; @@ -368,7 +375,7 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc if (jffs2_sum_active()) jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset); - dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n", + dbg_xattr("scanning xdatum at %#08x (xid=%u, version=%u)\n", ofs, xd->xid, xd->version); return 0; } @@ -449,7 +456,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo ofs = jeb->offset; prevofs = jeb->offset - 1; - D1(printk(KERN_DEBUG "jffs2_scan_eraseblock(): Scanning block at 0x%x\n", ofs)); + jffs2_dbg(1, "%s(): Scanning block at 0x%x\n", __func__, ofs); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (jffs2_cleanmarker_oob(c)) { @@ -459,7 +466,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo return BLK_STATE_BADBLOCK; ret = jffs2_check_nand_cleanmarker(c, jeb); - D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n",ret)); + jffs2_dbg(2, "jffs_check_nand_cleanmarker returned %d\n", ret); /* Even if it's not found, we still scan to see if the block is empty. We use this information @@ -561,7 +568,8 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo if (jffs2_cleanmarker_oob(c)) { /* scan oob, take care of cleanmarker */ int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound); - D2(printk(KERN_NOTICE "jffs2_check_oob_empty returned %d\n",ret)); + jffs2_dbg(2, "jffs2_check_oob_empty returned %d\n", + ret); switch (ret) { case 0: return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF; case 1: return BLK_STATE_ALLDIRTY; @@ -569,15 +577,16 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo } } #endif - D1(printk(KERN_DEBUG "Block at 0x%08x is empty (erased)\n", jeb->offset)); + jffs2_dbg(1, "Block at 0x%08x is empty (erased)\n", + jeb->offset); if (c->cleanmarker_size == 0) return BLK_STATE_CLEANMARKER; /* don't bother with re-erase */ else return BLK_STATE_ALLFF; /* OK to erase if all blocks are like this */ } if (ofs) { - D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset, - jeb->offset + ofs)); + jffs2_dbg(1, "Free space at %08x ends at %08x\n", jeb->offset, + jeb->offset + ofs); if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1))) return err; if ((err = jffs2_scan_dirty_space(c, jeb, ofs))) @@ -604,12 +613,13 @@ scan_more: cond_resched(); if (ofs & 3) { - printk(KERN_WARNING "Eep. ofs 0x%08x not word-aligned!\n", ofs); + pr_warn("Eep. ofs 0x%08x not word-aligned!\n", ofs); ofs = PAD(ofs); continue; } if (ofs == prevofs) { - printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs); + pr_warn("ofs 0x%08x has already been seen. Skipping\n", + ofs); if ((err = jffs2_scan_dirty_space(c, jeb, 4))) return err; ofs += 4; @@ -618,8 +628,10 @@ scan_more: prevofs = ofs; if (jeb->offset + c->sector_size < ofs + sizeof(*node)) { - D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node), - jeb->offset, c->sector_size, ofs, sizeof(*node))); + jffs2_dbg(1, "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", + sizeof(struct jffs2_unknown_node), + jeb->offset, c->sector_size, ofs, + sizeof(*node)); if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs))) return err; break; @@ -627,8 +639,9 @@ scan_more: if (buf_ofs + buf_len < ofs + sizeof(*node)) { buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); - D1(printk(KERN_DEBUG "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n", - sizeof(struct jffs2_unknown_node), buf_len, ofs)); + jffs2_dbg(1, "Fewer than %zd bytes (node header) left to end of buf. Reading 0x%x at 0x%08x\n", + sizeof(struct jffs2_unknown_node), + buf_len, ofs); err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); if (err) return err; @@ -645,13 +658,13 @@ scan_more: ofs += 4; scan_end = min_t(uint32_t, EMPTY_SCAN_SIZE(c->sector_size)/8, buf_len); - D1(printk(KERN_DEBUG "Found empty flash at 0x%08x\n", ofs)); + jffs2_dbg(1, "Found empty flash at 0x%08x\n", ofs); more_empty: inbuf_ofs = ofs - buf_ofs; while (inbuf_ofs < scan_end) { if (unlikely(*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff)) { - printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n", - empty_start, ofs); + pr_warn("Empty flash at 0x%08x ends at 0x%08x\n", + empty_start, ofs); if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start))) return err; goto scan_more; @@ -661,13 +674,15 @@ scan_more: ofs += 4; } /* Ran off end. */ - D1(printk(KERN_DEBUG "Empty flash to end of buffer at 0x%08x\n", ofs)); + jffs2_dbg(1, "Empty flash to end of buffer at 0x%08x\n", + ofs); /* If we're only checking the beginning of a block with a cleanmarker, bail now */ if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) && c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) { - D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size))); + jffs2_dbg(1, "%d bytes at start of block seems clean... assuming all clean\n", + EMPTY_SCAN_SIZE(c->sector_size)); return BLK_STATE_CLEANMARKER; } if (!buf_size && (scan_end != buf_len)) {/* XIP/point case */ @@ -680,13 +695,14 @@ scan_more: if (!buf_len) { /* No more to read. Break out of main loop without marking this range of empty space as dirty (because it's not) */ - D1(printk(KERN_DEBUG "Empty flash at %08x runs to end of block. Treating as free_space\n", - empty_start)); + jffs2_dbg(1, "Empty flash at %08x runs to end of block. Treating as free_space\n", + empty_start); break; } /* point never reaches here */ scan_end = buf_len; - D1(printk(KERN_DEBUG "Reading another 0x%x at 0x%08x\n", buf_len, ofs)); + jffs2_dbg(1, "Reading another 0x%x at 0x%08x\n", + buf_len, ofs); err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); if (err) return err; @@ -695,22 +711,23 @@ scan_more: } if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) { - printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs); + pr_warn("Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", + ofs); if ((err = jffs2_scan_dirty_space(c, jeb, 4))) return err; ofs += 4; continue; } if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) { - D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs)); + jffs2_dbg(1, "Dirty bitmask at 0x%08x\n", ofs); if ((err = jffs2_scan_dirty_space(c, jeb, 4))) return err; ofs += 4; continue; } if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) { - printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs); - printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n"); + pr_warn("Old JFFS2 bitmask found at 0x%08x\n", ofs); + pr_warn("You cannot use older JFFS2 filesystems with newer kernels\n"); if ((err = jffs2_scan_dirty_space(c, jeb, 4))) return err; ofs += 4; @@ -718,7 +735,8 @@ scan_more: } if (je16_to_cpu(node->magic) != JFFS2_MAGIC_BITMASK) { /* OK. We're out of possibilities. Whinge and move on */ - noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n", + noisy_printk(&noise, "%s(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n", + __func__, JFFS2_MAGIC_BITMASK, ofs, je16_to_cpu(node->magic)); if ((err = jffs2_scan_dirty_space(c, jeb, 4))) @@ -733,7 +751,8 @@ scan_more: hdr_crc = crc32(0, &crcnode, sizeof(crcnode)-4); if (hdr_crc != je32_to_cpu(node->hdr_crc)) { - noisy_printk(&noise, "jffs2_scan_eraseblock(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n", + noisy_printk(&noise, "%s(): Node at 0x%08x {0x%04x, 0x%04x, 0x%08x) has invalid CRC 0x%08x (calculated 0x%08x)\n", + __func__, ofs, je16_to_cpu(node->magic), je16_to_cpu(node->nodetype), je32_to_cpu(node->totlen), @@ -747,9 +766,9 @@ scan_more: if (ofs + je32_to_cpu(node->totlen) > jeb->offset + c->sector_size) { /* Eep. Node goes over the end of the erase block. */ - printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n", - ofs, je32_to_cpu(node->totlen)); - printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n"); + pr_warn("Node at 0x%08x with length 0x%08x would run over the end of the erase block\n", + ofs, je32_to_cpu(node->totlen)); + pr_warn("Perhaps the file system was created with the wrong erase size?\n"); if ((err = jffs2_scan_dirty_space(c, jeb, 4))) return err; ofs += 4; @@ -758,7 +777,8 @@ scan_more: if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) { /* Wheee. This is an obsoleted node */ - D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs)); + jffs2_dbg(2, "Node at 0x%08x is obsolete. Skipping\n", + ofs); if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) return err; ofs += PAD(je32_to_cpu(node->totlen)); @@ -769,8 +789,9 @@ scan_more: case JFFS2_NODETYPE_INODE: if (buf_ofs + buf_len < ofs + sizeof(struct jffs2_raw_inode)) { buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); - D1(printk(KERN_DEBUG "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n", - sizeof(struct jffs2_raw_inode), buf_len, ofs)); + jffs2_dbg(1, "Fewer than %zd bytes (inode node) left to end of buf. Reading 0x%x at 0x%08x\n", + sizeof(struct jffs2_raw_inode), + buf_len, ofs); err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); if (err) return err; @@ -785,8 +806,9 @@ scan_more: case JFFS2_NODETYPE_DIRENT: if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); - D1(printk(KERN_DEBUG "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n", - je32_to_cpu(node->totlen), buf_len, ofs)); + jffs2_dbg(1, "Fewer than %d bytes (dirent node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, + ofs); err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); if (err) return err; @@ -802,9 +824,9 @@ scan_more: case JFFS2_NODETYPE_XATTR: if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); - D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)" - " left to end of buf. Reading 0x%x at 0x%08x\n", - je32_to_cpu(node->totlen), buf_len, ofs)); + jffs2_dbg(1, "Fewer than %d bytes (xattr node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, + ofs); err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); if (err) return err; @@ -819,9 +841,9 @@ scan_more: case JFFS2_NODETYPE_XREF: if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) { buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs); - D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)" - " left to end of buf. Reading 0x%x at 0x%08x\n", - je32_to_cpu(node->totlen), buf_len, ofs)); + jffs2_dbg(1, "Fewer than %d bytes (xref node) left to end of buf. Reading 0x%x at 0x%08x\n", + je32_to_cpu(node->totlen), buf_len, + ofs); err = jffs2_fill_scan_buf(c, buf, ofs, buf_len); if (err) return err; @@ -836,15 +858,17 @@ scan_more: #endif /* CONFIG_JFFS2_FS_XATTR */ case JFFS2_NODETYPE_CLEANMARKER: - D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs)); + jffs2_dbg(1, "CLEANMARKER node found at 0x%08x\n", ofs); if (je32_to_cpu(node->totlen) != c->cleanmarker_size) { - printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n", - ofs, je32_to_cpu(node->totlen), c->cleanmarker_size); + pr_notice("CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n", + ofs, je32_to_cpu(node->totlen), + c->cleanmarker_size); if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node))))) return err; ofs += PAD(sizeof(struct jffs2_unknown_node)); } else if (jeb->first_node) { - printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset); + pr_notice("CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", + ofs, jeb->offset); if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node))))) return err; ofs += PAD(sizeof(struct jffs2_unknown_node)); @@ -866,7 +890,8 @@ scan_more: default: switch (je16_to_cpu(node->nodetype) & JFFS2_COMPAT_MASK) { case JFFS2_FEATURE_ROCOMPAT: - printk(KERN_NOTICE "Read-only compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs); + pr_notice("Read-only compatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); c->flags |= JFFS2_SB_FLAG_RO; if (!(jffs2_is_readonly(c))) return -EROFS; @@ -876,18 +901,21 @@ scan_more: break; case JFFS2_FEATURE_INCOMPAT: - printk(KERN_NOTICE "Incompatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs); + pr_notice("Incompatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); return -EINVAL; case JFFS2_FEATURE_RWCOMPAT_DELETE: - D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs)); + jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen))))) return err; ofs += PAD(je32_to_cpu(node->totlen)); break; case JFFS2_FEATURE_RWCOMPAT_COPY: { - D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs)); + jffs2_dbg(1, "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", + je16_to_cpu(node->nodetype), ofs); jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL); @@ -908,8 +936,9 @@ scan_more: } } - D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n", - jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size)); + jffs2_dbg(1, "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n", + jeb->offset, jeb->free_size, jeb->dirty_size, + jeb->unchecked_size, jeb->used_size, jeb->wasted_size); /* mark_node_obsolete can add to wasted !! */ if (jeb->wasted_size) { @@ -935,7 +964,7 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin ic = jffs2_alloc_inode_cache(); if (!ic) { - printk(KERN_NOTICE "jffs2_scan_make_inode_cache(): allocation of inode cache failed\n"); + pr_notice("%s(): allocation of inode cache failed\n", __func__); return NULL; } memset(ic, 0, sizeof(*ic)); @@ -954,7 +983,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc struct jffs2_inode_cache *ic; uint32_t crc, ino = je32_to_cpu(ri->ino); - D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs)); + jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs); /* We do very little here now. Just check the ino# to which we should attribute this node; we can do all the CRC checking etc. later. There's a tradeoff here -- @@ -968,9 +997,8 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc /* Check the node CRC in any case. */ crc = crc32(0, ri, sizeof(*ri)-8); if (crc != je32_to_cpu(ri->node_crc)) { - printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on " - "node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ofs, je32_to_cpu(ri->node_crc), crc); + pr_notice("%s(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + __func__, ofs, je32_to_cpu(ri->node_crc), crc); /* * We believe totlen because the CRC on the node * _header_ was OK, just the node itself failed. @@ -989,10 +1017,10 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc /* Wheee. It worked */ jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic); - D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n", + jffs2_dbg(1, "Node is ino #%u, version %d. Range 0x%x-0x%x\n", je32_to_cpu(ri->ino), je32_to_cpu(ri->version), je32_to_cpu(ri->offset), - je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize))); + je32_to_cpu(ri->offset)+je32_to_cpu(ri->dsize)); pseudo_random += je32_to_cpu(ri->version); @@ -1012,15 +1040,15 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo uint32_t crc; int err; - D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs)); + jffs2_dbg(1, "%s(): Node at 0x%08x\n", __func__, ofs); /* We don't get here unless the node is still valid, so we don't have to mask in the ACCURATE bit any more. */ crc = crc32(0, rd, sizeof(*rd)-8); if (crc != je32_to_cpu(rd->node_crc)) { - printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ofs, je32_to_cpu(rd->node_crc), crc); + pr_notice("%s(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + __func__, ofs, je32_to_cpu(rd->node_crc), crc); /* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */ if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen))))) return err; @@ -1032,7 +1060,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo /* Should never happen. Did. (OLPC trac #4184)*/ checkedlen = strnlen(rd->name, rd->nsize); if (checkedlen < rd->nsize) { - printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n", + pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n", ofs, checkedlen); } fd = jffs2_alloc_full_dirent(checkedlen+1); @@ -1044,9 +1072,10 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo crc = crc32(0, fd->name, rd->nsize); if (crc != je32_to_cpu(rd->name_crc)) { - printk(KERN_NOTICE "jffs2_scan_dirent_node(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", - ofs, je32_to_cpu(rd->name_crc), crc); - D1(printk(KERN_NOTICE "Name for which CRC failed is (now) '%s', ino #%d\n", fd->name, je32_to_cpu(rd->ino))); + pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", + __func__, ofs, je32_to_cpu(rd->name_crc), crc); + jffs2_dbg(1, "Name for which CRC failed is (now) '%s', ino #%d\n", + fd->name, je32_to_cpu(rd->ino)); jffs2_free_full_dirent(fd); /* FIXME: Why do we believe totlen? */ /* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */ diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 0f20208df60..aca97f35b29 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -23,8 +23,8 @@ #include "nodelist.h" /* ---- Initial Security Label(s) Attachment callback --- */ -int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int jffs2_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; int err = 0; diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index e537fb0e018..c522d098bb4 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -11,6 +11,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mtd/mtd.h> @@ -442,13 +444,16 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras /* This should never happen, but https://dev.laptop.org/ticket/4184 */ checkedlen = strnlen(spd->name, spd->nsize); if (!checkedlen) { - printk(KERN_ERR "Dirent at %08x has zero at start of name. Aborting mount.\n", - jeb->offset + je32_to_cpu(spd->offset)); + pr_err("Dirent at %08x has zero at start of name. Aborting mount.\n", + jeb->offset + + je32_to_cpu(spd->offset)); return -EIO; } if (checkedlen < spd->nsize) { - printk(KERN_ERR "Dirent at %08x has zeroes in name. Truncating to %d chars\n", - jeb->offset + je32_to_cpu(spd->offset), checkedlen); + pr_err("Dirent at %08x has zeroes in name. Truncating to %d chars\n", + jeb->offset + + je32_to_cpu(spd->offset), + checkedlen); } @@ -808,8 +813,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock sum_ofs = jeb->offset + c->sector_size - jeb->free_size; - dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n", - sum_ofs); + dbg_summary("writing out data to flash to pos : 0x%08x\n", sum_ofs); ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index f2d96b5e64f..f9916f312bd 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> @@ -69,7 +71,7 @@ static void jffs2_write_super(struct super_block *sb) sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - D1(printk(KERN_DEBUG "jffs2_write_super()\n")); + jffs2_dbg(1, "%s()\n", __func__); jffs2_flush_wbuf_gc(c, 0); } @@ -214,8 +216,8 @@ static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) JFFS2_COMPR_MODE_FORCEZLIB; #endif else { - printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"", - name); + pr_err("Error: unknown compressor \"%s\"\n", + name); kfree(name); return -EINVAL; } @@ -223,8 +225,8 @@ static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) c->mount_opts.override_compr = true; break; default: - printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", - p); + pr_err("Error: unrecognized mount option '%s' or missing value\n", + p); return -EINVAL; } } @@ -266,9 +268,9 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) struct jffs2_sb_info *c; int ret; - D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" + jffs2_dbg(1, "jffs2_get_sb_mtd():" " New superblock for device %d (\"%s\")\n", - sb->s_mtd->index, sb->s_mtd->name)); + sb->s_mtd->index, sb->s_mtd->name); c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) @@ -315,7 +317,7 @@ static void jffs2_put_super (struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); + jffs2_dbg(2, "%s()\n", __func__); if (sb->s_dirt) jffs2_write_super(sb); @@ -336,7 +338,7 @@ static void jffs2_put_super (struct super_block *sb) kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); mtd_sync(c->mtd); - D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); + jffs2_dbg(1, "%s(): returning\n", __func__); } static void jffs2_kill_sb(struct super_block *sb) @@ -371,7 +373,7 @@ static int __init init_jffs2_fs(void) BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68); BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32); - printk(KERN_INFO "JFFS2 version 2.2." + pr_info("version 2.2." #ifdef CONFIG_JFFS2_FS_WRITEBUFFER " (NAND)" #endif @@ -386,22 +388,22 @@ static int __init init_jffs2_fs(void) SLAB_MEM_SPREAD), jffs2_i_init_once); if (!jffs2_inode_cachep) { - printk(KERN_ERR "JFFS2 error: Failed to initialise inode cache\n"); + pr_err("error: Failed to initialise inode cache\n"); return -ENOMEM; } ret = jffs2_compressors_init(); if (ret) { - printk(KERN_ERR "JFFS2 error: Failed to initialise compressors\n"); + pr_err("error: Failed to initialise compressors\n"); goto out; } ret = jffs2_create_slab_caches(); if (ret) { - printk(KERN_ERR "JFFS2 error: Failed to initialise slab caches\n"); + pr_err("error: Failed to initialise slab caches\n"); goto out_compressors; } ret = register_filesystem(&jffs2_fs_type); if (ret) { - printk(KERN_ERR "JFFS2 error: Failed to register filesystem\n"); + pr_err("error: Failed to register filesystem\n"); goto out_slab; } return 0; diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index e3035afb181..6e563332bb2 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/fs.h> #include <linux/namei.h> @@ -47,10 +49,11 @@ static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) */ if (!p) { - printk(KERN_ERR "jffs2_follow_link(): can't find symlink target\n"); + pr_err("%s(): can't find symlink target\n", __func__); p = ERR_PTR(-EIO); } - D1(printk(KERN_DEBUG "jffs2_follow_link(): target path is '%s'\n", (char *) f->target)); + jffs2_dbg(1, "%s(): target path is '%s'\n", + __func__, (char *)f->target); nd_set_link(nd, p); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 30e8f47e8a2..74d9be19df3 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -11,6 +11,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mtd/mtd.h> @@ -91,7 +93,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) { - D1(printk(KERN_DEBUG "No memory to allocate inodirty. Fallback to all considered dirty\n")); + jffs2_dbg(1, "No memory to allocate inodirty. Fallback to all considered dirty\n"); jffs2_clear_wbuf_ino_list(c); c->wbuf_inodes = &inodirty_nomem; return; @@ -113,19 +115,20 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) list_for_each_safe(this, next, &c->erasable_pending_wbuf_list) { struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); - D1(printk(KERN_DEBUG "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n", jeb->offset)); + jffs2_dbg(1, "Removing eraseblock at 0x%08x from erasable_pending_wbuf_list...\n", + jeb->offset); list_del(this); if ((jiffies + (n++)) & 127) { /* Most of the time, we just erase it immediately. Otherwise we spend ages scanning it on mount, etc. */ - D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); + jffs2_dbg(1, "...and adding to erase_pending_list\n"); list_add_tail(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_garbage_collect_trigger(c); } else { /* Sometimes, however, we leave it elsewhere so it doesn't get immediately reused, and we spread the load a bit. */ - D1(printk(KERN_DEBUG "...and adding to erasable_list\n")); + jffs2_dbg(1, "...and adding to erasable_list\n"); list_add_tail(&jeb->list, &c->erasable_list); } } @@ -136,7 +139,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty) { - D1(printk("About to refile bad block at %08x\n", jeb->offset)); + jffs2_dbg(1, "About to refile bad block at %08x\n", jeb->offset); /* File the existing block on the bad_used_list.... */ if (c->nextblock == jeb) @@ -144,12 +147,14 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock else /* Not sure this should ever happen... need more coffee */ list_del(&jeb->list); if (jeb->first_node) { - D1(printk("Refiling block at %08x to bad_used_list\n", jeb->offset)); + jffs2_dbg(1, "Refiling block at %08x to bad_used_list\n", + jeb->offset); list_add(&jeb->list, &c->bad_used_list); } else { BUG_ON(allow_empty == REFILE_NOTEMPTY); /* It has to have had some nodes or we couldn't be here */ - D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); + jffs2_dbg(1, "Refiling block at %08x to erase_pending_list\n", + jeb->offset); list_add(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_garbage_collect_trigger(c); @@ -230,10 +235,12 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf, ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); if (ret && ret != -EUCLEAN && ret != -EBADMSG) { - printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret); + pr_warn("%s(): Read back of page at %08x failed: %d\n", + __func__, c->wbuf_ofs, ret); return ret; } else if (retlen != c->wbuf_pagesize) { - printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x gave short read: %zd not %d.\n", ofs, retlen, c->wbuf_pagesize); + pr_warn("%s(): Read back of page at %08x gave short read: %zd not %d\n", + __func__, ofs, retlen, c->wbuf_pagesize); return -EIO; } if (!memcmp(buf, c->wbuf_verify, c->wbuf_pagesize)) @@ -246,12 +253,12 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf, else eccstr = "OK or unused"; - printk(KERN_WARNING "Write verify error (ECC %s) at %08x. Wrote:\n", - eccstr, c->wbuf_ofs); + pr_warn("Write verify error (ECC %s) at %08x. Wrote:\n", + eccstr, c->wbuf_ofs); print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, c->wbuf, c->wbuf_pagesize, 0); - printk(KERN_WARNING "Read back:\n"); + pr_warn("Read back:\n"); print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, c->wbuf_verify, c->wbuf_pagesize, 0); @@ -308,7 +315,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) if (!first_raw) { /* All nodes were obsolete. Nothing to recover. */ - D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n")); + jffs2_dbg(1, "No non-obsolete nodes to be recovered. Just filing block bad\n"); c->wbuf_len = 0; return; } @@ -331,7 +338,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) buf = kmalloc(end - start, GFP_KERNEL); if (!buf) { - printk(KERN_CRIT "Malloc failure in wbuf recovery. Data loss ensues.\n"); + pr_crit("Malloc failure in wbuf recovery. Data loss ensues.\n"); goto read_failed; } @@ -346,7 +353,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) ret = 0; if (ret || retlen != c->wbuf_ofs - start) { - printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n"); + pr_crit("Old data are already lost in wbuf recovery. Data loss ensues.\n"); kfree(buf); buf = NULL; @@ -380,7 +387,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* ... and get an allocation of space from a shiny new block instead */ ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE); if (ret) { - printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n"); + pr_warn("Failed to allocate space for wbuf recovery. Data loss ensues.\n"); kfree(buf); return; } @@ -390,7 +397,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile); if (ret) { - printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n"); + pr_warn("Failed to allocate node refs for wbuf recovery. Data loss ensues.\n"); kfree(buf); return; } @@ -406,13 +413,13 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) unsigned char *rewrite_buf = buf?:c->wbuf; uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize); - D1(printk(KERN_DEBUG "Write 0x%x bytes at 0x%08x in wbuf recover\n", - towrite, ofs)); + jffs2_dbg(1, "Write 0x%x bytes at 0x%08x in wbuf recover\n", + towrite, ofs); #ifdef BREAKMEHEADER static int breakme; if (breakme++ == 20) { - printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs); + pr_notice("Faking write error at 0x%08x\n", ofs); breakme = 0; mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf); ret = -EIO; @@ -423,7 +430,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) { /* Argh. We tried. Really we did. */ - printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n"); + pr_crit("Recovery of wbuf failed due to a second write error\n"); kfree(buf); if (retlen) @@ -431,7 +438,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) return; } - printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs); + pr_notice("Recovery of wbuf succeeded to %08x\n", ofs); c->wbuf_len = (end - start) - towrite; c->wbuf_ofs = ofs + towrite; @@ -459,8 +466,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) struct jffs2_raw_node_ref **adjust_ref = NULL; struct jffs2_inode_info *f = NULL; - D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n", - rawlen, ref_offset(raw), ref_flags(raw), ofs)); + jffs2_dbg(1, "Refiling block of %08x at %08x(%d) to %08x\n", + rawlen, ref_offset(raw), ref_flags(raw), ofs); ic = jffs2_raw_ref_to_ic(raw); @@ -540,7 +547,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* Fix up the original jeb now it's on the bad_list */ if (first_raw == jeb->first_node) { - D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); + jffs2_dbg(1, "Failing block at %08x is now empty. Moving to erase_pending_list\n", + jeb->offset); list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_garbage_collect_trigger(c); @@ -554,7 +562,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) spin_unlock(&c->erase_completion_lock); - D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len)); + jffs2_dbg(1, "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", + c->wbuf_ofs, c->wbuf_len); } @@ -579,7 +588,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) return 0; if (!mutex_is_locked(&c->alloc_sem)) { - printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); + pr_crit("jffs2_flush_wbuf() called with alloc_sem not locked!\n"); BUG(); } @@ -617,7 +626,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) #ifdef BREAKME static int breakme; if (breakme++ == 20) { - printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs); + pr_notice("Faking write error at 0x%08x\n", c->wbuf_ofs); breakme = 0; mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, brokenbuf); @@ -629,11 +638,11 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) &retlen, c->wbuf); if (ret) { - printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret); + pr_warn("jffs2_flush_wbuf(): Write failed with %d\n", ret); goto wfail; } else if (retlen != c->wbuf_pagesize) { - printk(KERN_WARNING "jffs2_flush_wbuf(): Write was short: %zd instead of %d\n", - retlen, c->wbuf_pagesize); + pr_warn("jffs2_flush_wbuf(): Write was short: %zd instead of %d\n", + retlen, c->wbuf_pagesize); ret = -EIO; goto wfail; } else if ((ret = jffs2_verify_write(c, c->wbuf, c->wbuf_ofs))) { @@ -647,17 +656,18 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (pad) { uint32_t waste = c->wbuf_pagesize - c->wbuf_len; - D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n", - (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset)); + jffs2_dbg(1, "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n", + (wbuf_jeb == c->nextblock) ? "next" : "", + wbuf_jeb->offset); /* wbuf_pagesize - wbuf_len is the amount of space that's to be padded. If there is less free space in the block than that, something screwed up */ if (wbuf_jeb->free_size < waste) { - printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n", - c->wbuf_ofs, c->wbuf_len, waste); - printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n", - wbuf_jeb->offset, wbuf_jeb->free_size); + pr_crit("jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n", + c->wbuf_ofs, c->wbuf_len, waste); + pr_crit("jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n", + wbuf_jeb->offset, wbuf_jeb->free_size); BUG(); } @@ -694,14 +704,14 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) uint32_t old_wbuf_len; int ret = 0; - D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino)); + jffs2_dbg(1, "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino); if (!c->wbuf) return 0; mutex_lock(&c->alloc_sem); if (!jffs2_wbuf_pending_for_ino(c, ino)) { - D1(printk(KERN_DEBUG "Ino #%d not pending in wbuf. Returning\n", ino)); + jffs2_dbg(1, "Ino #%d not pending in wbuf. Returning\n", ino); mutex_unlock(&c->alloc_sem); return 0; } @@ -711,7 +721,8 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) if (c->unchecked_size) { /* GC won't make any progress for a while */ - D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() padding. Not finished checking\n")); + jffs2_dbg(1, "%s(): padding. Not finished checking\n", + __func__); down_write(&c->wbuf_sem); ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); /* retry flushing wbuf in case jffs2_wbuf_recover @@ -724,7 +735,7 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) mutex_unlock(&c->alloc_sem); - D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() calls gc pass\n")); + jffs2_dbg(1, "%s(): calls gc pass\n", __func__); ret = jffs2_garbage_collect_pass(c); if (ret) { @@ -742,7 +753,7 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) mutex_lock(&c->alloc_sem); } - D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() ends...\n")); + jffs2_dbg(1, "%s(): ends...\n", __func__); mutex_unlock(&c->alloc_sem); return ret; @@ -811,9 +822,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) { /* It's a write to a new block */ if (c->wbuf_len) { - D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx " - "causes flush of wbuf at 0x%08x\n", - (unsigned long)to, c->wbuf_ofs)); + jffs2_dbg(1, "%s(): to 0x%lx causes flush of wbuf at 0x%08x\n", + __func__, (unsigned long)to, c->wbuf_ofs); ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); if (ret) goto outerr; @@ -825,11 +835,11 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, if (to != PAD(c->wbuf_ofs + c->wbuf_len)) { /* We're not writing immediately after the writebuffer. Bad. */ - printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write " - "to %08lx\n", (unsigned long)to); + pr_crit("%s(): Non-contiguous write to %08lx\n", + __func__, (unsigned long)to); if (c->wbuf_len) - printk(KERN_CRIT "wbuf was previously %08x-%08x\n", - c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len); + pr_crit("wbuf was previously %08x-%08x\n", + c->wbuf_ofs, c->wbuf_ofs + c->wbuf_len); BUG(); } @@ -957,8 +967,8 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) { if (ret == -EBADMSG) - printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)" - " returned ECC error\n", len, ofs); + pr_warn("mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n", + len, ofs); /* * We have the raw data without ECC correction in the buffer, * maybe we are lucky and all data or parts are correct. We @@ -1034,9 +1044,8 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, ret = mtd_read_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { - printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" - " bytes, read %zd bytes, error %d\n", - jeb->offset, ops.ooblen, ops.oobretlen, ret); + pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); if (!ret) ret = -EIO; return ret; @@ -1048,8 +1057,8 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, continue; if (ops.oobbuf[i] != 0xFF) { - D2(printk(KERN_DEBUG "Found %02x at %x in OOB for " - "%08x\n", ops.oobbuf[i], i, jeb->offset)); + jffs2_dbg(2, "Found %02x at %x in OOB for " + "%08x\n", ops.oobbuf[i], i, jeb->offset); return 1; } } @@ -1077,9 +1086,8 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, ret = mtd_read_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { - printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" - " bytes, read %zd bytes, error %d\n", - jeb->offset, ops.ooblen, ops.oobretlen, ret); + pr_err("cannot read OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); if (!ret) ret = -EIO; return ret; @@ -1103,9 +1111,8 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, ret = mtd_write_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { - printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd" - " bytes, read %zd bytes, error %d\n", - jeb->offset, ops.ooblen, ops.oobretlen, ret); + pr_err("cannot write OOB for EB at %08x, requested %zd bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); if (!ret) ret = -EIO; return ret; @@ -1130,11 +1137,12 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock * if( ++jeb->bad_count < MAX_ERASE_FAILURES) return 0; - printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset); + pr_warn("marking eraseblock at %08x as bad\n", bad_offset); ret = mtd_block_markbad(c->mtd, bad_offset); if (ret) { - D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret)); + jffs2_dbg(1, "%s(): Write failed for block at %08x: error %d\n", + __func__, jeb->offset, ret); return ret; } return 1; @@ -1151,11 +1159,11 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c) c->cleanmarker_size = 0; if (!oinfo || oinfo->oobavail == 0) { - printk(KERN_ERR "inconsistent device description\n"); + pr_err("inconsistent device description\n"); return -EINVAL; } - D1(printk(KERN_DEBUG "JFFS2 using OOB on NAND\n")); + jffs2_dbg(1, "using OOB on NAND\n"); c->oobavail = oinfo->oobavail; @@ -1222,7 +1230,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { if ((c->flash_size % c->sector_size) != 0) { c->flash_size = (c->flash_size / c->sector_size) * c->sector_size; - printk(KERN_WARNING "JFFS2 flash size adjusted to %dKiB\n", c->flash_size); + pr_warn("flash size adjusted to %dKiB\n", c->flash_size); }; c->wbuf_ofs = 0xFFFFFFFF; @@ -1239,7 +1247,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { } #endif - printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); + pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n", + c->wbuf_pagesize, c->sector_size); return 0; } @@ -1297,7 +1306,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) { if (!c->wbuf) return -ENOMEM; - printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); + pr_info("write-buffering enabled buffer (%d) erasesize (%d)\n", + c->wbuf_pagesize, c->sector_size); return 0; } diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index 30d175b6d29..b634de4c810 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/fs.h> #include <linux/crc32.h> @@ -36,7 +38,7 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, f->inocache->state = INO_STATE_PRESENT; jffs2_add_ino_cache(c, f->inocache); - D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino)); + jffs2_dbg(1, "%s(): Assigned ino# %d\n", __func__, f->inocache->ino); ri->ino = cpu_to_je32(f->inocache->ino); ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -68,7 +70,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 unsigned long cnt = 2; D1(if(je32_to_cpu(ri->hdr_crc) != crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)) { - printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dnode()\n"); + pr_crit("Eep. CRC not correct in jffs2_write_dnode()\n"); BUG(); } ); @@ -78,7 +80,9 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 vecs[1].iov_len = datalen; if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) { - printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen); + pr_warn("%s(): ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", + __func__, je32_to_cpu(ri->totlen), + sizeof(*ri), datalen); } fn = jffs2_alloc_full_dnode(); @@ -95,9 +99,9 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) { BUG_ON(!retried); - D1(printk(KERN_DEBUG "jffs2_write_dnode : dnode_version %d, " - "highest version %d -> updating dnode\n", - je32_to_cpu(ri->version), f->highest_version)); + jffs2_dbg(1, "%s(): dnode_version %d, highest version %d -> updating dnode\n", + __func__, + je32_to_cpu(ri->version), f->highest_version); ri->version = cpu_to_je32(++f->highest_version); ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); } @@ -106,8 +110,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 (alloc_mode==ALLOC_GC)?0:f->inocache->ino); if (ret || (retlen != sizeof(*ri) + datalen)) { - printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", - sizeof(*ri)+datalen, flash_ofs, ret, retlen); + pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", + sizeof(*ri) + datalen, flash_ofs, ret, retlen); /* Mark the space as dirtied */ if (retlen) { @@ -118,7 +122,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 this node */ jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL); } else { - printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs); + pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", + flash_ofs); } if (!retried && alloc_mode != ALLOC_NORETRY) { /* Try to reallocate space and retry */ @@ -127,7 +132,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 retried = 1; - D1(printk(KERN_DEBUG "Retrying failed write.\n")); + jffs2_dbg(1, "Retrying failed write.\n"); jffs2_dbg_acct_sanity_check(c,jeb); jffs2_dbg_acct_paranoia_check(c, jeb); @@ -147,14 +152,16 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 if (!ret) { flash_ofs = write_ofs(c); - D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs)); + jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write.\n", + flash_ofs); jffs2_dbg_acct_sanity_check(c,jeb); jffs2_dbg_acct_paranoia_check(c, jeb); goto retry; } - D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret)); + jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n", + ret); } /* Release the full_dnode which is now useless, and return */ jffs2_free_full_dnode(fn); @@ -183,10 +190,10 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 fn->size = je32_to_cpu(ri->dsize); fn->frags = 0; - D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n", + jffs2_dbg(1, "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n", flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize), je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc), - je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen))); + je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)); if (retried) { jffs2_dbg_acct_sanity_check(c,NULL); @@ -206,22 +213,23 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff int retried = 0; int ret; - D1(printk(KERN_DEBUG "jffs2_write_dirent(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n", + jffs2_dbg(1, "%s(ino #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x)\n", + __func__, je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), - je32_to_cpu(rd->name_crc))); + je32_to_cpu(rd->name_crc)); D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) { - printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n"); + pr_crit("Eep. CRC not correct in jffs2_write_dirent()\n"); BUG(); }); if (strnlen(name, namelen) != namelen) { /* This should never happen, but seems to have done on at least one occasion: https://dev.laptop.org/ticket/4184 */ - printk(KERN_CRIT "Error in jffs2_write_dirent() -- name contains zero bytes!\n"); - printk(KERN_CRIT "Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n", - je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), - je32_to_cpu(rd->name_crc)); + pr_crit("Error in jffs2_write_dirent() -- name contains zero bytes!\n"); + pr_crit("Directory inode #%u, name at *0x%p \"%s\"->ino #%u, name_crc 0x%08x\n", + je32_to_cpu(rd->pino), name, name, je32_to_cpu(rd->ino), + je32_to_cpu(rd->name_crc)); WARN_ON(1); return ERR_PTR(-EIO); } @@ -249,9 +257,9 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) { BUG_ON(!retried); - D1(printk(KERN_DEBUG "jffs2_write_dirent : dirent_version %d, " - "highest version %d -> updating dirent\n", - je32_to_cpu(rd->version), f->highest_version)); + jffs2_dbg(1, "%s(): dirent_version %d, highest version %d -> updating dirent\n", + __func__, + je32_to_cpu(rd->version), f->highest_version); rd->version = cpu_to_je32(++f->highest_version); fd->version = je32_to_cpu(rd->version); rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); @@ -260,13 +268,14 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen, (alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino)); if (ret || (retlen != sizeof(*rd) + namelen)) { - printk(KERN_NOTICE "Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", - sizeof(*rd)+namelen, flash_ofs, ret, retlen); + pr_notice("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n", + sizeof(*rd) + namelen, flash_ofs, ret, retlen); /* Mark the space as dirtied */ if (retlen) { jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL); } else { - printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs); + pr_notice("Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", + flash_ofs); } if (!retried) { /* Try to reallocate space and retry */ @@ -275,7 +284,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff retried = 1; - D1(printk(KERN_DEBUG "Retrying failed write.\n")); + jffs2_dbg(1, "Retrying failed write.\n"); jffs2_dbg_acct_sanity_check(c,jeb); jffs2_dbg_acct_paranoia_check(c, jeb); @@ -295,12 +304,14 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff if (!ret) { flash_ofs = write_ofs(c); - D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs)); + jffs2_dbg(1, "Allocated space at 0x%08x to retry failed write\n", + flash_ofs); jffs2_dbg_acct_sanity_check(c,jeb); jffs2_dbg_acct_paranoia_check(c, jeb); goto retry; } - D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret)); + jffs2_dbg(1, "Failed to allocate space to retry failed write: %d!\n", + ret); } /* Release the full_dnode which is now useless, and return */ jffs2_free_full_dirent(fd); @@ -333,8 +344,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, int ret = 0; uint32_t writtenlen = 0; - D1(printk(KERN_DEBUG "jffs2_write_inode_range(): Ino #%u, ofs 0x%x, len 0x%x\n", - f->inocache->ino, offset, writelen)); + jffs2_dbg(1, "%s(): Ino #%u, ofs 0x%x, len 0x%x\n", + __func__, f->inocache->ino, offset, writelen); while(writelen) { struct jffs2_full_dnode *fn; @@ -345,12 +356,13 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, int retried = 0; retry: - D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset)); + jffs2_dbg(2, "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", + writelen, offset); ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) { - D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret)); + jffs2_dbg(1, "jffs2_reserve_space returned %d\n", ret); break; } mutex_lock(&f->sem); @@ -386,7 +398,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, if (!retried) { /* Write error to be retried */ retried = 1; - D1(printk(KERN_DEBUG "Retrying node write in jffs2_write_inode_range()\n")); + jffs2_dbg(1, "Retrying node write in jffs2_write_inode_range()\n"); goto retry; } break; @@ -399,7 +411,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, } if (ret) { /* Eep */ - D1(printk(KERN_DEBUG "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n", ret)); + jffs2_dbg(1, "Eep. add_full_dnode_to_inode() failed in commit_write, returned %d\n", + ret); jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); @@ -410,11 +423,11 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, mutex_unlock(&f->sem); jffs2_complete_reservation(c); if (!datalen) { - printk(KERN_WARNING "Eep. We didn't actually write any data in jffs2_write_inode_range()\n"); + pr_warn("Eep. We didn't actually write any data in jffs2_write_inode_range()\n"); ret = -EIO; break; } - D1(printk(KERN_DEBUG "increasing writtenlen by %d\n", datalen)); + jffs2_dbg(1, "increasing writtenlen by %d\n", datalen); writtenlen += datalen; offset += datalen; writelen -= datalen; @@ -439,7 +452,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, */ ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); - D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen)); + jffs2_dbg(1, "%s(): reserved 0x%x bytes\n", __func__, alloclen); if (ret) return ret; @@ -450,11 +463,11 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL); - D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n", - jemode_to_cpu(ri->mode))); + jffs2_dbg(1, "jffs2_do_create created file with mode 0x%x\n", + jemode_to_cpu(ri->mode)); if (IS_ERR(fn)) { - D1(printk(KERN_DEBUG "jffs2_write_dnode() failed\n")); + jffs2_dbg(1, "jffs2_write_dnode() failed\n"); /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); @@ -480,7 +493,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, if (ret) { /* Eep. */ - D1(printk(KERN_DEBUG "jffs2_reserve_space() for dirent failed\n")); + jffs2_dbg(1, "jffs2_reserve_space() for dirent failed\n"); return ret; } @@ -597,8 +610,8 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, !memcmp(fd->name, name, namelen) && !fd->name[namelen]) { - D1(printk(KERN_DEBUG "Marking old dirent node (ino #%u) @%08x obsolete\n", - fd->ino, ref_offset(fd->raw))); + jffs2_dbg(1, "Marking old dirent node (ino #%u) @%08x obsolete\n", + fd->ino, ref_offset(fd->raw)); jffs2_mark_node_obsolete(c, fd->raw); /* We don't want to remove it from the list immediately, because that screws up getdents()/seek() semantics even @@ -627,11 +640,13 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, dead_f->dents = fd->next; if (fd->ino) { - printk(KERN_WARNING "Deleting inode #%u with active dentry \"%s\"->ino #%u\n", - dead_f->inocache->ino, fd->name, fd->ino); + pr_warn("Deleting inode #%u with active dentry \"%s\"->ino #%u\n", + dead_f->inocache->ino, + fd->name, fd->ino); } else { - D1(printk(KERN_DEBUG "Removing deletion dirent for \"%s\" from dir ino #%u\n", - fd->name, dead_f->inocache->ino)); + jffs2_dbg(1, "Removing deletion dirent for \"%s\" from dir ino #%u\n", + fd->name, + dead_f->inocache->ino); } if (fd->raw) jffs2_mark_node_obsolete(c, fd->raw); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 3e93cdd1900..b55b803eddc 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2774e1013b3..f49b9afc443 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -496,7 +496,7 @@ static int param_set_##name(const char *val, struct kernel_param *kp) \ __typeof__(type) num = which_strtol(val, &endp, 0); \ if (endp == val || *endp || num < (min) || num > (max)) \ return -EINVAL; \ - *((int *) kp->arg) = num; \ + *((type *) kp->arg) = num; \ return 0; \ } diff --git a/fs/namei.c b/fs/namei.c index e615ff37e27..1898198abc3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1054,53 +1054,65 @@ static void follow_dotdot(struct nameidata *nd) } /* - * Allocate a dentry with name and parent, and perform a parent - * directory ->lookup on it. Returns the new dentry, or ERR_PTR - * on error. parent->d_inode->i_mutex must be held. d_lookup must - * have verified that no child exists while under i_mutex. + * This looks up the name in dcache, possibly revalidates the old dentry and + * allocates a new one if not found or not valid. In the need_lookup argument + * returns whether i_op->lookup is necessary. + * + * dir->d_inode->i_mutex must be held */ -static struct dentry *d_alloc_and_lookup(struct dentry *parent, - struct qstr *name, struct nameidata *nd) +static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, + struct nameidata *nd, bool *need_lookup) { - struct inode *inode = parent->d_inode; struct dentry *dentry; - struct dentry *old; + int error; - /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) - return ERR_PTR(-ENOENT); + *need_lookup = false; + dentry = d_lookup(dir, name); + if (dentry) { + if (d_need_lookup(dentry)) { + *need_lookup = true; + } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { + error = d_revalidate(dentry, nd); + if (unlikely(error <= 0)) { + if (error < 0) { + dput(dentry); + return ERR_PTR(error); + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } + } + } - dentry = d_alloc(parent, name); - if (unlikely(!dentry)) - return ERR_PTR(-ENOMEM); + if (!dentry) { + dentry = d_alloc(dir, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); - old = inode->i_op->lookup(inode, dentry, nd); - if (unlikely(old)) { - dput(dentry); - dentry = old; + *need_lookup = true; } return dentry; } /* - * We already have a dentry, but require a lookup to be performed on the parent - * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error. - * parent->d_inode->i_mutex must be held. d_lookup must have verified that no - * child exists while under i_mutex. + * Call i_op->lookup on the dentry. The dentry must be negative but may be + * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * + * dir->d_inode->i_mutex must be held */ -static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry, - struct nameidata *nd) +static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) { - struct inode *inode = parent->d_inode; struct dentry *old; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) { + if (unlikely(IS_DEADDIR(dir))) { dput(dentry); return ERR_PTR(-ENOENT); } - old = inode->i_op->lookup(inode, dentry, nd); + old = dir->i_op->lookup(dir, dentry, nd); if (unlikely(old)) { dput(dentry); dentry = old; @@ -1108,6 +1120,19 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr return dentry; } +static struct dentry *__lookup_hash(struct qstr *name, + struct dentry *base, struct nameidata *nd) +{ + bool need_lookup; + struct dentry *dentry; + + dentry = lookup_dcache(name, base, nd, &need_lookup); + if (!need_lookup) + return dentry; + + return lookup_real(base->d_inode, dentry, nd); +} + /* * It's more convoluted than I'd like it to be, but... it's still fairly * small and for now I'd prefer to have fast path as straight as possible. @@ -1139,6 +1164,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, return -ECHILD; nd->seq = seq; + if (unlikely(d_need_lookup(dentry))) + goto unlazy; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { @@ -1147,8 +1174,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, goto unlazy; } } - if (unlikely(d_need_lookup(dentry))) - goto unlazy; path->mnt = mnt; path->dentry = dentry; if (unlikely(!__follow_mount_rcu(nd, path, inode))) @@ -1163,38 +1188,14 @@ unlazy: dentry = __d_lookup(parent, name); } - if (dentry && unlikely(d_need_lookup(dentry))) { + if (unlikely(!dentry)) + goto need_lookup; + + if (unlikely(d_need_lookup(dentry))) { dput(dentry); - dentry = NULL; - } -retry: - if (unlikely(!dentry)) { - struct inode *dir = parent->d_inode; - BUG_ON(nd->inode != dir); - - mutex_lock(&dir->i_mutex); - dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); - } - /* known good */ - need_reval = 0; - status = 1; - } else if (unlikely(d_need_lookup(dentry))) { - dentry = d_inode_lookup(parent, dentry, nd); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); - } - /* known good */ - need_reval = 0; - status = 1; - } - mutex_unlock(&dir->i_mutex); + goto need_lookup; } + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { @@ -1204,12 +1205,10 @@ retry: } if (!d_invalidate(dentry)) { dput(dentry); - dentry = NULL; - need_reval = 1; - goto retry; + goto need_lookup; } } - +done: path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1221,6 +1220,16 @@ retry: nd->flags |= LOOKUP_JUMPED; *inode = path->dentry->d_inode; return 0; + +need_lookup: + BUG_ON(nd->inode != parent->d_inode); + + mutex_lock(&parent->d_inode->i_mutex); + dentry = __lookup_hash(name, parent, nd); + mutex_unlock(&parent->d_inode->i_mutex); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + goto done; } static inline int may_lookup(struct nameidata *nd) @@ -1846,59 +1855,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, return err; } -static struct dentry *__lookup_hash(struct qstr *name, - struct dentry *base, struct nameidata *nd) -{ - struct inode *inode = base->d_inode; - struct dentry *dentry; - int err; - - err = inode_permission(inode, MAY_EXEC); - if (err) - return ERR_PTR(err); - - /* - * Don't bother with __d_lookup: callers are for creat as - * well as unlink, so a lot of the time it would cost - * a double lookup. - */ - dentry = d_lookup(base, name); - - if (dentry && d_need_lookup(dentry)) { - /* - * __lookup_hash is called with the parent dir's i_mutex already - * held, so we are good to go here. - */ - dentry = d_inode_lookup(base, dentry, nd); - if (IS_ERR(dentry)) - return dentry; - } - - if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { - int status = d_revalidate(dentry, nd); - if (unlikely(status <= 0)) { - /* - * The dentry failed validation. - * If d_revalidate returned 0 attempt to invalidate - * the dentry otherwise d_revalidate is asking us - * to return a fail status. - */ - if (status < 0) { - dput(dentry); - return ERR_PTR(status); - } else if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } - } - - if (!dentry) - dentry = d_alloc_and_lookup(base, name, nd); - - return dentry; -} - /* * Restricted form of lookup. Doesn't follow links, single-component only, * needs parent already locked. Doesn't follow mounts. @@ -1924,6 +1880,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; unsigned int c; + int err; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); @@ -1948,6 +1905,10 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return ERR_PTR(err); } + err = inode_permission(base->d_inode, MAY_EXEC); + if (err) + return ERR_PTR(err); + return __lookup_hash(&this, base, NULL); } @@ -2749,7 +2710,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /* * The dentry_unhash() helper will try to drop the dentry early: we - * should have a usage count of 2 if we're the only user of this + * should have a usage count of 1 if we're the only user of this * dentry, and if that is true (possibly after pruning the dcache), * then we drop the dentry now. * diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 64a326418aa..3ff5fcc1528 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -7,7 +7,6 @@ */ #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/time.h> #include <linux/kernel.h> diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 49df0e7f837..87484fb8d17 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -11,7 +11,6 @@ #include <linux/module.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/byteorder.h> diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index e5d71b27a5b..be20a7e171a 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -19,7 +19,6 @@ #include <linux/memcontrol.h> #include <asm/uaccess.h> -#include <asm/system.h> #include "ncp_fs.h" diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4a108a0a2a6..da7b5e4ff9e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -43,7 +43,6 @@ #include <linux/nsproxy.h> #include <linux/pid_namespace.h> -#include <asm/system.h> #include "nfs4_fs.h" #include "callback.h" diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9c7f66ac6cc..481be7f7bdd 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -51,7 +51,6 @@ #include <linux/nfs_page.h> #include <linux/sunrpc/clnt.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <linux/atomic.h> diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4fdaaa63cf1..aa9b709fd32 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -30,7 +30,6 @@ #include <linux/swap.h> #include <asm/uaccess.h> -#include <asm/system.h> #include "delegation.h" #include "internal.h" diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 801d6d83078..4ca6f5c8038 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -32,7 +32,6 @@ #include <linux/namei.h> #include <linux/security.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "nfs4_fs.h" diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7bb4d13c1cd..e8bbfa5b350 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -41,7 +41,6 @@ #include <linux/freezer.h> #include <linux/crc32.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "nfs4_fs.h" diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 634c0bcb4fd..5acfd9ea8a3 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -793,7 +793,6 @@ filelayout_clear_request_commit(struct nfs_page *req) if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) goto out; if (list_is_singular(&req->wb_list)) { - struct inode *inode = req->wb_context->dentry->d_inode; struct pnfs_layout_segment *lseg; /* From here we can find the bucket, but for the moment, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e809d2305eb..f82bde005a8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -270,7 +270,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case 0: return 0; case -NFS4ERR_OPENMODE: - if (nfs_have_delegation(inode, FMODE_READ)) { + if (inode && nfs_have_delegation(inode, FMODE_READ)) { nfs_inode_return_delegation(inode); exception->retry = 1; return 0; @@ -282,10 +282,9 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (state != NULL) - nfs_remove_bad_delegation(state->inode); if (state == NULL) break; + nfs_remove_bad_delegation(state->inode); nfs4_schedule_stateid_recovery(server, state); goto wait_on_recovery; case -NFS4ERR_EXPIRED: @@ -2290,11 +2289,12 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, switch (err) { case 0: case -NFS4ERR_WRONGSEC: - break; + goto out; default: err = nfs4_handle_exception(server, err, &exception); } } while (exception.retry); +out: return err; } @@ -3712,7 +3712,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu if (acl_len > buflen) goto out_free; _copy_from_pages(buf, pages, res.acl_data_offset, - res.acl_len); + acl_len); } ret = acl_len; out_free: @@ -3824,8 +3824,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (state != NULL) - nfs_remove_bad_delegation(state->inode); + if (state == NULL) + break; + nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: if (state == NULL) break; @@ -6111,21 +6112,22 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) return; switch (task->tk_status) { /* Just ignore these failures */ - case NFS4ERR_DELEG_REVOKED: /* layout was recalled */ - case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ - case NFS4ERR_BADLAYOUT: /* no layout */ - case NFS4ERR_GRACE: /* loca_recalim always false */ + case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case -NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case -NFS4ERR_BADLAYOUT: /* no layout */ + case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; - } - - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; - } - - if (task->tk_status == 0) + break; + case 0: nfs_post_op_update_inode_force_wcc(data->args.inode, data->res.fattr); + break; + default: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } } static void nfs4_layoutcommit_release(void *calldata) @@ -6229,11 +6231,12 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, case 0: case -NFS4ERR_WRONGSEC: case -NFS4ERR_NOTSUPP: - break; + goto out; default: err = nfs4_handle_exception(server, err, &exception); } } while (exception.retry); +out: return err; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index cc1f758a7ee..9a0e8ef4a40 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -20,7 +20,6 @@ #include <linux/nfs_page.h> #include <linux/module.h> -#include <asm/system.h> #include "pnfs.h" #include "nfs4_fs.h" diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ccc4cdb1efe..37412f706b3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -55,7 +55,6 @@ #include <linux/nsproxy.h> #include <linux/rcupdate.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "nfs4_fs.h" diff --git a/fs/nfsd/current_stateid.h b/fs/nfsd/current_stateid.h new file mode 100644 index 00000000000..4123551208d --- /dev/null +++ b/fs/nfsd/current_stateid.h @@ -0,0 +1,28 @@ +#ifndef _NFSD4_CURRENT_STATE_H +#define _NFSD4_CURRENT_STATE_H + +#include "state.h" +#include "xdr4.h" + +extern void clear_current_stateid(struct nfsd4_compound_state *cstate); +/* + * functions to set current state id + */ +extern void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *); +extern void nfsd4_set_openstateid(struct nfsd4_compound_state *, struct nfsd4_open *); +extern void nfsd4_set_lockstateid(struct nfsd4_compound_state *, struct nfsd4_lock *); +extern void nfsd4_set_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *); + +/* + * functions to consume current state id + */ +extern void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *); +extern void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *, struct nfsd4_delegreturn *); +extern void nfsd4_get_freestateid(struct nfsd4_compound_state *, struct nfsd4_free_stateid *); +extern void nfsd4_get_setattrstateid(struct nfsd4_compound_state *, struct nfsd4_setattr *); +extern void nfsd4_get_closestateid(struct nfsd4_compound_state *, struct nfsd4_close *); +extern void nfsd4_get_lockustateid(struct nfsd4_compound_state *, struct nfsd4_locku *); +extern void nfsd4_get_readstateid(struct nfsd4_compound_state *, struct nfsd4_read *); +extern void nfsd4_get_writestateid(struct nfsd4_compound_state *, struct nfsd4_write *); + +#endif /* _NFSD4_CURRENT_STATE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cf8a6bd062f..8e9689abbc0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) struct svc_expkey key; struct svc_expkey *ek = NULL; - if (mlen < 1 || mesg[mlen-1] != '\n') + if (mesg[mlen - 1] != '\n') return -EINVAL; mesg[mlen-1] = 0; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h new file mode 100644 index 00000000000..12e0cff435b --- /dev/null +++ b/fs/nfsd/netns.h @@ -0,0 +1,34 @@ +/* + * per net namespace data structures for nfsd + * + * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __NFSD_NETNS_H__ +#define __NFSD_NETNS_H__ + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +struct cld_net; + +struct nfsd_net { + struct cld_net *cld_net; +}; + +extern int nfsd_net_id; +#endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 0e262f32ac4..c8e9f637153 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -645,7 +645,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .timeout = &timeparms, .program = &cb_program, .version = 0, - .authflavor = clp->cl_flavor, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), }; struct rpc_clnt *client; @@ -656,6 +655,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.client_name = clp->cl_principal; args.prognumber = conn->cb_prog, args.protocol = XPRT_TRANSPORT_TCP; + args.authflavor = clp->cl_flavor; clp->cl_cb_ident = conn->cb_ident; } else { if (!conn->cb_xprt) @@ -665,6 +665,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; args.protocol = XPRT_TRANSPORT_BC_TCP; + args.authflavor = RPC_AUTH_UNIX; } /* Create RPC client */ client = rpc_create(&args); @@ -754,9 +755,9 @@ static void do_probe_callback(struct nfs4_client *clp) */ void nfsd4_probe_callback(struct nfs4_client *clp) { - /* XXX: atomicity? Also, should we be using cl_cb_flags? */ + /* XXX: atomicity? Also, should we be using cl_flags? */ clp->cl_cb_state = NFSD4_CB_UNKNOWN; - set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); + set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); do_probe_callback(clp); } @@ -915,7 +916,7 @@ void nfsd4_destroy_callback_queue(void) /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { - set_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags); + set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; * instead, nfsd4_do_callback_rpc() will detect the killed @@ -966,15 +967,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) svc_xprt_put(clp->cl_cb_conn.cb_xprt); clp->cl_cb_conn.cb_xprt = NULL; } - if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these * flags; main nfsd code can only set them: */ - BUG_ON(!clp->cl_cb_flags); - clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); + BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); c = __nfsd4_find_backchannel(clp); if (c) { @@ -986,7 +987,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) err = setup_callback_client(clp, &conn, ses); if (err) { - warn_no_callback_path(clp, err); + nfsd4_mark_cb_down(clp, err); return; } /* Yay, the callback channel's back! Restart any callbacks: */ @@ -1000,7 +1001,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w) struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - if (clp->cl_cb_flags) + if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 94096273cd6..322d11ce06a 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -41,6 +41,14 @@ #include "nfsd.h" /* + * Turn off idmapping when using AUTH_SYS. + */ +static bool nfs4_disable_idmapping = true; +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off server's NFSv4 idmapping when using 'sec=sys'"); + +/* * Cache entry */ @@ -561,28 +569,65 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) return ret; } +static bool +numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +{ + int ret; + char buf[11]; + + if (namelen + 1 > sizeof(buf)) + /* too long to represent a 32-bit id: */ + return false; + /* Just to make sure it's null-terminated: */ + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + ret = kstrtouint(name, 10, id); + return ret == 0; +} + +static __be32 +do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) +{ + if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS) + if (numeric_name_to_id(rqstp, type, name, namelen, id)) + return 0; + /* + * otherwise, fall through and try idmapping, for + * backwards compatibility with clients sending names: + */ + return idmap_name_to_id(rqstp, type, name, namelen, id); +} + +static int +do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) +{ + if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS) + return sprintf(name, "%u", id); + return idmap_id_to_name(rqstp, type, id, name); +} + __be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, __u32 *id) { - return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); + return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); } __be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, __u32 *id) { - return idmap_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id); + return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id); } int nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) { - return idmap_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); + return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); } int nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name) { - return idmap_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); + return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 896da74ec56..2ed14dfd00a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -39,6 +39,7 @@ #include "cache.h" #include "xdr4.h" #include "vfs.h" +#include "current_stateid.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -192,10 +193,13 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) static __be32 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { - struct svc_fh resfh; + struct svc_fh *resfh; __be32 status; - fh_init(&resfh, NFS4_FHSIZE); + resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + if (!resfh) + return nfserr_jukebox; + fh_init(resfh, NFS4_FHSIZE); open->op_truncate = 0; if (open->op_create) { @@ -220,7 +224,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o */ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, - &resfh, open->op_createmode, + resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); @@ -234,30 +238,29 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o FATTR4_WORD1_TIME_MODIFY); } else { status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, &resfh); + open->op_fname.data, open->op_fname.len, resfh); fh_unlock(current_fh); if (status) goto out; - status = nfsd_check_obj_isreg(&resfh); + status = nfsd_check_obj_isreg(resfh); } if (status) goto out; if (is_create_with_attrs(open) && open->op_acl != NULL) - do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval); - - set_change_info(&open->op_cinfo, current_fh); - fh_dup2(current_fh, &resfh); + do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); /* set reply cache */ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, - &resfh.fh_handle); + &resfh->fh_handle); if (!open->op_created) - status = do_open_permission(rqstp, current_fh, open, + status = do_open_permission(rqstp, resfh, open, NFSD_MAY_NOP); - + set_change_info(&open->op_cinfo, current_fh); + fh_dup2(current_fh, resfh); out: - fh_put(&resfh); + fh_put(resfh); + kfree(resfh); return status; } @@ -310,16 +313,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; - /* We don't yet support WANT bits: */ - open->op_share_access &= NFS4_SHARE_ACCESS_MASK; - open->op_created = 0; /* * RFC5661 18.51.3 * Before RECLAIM_COMPLETE done, server should deny new lock */ if (nfsd4_has_session(cstate) && - !cstate->session->se_client->cl_firststate && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, + &cstate->session->se_client->cl_flags) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) return nfserr_grace; @@ -452,6 +453,10 @@ nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_restorefh; fh_dup2(&cstate->current_fh, &cstate->save_fh); + if (HAS_STATE_ID(cstate, SAVED_STATE_ID_FLAG)) { + memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t)); + SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + } return nfs_ok; } @@ -463,6 +468,10 @@ nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_nofilehandle; fh_dup2(&cstate->save_fh, &cstate->current_fh); + if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) { + memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t)); + SET_STATE_ID(cstate, SAVED_STATE_ID_FLAG); + } return nfs_ok; } @@ -481,14 +490,20 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &access->ac_supported); } +static void gen_boot_verifier(nfs4_verifier *verifier) +{ + __be32 verf[2]; + + verf[0] = (__be32)nfssvc_boot.tv_sec; + verf[1] = (__be32)nfssvc_boot.tv_usec; + memcpy(verifier->data, verf, sizeof(verifier->data)); +} + static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { - u32 *p = (u32 *)commit->co_verf.data; - *p++ = nfssvc_boot.tv_sec; - *p++ = nfssvc_boot.tv_usec; - + gen_boot_verifier(&commit->co_verf); return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); } @@ -865,7 +880,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { stateid_t *stateid = &write->wr_stateid; struct file *filp = NULL; - u32 *p; __be32 status = nfs_ok; unsigned long cnt; @@ -887,9 +901,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; - p = (u32 *)write->wr_verifier.data; - *p++ = nfssvc_boot.tv_sec; - *p++ = nfssvc_boot.tv_usec; + gen_boot_verifier(&write->wr_verifier); status = nfsd_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, write->wr_vlen, @@ -1000,6 +1012,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum) typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op); +typedef void(*stateid_setter)(struct nfsd4_compound_state *, void *); +typedef void(*stateid_getter)(struct nfsd4_compound_state *, void *); enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ @@ -1025,6 +1039,10 @@ enum nfsd4_op_flags { * the v4.0 case). */ OP_CACHEME = 1 << 6, + /* + * These are ops which clear current state id. + */ + OP_CLEAR_STATEID = 1 << 7, }; struct nfsd4_operation { @@ -1033,11 +1051,15 @@ struct nfsd4_operation { char *op_name; /* Try to get response size before operation */ nfsd4op_rsize op_rsize_bop; + stateid_setter op_get_currentstateid; + stateid_getter op_set_currentstateid; }; static struct nfsd4_operation nfsd4_ops[]; +#ifdef NFSD_DEBUG static const char *nfsd4_op_name(unsigned opnum); +#endif /* * Enforce NFSv4.1 COMPOUND ordering rules: @@ -1215,13 +1237,23 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (op->status) goto encode_op; - if (opdesc->op_func) + if (opdesc->op_func) { + if (opdesc->op_get_currentstateid) + opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); - else + } else BUG_ON(op->status == nfs_ok); - if (!op->status && need_wrongsec_check(rqstp)) - op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + if (!op->status) { + if (opdesc->op_set_currentstateid) + opdesc->op_set_currentstateid(cstate, &op->u); + + if (opdesc->op_flags & OP_CLEAR_STATEID) + clear_current_stateid(cstate); + + if (need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + } encode_op: /* Only from SEQUENCE */ @@ -1413,6 +1445,8 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CLOSE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_closestateid, + .op_set_currentstateid = (stateid_setter)nfsd4_set_closestateid, }, [OP_COMMIT] = { .op_func = (nfsd4op_func)nfsd4_commit, @@ -1422,7 +1456,7 @@ static struct nfsd4_operation nfsd4_ops[] = { }, [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, - .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME | OP_CLEAR_STATEID, .op_name = "OP_CREATE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize, }, @@ -1431,6 +1465,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_DELEGRETURN", .op_rsize_bop = nfsd4_only_status_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_delegreturnstateid, }, [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, @@ -1453,6 +1488,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCK", .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, + .op_set_currentstateid = (stateid_setter)nfsd4_set_lockstateid, }, [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, @@ -1463,15 +1499,16 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LOCKU", .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_lockustateid, }, [OP_LOOKUP] = { .op_func = (nfsd4op_func)nfsd4_lookup, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUP", }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, - .op_flags = OP_HANDLES_WRONGSEC, + .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUPP", }, [OP_NVERIFY] = { @@ -1483,6 +1520,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN", .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize, + .op_set_currentstateid = (stateid_setter)nfsd4_set_openstateid, }, [OP_OPEN_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_open_confirm, @@ -1495,25 +1533,30 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_OPEN_DOWNGRADE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_opendowngradestateid, + .op_set_currentstateid = (stateid_setter)nfsd4_set_opendowngradestateid, }, [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING + | OP_CLEAR_STATEID, .op_name = "OP_PUTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING + | OP_CLEAR_STATEID, .op_name = "OP_PUTPUBFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING + | OP_CLEAR_STATEID, .op_name = "OP_PUTROOTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, @@ -1522,6 +1565,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, @@ -1576,6 +1620,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SETATTR", .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_setattrstateid, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, @@ -1600,6 +1645,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_WRITE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid, }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, @@ -1674,12 +1720,14 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; +#ifdef NFSD_DEBUG static const char *nfsd4_op_name(unsigned opnum) { if (opnum < ARRAY_SIZE(nfsd4_ops)) return nfsd4_ops[opnum].op_name; return "unknown_operation"; } +#endif #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 0b3e875d1ab..4767429264a 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2004 The Regents of the University of Michigan. +* Copyright (c) 2012 Jeff Layton <jlayton@redhat.com> * All rights reserved. * * Andy Adamson <andros@citi.umich.edu> @@ -36,16 +37,34 @@ #include <linux/namei.h> #include <linux/crypto.h> #include <linux/sched.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <net/net_namespace.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfsd/cld.h> #include "nfsd.h" #include "state.h" #include "vfs.h" +#include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC +/* Declarations */ +struct nfsd4_client_tracking_ops { + int (*init)(struct net *); + void (*exit)(struct net *); + void (*create)(struct nfs4_client *); + void (*remove)(struct nfs4_client *); + int (*check)(struct nfs4_client *); + void (*grace_done)(struct net *, time_t); +}; + /* Globals */ static struct file *rec_file; static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; +static struct nfsd4_client_tracking_ops *client_tracking_ops; static int nfs4_save_creds(const struct cred **original_creds) @@ -117,7 +136,8 @@ out_no_tfm: return status; } -void nfsd4_create_clid_dir(struct nfs4_client *clp) +static void +nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char *dname = clp->cl_recdir; @@ -126,9 +146,8 @@ void nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); - if (clp->cl_firststate) + if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - clp->cl_firststate = 1; if (!rec_file) return; status = nfs4_save_creds(&original_cred); @@ -265,19 +284,19 @@ out_unlock: return status; } -void +static void nfsd4_remove_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; int status; - if (!rec_file || !clp->cl_firststate) + if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; status = mnt_want_write_file(rec_file); if (status) goto out; - clp->cl_firststate = 0; + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); status = nfs4_save_creds(&original_cred); if (status < 0) @@ -292,7 +311,6 @@ out: if (status) printk("NFSD: Failed to remove expired client state directory" " %.*s\n", HEXDIR_LEN, clp->cl_recdir); - return; } static int @@ -311,8 +329,9 @@ purge_old(struct dentry *parent, struct dentry *child) return 0; } -void -nfsd4_recdir_purge_old(void) { +static void +nfsd4_recdir_purge_old(struct net *net, time_t boot_time) +{ int status; if (!rec_file) @@ -343,7 +362,7 @@ load_recdir(struct dentry *parent, struct dentry *child) return 0; } -int +static int nfsd4_recdir_load(void) { int status; @@ -361,8 +380,8 @@ nfsd4_recdir_load(void) { * Hold reference to the recovery directory. */ -void -nfsd4_init_recdir() +static int +nfsd4_init_recdir(void) { const struct cred *original_cred; int status; @@ -377,20 +396,44 @@ nfsd4_init_recdir() printk("NFSD: Unable to change credentials to find recovery" " directory: error %d\n", status); - return; + return status; } rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", user_recovery_dirname); + status = PTR_ERR(rec_file); rec_file = NULL; } nfs4_reset_creds(original_cred); + return status; } -void +static int +nfsd4_load_reboot_recovery_data(struct net *net) +{ + int status; + + /* XXX: The legacy code won't work in a container */ + if (net != &init_net) { + WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " + "tracking in a container!\n"); + return -EINVAL; + } + + nfs4_lock_state(); + status = nfsd4_init_recdir(); + if (!status) + status = nfsd4_recdir_load(); + nfs4_unlock_state(); + if (status) + printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); + return status; +} + +static void nfsd4_shutdown_recdir(void) { if (!rec_file) @@ -399,6 +442,13 @@ nfsd4_shutdown_recdir(void) rec_file = NULL; } +static void +nfsd4_legacy_tracking_exit(struct net *net) +{ + nfs4_release_reclaim(); + nfsd4_shutdown_recdir(); +} + /* * Change the NFSv4 recovery directory to recdir. */ @@ -425,3 +475,572 @@ nfs4_recoverydir(void) { return user_recovery_dirname; } + +static int +nfsd4_check_legacy_client(struct nfs4_client *clp) +{ + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + if (nfsd4_find_reclaim_client(clp)) { + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + return 0; + } + + return -ENOENT; +} + +static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { + .init = nfsd4_load_reboot_recovery_data, + .exit = nfsd4_legacy_tracking_exit, + .create = nfsd4_create_clid_dir, + .remove = nfsd4_remove_clid_dir, + .check = nfsd4_check_legacy_client, + .grace_done = nfsd4_recdir_purge_old, +}; + +/* Globals */ +#define NFSD_PIPE_DIR "nfsd" +#define NFSD_CLD_PIPE "cld" + +/* per-net-ns structure for holding cld upcall info */ +struct cld_net { + struct rpc_pipe *cn_pipe; + spinlock_t cn_lock; + struct list_head cn_list; + unsigned int cn_xid; +}; + +struct cld_upcall { + struct list_head cu_list; + struct cld_net *cu_net; + struct task_struct *cu_task; + struct cld_msg cu_msg; +}; + +static int +__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +{ + int ret; + struct rpc_pipe_msg msg; + + memset(&msg, 0, sizeof(msg)); + msg.data = cmsg; + msg.len = sizeof(*cmsg); + + /* + * Set task state before we queue the upcall. That prevents + * wake_up_process in the downcall from racing with schedule. + */ + set_current_state(TASK_UNINTERRUPTIBLE); + ret = rpc_queue_upcall(pipe, &msg); + if (ret < 0) { + set_current_state(TASK_RUNNING); + goto out; + } + + schedule(); + set_current_state(TASK_RUNNING); + + if (msg.errno < 0) + ret = msg.errno; +out: + return ret; +} + +static int +cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +{ + int ret; + + /* + * -EAGAIN occurs when pipe is closed and reopened while there are + * upcalls queued. + */ + do { + ret = __cld_pipe_upcall(pipe, cmsg); + } while (ret == -EAGAIN); + + return ret; +} + +static ssize_t +cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct cld_upcall *tmp, *cup; + struct cld_msg *cmsg = (struct cld_msg *)src; + uint32_t xid; + struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + if (mlen != sizeof(*cmsg)) { + dprintk("%s: got %lu bytes, expected %lu\n", __func__, mlen, + sizeof(*cmsg)); + return -EINVAL; + } + + /* copy just the xid so we can try to find that */ + if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + dprintk("%s: error when copying xid from userspace", __func__); + return -EFAULT; + } + + /* walk the list and find corresponding xid */ + cup = NULL; + spin_lock(&cn->cn_lock); + list_for_each_entry(tmp, &cn->cn_list, cu_list) { + if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + cup = tmp; + list_del_init(&cup->cu_list); + break; + } + } + spin_unlock(&cn->cn_lock); + + /* couldn't find upcall? */ + if (!cup) { + dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid); + return -EINVAL; + } + + if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + return -EFAULT; + + wake_up_process(cup->cu_task); + return mlen; +} + +static void +cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct cld_msg *cmsg = msg->data; + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, + cu_msg); + + /* errno >= 0 means we got a downcall */ + if (msg->errno >= 0) + return; + + wake_up_process(cup->cu_task); +} + +static const struct rpc_pipe_ops cld_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = cld_pipe_downcall, + .destroy_msg = cld_pipe_destroy_msg, +}; + +static struct dentry * +nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); + dput(dir); + return dentry; +} + +static void +nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static struct dentry * +nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) +{ + struct super_block *sb; + struct dentry *dentry; + + sb = rpc_get_sb_net(net); + if (!sb) + return NULL; + dentry = nfsd4_cld_register_sb(sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void +nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) +{ + struct super_block *sb; + + sb = rpc_get_sb_net(net); + if (sb) { + nfsd4_cld_unregister_sb(pipe); + rpc_put_sb_net(net); + } +} + +/* Initialize rpc_pipefs pipe for communication with client tracking daemon */ +static int +nfsd4_init_cld_pipe(struct net *net) +{ + int ret; + struct dentry *dentry; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn; + + if (nn->cld_net) + return 0; + + cn = kzalloc(sizeof(*cn), GFP_KERNEL); + if (!cn) { + ret = -ENOMEM; + goto err; + } + + cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(cn->cn_pipe)) { + ret = PTR_ERR(cn->cn_pipe); + goto err; + } + spin_lock_init(&cn->cn_lock); + INIT_LIST_HEAD(&cn->cn_list); + + dentry = nfsd4_cld_register_net(net, cn->cn_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto err_destroy_data; + } + + cn->cn_pipe->dentry = dentry; + nn->cld_net = cn; + return 0; + +err_destroy_data: + rpc_destroy_pipe_data(cn->cn_pipe); +err: + kfree(cn); + printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n", + ret); + return ret; +} + +static void +nfsd4_remove_cld_pipe(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + nfsd4_cld_unregister_net(net, cn->cn_pipe); + rpc_destroy_pipe_data(cn->cn_pipe); + kfree(nn->cld_net); + nn->cld_net = NULL; +} + +static struct cld_upcall * +alloc_cld_upcall(struct cld_net *cn) +{ + struct cld_upcall *new, *tmp; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return new; + + /* FIXME: hard cap on number in flight? */ +restart_search: + spin_lock(&cn->cn_lock); + list_for_each_entry(tmp, &cn->cn_list, cu_list) { + if (tmp->cu_msg.cm_xid == cn->cn_xid) { + cn->cn_xid++; + spin_unlock(&cn->cn_lock); + goto restart_search; + } + } + new->cu_task = current; + new->cu_msg.cm_vers = CLD_UPCALL_VERSION; + put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_net = cn; + list_add(&new->cu_list, &cn->cn_list); + spin_unlock(&cn->cn_lock); + + dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + + return new; +} + +static void +free_cld_upcall(struct cld_upcall *victim) +{ + struct cld_net *cn = victim->cu_net; + + spin_lock(&cn->cn_lock); + list_del(&victim->cu_list); + spin_unlock(&cn->cn_lock); + kfree(victim); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_create(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + /* FIXME: determine net from clp */ + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(cn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_msg.cm_cmd = Cld_Create; + cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) { + ret = cup->cu_msg.cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to create client " + "record on stable storage: %d\n", ret); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_remove(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + /* FIXME: determine net from clp */ + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if it's already removed */ + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(cn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_msg.cm_cmd = Cld_Remove; + cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) { + ret = cup->cu_msg.cm_status; + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to remove client " + "record from stable storage: %d\n", ret); +} + +/* Check for presence of a record, and update its timestamp */ +static int +nfsd4_cld_check(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + /* FIXME: determine net from clp */ + struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if one was already stored during this grace pd */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + cup = alloc_cld_upcall(cn); + if (!cup) { + printk(KERN_ERR "NFSD: Unable to check client record on " + "stable storage: %d\n", -ENOMEM); + return -ENOMEM; + } + + cup->cu_msg.cm_cmd = Cld_Check; + cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) { + ret = cup->cu_msg.cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); + return ret; +} + +static void +nfsd4_cld_grace_done(struct net *net, time_t boot_time) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(cn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + if (!ret) + ret = cup->cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); +} + +static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { + .init = nfsd4_init_cld_pipe, + .exit = nfsd4_remove_cld_pipe, + .create = nfsd4_cld_create, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check, + .grace_done = nfsd4_cld_grace_done, +}; + +int +nfsd4_client_tracking_init(struct net *net) +{ + int status; + struct path path; + + if (!client_tracking_ops) { + client_tracking_ops = &nfsd4_cld_tracking_ops; + status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); + if (!status) { + if (S_ISDIR(path.dentry->d_inode->i_mode)) + client_tracking_ops = + &nfsd4_legacy_tracking_ops; + path_put(&path); + } + } + + status = client_tracking_ops->init(net); + if (status) { + printk(KERN_WARNING "NFSD: Unable to initialize client " + "recovery tracking! (%d)\n", status); + client_tracking_ops = NULL; + } + return status; +} + +void +nfsd4_client_tracking_exit(struct net *net) +{ + if (client_tracking_ops) { + client_tracking_ops->exit(net); + client_tracking_ops = NULL; + } +} + +void +nfsd4_client_record_create(struct nfs4_client *clp) +{ + if (client_tracking_ops) + client_tracking_ops->create(clp); +} + +void +nfsd4_client_record_remove(struct nfs4_client *clp) +{ + if (client_tracking_ops) + client_tracking_ops->remove(clp); +} + +int +nfsd4_client_record_check(struct nfs4_client *clp) +{ + if (client_tracking_ops) + return client_tracking_ops->check(clp); + + return -EOPNOTSUPP; +} + +void +nfsd4_record_grace_done(struct net *net, time_t boot_time) +{ + if (client_tracking_ops) + client_tracking_ops->grace_done(net, boot_time); +} + +static int +rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (!cn) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + cn->cn_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (cn->cn_pipe->dentry) + nfsd4_cld_unregister_sb(cn->cn_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +struct notifier_block nfsd4_cld_block = { + .notifier_call = rpc_pipefs_event, +}; + +int +register_cld_notifier(void) +{ + return rpc_pipefs_notifier_register(&nfsd4_cld_block); +} + +void +unregister_cld_notifier(void) +{ + rpc_pipefs_notifier_unregister(&nfsd4_cld_block); +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c5cddd65942..1841f8bf845 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -58,11 +58,15 @@ static const stateid_t one_stateid = { static const stateid_t zero_stateid = { /* all fields zero */ }; +static const stateid_t currentstateid = { + .si_generation = 1, +}; static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) +#define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) /* forward declarations */ static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); @@ -91,6 +95,19 @@ nfs4_lock_state(void) mutex_lock(&client_mutex); } +static void free_session(struct kref *); + +/* Must be called under the client_lock */ +static void nfsd4_put_session_locked(struct nfsd4_session *ses) +{ + kref_put(&ses->se_ref, free_session); +} + +static void nfsd4_get_session(struct nfsd4_session *ses) +{ + kref_get(&ses->se_ref); +} + void nfs4_unlock_state(void) { @@ -605,12 +622,20 @@ hash_sessionid(struct nfs4_sessionid *sessionid) return sid->sequence % SESSION_HASH_SIZE; } +#ifdef NFSD_DEBUG static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { u32 *ptr = (u32 *)(&sessionid->data[0]); dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); } +#else +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ +} +#endif + static void gen_sessionid(struct nfsd4_session *ses) @@ -832,11 +857,12 @@ static void nfsd4_del_conns(struct nfsd4_session *s) spin_unlock(&clp->cl_lock); } -void free_session(struct kref *kref) +static void free_session(struct kref *kref) { struct nfsd4_session *ses; int mem; + BUG_ON(!spin_is_locked(&client_lock)); ses = container_of(kref, struct nfsd4_session, se_ref); nfsd4_del_conns(ses); spin_lock(&nfsd_drc_lock); @@ -847,6 +873,13 @@ void free_session(struct kref *kref) kfree(ses); } +void nfsd4_put_session(struct nfsd4_session *ses) +{ + spin_lock(&client_lock); + nfsd4_put_session_locked(ses); + spin_unlock(&client_lock); +} + static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) { struct nfsd4_session *new; @@ -894,7 +927,9 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n status = nfsd4_new_conn_from_crses(rqstp, new); /* whoops: benny points out, status is ignored! (err, or bogus) */ if (status) { + spin_lock(&client_lock); free_session(&new->se_ref); + spin_unlock(&client_lock); return NULL; } if (cses->flags & SESSION4_BACK_CHAN) { @@ -1006,12 +1041,13 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) static inline void free_client(struct nfs4_client *clp) { + BUG_ON(!spin_is_locked(&client_lock)); while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, se_perclnt); list_del(&ses->se_perclnt); - nfsd4_put_session(ses); + nfsd4_put_session_locked(ses); } if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); @@ -1138,12 +1174,12 @@ static void gen_clid(struct nfs4_client *clp) static void gen_confirm(struct nfs4_client *clp) { + __be32 verf[2]; static u32 i; - u32 *p; - p = (u32 *)clp->cl_confirm.data; - *p++ = get_seconds(); - *p++ = i++; + verf[0] = (__be32)get_seconds(); + verf[1] = (__be32)i++; + memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) @@ -1180,7 +1216,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, if (princ) { clp->cl_principal = kstrdup(princ, GFP_KERNEL); if (clp->cl_principal == NULL) { + spin_lock(&client_lock); free_client(clp); + spin_unlock(&client_lock); return NULL; } } @@ -1347,6 +1385,7 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) slot->sl_opcnt = resp->opcnt; slot->sl_status = resp->cstate.status; + slot->sl_flags |= NFSD4_SLOT_INITIALIZED; if (nfsd4_not_cached(resp)) { slot->sl_datalen = 0; return; @@ -1374,15 +1413,12 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, struct nfsd4_op *op; struct nfsd4_slot *slot = resp->cstate.slot; - dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__, - resp->opcnt, resp->cstate.slot->sl_cachethis); - /* Encode the replayed sequence operation */ op = &args->ops[resp->opcnt - 1]; nfsd4_encode_operation(resp, op); /* Return nfserr_retry_uncached_rep in next operation. */ - if (args->opcnt > 1 && slot->sl_cachethis == 0) { + if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) { op = &args->ops[resp->opcnt++]; op->status = nfserr_retry_uncached_rep; nfsd4_encode_operation(resp, op); @@ -1575,16 +1611,11 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) else return nfserr_seq_misordered; } - /* Normal */ + /* Note unsigned 32-bit arithmetic handles wraparound: */ if (likely(seqid == slot_seqid + 1)) return nfs_ok; - /* Replay */ if (seqid == slot_seqid) return nfserr_replay_cache; - /* Wraparound */ - if (seqid == 1 && (slot_seqid + 1) == 0) - return nfs_ok; - /* Misordered replay or misordered new request */ return nfserr_seq_misordered; } @@ -1815,9 +1846,10 @@ nfsd4_destroy_session(struct svc_rqst *r, nfsd4_probe_callback_sync(ses->se_client); nfs4_unlock_state(); + spin_lock(&client_lock); nfsd4_del_conns(ses); - - nfsd4_put_session(ses); + nfsd4_put_session_locked(ses); + spin_unlock(&client_lock); status = nfs_ok; out: dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -1921,8 +1953,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, * sr_highest_slotid and the sr_target_slot id to maxslots */ seq->maxslots = session->se_fchannel.maxreqs; - status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse); + status = check_slot_seqid(seq->seqid, slot->sl_seqid, + slot->sl_flags & NFSD4_SLOT_INUSE); if (status == nfserr_replay_cache) { + status = nfserr_seq_misordered; + if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) + goto out; cstate->slot = slot; cstate->session = session; /* Return the cached reply status and set cstate->status @@ -1938,9 +1974,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, conn = NULL; /* Success! bump slot seqid */ - slot->sl_inuse = true; slot->sl_seqid = seq->seqid; - slot->sl_cachethis = seq->cachethis; + slot->sl_flags |= NFSD4_SLOT_INUSE; + if (seq->cachethis) + slot->sl_flags |= NFSD4_SLOT_CACHETHIS; + else + slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS; cstate->slot = slot; cstate->session = session; @@ -2030,7 +2069,8 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta nfs4_lock_state(); status = nfserr_complete_already; - if (cstate->session->se_client->cl_firststate) + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, + &cstate->session->se_client->cl_flags)) goto out; status = nfserr_stale_clientid; @@ -2045,7 +2085,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta goto out; status = nfs_ok; - nfsd4_create_clid_dir(cstate->session->se_client); + nfsd4_client_record_create(cstate->session->se_client); out: nfs4_unlock_state(); return status; @@ -2240,7 +2280,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, conf = find_confirmed_client_by_str(unconf->cl_recdir, hash); if (conf) { - nfsd4_remove_clid_dir(conf); + nfsd4_client_record_remove(conf); expire_client(conf); } move_to_confirmed(unconf); @@ -2633,8 +2673,6 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) static int share_access_to_flags(u32 share_access) { - share_access &= ~NFS4_SHARE_WANT_MASK; - return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } @@ -2776,10 +2814,9 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c static void -nfs4_set_claim_prev(struct nfsd4_open *open) +nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) { open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - open->op_openowner->oo_owner.so_client->cl_firststate = 1; } /* Should we give out recallable state?: */ @@ -2855,6 +2892,27 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) return 0; } +static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) +{ + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + if (status == -EAGAIN) + open->op_why_no_deleg = WND4_CONTENTION; + else { + open->op_why_no_deleg = WND4_RESOURCE; + switch (open->op_deleg_want) { + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + break; + case NFS4_SHARE_WANT_CANCEL: + open->op_why_no_deleg = WND4_CANCELLED; + break; + case NFS4_SHARE_WANT_NO_DELEG: + BUG(); /* not supposed to get here */ + } + } +} + /* * Attempt to hand out a delegation. */ @@ -2864,7 +2922,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_ struct nfs4_delegation *dp; struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); int cb_up; - int status, flag = 0; + int status = 0, flag = 0; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); flag = NFS4_OPEN_DELEGATE_NONE; @@ -2905,11 +2963,16 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_ dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", STATEID_VAL(&dp->dl_stid.sc_stateid)); out: - if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS - && flag == NFS4_OPEN_DELEGATE_NONE - && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) - dprintk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_delegate_type = flag; + if (flag == NFS4_OPEN_DELEGATE_NONE) { + if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && + open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) + dprintk("NFSD: WARNING: refusing delegation reclaim\n"); + + /* 4.1 client asking for a delegation? */ + if (open->op_deleg_want) + nfsd4_open_deleg_none_ext(open, status); + } return; out_free: nfs4_put_delegation(dp); @@ -2918,6 +2981,24 @@ out_no_deleg: goto out; } +static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, + struct nfs4_delegation *dp) +{ + if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG && + dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; + } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG && + dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; + } + /* Otherwise the client must be confused wanting a delegation + * it already has, therefore we don't return + * NFS4_OPEN_DELEGATE_NONE_EXT and reason. + */ +} + /* * called with nfs4_lock_state() held. */ @@ -2979,24 +3060,36 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - if (nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) { open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_WANTED; + goto nodeleg; + } + } + /* * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ nfs4_open_delegation(current_fh, open, stp); - +nodeleg: status = nfs_ok; dprintk("%s: stateid=" STATEID_FMT "\n", __func__, STATEID_VAL(&stp->st_stid.sc_stateid)); out: + /* 4.1 client trying to upgrade/downgrade delegation? */ + if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp && + open->op_deleg_want) + nfsd4_deleg_xgrade_none_ext(open, dp); + if (fp) put_nfs4_file(fp); if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - nfs4_set_claim_prev(open); + nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate)); /* * To finish the open response, we just need to set the rflags. */ @@ -3066,7 +3159,7 @@ static void nfsd4_end_grace(void) { dprintk("NFSD: end of grace period\n"); - nfsd4_recdir_purge_old(); + nfsd4_record_grace_done(&init_net, boot_time); locks_end_grace(&nfsd4_manager); /* * Now that every NFSv4 client has had the chance to recover and @@ -3115,7 +3208,7 @@ nfs4_laundromat(void) clp = list_entry(pos, struct nfs4_client, cl_lru); dprintk("NFSD: purging unused client (clientid %08x)\n", clp->cl_clientid.cl_id); - nfsd4_remove_clid_dir(clp); + nfsd4_client_record_remove(clp); expire_client(clp); } spin_lock(&recall_lock); @@ -3400,7 +3493,14 @@ __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_test_stateid *test_stateid) { - /* real work is done during encoding */ + struct nfsd4_test_stateid_id *stateid; + struct nfs4_client *cl = cstate->session->se_client; + + nfs4_lock_state(); + list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) + stateid->ts_id_status = nfs4_validate_stateid(cl, &stateid->ts_id_stateid); + nfs4_unlock_state(); + return nfs_ok; } @@ -3539,7 +3639,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); - nfsd4_create_clid_dir(oo->oo_owner.so_client); + nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; out: if (!cstate->replay_owner) @@ -3596,7 +3696,9 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, cstate->current_fh.fh_dentry->d_name.name); /* We don't yet support WANT bits: */ - od->od_share_access &= NFS4_SHARE_ACCESS_MASK; + if (od->od_deleg_want) + dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, + od->od_deleg_want); nfs4_lock_state(); status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, @@ -4353,7 +4455,9 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) struct nfs4_client *clp; clp = find_confirmed_client_by_str(name, strhashval); - return clp ? 1 : 0; + if (!clp) + return 0; + return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } /* @@ -4377,7 +4481,7 @@ nfs4_client_to_reclaim(const char *name) return 1; } -static void +void nfs4_release_reclaim(void) { struct nfs4_client_reclaim *crp = NULL; @@ -4397,19 +4501,12 @@ nfs4_release_reclaim(void) /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ -static struct nfs4_client_reclaim * -nfs4_find_reclaim_client(clientid_t *clid) +struct nfs4_client_reclaim * +nfsd4_find_reclaim_client(struct nfs4_client *clp) { unsigned int strhashval; - struct nfs4_client *clp; struct nfs4_client_reclaim *crp = NULL; - - /* find clientid in conf_id_hashtbl */ - clp = find_confirmed_client(clid); - if (clp == NULL) - return NULL; - dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", clp->cl_name.len, clp->cl_name.data, clp->cl_recdir); @@ -4430,7 +4527,14 @@ nfs4_find_reclaim_client(clientid_t *clid) __be32 nfs4_check_open_reclaim(clientid_t *clid) { - return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; + struct nfs4_client *clp; + + /* find clientid in conf_id_hashtbl */ + clp = find_confirmed_client(clid); + if (clp == NULL) + return nfserr_reclaim_bad; + + return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok; } #ifdef CONFIG_NFSD_FAULT_INJECTION @@ -4442,7 +4546,7 @@ void nfsd_forget_clients(u64 num) nfs4_lock_state(); list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { - nfsd4_remove_clid_dir(clp); + nfsd4_client_record_remove(clp); expire_client(clp); if (++count == num) break; @@ -4577,19 +4681,6 @@ nfs4_state_init(void) reclaim_str_hashtbl_size = 0; } -static void -nfsd4_load_reboot_recovery_data(void) -{ - int status; - - nfs4_lock_state(); - nfsd4_init_recdir(); - status = nfsd4_recdir_load(); - nfs4_unlock_state(); - if (status) - printk("NFSD: Failure reading reboot recovery data\n"); -} - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has @@ -4613,21 +4704,34 @@ set_max_delegations(void) /* initialization to perform when the nfsd service is started: */ -static int -__nfs4_state_start(void) +int +nfs4_state_start(void) { int ret; + /* + * FIXME: For now, we hang most of the pernet global stuff off of + * init_net until nfsd is fully containerized. Eventually, we'll + * need to pass a net pointer into this function, take a reference + * to that instead and then do most of the rest of this on a per-net + * basis. + */ + get_net(&init_net); + nfsd4_client_tracking_init(&init_net); boot_time = get_seconds(); locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", nfsd4_grace); ret = set_callback_cred(); - if (ret) - return -ENOMEM; + if (ret) { + ret = -ENOMEM; + goto out_recovery; + } laundry_wq = create_singlethread_workqueue("nfsd4"); - if (laundry_wq == NULL) - return -ENOMEM; + if (laundry_wq == NULL) { + ret = -ENOMEM; + goto out_recovery; + } ret = nfsd4_create_callback_queue(); if (ret) goto out_free_laundry; @@ -4636,16 +4740,12 @@ __nfs4_state_start(void) return 0; out_free_laundry: destroy_workqueue(laundry_wq); +out_recovery: + nfsd4_client_tracking_exit(&init_net); + put_net(&init_net); return ret; } -int -nfs4_state_start(void) -{ - nfsd4_load_reboot_recovery_data(); - return __nfs4_state_start(); -} - static void __nfs4_state_shutdown(void) { @@ -4676,7 +4776,8 @@ __nfs4_state_shutdown(void) unhash_delegation(dp); } - nfsd4_shutdown_recdir(); + nfsd4_client_tracking_exit(&init_net); + put_net(&init_net); } void @@ -4686,8 +4787,108 @@ nfs4_state_shutdown(void) destroy_workqueue(laundry_wq); locks_end_grace(&nfsd4_manager); nfs4_lock_state(); - nfs4_release_reclaim(); __nfs4_state_shutdown(); nfs4_unlock_state(); nfsd4_destroy_callback_queue(); } + +static void +get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) +{ + if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid)) + memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t)); +} + +static void +put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) +{ + if (cstate->minorversion) { + memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t)); + SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + } +} + +void +clear_current_stateid(struct nfsd4_compound_state *cstate) +{ + CLEAR_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); +} + +/* + * functions to set current state id + */ +void +nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp) +{ + put_stateid(cstate, &odp->od_stateid); +} + +void +nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + put_stateid(cstate, &open->op_stateid); +} + +void +nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close) +{ + put_stateid(cstate, &close->cl_stateid); +} + +void +nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, struct nfsd4_lock *lock) +{ + put_stateid(cstate, &lock->lk_resp_stateid); +} + +/* + * functions to consume current state id + */ + +void +nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, struct nfsd4_open_downgrade *odp) +{ + get_stateid(cstate, &odp->od_stateid); +} + +void +nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *drp) +{ + get_stateid(cstate, &drp->dr_stateid); +} + +void +nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *fsp) +{ + get_stateid(cstate, &fsp->fr_stateid); +} + +void +nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) +{ + get_stateid(cstate, &setattr->sa_stateid); +} + +void +nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, struct nfsd4_close *close) +{ + get_stateid(cstate, &close->cl_stateid); +} + +void +nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) +{ + get_stateid(cstate, &locku->lu_stateid); +} + +void +nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, struct nfsd4_read *read) +{ + get_stateid(cstate, &read->rd_stateid); +} + +void +nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, struct nfsd4_write *write) +{ + get_stateid(cstate, &write->wr_stateid); +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0ec5a1b9700..bcd8904ab1e 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -133,22 +133,6 @@ xdr_error: \ } \ } while (0) -static void save_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep) -{ - savep->p = argp->p; - savep->end = argp->end; - savep->pagelen = argp->pagelen; - savep->pagelist = argp->pagelist; -} - -static void restore_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep) -{ - argp->p = savep->p; - argp->end = savep->end; - argp->pagelen = savep->pagelen; - argp->pagelist = savep->pagelist; -} - static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) { /* We want more bytes than seem to be available. @@ -638,14 +622,18 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_TAIL; } -static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when) { __be32 *p; u32 w; READ_BUF(4); READ32(w); - *x = w; + *share_access = w & NFS4_SHARE_ACCESS_MASK; + *deleg_want = w & NFS4_SHARE_WANT_MASK; + if (deleg_when) + *deleg_when = w & NFS4_SHARE_WHEN_MASK; + switch (w & NFS4_SHARE_ACCESS_MASK) { case NFS4_SHARE_ACCESS_READ: case NFS4_SHARE_ACCESS_WRITE: @@ -673,6 +661,9 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x) w &= ~NFS4_SHARE_WANT_MASK; if (!w) return nfs_ok; + + if (!deleg_when) /* open_downgrade */ + return nfserr_inval; switch (w) { case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: @@ -719,6 +710,7 @@ static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { DECODE_HEAD; + u32 dummy; memset(open->op_bmval, 0, sizeof(open->op_bmval)); open->op_iattr.ia_valid = 0; @@ -727,7 +719,9 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(4); READ32(open->op_seqid); - status = nfsd4_decode_share_access(argp, &open->op_share_access); + /* decode, yet ignore deleg_when until supported */ + status = nfsd4_decode_share_access(argp, &open->op_share_access, + &open->op_deleg_want, &dummy); if (status) goto xdr_error; status = nfsd4_decode_share_deny(argp, &open->op_share_deny); @@ -755,14 +749,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) goto out; break; case NFS4_CREATE_EXCLUSIVE: - READ_BUF(8); - COPYMEM(open->op_verf.data, 8); + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); break; case NFS4_CREATE_EXCLUSIVE4_1: if (argp->minorversion < 1) goto xdr_error; - READ_BUF(8); - COPYMEM(open->op_verf.data, 8); + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl); if (status) @@ -848,7 +842,8 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d return status; READ_BUF(4); READ32(open_down->od_seqid); - status = nfsd4_decode_share_access(argp, &open_down->od_share_access); + status = nfsd4_decode_share_access(argp, &open_down->od_share_access, + &open_down->od_deleg_want, NULL); if (status) return status; status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); @@ -994,8 +989,8 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { DECODE_HEAD; - READ_BUF(8); - COPYMEM(setclientid->se_verf.data, 8); + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_opaque(argp, &setclientid->se_name); if (status) @@ -1020,9 +1015,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s { DECODE_HEAD; - READ_BUF(8 + sizeof(nfs4_verifier)); + READ_BUF(8 + NFS4_VERIFIER_SIZE); COPYMEM(&scd_c->sc_clientid, 8); - COPYMEM(&scd_c->sc_confirm, sizeof(nfs4_verifier)); + COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); DECODE_TAIL; } @@ -1385,26 +1380,29 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) { - unsigned int nbytes; - stateid_t si; int i; - __be32 *p; - __be32 status; + __be32 *p, status; + struct nfsd4_test_stateid_id *stateid; READ_BUF(4); test_stateid->ts_num_ids = ntohl(*p++); - nbytes = test_stateid->ts_num_ids * sizeof(stateid_t); - if (nbytes > (u32)((char *)argp->end - (char *)argp->p)) - goto xdr_error; - - test_stateid->ts_saved_args = argp; - save_buf(argp, &test_stateid->ts_savedp); + INIT_LIST_HEAD(&test_stateid->ts_stateid_list); for (i = 0; i < test_stateid->ts_num_ids; i++) { - status = nfsd4_decode_stateid(argp, &si); + stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL); + if (!stateid) { + status = PTR_ERR(stateid); + goto out; + } + + defer_free(argp, kfree, stateid); + INIT_LIST_HEAD(&stateid->ts_id_list); + list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); + + status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid); if (status) - return status; + goto out; } status = 0; @@ -2661,8 +2659,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ __be32 *p; if (!nfserr) { - RESERVE_SPACE(8); - WRITEMEM(commit->co_verf.data, 8); + RESERVE_SPACE(NFS4_VERIFIER_SIZE); + WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE); ADJUST_ARGS(); } return nfserr; @@ -2851,6 +2849,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op WRITE32(0); /* XXX: is NULL principal ok? */ ADJUST_ARGS(); break; + case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ + switch (open->op_why_no_deleg) { + case WND4_CONTENTION: + case WND4_RESOURCE: + RESERVE_SPACE(8); + WRITE32(open->op_why_no_deleg); + WRITE32(0); /* deleg signaling not supported yet */ + break; + default: + RESERVE_SPACE(4); + WRITE32(open->op_why_no_deleg); + } + ADJUST_ARGS(); + break; default: BUG(); } @@ -3008,7 +3020,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 if (resp->xbuf->page_len) return nfserr_resource; - RESERVE_SPACE(8); /* verifier */ + RESERVE_SPACE(NFS4_VERIFIER_SIZE); savep = p; /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ @@ -3209,9 +3221,9 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n __be32 *p; if (!nfserr) { - RESERVE_SPACE(8 + sizeof(nfs4_verifier)); + RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE); WRITEMEM(&scd->se_clientid, 8); - WRITEMEM(&scd->se_confirm, sizeof(nfs4_verifier)); + WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE); ADJUST_ARGS(); } else if (nfserr == nfserr_clid_inuse) { @@ -3232,7 +3244,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w RESERVE_SPACE(16); WRITE32(write->wr_bytes_written); WRITE32(write->wr_how_written); - WRITEMEM(write->wr_verifier.data, 8); + WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE); ADJUST_ARGS(); } return nfserr; @@ -3391,30 +3403,17 @@ __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_test_stateid *test_stateid) { - struct nfsd4_compoundargs *argp; - struct nfs4_client *cl = resp->cstate.session->se_client; - stateid_t si; + struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; - int i; - int valid; - - restore_buf(test_stateid->ts_saved_args, &test_stateid->ts_savedp); - argp = test_stateid->ts_saved_args; - RESERVE_SPACE(4); + RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); *p++ = htonl(test_stateid->ts_num_ids); - resp->p = p; - nfs4_lock_state(); - for (i = 0; i < test_stateid->ts_num_ids; i++) { - nfsd4_decode_stateid(argp, &si); - valid = nfs4_validate_stateid(cl, &si); - RESERVE_SPACE(4); - *p++ = htonl(valid); - resp->p = p; + list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { + *p++ = htonl(stateid->ts_id_status); } - nfs4_unlock_state(); + ADJUST_ARGS(); return nfserr; } @@ -3532,7 +3531,7 @@ int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) if (length > session->se_fchannel.maxresp_sz) return nfserr_rep_too_big; - if (slot->sl_cachethis == 1 && + if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) && length > session->se_fchannel.maxresp_cached) return nfserr_rep_too_big_to_cache; @@ -3656,8 +3655,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo if (nfsd4_has_session(cs)) { if (cs->status != nfserr_replay_cache) { nfsd4_store_cache_entry(resp); - dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); - cs->slot->sl_inuse = false; + cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; } /* Renew the clientid on success and on replay */ release_session_client(cs->session); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 64c24af8d7e..2c53be6d357 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,12 +13,14 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/module.h> #include "idmap.h" #include "nfsd.h" #include "cache.h" #include "fault_inject.h" +#include "netns.h" /* * We have a single directory with several nodes in it. @@ -1124,14 +1126,26 @@ static int create_proc_exports_entry(void) } #endif +int nfsd_net_id; +static struct pernet_operations nfsd_net_ops = { + .id = &nfsd_net_id, + .size = sizeof(struct nfsd_net), +}; + static int __init init_nfsd(void) { int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = nfsd4_init_slabs(); + retval = register_cld_notifier(); if (retval) return retval; + retval = register_pernet_subsys(&nfsd_net_ops); + if (retval < 0) + goto out_unregister_notifier; + retval = nfsd4_init_slabs(); + if (retval) + goto out_unregister_pernet; nfs4_state_init(); retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ if (retval) @@ -1169,6 +1183,10 @@ out_free_stat: nfsd_fault_inject_cleanup(); out_free_slabs: nfsd4_free_slabs(); +out_unregister_pernet: + unregister_pernet_subsys(&nfsd_net_ops); +out_unregister_notifier: + unregister_cld_notifier(); return retval; } @@ -1184,6 +1202,8 @@ static void __exit exit_nfsd(void) nfsd4_free_slabs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); + unregister_pernet_subsys(&nfsd_net_ops); + unregister_cld_notifier(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 1d1e8589b4c..1671429ffa6 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -364,12 +364,17 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) NFSD_WRITEABLE_ATTRS_WORD2 extern int nfsd4_is_junction(struct dentry *dentry); -#else +extern int register_cld_notifier(void); +extern void unregister_cld_notifier(void); +#else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) { return 0; } +#define register_cld_notifier() 0 +#define unregister_cld_notifier() do { } while(0) + #endif /* CONFIG_NFSD_V4 */ #endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fce472f5f39..28dfad39f0c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -307,33 +307,37 @@ static void set_max_drc(void) dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); } -int nfsd_create_serv(void) +static int nfsd_get_default_max_blksize(void) { - int err = 0; + struct sysinfo i; + unsigned long long target; + unsigned long ret; + + si_meminfo(&i); + target = (i.totalram - i.totalhigh) << PAGE_SHIFT; + /* + * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig + * machines, but only uses 32K on 128M machines. Bottom out at + * 8K on 32M and smaller. Of course, this is only a default. + */ + target >>= 12; + + ret = NFSSVC_MAXBLKSIZE; + while (ret > target && ret >= 8*1024*2) + ret /= 2; + return ret; +} +int nfsd_create_serv(void) +{ WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { svc_get(nfsd_serv); return 0; } - if (nfsd_max_blksize == 0) { - /* choose a suitable default */ - struct sysinfo i; - si_meminfo(&i); - /* Aim for 1/4096 of memory per thread - * This gives 1MB on 4Gig machines - * But only uses 32K on 128M machines. - * Bottom out at 8K on 32M and smaller. - * Of course, this is only a default. - */ - nfsd_max_blksize = NFSSVC_MAXBLKSIZE; - i.totalram <<= PAGE_SHIFT - 12; - while (nfsd_max_blksize > i.totalram && - nfsd_max_blksize >= 8*1024*2) - nfsd_max_blksize /= 2; - } + if (nfsd_max_blksize == 0) + nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(); - nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) @@ -341,7 +345,7 @@ int nfsd_create_serv(void) set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ - return err; + return 0; } int nfsd_nrpools(void) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ffb5df1db94..89ab137d379 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -128,12 +128,14 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) struct nfsd4_slot { - bool sl_inuse; - bool sl_cachethis; - u16 sl_opcnt; u32 sl_seqid; __be32 sl_status; u32 sl_datalen; + u16 sl_opcnt; +#define NFSD4_SLOT_INUSE (1 << 0) +#define NFSD4_SLOT_CACHETHIS (1 << 1) +#define NFSD4_SLOT_INITIALIZED (1 << 2) + u8 sl_flags; char sl_data[]; }; @@ -196,18 +198,7 @@ struct nfsd4_session { struct nfsd4_slot *se_slots[]; /* forward channel slots */ }; -static inline void -nfsd4_put_session(struct nfsd4_session *ses) -{ - extern void free_session(struct kref *kref); - kref_put(&ses->se_ref, free_session); -} - -static inline void -nfsd4_get_session(struct nfsd4_session *ses) -{ - kref_get(&ses->se_ref); -} +extern void nfsd4_put_session(struct nfsd4_session *ses); /* formatted contents of nfs4_sessionid */ struct nfsd4_sessionid { @@ -245,14 +236,17 @@ struct nfs4_client { struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ - u32 cl_firststate; /* recovery dir creation */ u32 cl_minorversion; /* for v4.0 and v4.1 callbacks: */ struct nfs4_cb_conn cl_cb_conn; -#define NFSD4_CLIENT_CB_UPDATE 1 -#define NFSD4_CLIENT_KILL 2 - unsigned long cl_cb_flags; +#define NFSD4_CLIENT_CB_UPDATE (0) +#define NFSD4_CLIENT_CB_KILL (1) +#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ +#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ +#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ + 1 << NFSD4_CLIENT_CB_KILL) + unsigned long cl_flags; struct rpc_clnt *cl_cb_client; u32 cl_cb_ident; #define NFSD4_CB_UP 0 @@ -463,6 +457,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); +extern void nfs4_release_reclaim(void); +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); extern void nfs4_free_openowner(struct nfs4_openowner *); extern void nfs4_free_lockowner(struct nfs4_lockowner *); @@ -477,16 +473,17 @@ extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); -extern void nfsd4_init_recdir(void); -extern int nfsd4_recdir_load(void); -extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); -extern void nfsd4_recdir_purge_old(void); -extern void nfsd4_create_clid_dir(struct nfs4_client *clp); -extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); +/* nfs4recover operations */ +extern int nfsd4_client_tracking_init(struct net *net); +extern void nfsd4_client_tracking_exit(struct net *net); +extern void nfsd4_client_record_create(struct nfs4_client *clp); +extern void nfsd4_client_record_remove(struct nfs4_client *clp); +extern int nfsd4_client_record_check(struct nfs4_client *clp); +extern void nfsd4_record_grace_done(struct net *net, time_t boot_time); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e59f71d0cf7..296d671654d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -737,12 +737,13 @@ static int nfsd_open_break_lease(struct inode *inode, int access) /* * Open an existing file or directory. - * The access argument indicates the type of open (read/write/lock) + * The may_flags argument indicates the type of open (read/write/lock) + * and additional flags. * N.B. After this call fhp needs an fh_put */ __be32 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, - int access, struct file **filp) + int may_flags, struct file **filp) { struct dentry *dentry; struct inode *inode; @@ -757,7 +758,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. */ - err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE); + err = fh_verify(rqstp, fhp, type, may_flags | NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; @@ -768,7 +769,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, * or any access when mandatory locking enabled */ err = nfserr_perm; - if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE)) + if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; /* * We must ignore files (but only files) which might have mandatory @@ -781,12 +782,12 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, if (!inode->i_fop) goto out; - host_err = nfsd_open_break_lease(inode, access); + host_err = nfsd_open_break_lease(inode, may_flags); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; - if (access & NFSD_MAY_WRITE) { - if (access & NFSD_MAY_READ) + if (may_flags & NFSD_MAY_WRITE) { + if (may_flags & NFSD_MAY_READ) flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; @@ -795,8 +796,15 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); - else - host_err = ima_file_check(*filp, access); + else { + host_err = ima_file_check(*filp, may_flags); + + if (may_flags & NFSD_MAY_64BIT_COOKIE) + (*filp)->f_mode |= FMODE_64BITHASH; + else + (*filp)->f_mode |= FMODE_32BITHASH; + } + out_nfserr: err = nfserrno(host_err); out: @@ -2021,8 +2029,13 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, __be32 err; struct file *file; loff_t offset = *offsetp; + int may_flags = NFSD_MAY_READ; + + /* NFSv2 only supports 32 bit cookies */ + if (rqstp->rq_vers > 2) + may_flags |= NFSD_MAY_64BIT_COOKIE; - err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); if (err) goto out; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 1dcd238e11a..ec0611b2b73 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -27,6 +27,8 @@ #define NFSD_MAY_BYPASS_GSS 0x400 #define NFSD_MAY_READ_IF_EXEC 0x800 +#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ + #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 2364747ee97..1b3501598ab 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -43,6 +43,13 @@ #define NFSD4_MAX_TAGLEN 128 #define XDR_LEN(n) (((n) + 3) & ~3) +#define CURRENT_STATE_ID_FLAG (1<<0) +#define SAVED_STATE_ID_FLAG (1<<1) + +#define SET_STATE_ID(c, f) ((c)->sid_flags |= (f)) +#define HAS_STATE_ID(c, f) ((c)->sid_flags & (f)) +#define CLEAR_STATE_ID(c, f) ((c)->sid_flags &= ~(f)) + struct nfsd4_compound_state { struct svc_fh current_fh; struct svc_fh save_fh; @@ -54,6 +61,10 @@ struct nfsd4_compound_state { size_t iovlen; u32 minorversion; u32 status; + stateid_t current_stateid; + stateid_t save_stateid; + /* to indicate current and saved state id presents */ + u32 sid_flags; }; static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) @@ -212,16 +223,19 @@ struct nfsd4_open { struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ u32 op_delegate_type; /* request - CLAIM_PREV only */ stateid_t op_delegate_stateid; /* request - response */ + u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ u32 op_create; /* request */ u32 op_createmode; /* request */ u32 op_bmval[3]; /* request */ struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ - nfs4_verifier verf; /* EXCLUSIVE4 */ + nfs4_verifier op_verf __attribute__((aligned(32))); + /* EXCLUSIVE4 */ clientid_t op_clientid; /* request */ struct xdr_netobj op_owner; /* request */ u32 op_seqid; /* request */ u32 op_share_access; /* request */ u32 op_share_deny; /* request */ + u32 op_deleg_want; /* request */ stateid_t op_stateid; /* response */ u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ @@ -234,7 +248,6 @@ struct nfsd4_open { struct nfs4_acl *op_acl; }; #define op_iattr iattr -#define op_verf verf struct nfsd4_open_confirm { stateid_t oc_req_stateid /* request */; @@ -245,8 +258,9 @@ struct nfsd4_open_confirm { struct nfsd4_open_downgrade { stateid_t od_stateid; u32 od_seqid; - u32 od_share_access; - u32 od_share_deny; + u32 od_share_access; /* request */ + u32 od_deleg_want; /* request */ + u32 od_share_deny; /* request */ }; @@ -343,10 +357,15 @@ struct nfsd4_saved_compoundargs { struct page **pagelist; }; +struct nfsd4_test_stateid_id { + __be32 ts_id_status; + stateid_t ts_id_stateid; + struct list_head ts_id_list; +}; + struct nfsd4_test_stateid { __be32 ts_num_ids; - struct nfsd4_compoundargs *ts_saved_args; - struct nfsd4_saved_compoundargs ts_savedp; + struct list_head ts_stateid_list; }; struct nfsd4_free_stateid { @@ -503,7 +522,8 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) { - return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); + return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) + || nfsd4_is_solo_sequence(resp); } #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index a6fda3c188a..a1a1bfd652c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -28,8 +28,6 @@ #include "suballoc.h" #include "move_extents.h" -#include <linux/ext2_fs.h> - #define o2info_from_user(a, b) \ copy_from_user(&(a), (b), sizeof(a)) #define o2info_to_user(a, b) \ diff --git a/fs/open.c b/fs/open.c index 77becc04114..5720854156d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -836,7 +836,7 @@ EXPORT_SYMBOL(dentry_open); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); - __FD_CLR(fd, fdt->open_fds); + __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } @@ -1080,7 +1080,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) if (!filp) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); - FD_CLR(fd, fdt->close_on_exec); + __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); retval = filp_close(filp, files); diff --git a/fs/proc/array.c b/fs/proc/array.c index fbb53c24908..f9bd395b347 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -550,7 +550,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', shared); seq_put_decimal_ull(m, ' ', text); seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', data); seq_put_decimal_ull(m, ' ', 0); seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b42c1418f3..1c8b280146d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1753,7 +1753,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) fdt = files_fdtable(files); f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) + if (close_on_exec(fd, fdt)) f_flags |= O_CLOEXEC; if (path) { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8461a7b82fd..205c9228083 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -22,7 +22,6 @@ #include <linux/slab.h> #include <linux/mount.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 3551f1f839e..0d9e23a39e4 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -156,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - for (entry = ns_entries; entry <= last; entry++) { + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } error = ERR_PTR(-ENOENT); - if (entry > last) + if (entry == last) goto out; error = proc_ns_instantiate(dir, dentry, task, *entry); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9694cc28351..2b9a7607cbd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -781,12 +781,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); - if (pmd_trans_unstable(pmd)) - return 0; - /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - spin_lock(&walk->mm->page_table_lock); if (pmd_trans_huge_lock(pmd, vma) == 1) { for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; @@ -802,6 +798,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } + if (pmd_trans_unstable(pmd)) + return 0; for (; addr != end; addr += PAGE_SIZE) { /* check to see if we've left 'vma' behind diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index f37c32b9452..50952c9bd06 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -105,26 +105,12 @@ static const struct inode_operations pstore_dir_inode_operations = { .unlink = pstore_unlink, }; -static struct inode *pstore_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev) +static struct inode *pstore_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); - if (inode) { inode->i_ino = get_next_ino(); - inode->i_uid = inode->i_gid = 0; - inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - switch (mode & S_IFMT) { - case S_IFREG: - inode->i_fop = &pstore_file_operations; - break; - case S_IFDIR: - inode->i_op = &pstore_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - inc_nlink(inode); - break; - } } return inode; } @@ -216,9 +202,11 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, return rc; rc = -ENOMEM; - inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); + inode = pstore_get_inode(pstore_sb); if (!inode) goto fail; + inode->i_mode = S_IFREG | 0444; + inode->i_fop = &pstore_file_operations; private = kmalloc(sizeof *private + size, GFP_KERNEL); if (!private) goto fail_alloc; @@ -293,10 +281,12 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) parse_options(data); - inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0); + inode = pstore_get_inode(sb); if (inode) { - /* override ramfs "dir" options so we catch unlink(2) */ + inode->i_mode = S_IFDIR | 0755; inode->i_op = &pstore_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); } sb->s_root = d_make_root(inode); if (!sb->s_root) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8b4f12b33f5..d69a1d1d7e1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1110,6 +1110,13 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) clear_bit(DQ_BLKS_B, &dquot->dq_flags); } +struct dquot_warn { + struct super_block *w_sb; + qid_t w_dq_id; + short w_dq_type; + short w_type; +}; + static int warning_issued(struct dquot *dquot, const int warntype) { int flag = (warntype == QUOTA_NL_BHARDWARN || @@ -1125,41 +1132,42 @@ static int warning_issued(struct dquot *dquot, const int warntype) #ifdef CONFIG_PRINT_QUOTA_WARNING static int flag_print_warnings = 1; -static int need_print_warning(struct dquot *dquot) +static int need_print_warning(struct dquot_warn *warn) { if (!flag_print_warnings) return 0; - switch (dquot->dq_type) { + switch (warn->w_dq_type) { case USRQUOTA: - return current_fsuid() == dquot->dq_id; + return current_fsuid() == warn->w_dq_id; case GRPQUOTA: - return in_group_p(dquot->dq_id); + return in_group_p(warn->w_dq_id); } return 0; } /* Print warning to user which exceeded quota */ -static void print_warning(struct dquot *dquot, const int warntype) +static void print_warning(struct dquot_warn *warn) { char *msg = NULL; struct tty_struct *tty; + int warntype = warn->w_type; if (warntype == QUOTA_NL_IHARDBELOW || warntype == QUOTA_NL_ISOFTBELOW || warntype == QUOTA_NL_BHARDBELOW || - warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) + warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn)) return; tty = get_current_tty(); if (!tty) return; - tty_write_message(tty, dquot->dq_sb->s_id); + tty_write_message(tty, warn->w_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); else tty_write_message(tty, ": write failed, "); - tty_write_message(tty, quotatypes[dquot->dq_type]); + tty_write_message(tty, quotatypes[warn->w_dq_type]); switch (warntype) { case QUOTA_NL_IHARDWARN: msg = " file limit reached.\r\n"; @@ -1185,26 +1193,34 @@ static void print_warning(struct dquot *dquot, const int warntype) } #endif +static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, + int warntype) +{ + if (warning_issued(dquot, warntype)) + return; + warn->w_type = warntype; + warn->w_sb = dquot->dq_sb; + warn->w_dq_id = dquot->dq_id; + warn->w_dq_type = dquot->dq_type; +} + /* * Write warnings to the console and send warning messages over netlink. * - * Note that this function can sleep. + * Note that this function can call into tty and networking code. */ -static void flush_warnings(struct dquot *const *dquots, char *warntype) +static void flush_warnings(struct dquot_warn *warn) { - struct dquot *dq; int i; for (i = 0; i < MAXQUOTAS; i++) { - dq = dquots[i]; - if (dq && warntype[i] != QUOTA_NL_NOWARN && - !warning_issued(dq, warntype[i])) { + if (warn[i].w_type == QUOTA_NL_NOWARN) + continue; #ifdef CONFIG_PRINT_QUOTA_WARNING - print_warning(dq, warntype[i]); + print_warning(&warn[i]); #endif - quota_send_warning(dq->dq_type, dq->dq_id, - dq->dq_sb->s_dev, warntype[i]); - } + quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id, + warn[i].w_sb->s_dev, warn[i].w_type); } } @@ -1218,11 +1234,11 @@ static int ignore_hardlimit(struct dquot *dquot) } /* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) +static int check_idq(struct dquot *dquot, qsize_t inodes, + struct dquot_warn *warn) { qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; - *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) return 0; @@ -1230,7 +1246,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { - *warntype = QUOTA_NL_IHARDWARN; + prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); return -EDQUOT; } @@ -1239,14 +1255,14 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) dquot->dq_dqb.dqb_itime && get_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { - *warntype = QUOTA_NL_ISOFTLONGWARN; + prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime == 0) { - *warntype = QUOTA_NL_ISOFTWARN; + prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); dquot->dq_dqb.dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } @@ -1255,12 +1271,12 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) } /* needs dq_data_lock */ -static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) +static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, + struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; - *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) return 0; @@ -1272,7 +1288,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (!prealloc) - *warntype = QUOTA_NL_BHARDWARN; + prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); return -EDQUOT; } @@ -1282,7 +1298,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war get_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (!prealloc) - *warntype = QUOTA_NL_BSOFTLONGWARN; + prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); return -EDQUOT; } @@ -1290,7 +1306,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (!prealloc) { - *warntype = QUOTA_NL_BSOFTWARN; + prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = get_seconds() + sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace; } @@ -1543,10 +1559,9 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0; - char warntype[MAXQUOTAS]; - int warn = flags & DQUOT_SPACE_WARN; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; - int nofail = flags & DQUOT_SPACE_NOFAIL; /* * First test before acquiring mutex - solves deadlocks when we @@ -1559,36 +1574,36 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) - warntype[cnt] = QUOTA_NL_NOWARN; + warn[cnt].w_type = QUOTA_NL_NOWARN; spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; - ret = check_bdq(inode->i_dquot[cnt], number, !warn, - warntype+cnt); - if (ret && !nofail) { + ret = check_bdq(dquots[cnt], number, + !(flags & DQUOT_SPACE_WARN), &warn[cnt]); + if (ret && !(flags & DQUOT_SPACE_NOFAIL)) { spin_unlock(&dq_data_lock); goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; if (reserve) - dquot_resv_space(inode->i_dquot[cnt], number); + dquot_resv_space(dquots[cnt], number); else - dquot_incr_space(inode->i_dquot[cnt], number); + dquot_incr_space(dquots[cnt], number); } inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); if (reserve) goto out_flush_warn; - mark_all_dquot_dirty(inode->i_dquot); + mark_all_dquot_dirty(dquots); out_flush_warn: - flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); out: return ret; } @@ -1600,36 +1615,37 @@ EXPORT_SYMBOL(__dquot_alloc_space); int dquot_alloc_inode(const struct inode *inode) { int cnt, ret = 0; - char warntype[MAXQUOTAS]; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot * const *dquots = inode->i_dquot; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) - warntype[cnt] = QUOTA_NL_NOWARN; + warn[cnt].w_type = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; - ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + ret = check_idq(dquots[cnt], 1, &warn[cnt]); if (ret) goto warn_put_all; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + if (!dquots[cnt]) continue; - dquot_incr_inodes(inode->i_dquot[cnt], 1); + dquot_incr_inodes(dquots[cnt], 1); } warn_put_all: spin_unlock(&dq_data_lock); if (ret == 0) - mark_all_dquot_dirty(inode->i_dquot); - flush_warnings(inode->i_dquot, warntype); + mark_all_dquot_dirty(dquots); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); return ret; } EXPORT_SYMBOL(dquot_alloc_inode); @@ -1669,7 +1685,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; - char warntype[MAXQUOTAS]; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot **dquots = inode->i_dquot; int reserve = flags & DQUOT_SPACE_RESERVE; /* First test before acquiring mutex - solves deadlocks when we @@ -1682,23 +1699,28 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + int wtype; + + warn[cnt].w_type = QUOTA_NL_NOWARN; + if (!dquots[cnt]) continue; - warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); + wtype = info_bdq_free(dquots[cnt], number); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn[cnt], dquots[cnt], wtype); if (reserve) - dquot_free_reserved_space(inode->i_dquot[cnt], number); + dquot_free_reserved_space(dquots[cnt], number); else - dquot_decr_space(inode->i_dquot[cnt], number); + dquot_decr_space(dquots[cnt], number); } inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); if (reserve) goto out_unlock; - mark_all_dquot_dirty(inode->i_dquot); + mark_all_dquot_dirty(dquots); out_unlock: - flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); @@ -1708,7 +1730,8 @@ EXPORT_SYMBOL(__dquot_free_space); void dquot_free_inode(const struct inode *inode) { unsigned int cnt; - char warntype[MAXQUOTAS]; + struct dquot_warn warn[MAXQUOTAS]; + struct dquot * const *dquots = inode->i_dquot; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1718,15 +1741,20 @@ void dquot_free_inode(const struct inode *inode) down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!inode->i_dquot[cnt]) + int wtype; + + warn[cnt].w_type = QUOTA_NL_NOWARN; + if (!dquots[cnt]) continue; - warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); - dquot_decr_inodes(inode->i_dquot[cnt], 1); + wtype = info_idq_free(dquots[cnt], 1); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn[cnt], dquots[cnt], wtype); + dquot_decr_inodes(dquots[cnt], 1); } spin_unlock(&dq_data_lock); - mark_all_dquot_dirty(inode->i_dquot); - flush_warnings(inode->i_dquot, warntype); + mark_all_dquot_dirty(dquots); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); @@ -1747,16 +1775,20 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, ret = 0; char is_valid[MAXQUOTAS] = {}; - char warntype_to[MAXQUOTAS]; - char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; + struct dquot_warn warn_to[MAXQUOTAS]; + struct dquot_warn warn_from_inodes[MAXQUOTAS]; + struct dquot_warn warn_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; /* Initialize the arrays */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - warntype_to[cnt] = QUOTA_NL_NOWARN; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + warn_to[cnt].w_type = QUOTA_NL_NOWARN; + warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; + warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; + } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1778,10 +1810,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = inode->i_dquot[cnt]; - ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); if (ret) goto over_quota; - ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); if (ret) goto over_quota; } @@ -1794,10 +1826,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { - warntype_from_inodes[cnt] = - info_idq_free(transfer_from[cnt], 1); - warntype_from_space[cnt] = - info_bdq_free(transfer_from[cnt], space); + int wtype; + wtype = info_idq_free(transfer_from[cnt], 1); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn_from_inodes[cnt], + transfer_from[cnt], wtype); + wtype = info_bdq_free(transfer_from[cnt], space); + if (wtype != QUOTA_NL_NOWARN) + prepare_warning(&warn_from_space[cnt], + transfer_from[cnt], wtype); dquot_decr_inodes(transfer_from[cnt], 1); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], @@ -1815,9 +1852,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) mark_all_dquot_dirty(transfer_from); mark_all_dquot_dirty(transfer_to); - flush_warnings(transfer_to, warntype_to); - flush_warnings(transfer_from, warntype_from_inodes); - flush_warnings(transfer_from, warntype_from_space); + flush_warnings(warn_to); + flush_warnings(warn_from_inodes); + flush_warnings(warn_from_space); /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) @@ -1826,7 +1863,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) over_quota: spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - flush_warnings(transfer_to, warntype_to); + flush_warnings(warn_to); return ret; } EXPORT_SYMBOL(__dquot_transfer); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index cf9f4de00a9..b1a08573fe1 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -51,7 +51,6 @@ #include <linux/uaccess.h> #include <linux/slab.h> -#include <asm/system.h> /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 71e2b4d50a0..f86f51f99ac 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -19,7 +19,7 @@ #endif #ifdef CONFIG_ROMFS_ON_MTD -#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__)) +#define ROMFS_MTD_READ(sb, ...) mtd_read((sb)->s_mtd, ##__VA_ARGS__) /* * read data from an romfs image on an MTD device diff --git a/fs/select.c b/fs/select.c index 6fb8943d580..17d33d09fc1 100644 --- a/fs/select.c +++ b/fs/select.c @@ -348,7 +348,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; fdt = files_fdtable(current->files); - open_fds = fdt->open_fds->fds_bits+n; + open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index ed0eb2a921f..fb50652e4e1 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -83,7 +83,8 @@ static struct buffer_head *get_block_length(struct super_block *sb, * filesystem), otherwise the length is obtained from the first two bytes of * the metadata block. A bit in the length field indicates if the block * is stored uncompressed in the filesystem (usually because compression - * generated a larger block - this does occasionally happen with zlib). + * generated a larger block - this does occasionally happen with compression + * algorithms). */ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, int length, u64 *next_index, int srclength, int pages) diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 9dfe2ce0fb7..b381305c9a4 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -64,7 +64,7 @@ static int get_dir_index_using_offset(struct super_block *sb, * is offset by 3 because we invent "." and ".." entries which are * not actually stored in the directory. */ - if (f_pos < 3) + if (f_pos <= 3) return f_pos; f_pos -= 3; @@ -105,7 +105,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) struct inode *inode = file->f_dentry->d_inode; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; u64 block = squashfs_i(inode)->start + msblk->directory_table; - int offset = squashfs_i(inode)->offset, length = 0, dir_count, size, + int offset = squashfs_i(inode)->offset, length, dir_count, size, type, err; unsigned int inode_number; struct squashfs_dir_header dirh; @@ -173,8 +173,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) dir_count = le32_to_cpu(dirh.count) + 1; - /* dir_count should never be larger than 256 */ - if (dir_count > 256) + if (dir_count > SQUASHFS_DIR_COUNT) goto failed_read; while (dir_count--) { diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 0682b38d7e3..abcc58f3c15 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -144,7 +144,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, struct squashfs_dir_entry *dire; u64 block = squashfs_i(dir)->start + msblk->directory_table; int offset = squashfs_i(dir)->offset; - int err, length = 0, dir_count, size; + int err, length, dir_count, size; TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); @@ -177,8 +177,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, dir_count = le32_to_cpu(dirh.count) + 1; - /* dir_count should never be larger than 256 */ - if (dir_count > 256) + if (dir_count > SQUASHFS_DIR_COUNT) goto data_error; while (dir_count--) { diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index e8e14645de9..9e2349d07cb 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -30,11 +30,6 @@ /* size of metadata (inode and directory) blocks */ #define SQUASHFS_METADATA_SIZE 8192 -#define SQUASHFS_METADATA_LOG 13 - -/* default size of data blocks */ -#define SQUASHFS_FILE_SIZE 131072 -#define SQUASHFS_FILE_LOG 17 /* default size of block device I/O */ #ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE @@ -46,12 +41,12 @@ #define SQUASHFS_FILE_MAX_SIZE 1048576 #define SQUASHFS_FILE_MAX_LOG 20 -/* Max number of uids and gids */ -#define SQUASHFS_IDS 65536 - /* Max length of filename (not 255) */ #define SQUASHFS_NAME_LEN 256 +/* Max value for directory header count*/ +#define SQUASHFS_DIR_COUNT 256 + #define SQUASHFS_INVALID_FRAG (0xffffffffU) #define SQUASHFS_INVALID_XATTR (0xffffffffU) #define SQUASHFS_INVALID_BLK (-1LL) @@ -142,9 +137,6 @@ #define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\ << 16) + (B))) -/* Translate between VFS mode and squashfs mode */ -#define SQUASHFS_MODE(A) ((A) & 0xfff) - /* fragment and fragment table defines */ #define SQUASHFS_FRAGMENT_BYTES(A) \ ((A) * sizeof(struct squashfs_fragment_entry)) @@ -215,11 +207,6 @@ /* cached data constants for filesystem */ #define SQUASHFS_CACHED_BLKS 8 -#define SQUASHFS_MAX_FILE_SIZE_LOG 64 - -#define SQUASHFS_MAX_FILE_SIZE (1LL << \ - (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) - /* meta index cache */ #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 970b1167e7c..29cd014ed3a 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -158,10 +158,15 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + /* Check block log for sanity */ msblk->block_log = le16_to_cpu(sblk->block_log); if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) goto failed_mount; + /* Check that block_size and block_log match */ + if (msblk->block_size != (1 << msblk->block_log)) + goto failed_mount; + /* Check the root inode for sanity */ root_inode = le64_to_cpu(sblk->root_inode); if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 987585bb0a1..1ba2baaf436 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -105,7 +105,6 @@ static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt) } static void udf_bitmap_free_blocks(struct super_block *sb, - struct inode *inode, struct udf_bitmap *bitmap, struct kernel_lb_addr *bloc, uint32_t offset, @@ -172,7 +171,6 @@ error_return: } static int udf_bitmap_prealloc_blocks(struct super_block *sb, - struct inode *inode, struct udf_bitmap *bitmap, uint16_t partition, uint32_t first_block, uint32_t block_count) @@ -223,7 +221,6 @@ out: } static int udf_bitmap_new_block(struct super_block *sb, - struct inode *inode, struct udf_bitmap *bitmap, uint16_t partition, uint32_t goal, int *err) { @@ -349,7 +346,6 @@ error_return: } static void udf_table_free_blocks(struct super_block *sb, - struct inode *inode, struct inode *table, struct kernel_lb_addr *bloc, uint32_t offset, @@ -581,7 +577,6 @@ error_return: } static int udf_table_prealloc_blocks(struct super_block *sb, - struct inode *inode, struct inode *table, uint16_t partition, uint32_t first_block, uint32_t block_count) { @@ -643,7 +638,6 @@ static int udf_table_prealloc_blocks(struct super_block *sb, } static int udf_table_new_block(struct super_block *sb, - struct inode *inode, struct inode *table, uint16_t partition, uint32_t goal, int *err) { @@ -743,18 +737,23 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode, struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { - udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, + udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap, bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { - udf_table_free_blocks(sb, inode, map->s_uspace.s_table, + udf_table_free_blocks(sb, map->s_uspace.s_table, bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, + udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap, bloc, offset, count); } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - udf_table_free_blocks(sb, inode, map->s_fspace.s_table, + udf_table_free_blocks(sb, map->s_fspace.s_table, bloc, offset, count); } + + if (inode) { + inode_sub_bytes(inode, + ((sector_t)count) << sb->s_blocksize_bits); + } } inline int udf_prealloc_blocks(struct super_block *sb, @@ -763,29 +762,34 @@ inline int udf_prealloc_blocks(struct super_block *sb, uint32_t block_count) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + sector_t allocated; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - return udf_bitmap_prealloc_blocks(sb, inode, - map->s_uspace.s_bitmap, - partition, first_block, - block_count); + allocated = udf_bitmap_prealloc_blocks(sb, + map->s_uspace.s_bitmap, + partition, first_block, + block_count); else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - return udf_table_prealloc_blocks(sb, inode, - map->s_uspace.s_table, - partition, first_block, - block_count); + allocated = udf_table_prealloc_blocks(sb, + map->s_uspace.s_table, + partition, first_block, + block_count); else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - return udf_bitmap_prealloc_blocks(sb, inode, - map->s_fspace.s_bitmap, - partition, first_block, - block_count); + allocated = udf_bitmap_prealloc_blocks(sb, + map->s_fspace.s_bitmap, + partition, first_block, + block_count); else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - return udf_table_prealloc_blocks(sb, inode, - map->s_fspace.s_table, - partition, first_block, - block_count); + allocated = udf_table_prealloc_blocks(sb, + map->s_fspace.s_table, + partition, first_block, + block_count); else return 0; + + if (inode && allocated > 0) + inode_add_bytes(inode, allocated << sb->s_blocksize_bits); + return allocated; } inline int udf_new_block(struct super_block *sb, @@ -793,25 +797,29 @@ inline int udf_new_block(struct super_block *sb, uint16_t partition, uint32_t goal, int *err) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; + int block; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - return udf_bitmap_new_block(sb, inode, - map->s_uspace.s_bitmap, - partition, goal, err); + block = udf_bitmap_new_block(sb, + map->s_uspace.s_bitmap, + partition, goal, err); else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - return udf_table_new_block(sb, inode, - map->s_uspace.s_table, - partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - return udf_bitmap_new_block(sb, inode, - map->s_fspace.s_bitmap, + block = udf_table_new_block(sb, + map->s_uspace.s_table, partition, goal, err); + else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + block = udf_bitmap_new_block(sb, + map->s_fspace.s_bitmap, + partition, goal, err); else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - return udf_table_new_block(sb, inode, - map->s_fspace.s_table, - partition, goal, err); + block = udf_table_new_block(sb, + map->s_fspace.s_table, + partition, goal, err); else { *err = -EIO; return 0; } + if (inode && block) + inode_add_bytes(inode, sb->s_blocksize); + return block; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 05ab48195be..7e5aae4bf46 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -116,6 +116,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; + iinfo->i_checkpoint = 1; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7699df7b319..7d752800835 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1358,6 +1358,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); + iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr; } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << @@ -1379,6 +1380,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); + iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); offset = sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr; } @@ -1495,6 +1497,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; + uint64_t lb_recorded; uint32_t udfperms; uint16_t icbflags; uint16_t crclen; @@ -1589,13 +1592,18 @@ static int udf_update_inode(struct inode *inode, int do_sync) dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); } + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + lb_recorded = 0; /* No extents => no blocks! */ + else + lb_recorded = + (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> + (blocksize_bits - 9); + if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); - fe->logicalBlocksRecorded = cpu_to_le64( - (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> - (blocksize_bits - 9)); + fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); @@ -1607,6 +1615,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->uniqueID = cpu_to_le64(iinfo->i_unique); fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); crclen = sizeof(struct fileEntry); } else { @@ -1615,9 +1624,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = cpu_to_le64(inode->i_size); - efe->logicalBlocksRecorded = cpu_to_le64( - (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> - (blocksize_bits - 9)); + efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec || (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && @@ -1646,6 +1653,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->uniqueID = cpu_to_le64(iinfo->i_unique); efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); + efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } diff --git a/fs/udf/super.c b/fs/udf/super.c index 85067b4c7e1..ac8a348dcb6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -950,11 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) else bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ - if (bitmap == NULL) { - udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n", - nr_groups); + if (bitmap == NULL) return NULL; - } bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); bitmap->s_nr_groups = nr_groups; diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index d1bd31ea724..bb8309dcd5c 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -23,6 +23,7 @@ struct udf_inode_info { __u64 i_lenExtents; __u32 i_next_alloc_block; __u32 i_next_alloc_goal; + __u32 i_checkpoint; unsigned i_alloc_type : 3; unsigned i_efe : 1; /* extendedFileEntry */ unsigned i_use : 1; /* unallocSpaceEntry */ diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 9094e1d917b..7cdd3953d67 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -26,7 +26,6 @@ */ #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f636f6b460d..ac8e279eccc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -73,7 +73,6 @@ #include <stdarg.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index ce84ffd0264..0f0df2759b0 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -35,6 +35,7 @@ #include "xfs_error.h" #include "xfs_trace.h" +struct workqueue_struct *xfs_alloc_wq; #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) @@ -68,7 +69,7 @@ xfs_alloc_lookup_eq( * Lookup the first record greater than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -2207,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -xfs_alloc_vextent( +__xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2417,6 +2418,37 @@ error0: return error; } +static void +xfs_alloc_vextent_worker( + struct work_struct *work) +{ + struct xfs_alloc_arg *args = container_of(work, + struct xfs_alloc_arg, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_alloc_vextent(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + + +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + DECLARE_COMPLETION_ONSTACK(done); + + args->done = &done; + INIT_WORK(&args->work, xfs_alloc_vextent_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 2f52b924be7..3a7e7d8f8de 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -25,6 +25,8 @@ struct xfs_perag; struct xfs_trans; struct xfs_busy_extent; +extern struct workqueue_struct *xfs_alloc_wq; + /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ @@ -119,6 +121,9 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ + struct completion *done; + struct work_struct work; + int result; } xfs_alloc_arg_t; /* @@ -243,6 +248,13 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat); /* success/failure */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 08b9ac644c3..65d61b948ea 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -853,6 +853,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) { int newsize, forkoff, retval; + trace_xfs_attr_sf_addname(args); + retval = xfs_attr_shortform_lookup(args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { return(retval); @@ -896,6 +898,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_dabuf_t *bp; int retval, error, committed, forkoff; + trace_xfs_attr_leaf_addname(args); + /* * Read the (only) block in the attribute list in. */ @@ -920,6 +924,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_da_brelse(args->trans, bp); return(retval); } + + trace_xfs_attr_leaf_replace(args); + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; @@ -1090,6 +1097,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_dabuf_t *bp; int error, committed, forkoff; + trace_xfs_attr_leaf_removename(args); + /* * Remove the attribute. */ @@ -1223,6 +1232,8 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_mount_t *mp; int committed, retval, error; + trace_xfs_attr_node_addname(args); + /* * Fill in bucket of arguments/results/context to carry around. */ @@ -1249,6 +1260,9 @@ restart: } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) goto out; + + trace_xfs_attr_node_replace(args); + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; @@ -1480,6 +1494,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_dabuf_t *bp; int retval, error, committed, forkoff; + trace_xfs_attr_node_removename(args); + /* * Tie a string around our finger to remind us where we are. */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d25eafd4d28..76d93dc953e 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -235,6 +235,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args) xfs_inode_t *dp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_create(args); + dp = args->dp; ASSERT(dp != NULL); ifp = dp->i_afp; @@ -268,6 +270,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) xfs_inode_t *dp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_add(args); + dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; @@ -337,6 +341,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_mount_t *mp; xfs_inode_t *dp; + trace_xfs_attr_sf_remove(args); + dp = args->dp; mp = dp->i_mount; base = sizeof(xfs_attr_sf_hdr_t); @@ -405,6 +411,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) int i; xfs_ifork_t *ifp; + trace_xfs_attr_sf_lookup(args); + ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; @@ -476,6 +484,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) xfs_dabuf_t *bp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_to_leaf(args); + dp = args->dp; ifp = dp->i_afp; sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; @@ -775,6 +785,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) char *tmpbuffer; int error, i; + trace_xfs_attr_leaf_to_sf(args); + dp = args->dp; tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); ASSERT(tmpbuffer != NULL); @@ -848,6 +860,8 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) xfs_dablk_t blkno; int error; + trace_xfs_attr_leaf_to_node(args); + dp = args->dp; bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); @@ -911,6 +925,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) xfs_dabuf_t *bp; int error; + trace_xfs_attr_leaf_create(args); + dp = args->dp; ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, @@ -948,6 +964,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, xfs_dablk_t blkno; int error; + trace_xfs_attr_leaf_split(state->args); + /* * Allocate space for a new leaf node. */ @@ -977,10 +995,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * Insert the "new" entry in the correct block. */ - if (state->inleaf) + if (state->inleaf) { + trace_xfs_attr_leaf_add_old(state->args); error = xfs_attr_leaf_add(oldblk->bp, state->args); - else + } else { + trace_xfs_attr_leaf_add_new(state->args); error = xfs_attr_leaf_add(newblk->bp, state->args); + } /* * Update last hashval in each block since we added the name. @@ -1001,6 +1022,8 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_attr_leaf_map_t *map; int tablesize, entsize, sum, tmp, i; + trace_xfs_attr_leaf_add(args); + leaf = bp->data; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT((args->index >= 0) @@ -1128,8 +1151,6 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* - * Copy the attribute name and value into the new space. - * * For "remote" attribute values, simply note that we need to * allocate space for the "remote" value. We can't actually * allocate the extents in this transaction, and we can't decide @@ -1265,6 +1286,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); args = state->args; + trace_xfs_attr_leaf_rebalance(args); + /* * Check ordering of blocks, reverse if it makes things simpler. * @@ -1810,6 +1833,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_mount_t *mp; char *tmpbuffer; + trace_xfs_attr_leaf_unbalance(state->args); + /* * Set up environment. */ @@ -1919,6 +1944,8 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) int probe, span; xfs_dahash_t hashval; + trace_xfs_attr_leaf_lookup(args); + leaf = bp->data; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) @@ -2445,6 +2472,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) char *name; #endif /* DEBUG */ + trace_xfs_attr_leaf_clearflag(args); /* * Set up the operation. */ @@ -2509,6 +2537,8 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) xfs_dabuf_t *bp; int error; + trace_xfs_attr_leaf_setflag(args); + /* * Set up the operation. */ @@ -2565,6 +2595,8 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) char *name1, *name2; #endif /* DEBUG */ + trace_xfs_attr_leaf_flipflags(args); + /* * Read the block containing the "old" attr */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3548c6f7559..85e7e327bcd 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5124,6 +5124,15 @@ xfs_bunmapi( cur->bc_private.b.flags = 0; } else cur = NULL; + + if (isrt) { + /* + * Synchronize by locking the bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + } + extno = 0; while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && (nexts == 0 || extno < nexts)) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index df7ffb0affe..5bf3be45f54 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -21,7 +21,6 @@ #include <linux/list.h> #include <linux/types.h> #include <linux/spinlock.h> -#include <asm/system.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/buffer_head.h> diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 77c74257c2a..7f1a6f5b05a 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -108,6 +108,8 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, int error; xfs_trans_t *tp; + trace_xfs_da_node_create(args); + tp = args->trans; error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); if (error) @@ -140,6 +142,8 @@ xfs_da_split(xfs_da_state_t *state) xfs_dabuf_t *bp; int max, action, error, i; + trace_xfs_da_split(state->args); + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then @@ -178,10 +182,12 @@ xfs_da_split(xfs_da_state_t *state) state->extravalid = 1; if (state->inleaf) { state->extraafter = 0; /* before newblk */ + trace_xfs_attr_leaf_split_before(state->args); error = xfs_attr_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ + trace_xfs_attr_leaf_split_after(state->args); error = xfs_attr_leaf_split(state, newblk, &state->extrablk); } @@ -300,6 +306,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_mount_t *mp; xfs_dir2_leaf_t *leaf; + trace_xfs_da_root_split(state->args); + /* * Copy the existing (incorrect) block from the root node position * to a free space somewhere. @@ -380,6 +388,8 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, int newcount, error; int useextra; + trace_xfs_da_node_split(state->args); + node = oldblk->bp->data; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); @@ -466,6 +476,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, int count, tmp; xfs_trans_t *tp; + trace_xfs_da_node_rebalance(state->args); + node1 = blk1->bp->data; node2 = blk2->bp->data; /* @@ -574,6 +586,8 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, xfs_da_node_entry_t *btree; int tmp; + trace_xfs_da_node_add(state->args); + node = oldblk->bp->data; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); @@ -619,6 +633,8 @@ xfs_da_join(xfs_da_state_t *state) xfs_da_state_blk_t *drop_blk, *save_blk; int action, error; + trace_xfs_da_join(state->args); + action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; @@ -723,6 +739,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_dabuf_t *bp; int error; + trace_xfs_da_root_join(state->args); + args = state->args; ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); @@ -941,6 +959,8 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) xfs_da_node_entry_t *btree; int tmp; + trace_xfs_da_node_remove(state->args); + node = drop_blk->bp->data; ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); ASSERT(drop_blk->index >= 0); @@ -984,6 +1004,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, int tmp; xfs_trans_t *tp; + trace_xfs_da_node_unbalance(state->args); + drop_node = drop_blk->bp->data; save_node = save_blk->bp->data; ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); @@ -1230,6 +1252,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, /* * Link new block in before existing block. */ + trace_xfs_da_link_before(args); new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { @@ -1251,6 +1274,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, /* * Link new block in after existing block. */ + trace_xfs_da_link_after(args); new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { @@ -1348,6 +1372,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Unlink the leaf block from the doubly linked chain of leaves. */ if (be32_to_cpu(save_info->back) == drop_blk->blkno) { + trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { error = xfs_da_read_buf(args->trans, args->dp, @@ -1365,6 +1390,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_buf_done(bp); } } else { + trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, @@ -1652,6 +1678,8 @@ xfs_da_grow_inode( int count; int error; + trace_xfs_da_grow_inode(args); + if (args->whichfork == XFS_DATA_FORK) { bno = args->dp->i_mount->m_dirleafblk; count = args->dp->i_mount->m_dirblkfsbs; @@ -1690,6 +1718,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, xfs_dir2_leaf_t *dead_leaf2; xfs_dahash_t dead_hash; + trace_xfs_da_swap_lastblock(args); + dead_buf = *dead_bufp; dead_blkno = *dead_blknop; tp = args->trans; @@ -1878,6 +1908,8 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_trans_t *tp; xfs_mount_t *mp; + trace_xfs_da_shrink_inode(args); + dp = args->dp; w = args->whichfork; tp = args->trans; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 286a051f12c..1ad3a4b8ca4 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -37,9 +37,9 @@ STATIC int xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_fsblock_t start, - xfs_fsblock_t end, - xfs_fsblock_t minlen, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, __uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; @@ -67,7 +67,7 @@ xfs_trim_extents( /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_le(cur, 0, + error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -77,8 +77,10 @@ xfs_trim_extents( * enough to be worth discarding. */ while (i) { - xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + xfs_daddr_t dbno; + xfs_extlen_t dlen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) @@ -87,9 +89,17 @@ xfs_trim_extents( ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* + * use daddr format for all range/len calculations as that is + * the format the range/len variables are supplied in by + * userspace. + */ + dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dlen = XFS_FSB_TO_BB(mp, flen); + + /* * Too small? Give up. */ - if (flen < minlen) { + if (dlen < minlen) { trace_xfs_discard_toosmall(mp, agno, fbno, flen); goto out_del_cursor; } @@ -99,8 +109,7 @@ xfs_trim_extents( * supposed to discard skip it. Do not bother to trim * down partially overlapping ranges for now. */ - if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) > end) { + if (dbno + dlen < start || dbno > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -115,10 +124,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = -blkdev_issue_discard(bdev, - XFS_AGB_TO_DADDR(mp, agno, fbno), - XFS_FSB_TO_BB(mp, flen), - GFP_NOFS, 0); + error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -137,6 +143,15 @@ out_put_perag: return error; } +/* + * trim a range of the filesystem. + * + * Note: the parameters passed from userspace are byte ranges into the + * filesystem which does not match to the format we use for filesystem block + * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format + * is a linear address range. Hence we need to use DADDR based conversions and + * comparisons for determining the correct offset and regions to trim. + */ int xfs_ioc_trim( struct xfs_mount *mp, @@ -145,7 +160,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, end, minlen; + xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -159,22 +174,22 @@ xfs_ioc_trim( /* * Truncating down the len isn't actually quite correct, but using - * XFS_B_TO_FSB would mean we trivially get overflows for values + * BBTOB would mean we trivially get overflows for values * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ - start = XFS_B_TO_FSBT(mp, range.start); - end = start + XFS_B_TO_FSBT(mp, range.len) - 1; - minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + start = BTOBB(range.start); + end = start + BTOBBT(range.len) - 1; + minlen = BTOBB(max_t(u64, granularity, range.minlen)); - if (start >= mp->m_sb.sb_dblocks) + if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); - if (end > mp->m_sb.sb_dblocks - 1) - end = mp->m_sb.sb_dblocks - 1; + if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; - start_agno = XFS_FSB_TO_AGNO(mp, start); - end_agno = XFS_FSB_TO_AGNO(mp, end); + start_agno = xfs_daddr_to_agno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { error = -xfs_trim_extents(mp, agno, start, end, minlen, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 4be16a0cbe5..1155208fa83 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1065,7 +1065,7 @@ out: return -ENOMEM; } -void __exit +void xfs_qm_exit(void) { kmem_zone_destroy(xfs_qm_dqtrxzone); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index a98cb4524e6..bcc6c249b2c 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -289,7 +289,7 @@ xfs_iget_cache_hit( if (lock_flags != 0) xfs_ilock(ip, lock_flags); - xfs_iflags_clear(ip, XFS_ISTALE); + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); XFS_STATS_INC(xs_ig_found); return 0; @@ -314,6 +314,7 @@ xfs_iget_cache_miss( struct xfs_inode *ip; int error; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; ip = xfs_inode_alloc(mp, ino); if (!ip) @@ -358,8 +359,11 @@ xfs_iget_cache_miss( * memory barrier that ensures this detection works correctly at lookup * time. */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, XFS_INEW); + xfs_iflags_set(ip, iflags); /* insert the new inode */ spin_lock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f123dbe6d42..7fee3387e1c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -387,10 +387,11 @@ xfs_set_projid(struct xfs_inode *ip, #define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) +#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during - * inode lookup. Thi prevents unintended behaviour on the new inode from + * inode lookup. This prevents unintended behaviour on the new inode from * ocurring. */ #define XFS_IRECLAIM_RESET_FLAGS \ @@ -553,6 +554,7 @@ do { \ */ #define XFS_IGET_CREATE 0x1 #define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, struct xfs_dinode **, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f588320dc4b..91f8ff547ab 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -209,6 +209,7 @@ xfs_open_by_handle( struct file *filp; struct inode *inode; struct dentry *dentry; + fmode_t fmode; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -228,26 +229,21 @@ xfs_open_by_handle( hreq->oflags |= O_LARGEFILE; #endif - /* Put open permission in namei format. */ permflag = hreq->oflags; - if ((permflag+1) & O_ACCMODE) - permflag++; - if (permflag & O_TRUNC) - permflag |= 2; - + fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (permflag & FMODE_WRITE) && IS_APPEND(inode)) { + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { error = -XFS_ERROR(EPERM); goto out_dput; } - if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { error = -XFS_ERROR(EACCES); goto out_dput; } /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { error = -XFS_ERROR(EISDIR); goto out_dput; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 9720c54bbed..acc2bf264da 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -75,7 +75,8 @@ xfs_bulkstat_one_int( return XFS_ERROR(ENOMEM); error = xfs_iget(mp, NULL, ino, - XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip); + (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), + XFS_ILOCK_SHARED, &ip); if (error) { *stat = BULKSTAT_RV_NOTHING; goto out_free; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 98a9cb5ffd1..6db1fef38bf 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -726,8 +726,9 @@ xfs_log_unmount_write(xfs_mount_t *mp) .lv_iovecp = ®, }; - /* remove inited flag */ + /* remove inited flag, and account for space used */ tic->t_flags = 0; + tic->t_curr_res -= sizeof(magic); error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 7c75c7374d5..8ecad5bad66 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3161,37 +3161,26 @@ xlog_recover_process_iunlinks( */ continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); - - /* - * Reacquire the agibuffer and continue around - * the loop. This should never fail as we know - * the buffer was good earlier on. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); - ASSERT(error == 0); - agi = XFS_BUF_TO_AGI(agibp); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 87323f1ded6..ca4f31534a0 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -183,6 +183,7 @@ error_cancel: oblocks = map.br_startoff + map.br_blockcount; } return 0; + error: return error; } @@ -2139,11 +2140,9 @@ xfs_rtfree_extent( xfs_buf_t *sumbp; /* summary file block buffer */ mp = tp->t_mountp; - /* - * Synchronize by locking the bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); #if defined(__KERNEL__) && defined(DEBUG) /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 912442cf0f8..dab9a5f6dfd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -950,6 +950,22 @@ xfs_fs_evict_inode( xfs_inactive(ip); } +/* + * We do an unlocked check for XFS_IDONTCACHE here because we are already + * serialised against cache hits here via the inode->i_lock and igrab() in + * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be + * racing with us, and it avoids needing to grab a spinlock here for every inode + * we drop the final reference on. + */ +STATIC int +xfs_fs_drop_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); +} + STATIC void xfs_free_fsname( struct xfs_mount *mp) @@ -1433,6 +1449,7 @@ static const struct super_operations xfs_super_operations = { .destroy_inode = xfs_fs_destroy_inode, .dirty_inode = xfs_fs_dirty_inode, .evict_inode = xfs_fs_evict_inode, + .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, @@ -1606,12 +1623,28 @@ xfs_init_workqueues(void) xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); if (!xfs_syncd_wq) return -ENOMEM; + + /* + * The allocation workqueue can be used in memory reclaim situations + * (writepage path), and parallelism is only limited by the number of + * AGs in all the filesystems mounted. Hence use the default large + * max_active value for this workqueue. + */ + xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); + if (!xfs_alloc_wq) + goto out_destroy_syncd; + return 0; + +out_destroy_syncd: + destroy_workqueue(xfs_syncd_wq); + return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { + destroy_workqueue(xfs_alloc_wq); destroy_workqueue(xfs_syncd_wq); } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 75eb54af4d5..06838c42b2a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -627,16 +627,19 @@ DECLARE_EVENT_CLASS(xfs_namespace_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, dp_ino) + __field(int, namelen) __dynamic_array(char, name, name->len) ), TP_fast_assign( __entry->dev = VFS_I(dp)->i_sb->s_dev; __entry->dp_ino = dp->i_ino; + __entry->namelen = name->len; memcpy(__get_str(name), name->name, name->len); ), - TP_printk("dev %d:%d dp ino 0x%llx name %s", + TP_printk("dev %d:%d dp ino 0x%llx name %.*s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dp_ino, + __entry->namelen, __get_str(name)) ) @@ -658,6 +661,8 @@ TRACE_EVENT(xfs_rename, __field(dev_t, dev) __field(xfs_ino_t, src_dp_ino) __field(xfs_ino_t, target_dp_ino) + __field(int, src_namelen) + __field(int, target_namelen) __dynamic_array(char, src_name, src_name->len) __dynamic_array(char, target_name, target_name->len) ), @@ -665,15 +670,20 @@ TRACE_EVENT(xfs_rename, __entry->dev = VFS_I(src_dp)->i_sb->s_dev; __entry->src_dp_ino = src_dp->i_ino; __entry->target_dp_ino = target_dp->i_ino; + __entry->src_namelen = src_name->len; + __entry->target_namelen = target_name->len; memcpy(__get_str(src_name), src_name->name, src_name->len); - memcpy(__get_str(target_name), target_name->name, target_name->len); + memcpy(__get_str(target_name), target_name->name, + target_name->len); ), TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" - " src name %s target name %s", + " src name %.*s target name %.*s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_dp_ino, __entry->target_dp_ino, + __entry->src_namelen, __get_str(src_name), + __entry->target_namelen, __get_str(target_name)) ) @@ -1408,7 +1418,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); -DECLARE_EVENT_CLASS(xfs_dir2_class, +DECLARE_EVENT_CLASS(xfs_da_class, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), TP_STRUCT__entry( @@ -1443,7 +1453,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_class, ) #define DEFINE_DIR2_EVENT(name) \ -DEFINE_EVENT(xfs_dir2_class, name, \ +DEFINE_EVENT(xfs_da_class, name, \ TP_PROTO(struct xfs_da_args *args), \ TP_ARGS(args)) DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); @@ -1472,6 +1482,64 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace); DEFINE_DIR2_EVENT(xfs_dir2_node_removename); DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); +#define DEFINE_ATTR_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_ATTR_EVENT(xfs_attr_sf_add); +DEFINE_ATTR_EVENT(xfs_attr_sf_addname); +DEFINE_ATTR_EVENT(xfs_attr_sf_create); +DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_sf_remove); +DEFINE_ATTR_EVENT(xfs_attr_sf_removename); +DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); + +DEFINE_ATTR_EVENT(xfs_attr_leaf_add); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); +DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after); +DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); +DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); + +DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_lookup); +DEFINE_ATTR_EVENT(xfs_attr_node_replace); +DEFINE_ATTR_EVENT(xfs_attr_node_removename); + +#define DEFINE_DA_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DA_EVENT(xfs_da_split); +DEFINE_DA_EVENT(xfs_da_join); +DEFINE_DA_EVENT(xfs_da_link_before); +DEFINE_DA_EVENT(xfs_da_link_after); +DEFINE_DA_EVENT(xfs_da_unlink_back); +DEFINE_DA_EVENT(xfs_da_unlink_forward); +DEFINE_DA_EVENT(xfs_da_root_split); +DEFINE_DA_EVENT(xfs_da_root_join); +DEFINE_DA_EVENT(xfs_da_node_add); +DEFINE_DA_EVENT(xfs_da_node_create); +DEFINE_DA_EVENT(xfs_da_node_split); +DEFINE_DA_EVENT(xfs_da_node_remove); +DEFINE_DA_EVENT(xfs_da_node_rebalance); +DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_swap_lastblock); +DEFINE_DA_EVENT(xfs_da_grow_inode); +DEFINE_DA_EVENT(xfs_da_shrink_inode); + DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_PROTO(struct xfs_da_args *args, int idx), TP_ARGS(args, idx), |