diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.c | 134 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 42 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 388 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 116 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 801 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 49 | ||||
-rw-r--r-- | fs/btrfs/inode-item.c | 1 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 11 | ||||
-rw-r--r-- | fs/btrfs/reada.c | 10 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 1367 | ||||
-rw-r--r-- | fs/btrfs/struct-funcs.c | 53 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 2 |
12 files changed, 2014 insertions, 960 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e697afd1815..e801f226d7e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - rcu_read_lock(); - eb = rcu_dereference(root->node); - extent_buffer_get(eb); - rcu_read_unlock(); + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -514,7 +527,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; @@ -974,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ - free_extent_buffer(mid); + free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > @@ -1028,7 +1041,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer(right); + free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; @@ -1070,7 +1083,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer(mid); + free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ @@ -1396,7 +1409,8 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock) + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) { int i; int skip_level = level; @@ -1428,6 +1442,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level, if (i >= lowest_unlock && i > skip_level && path->locks[i]) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } } } } @@ -1651,6 +1670,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; + int min_write_lock_level; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1678,6 +1698,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; + min_write_lock_level = write_lock_level; + again: /* * we try very hard to do read locks on the root @@ -1809,7 +1831,8 @@ cow_done: goto again; } - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); if (level == lowest_level) { if (dec) @@ -1871,7 +1894,8 @@ cow_done: } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); goto done; } } @@ -2320,6 +2344,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2331,6 +2356,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; + btrfs_init_map_token(&token); + if (empty) nr = 0; else @@ -2408,8 +2435,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space -= btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } left_nritems -= push_items; @@ -2539,6 +2566,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, int ret = 0; u32 this_item_size; u32 old_left_item_size; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -2599,9 +2629,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, item = btrfs_item_nr(left, i); - ioff = btrfs_item_offset(left, item); - btrfs_set_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2631,8 +2662,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space = push_space - btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } btrfs_mark_buffer_dirty(left); @@ -2748,6 +2780,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, int rt_data_off; int i; struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2769,8 +2804,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); } btrfs_set_header_nritems(l, mid); @@ -3246,6 +3282,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3272,8 +3311,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + size_diff); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); } /* shift the data */ @@ -3342,6 +3382,9 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -3371,8 +3414,9 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - data_size); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); } /* shift the data */ @@ -3414,6 +3458,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, unsigned int data_end; struct btrfs_disk_key disk_key; struct btrfs_key found_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); for (i = 0; i < nr; i++) { if (total_size + data_size[i] + sizeof(struct btrfs_item) > @@ -3479,8 +3526,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3507,9 +3555,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); @@ -3547,6 +3596,9 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *leaf; int slot; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3578,8 +3630,9 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3598,9 +3651,10 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); @@ -3740,7 +3794,9 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); + extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + free_extent_buffer_stale(leaf); } /* * delete the item at the leaf level in path. If that empties @@ -3757,6 +3813,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3778,8 +3837,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + dsize); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -4013,7 +4073,7 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); @@ -4024,7 +4084,7 @@ find_next_key: path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); btrfs_clear_path_blocking(path, NULL, 0); } out: @@ -4260,7 +4320,7 @@ again: } ret = 0; done: - unlock_up(path, 0, 1); + unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b6ebea5582c..ed2d196f7a8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAX_MIRRORS 2 + #define BTRFS_MAX_LEVEL 8 #define BTRFS_COMPAT_EXTENT_TREE_V0 @@ -138,6 +140,12 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 /* + * the max metadata block size. This limit is somewhat artificial, + * but the memmove costs go through the roof for larger blocks. + */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + +/* * we can actually store much bigger names, but lets not confuse the rest * of linux */ @@ -461,6 +469,19 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -468,6 +489,7 @@ struct btrfs_super_block { (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* @@ -1527,6 +1549,17 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +static inline void btrfs_init_map_token (struct btrfs_map_token *token) +{ + memset(token, 0, sizeof(*token)); +} + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1550,20 +1583,22 @@ struct btrfs_ioctl_defrag_range_args { #ifndef BTRFS_SETGET_FUNCS #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #endif #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ p->member = cpu_to_le##bits(val); \ } @@ -2467,8 +2502,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data); + struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fe087847c8e..7b55eee15a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -333,7 +333,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state); - if (extent_buffer_uptodate(io_tree, eb, cached_state) && + if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; @@ -344,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, (unsigned long long)parent_transid, (unsigned long long)btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(io_tree, eb, &cached_state); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); @@ -360,9 +360,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, u64 start, u64 parent_transid) { struct extent_io_tree *io_tree; + int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; + int failed_mirror = 0; clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; @@ -370,9 +372,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && - !verify_parent_transid(io_tree, eb, parent_transid)) - return ret; + if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) + break; /* * This buffer's crc is fine, but its contents are corrupted, so @@ -380,18 +381,31 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * any less wrong. */ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) - return ret; + break; + + if (!failed_mirror) { + failed = 1; + printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); + failed_mirror = eb->failed_mirror; + } num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) - return ret; + break; mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + if (mirror_num > num_copies) - return ret; + break; } - return -EIO; + + if (failed && !ret) + repair_eb_io_failure(root, eb, failed_mirror); + + return ret; } /* @@ -404,59 +418,28 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; - unsigned long len; struct extent_buffer *eb; - int ret = -EIO; tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) { - WARN_ON(1); - goto out; - } - if (!page->private) { - WARN_ON(1); - goto out; - } - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { - WARN_ON(1); - ret = -ENOMEM; - goto out; - } - ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, - btrfs_header_generation(eb)); - if (ret) { - btrfs_printk(root->fs_info, KERN_WARNING - "Failed to checksum dirty buffer @ %llu[%lu]\n", - start, len); - goto err; - } - WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); - - ret = -EIO; + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return 0; found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); - goto err; + return 0; } - if (eb->first_page != page) { + if (eb->pages[0] != page) { WARN_ON(1); - goto err; + return 0; } if (!PageUptodate(page)) { WARN_ON(1); - goto err; + return 0; } csum_tree_block(root, eb, 0); - ret = 0; -err: - free_extent_buffer(eb); -out: - return ret; + return 0; } static int check_tree_block_fsid(struct btrfs_root *root, @@ -545,34 +528,74 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, + struct page *page, int max_walk) +{ + struct extent_buffer *eb; + u64 start = page_offset(page); + u64 target = start; + u64 min_start; + + if (start < max_walk) + min_start = 0; + else + min_start = start - max_walk; + + while (start >= min_start) { + eb = find_extent_buffer(tree, start, 0); + if (eb) { + /* + * we found an extent buffer and it contains our page + * horray! + */ + if (eb->start <= target && + eb->start + eb->len > target) + return eb; + + /* we found an extent buffer that wasn't for us */ + free_extent_buffer(eb); + return NULL; + } + if (start == 0) + break; + start -= PAGE_CACHE_SIZE; + } + return NULL; +} + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { struct extent_io_tree *tree; u64 found_start; int found_level; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; + int reads_done; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; if (!page->private) goto out; - len = page->private >> 2; - WARN_ON(len == 0); + tree = &BTRFS_I(page->mapping->host)->io_tree; + eb = (struct extent_buffer *)page->private; + + /* the pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + extent_buffer_get(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { + if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; - goto out; + goto err; } found_start = btrfs_header_bytenr(eb); - if (found_start != start) { + if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", (unsigned long long)found_start, @@ -580,13 +603,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - if (eb->first_page != page) { - printk(KERN_INFO "btrfs bad first page %lu %lu\n", - eb->first_page->index, page->index); - WARN_ON(1); - ret = -EIO; - goto err; - } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", (unsigned long long)eb->start); @@ -614,48 +630,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; } - end = min_t(u64, eb->len, PAGE_CACHE_SIZE); - end = eb->start + end - 1; + if (!ret) + set_extent_buffer_uptodate(eb); err: if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, ret); } + if (ret) + clear_extent_buffer_uptodate(eb); free_extent_buffer(eb); out: return ret; } -static int btree_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - int mirror_num, struct extent_state *state) +static int btree_io_failed_hook(struct page *page, int failed_mirror) { - struct extent_io_tree *tree; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; - if (!page->private) - goto out; - - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) - goto out; - - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = failed_mirror; + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); - } - free_extent_buffer(eb); - -out: return -EIO; /* we fixed nothing */ } @@ -868,15 +867,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, { int ret; - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1); - if (ret) - return ret; - if (!(rw & REQ_WRITE)) { + /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + if (ret) + return ret; return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } @@ -914,34 +914,6 @@ static int btree_migratepage(struct address_space *mapping, } #endif -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct extent_buffer *eb; - int was_dirty; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (!(current->flags & PF_MEMALLOC)) { - return extent_write_full_page(tree, page, - btree_get_extent, wbc); - } - - redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); - WARN_ON(!eb); - - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; - spin_unlock(&root->fs_info->delalloc_lock); - } - free_extent_buffer(eb); - - unlock_page(page); - return 0; -} static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -961,7 +933,7 @@ static int btree_writepages(struct address_space *mapping, if (num_dirty < thresh) return 0; } - return extent_writepages(tree, mapping, btree_get_extent, wbc); + return btree_write_cache_pages(mapping, wbc); } static int btree_readpage(struct file *file, struct page *page) @@ -973,16 +945,8 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - if (PageWriteback(page) || PageDirty(page)) return 0; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -990,18 +954,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) */ gfp_flags &= ~GFP_SLAB_BUG_MASK; - ret = try_release_extent_state(map, tree, page, gfp_flags); - if (!ret) - return 0; - - ret = try_release_extent_buffer(tree, page); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } - - return ret; + return try_release_extent_buffer(page, gfp_flags); } static void btree_invalidatepage(struct page *page, unsigned long offset) @@ -1019,15 +972,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } +static int btree_set_page_dirty(struct page *page) +{ + struct extent_buffer *eb; + + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); +} + static const struct address_space_operations btree_aops = { .readpage = btree_readpage, - .writepage = btree_writepage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif + .set_page_dirty = btree_set_page_dirty, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1070,7 +1036,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { free_extent_buffer(buf); return -EIO; - } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + } else if (extent_buffer_uptodate(buf)) { *eb = buf; } else { free_extent_buffer(buf); @@ -1095,20 +1061,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *eb; eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, NULL); + bytenr, blocksize); return eb; } int btrfs_write_tree_block(struct extent_buffer *buf) { - return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->first_page->mapping, + return filemap_fdatawait_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } @@ -1123,9 +1089,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return buf; } @@ -1133,7 +1096,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1155,8 +1117,7 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + clear_extent_buffer_dirty(buf); } } @@ -1539,41 +1500,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return 0; } -static int bio_ready_for_csum(struct bio *bio) -{ - u64 length = 0; - u64 buf_len = 0; - u64 start = 0; - struct page *page; - struct extent_io_tree *io_tree = NULL; - struct bio_vec *bvec; - int i; - int ret; - - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - if (page->private == EXTENT_PAGE_PRIVATE) { - length += bvec->bv_len; - continue; - } - if (!page->private) { - length += bvec->bv_len; - continue; - } - length = bvec->bv_len; - buf_len = page->private >> 2; - start = page_offset(page) + bvec->bv_offset; - io_tree = &BTRFS_I(page->mapping->host)->io_tree; - } - /* are we fully contained in this bio? */ - if (buf_len <= length) - return 1; - - ret = extent_range_uptodate(io_tree, start + length, - start + buf_len - 1); - return ret; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1589,17 +1515,6 @@ static void end_workqueue_fn(struct btrfs_work *work) bio = end_io_wq->bio; fs_info = end_io_wq->info; - /* metadata bio reads are special because the whole tree block must - * be checksummed at once. This makes sure the entire block is in - * ram and up to date before trying to verify things. For - * blocksize <= pagesize, it is basically a noop - */ - if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && - !bio_ready_for_csum(bio)) { - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); - return; - } error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; @@ -2073,6 +1988,7 @@ int open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; @@ -2171,10 +2087,38 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + if (btrfs_super_leafsize(disk_super) != + btrfs_super_nodesize(disk_super)) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksizes don't match. node %d leaf %d\n", + btrfs_super_nodesize(disk_super), + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksize (%d) was too large\n", + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + + /* + * flag our filesystem as having big metadata blocks if + * they are bigger than the page size + */ + if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -3196,10 +3140,9 @@ int close_ctree(struct btrfs_root *root) int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) { int ret; - struct inode *btree_inode = buf->first_page->mapping->host; + struct inode *btree_inode = buf->pages[0]->mapping->host; - ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, - NULL); + ret = extent_buffer_uptodate(buf); if (!ret) return ret; @@ -3210,16 +3153,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct inode *btree_inode = buf->first_page->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, - buf); + return set_extent_buffer_uptodate(buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; u64 transid = btrfs_header_generation(buf); - struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; btrfs_assert_tree_locked(buf); @@ -3231,8 +3171,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); root->fs_info->dirty_metadata_bytes += buf->len; @@ -3286,12 +3225,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - int ret; - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); - return ret; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; + return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } static int btree_lock_page_hook(struct page *page, void *data, @@ -3299,17 +3234,21 @@ static int btree_lock_page_hook(struct page *page, void *data, { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; - unsigned long len; - u64 bytenr = page_offset(page); - if (page->private == EXTENT_PAGE_PRIVATE) + /* + * We culled this eb but the page is still hanging out on the mapping, + * carry on. + */ + if (!PagePrivate(page)) goto out; - len = page->private >> 2; - eb = find_extent_buffer(io_tree, bytenr, len); - if (!eb) + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + goto out; + } + if (page != eb->pages[0]) goto out; if (!btrfs_try_tree_write_lock(eb)) { @@ -3328,7 +3267,6 @@ static int btree_lock_page_hook(struct page *page, void *data, } btrfs_tree_unlock(eb); - free_extent_buffer(eb); out: if (!trylock_page(page)) { flush_fn(data); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4b3f1eedced..8b304e3537c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5074,10 +5074,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) goto abort; - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0); @@ -5321,11 +5317,10 @@ static int get_block_group_index(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_FIND_IDEAL = 0, - LOOP_CACHING_NOWAIT = 1, - LOOP_CACHING_WAIT = 2, - LOOP_ALLOC_CHUNK = 3, - LOOP_NO_EMPTY_SIZE = 4, + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, }; /* @@ -5339,7 +5334,6 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, - u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, u64 data) { @@ -5348,6 +5342,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; struct btrfs_block_group_cache *used_block_group; + u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; @@ -5361,8 +5356,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; - u64 ideal_cache_percent = 0; - u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -5412,7 +5405,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { -ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); used_block_group = block_group; @@ -5424,8 +5416,7 @@ ideal_cache: * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - (block_group->cached != BTRFS_CACHE_NO || - search_start == ideal_cache_offset)) { + block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -5479,45 +5470,13 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - u64 free_percent; - found_uncached_bg = true; ret = cache_block_group(block_group, trans, - orig_root, 1); - BUG_ON(ret < 0); /* -ENOMEM */ - if (block_group->cached == BTRFS_CACHE_FINISHED) - goto alloc; - - free_percent = btrfs_block_group_used(&block_group->item); - free_percent *= 100; - free_percent = div64_u64(free_percent, - block_group->key.offset); - free_percent = 100 - free_percent; - if (free_percent > ideal_cache_percent && - likely(!block_group->ro)) { - ideal_cache_offset = block_group->key.objectid; - ideal_cache_percent = free_percent; - } - - /* - * The caching workers are limited to 2 threads, so we - * can queue as much work as we care to. - */ - if (loop > LOOP_FIND_IDEAL) { - ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret); /* -ENOMEM */ - } - - /* - * If loop is set for cached only, try the next block - * group. - */ - if (loop == LOOP_FIND_IDEAL) - goto loop; + orig_root, 0); + BUG_ON(ret < 0); + ret = 0; } -alloc: if (unlikely(block_group->ro)) goto loop; @@ -5668,11 +5627,6 @@ unclustered_alloc: } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } /* move on to the next group */ if (search_start + num_bytes > @@ -5723,9 +5677,7 @@ loop: if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; - /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for - * for them to make caching progress. Also - * determine the best possible bg to cache + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5735,45 +5687,7 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { - found_uncached_bg = false; - loop++; - if (!ideal_cache_percent) - goto search; - - /* - * 1 of the following 2 things have happened so far - * - * 1) We found an ideal block group for caching that - * is mostly full and will cache quickly, so we might - * as well wait for it. - * - * 2) We searched for cached only and we didn't find - * anything, and we didn't start any caching kthreads - * either, so chances are we will loop through and - * start a couple caching kthreads, and then come back - * around and just wait for them. This will be slower - * because we will have 2 caching kthreads reading at - * the same time when we could have just started one - * and waited for it to get far enough to give us an - * allocation, so go ahead and go to the wait caching - * loop. - */ - loop = LOOP_CACHING_WAIT; - search_start = ideal_cache_offset; - ideal_cache_percent = 0; - goto ideal_cache; - } else if (loop == LOOP_FIND_IDEAL) { - /* - * Didn't find a uncached bg, wait on anything we find - * next. - */ - loop = LOOP_CACHING_WAIT; - goto search; - } - loop++; - if (loop == LOOP_ALLOC_CHUNK) { if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + @@ -5866,12 +5780,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) + struct btrfs_key *ins, u64 data) { bool final_tried = false; int ret; - u64 search_start = 0; data = btrfs_get_alloc_profile(root, data); again: @@ -5891,8 +5803,7 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, - ins, data); + hint_byte, ins, data); if (ret == -ENOSPC) { if (!final_tried) { @@ -6191,6 +6102,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); @@ -6298,7 +6210,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, (u64)-1, &ins, 0); + empty_size, hint, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c3ce7a0a7a..0c3ec003f27 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -19,6 +19,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" +#include "locking.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -53,6 +54,7 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) { @@ -1929,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2275,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; + int failed_mirror; int ret; if (err) @@ -2321,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else clean_io_failure(start, page); } - if (!uptodate) { - int failed_mirror; + + if (!uptodate) failed_mirror = (int)(unsigned long)bio->bi_bdev; + + if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2337,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, page, start, end, failed_mirror, NULL); if (ret == 0) { -error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2345,16 +2374,9 @@ error_handled: uncache_state(&cached); continue; } - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - if (ret == 0) - goto error_handled; - } } - if (uptodate) { + if (uptodate && tree->track_uptodate) { set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } @@ -2507,19 +2529,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void set_page_extent_mapped(struct page *page) +void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); } } -static void set_page_extent_head(struct page *page, unsigned long len) +void set_page_extent_mapped(struct page *page) { - WARN_ON(!PagePrivate(page)); - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } } /* @@ -3008,6 +3035,275 @@ done_unlocked: return 0; } +static int eb_wait(void *word) +{ + io_schedule(); + return 0; +} + +static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static int lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; + btrfs_tree_unlock(eb); + } + } + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_lock(&fs_info->delalloc_lock); + if (fs_info->dirty_metadata_bytes >= eb->len) + fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&fs_info->delalloc_lock); + ret = 1; + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_clear_bit(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_buffer *eb; + int done; + + do { + struct page *page = bvec->bv_page; + + bvec--; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + +} + +static int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + u64 offset = eb->start; + unsigned long i, num_pages; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int ret; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, 0, 0); + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + continue; + } + + if (eb == prev_eb) + continue; + + if (!atomic_inc_not_zero(&eb->refs)) { + WARN_ON(1); + continue; + } + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -3592,26 +3888,7 @@ out: inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; - - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); - - return p; + return eb->pages[i]; } inline unsigned long num_extent_pages(u64 start, u64 len) @@ -3620,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + if (eb->pages && eb->pages != eb->inline_pages) + kfree(eb->pages); + kmem_cache_free(extent_buffer_cache, eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -3635,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; + eb->tree = tree; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3651,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); + atomic_set(&eb->io_pages, 0); + + if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { + struct page **pages; + int num_pages = (len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + pages = kzalloc(num_pages, mask); + if (!pages) { + __free_extent_buffer(eb); + return NULL; + } + eb->pages = pages; + } else { + eb->pages = eb->inline_pages; + } return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) +static int extent_buffer_under_io(struct extent_buffer *eb) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } /* @@ -3676,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; - if (!eb->first_page) - return; + BUG_ON(extent_buffer_under_io(eb)); index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) @@ -3686,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) + if (page) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ page_cache_release(page); + } } while (index != start_idx); } @@ -3700,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + atomic_inc(&eb->refs); + if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + } +} + +static void mark_extent_buffer_accessed(struct extent_buffer *eb) +{ + unsigned long num_pages, i; + + check_buffer_tree_ref(eb); + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + mark_page_accessed(p); + } +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0) + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3718,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3727,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; } - set_page_extent_mapped(p); - mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + mark_extent_buffer_accessed(exists); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + WARN_ON(PageDirty(p)); + page_cache_release(p); } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); + mark_page_accessed(p); + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -3760,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ - if (i != 0) - unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - +again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; @@ -3775,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - /* add one reference for the caller */ - atomic_inc(&exists->refs); + if (!atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + exists = NULL; + goto again; + } spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + mark_extent_buffer_accessed(exists); goto free_eb; } /* add one reference for the tree */ - atomic_inc(&eb->refs); + spin_lock(&eb->refs_lock); + check_buffer_tree_ref(eb); + spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3795,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->first_page); - set_page_extent_head(eb->first_page, eb->len); - if (!page0) - unlock_page(eb->first_page); + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (eb->first_page && !page0) - unlock_page(eb->first_page); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } if (!atomic_dec_and_test(&eb->refs)) return exists; @@ -3820,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3828,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +/* Expects to have eb->eb_lock already held */ +static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + struct extent_io_tree *tree = eb->tree; + + spin_unlock(&eb->refs_lock); + + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return; + } + spin_unlock(&eb->refs_lock); +} + void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb, GFP_ATOMIC); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb, GFP_NOFS); } -void clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +void clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; @@ -3856,10 +4298,6 @@ void clear_extent_buffer_dirty(struct extent_io_tree *tree, lock_page(page); WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -3871,23 +4309,29 @@ void clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } + WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -static int __eb_straddles_pages(u64 start, u64 len) +static int range_straddles_pages(u64 start, u64 len) { if (len < PAGE_CACHE_SIZE) return 1; @@ -3898,25 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len) return 0; } -static int eb_straddles_pages(struct extent_buffer *eb) -{ - return __eb_straddles_pages(eb->start, eb->len); -} - -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3925,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - if (eb_straddles_pages(eb)) { - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); - } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; @@ -3960,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - if (__eb_straddles_pages(start, end - start + 1)) { + if (range_straddles_pages(start, end - start + 1)) { ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) @@ -3982,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree, return pg_uptodate; } -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) +int extent_buffer_uptodate(struct extent_buffer *eb) { - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - if (eb_straddles_pages(eb)) { - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -4024,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (eb_straddles_pages(eb)) { - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -4057,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -4066,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = 0; + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - - WARN_ON(!PagePrivate(page)); - - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, @@ -4107,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4350,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - BUG_ON(areas_overlap(src_off, dst_off, len)); + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4428,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (!areas_overlap(src_offset, dst_offset, len)) { + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4454,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) -{ - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - btrfs_release_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +int try_release_extent_buffer(struct page *page, gfp_t mask) { - u64 start = page_offset(page); struct extent_buffer *eb; - int ret = 1; - spin_lock(&tree->buffer_lock); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) { - spin_unlock(&tree->buffer_lock); - return ret; + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } + spin_unlock(&page->mapping->private_lock); - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); -out: - spin_unlock(&tree->buffer_lock); + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + release_extent_buffer(eb, mask); + + return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3a171c25927..faf10eb57f7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,6 +35,10 @@ #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 +#define EXTENT_BUFFER_WRITEBACK 7 +#define EXTENT_BUFFER_IOERR 8 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -54,6 +58,7 @@ #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; +struct btrfs_root; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -69,9 +74,7 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state); + int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, struct extent_state *state); @@ -97,6 +100,7 @@ struct extent_io_tree { struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; + int track_uptodate; spinlock_t lock; spinlock_t buffer_lock; struct extent_io_ops *ops; @@ -119,16 +123,21 @@ struct extent_state { struct list_head leak_list; }; +#define INLINE_EXTENT_BUFFER_PAGES 16 +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) struct extent_buffer { u64 start; unsigned long len; unsigned long map_start; unsigned long map_len; - struct page *first_page; unsigned long bflags; + struct extent_io_tree *tree; + spinlock_t refs_lock; + atomic_t refs; + atomic_t io_pages; + int failed_mirror; struct list_head leak_list; struct rcu_head rcu_head; - atomic_t refs; pid_t lock_owner; /* count of read lock holders on the extent buffer */ @@ -152,6 +161,9 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; + wait_queue_head_t lock_wq; + struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; + struct page **pages; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -178,7 +190,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_buffer(struct page *page, gfp_t mask); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -239,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, struct list_head *pages, unsigned nr_pages, @@ -250,11 +264,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0); + u64 start, unsigned long len); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 @@ -287,18 +301,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -void clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state); -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state); +void clear_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, @@ -319,4 +326,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num); #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 7ec58bd7c50..a13cf1a96c7 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -19,6 +19,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "print-tree.h" static int find_name_in_backref(struct btrfs_path *path, const char *name, int name_len, struct btrfs_inode_ref **ref_ret) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d6420cca9c8..eb6aec7bbac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -658,8 +658,7 @@ retry: ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, - (u64)-1, &ins, 1); + 0, alloc_hint, &ins, 1); if (ret) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); @@ -884,7 +883,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); + &ins, 1); if (ret < 0) { btrfs_abort_transaction(trans, root, ret); goto out_unlock; @@ -5574,7 +5573,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, - alloc_hint, (u64)-1, &ins, 1); + alloc_hint, &ins, 1); if (ret) { em = ERR_PTR(ret); goto out; @@ -6939,6 +6938,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + ei->io_tree.track_uptodate = 1; + ei->io_failure_tree.track_uptodate = 1; mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -7480,7 +7481,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, (u64)-1, &ins, 1); + 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 22db04550f6..dc5d33146fd 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -54,7 +54,6 @@ * than the 2 started one after another. */ -#define MAX_MIRRORS 2 #define MAX_IN_FLIGHT 6 struct reada_extctl { @@ -71,7 +70,7 @@ struct reada_extent { struct list_head extctl; struct kref refcnt; spinlock_t lock; - struct reada_zone *zones[MAX_MIRRORS]; + struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; struct btrfs_device *scheduled_for; }; @@ -84,7 +83,8 @@ struct reada_zone { spinlock_t lock; int locked; struct btrfs_device *device; - struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl + * self */ int ndevs; struct kref refcnt; }; @@ -365,9 +365,9 @@ again: if (ret || !bbio || length < blocksize) goto error; - if (bbio->num_stripes > MAX_MIRRORS) { + if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", MAX_MIRRORS); + "supported", BTRFS_MAX_MIRRORS); goto error; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0209d8a9ae3..07e59d97551 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -36,37 +36,30 @@ * Future enhancements: * - In case an unrepairable extent is encountered, track which files are * affected and report them - * - In case of a read error on files with nodatasum, map the file and read - * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space */ -struct scrub_bio; -struct scrub_page; +struct scrub_block; struct scrub_dev; -static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_checksum(struct btrfs_work *work); -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer); -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer); -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); -static int scrub_fixup_check(struct scrub_bio *sbio, int ix); -static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page); -static void scrub_fixup(struct scrub_bio *sbio, int ix); #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { + struct scrub_block *sblock; + struct page *page; + struct block_device *bdev; u64 flags; /* extent flags */ u64 generation; - int mirror_num; - int have_csum; + u64 logical; + u64 physical; + struct { + unsigned int mirror_num:8; + unsigned int have_csum:1; + unsigned int io_error:1; + }; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -77,12 +70,25 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page spag[SCRUB_PAGES_PER_BIO]; - u64 count; + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; + int page_count; int next_free; struct btrfs_work work; }; +struct scrub_block { + struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + int page_count; + atomic_t outstanding_pages; + atomic_t ref_count; /* free mem on transition to zero */ + struct scrub_dev *sdev; + struct { + unsigned int header_error:1; + unsigned int checksum_error:1; + unsigned int no_io_error_seen:1; + }; +}; + struct scrub_dev { struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; struct btrfs_device *dev; @@ -96,6 +102,10 @@ struct scrub_dev { struct list_head csum_list; atomic_t cancel_req; int readonly; + int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + u32 sectorsize; + u32 nodesize; + u32 leafsize; /* * statistics */ @@ -124,6 +134,43 @@ struct scrub_warning { int scratch_bufsize; }; + +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblock); +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size); +static void scrub_complete_bio_end_io(struct bio *bio, int err); +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write); +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write); +static int scrub_checksum_data(struct scrub_block *sblock); +static int scrub_checksum_tree_block(struct scrub_block *sblock); +static int scrub_checksum_super(struct scrub_block *sblock); +static void scrub_block_get(struct scrub_block *sblock); +static void scrub_block_put(struct scrub_block *sblock); +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage); +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force); +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_block_complete(struct scrub_block *sblock); + + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -135,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } -static void scrub_free_bio(struct bio *bio) -{ - int i; - struct page *last_page = NULL; - - if (!bio) - return; - - for (i = 0; i < bio->bi_vcnt; ++i) { - if (bio->bi_io_vec[i].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[i].bv_page; - __free_page(last_page); - } - bio_put(bio); -} - static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; @@ -159,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) if (!sdev) return; + /* this can happen when scrub is cancelled */ + if (sdev->curr != -1) { + struct scrub_bio *sbio = sdev->bios[sdev->curr]; + + for (i = 0; i < sbio->page_count; i++) { + BUG_ON(!sbio->pagev[i]); + BUG_ON(!sbio->pagev[i]->page); + scrub_block_put(sbio->pagev[i]->sblock); + } + bio_put(sbio->bio); + } + for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; if (!sbio) break; - - scrub_free_bio(sbio->bio); kfree(sbio); } @@ -179,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) struct scrub_dev *sdev; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + int pages_per_bio; + pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, + bio_get_nr_vecs(dev->bdev)); sdev = kzalloc(sizeof(*sdev), GFP_NOFS); if (!sdev) goto nomem; sdev->dev = dev; + sdev->pages_per_bio = pages_per_bio; + sdev->curr = -1; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio; @@ -194,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sbio->index = i; sbio->sdev = sdev; - sbio->count = 0; - sbio->work.func = scrub_checksum; + sbio->page_count = 0; + sbio->work.func = scrub_bio_end_io_worker; if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -203,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->bios[i]->next_free = -1; } sdev->first_free = 0; - sdev->curr = -1; + sdev->nodesize = dev->dev_root->nodesize; + sdev->leafsize = dev->dev_root->leafsize; + sdev->sectorsize = dev->dev_root->sectorsize; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); @@ -294,10 +341,9 @@ err: return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, - int ix) +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_device *dev = sblock->sdev->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -316,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; - swarn.logical = sbio->logical + ix * PAGE_SIZE; + BUG_ON(sblock->page_count < 1); + swarn.sector = (sblock->pagev[0].physical) >> 9; + swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; swarn.dev = dev; swarn.msg_bufsize = bufsize; @@ -342,7 +389,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING "%s at logical %llu on dev %s, " + printk(KERN_WARNING + "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, dev->name, (unsigned long long)swarn.sector, @@ -531,9 +579,9 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR "btrfs: unable to fixup " - "(nodatasum) error at logical %llu\n", - fixup->logical); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + (unsigned long long)fixup->logical, sdev->dev->name); } btrfs_free_path(path); @@ -550,91 +598,168 @@ out: } /* - * scrub_recheck_error gets called when either verification of the page - * failed or the bio failed to read, e.g. with EIO. In the latter case, - * recheck_error gets called for every page in the bio, even though only - * one may be bad + * scrub_handle_errored_block gets called when either verification of the + * pages failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all pages in the bio, even though only one + * may be bad. + * The goal of this function is to repair the errored block by using the + * contents of one of the mirrors. */ -static int scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sbio->sdev; - u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + struct scrub_dev *sdev = sblock_to_check->sdev; + struct btrfs_fs_info *fs_info; + u64 length; + u64 logical; + u64 generation; + unsigned int failed_mirror_index; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + struct scrub_block *sblock_bad; + int ret; + int mirror_index; + int page_num; + int success; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); + DEFAULT_RATELIMIT_BURST); + + BUG_ON(sblock_to_check->page_count < 1); + fs_info = sdev->dev->dev_root->fs_info; + length = sblock_to_check->page_count * PAGE_SIZE; + logical = sblock_to_check->pagev[0].logical; + generation = sblock_to_check->pagev[0].generation; + BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0].flags & + BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock_to_check->pagev[0].have_csum; + csum = sblock_to_check->pagev[0].csum; - if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, - sbio->bio->bi_io_vec[ix].bv_page) == 0) { - if (scrub_fixup_check(sbio, ix) == 0) - return 0; - } - if (__ratelimit(&_rs)) - scrub_print_warning("i/o error", sbio, ix); - } else { - if (__ratelimit(&_rs)) - scrub_print_warning("checksum error", sbio, ix); + /* + * read all mirrors one after the other. This includes to + * re-read the extent or metadata block that failed (that was + * the cause that this fixup code is called) another time, + * page by page this time in order to know which pages + * caused I/O errors and which ones are good (for all mirrors). + * It is the goal to handle the situation when more than one + * mirror contains I/O errors, but the errors do not + * overlap, i.e. the data can be repaired by selecting the + * pages from those mirrors without I/O error on the + * particular pages. One example (with blocks >= 2 * PAGE_SIZE) + * would be that mirror #1 has an I/O error on the first page, + * the second page is good, and mirror #2 has an I/O error on + * the second page, but the first page is good. + * Then the first page of the first mirror can be repaired by + * taking the first page of the second mirror, and the + * second page of the second mirror can be repaired by + * copying the contents of the 2nd page of the 1st mirror. + * One more note: if the pages of one mirror contain I/O + * errors, the checksum cannot be verified. In order to get + * the best data for repairing, the first attempt is to find + * a mirror without I/O errors and with a validated checksum. + * Only if this is not possible, the pages are picked from + * mirrors with I/O errors without considering the checksum. + * If the latter is the case, at the end, the checksum of the + * repaired area is verified in order to correctly maintain + * the statistics. + */ + + sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * + sizeof(*sblocks_for_recheck), + GFP_NOFS); + if (!sblocks_for_recheck) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; } - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); + /* setup the context, map the logical blocks and alloc the pages */ + ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + logical, sblocks_for_recheck); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } + BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); + sblock_bad = sblocks_for_recheck + failed_mirror_index; - scrub_fixup(sbio, ix); - return 1; -} + /* build and submit the bios for the failed mirror, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sdev->csum_size); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } -static int scrub_fixup_check(struct scrub_bio *sbio, int ix) -{ - int ret = 1; - struct page *page; - void *buffer; - u64 flags = sbio->spag[ix].flags; + if (!sblock_bad->header_error && !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) { + /* + * the error disappeared after reading page by page, or + * the area was part of a huge bio and other parts of the + * bio caused I/O errors, or the block layer merged several + * read requests into one and the error is caused by a + * different bio (usually one of the two latter cases is + * the cause) + */ + spin_lock(&sdev->stat_lock); + sdev->stat.unverified_errors++; + spin_unlock(&sdev->stat_lock); - page = sbio->bio->bi_io_vec[ix].bv_page; - buffer = kmap_atomic(page, KM_USER0); - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sbio->sdev, - sbio->spag + ix, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sbio->sdev, - sbio->spag + ix, - sbio->logical + ix * PAGE_SIZE, - buffer); - } else { - WARN_ON(1); + goto out; } - kunmap_atomic(buffer, KM_USER0); - return ret; -} + if (!sblock_bad->no_io_error_seen) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sblock_to_check); + } else if (sblock_bad->checksum_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.csum_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sblock_to_check); + } else if (sblock_bad->header_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.verify_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum/header error", + sblock_to_check); + } -static void scrub_fixup_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} + if (sdev->readonly) + goto did_not_correct_error; + + if (!is_metadata && !have_csum) { + struct scrub_fixup_nodatasum *fixup_nodatasum; -static void scrub_fixup(struct scrub_bio *sbio, int ix) -{ - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_bio *bbio = NULL; - struct scrub_fixup_nodatasum *fixup; - u64 logical = sbio->logical + ix * PAGE_SIZE; - u64 length; - int i; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); - - if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && - (sbio->spag[ix].have_csum == 0)) { - fixup = kzalloc(sizeof(*fixup), GFP_NOFS); - if (!fixup) - goto uncorrectable; - fixup->sdev = sdev; - fixup->logical = logical; - fixup->root = fs_info->extent_root; - fixup->mirror_num = sbio->spag[ix].mirror_num; + /* + * !is_metadata and !have_csum, this means that the data + * might not be COW'ed, that it might be modified + * concurrently. The general strategy to work on the + * commit root does not help in the case when COW is not + * used. + */ + fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); + if (!fixup_nodatasum) + goto did_not_correct_error; + fixup_nodatasum->sdev = sdev; + fixup_nodatasum->logical = logical; + fixup_nodatasum->root = fs_info->extent_root; + fixup_nodatasum->mirror_num = failed_mirror_index + 1; /* * increment scrubs_running to prevent cancel requests from * completing as long as a fixup worker is running. we must also @@ -649,235 +774,529 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); atomic_inc(&sdev->fixup_cnt); - fixup->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); - return; + fixup_nodatasum->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, + &fixup_nodatasum->work); + goto out; } - length = PAGE_SIZE; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &bbio, 0); - if (ret || !bbio || length < PAGE_SIZE) { - printk(KERN_ERR - "scrub_fixup: btrfs_map_block failed us for %llu\n", - (unsigned long long)logical); - WARN_ON(1); - kfree(bbio); - return; + /* + * now build and submit the bios for the other mirrors, check + * checksums + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (mirror_index == failed_mirror_index) + continue; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, + sblocks_for_recheck + mirror_index, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (ret) + goto did_not_correct_error; } - if (bbio->num_stripes == 1) - /* there aren't any replicas */ - goto uncorrectable; + /* + * first try to pick the mirror which is completely without I/O + * errors and also does not have a checksum error. + * If one is found, and if a checksum is present, the full block + * that is known to contain an error is rewritten. Afterwards + * the block is known to be corrected. + * If a mirror is found which is completely correct, and no + * checksum is present, only those pages are rewritten that had + * an I/O error in the block to be repaired, since it cannot be + * determined, which copy of the other pages is better (and it + * could happen otherwise that a correct page would be + * overwritten by a bad one). + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + + if (!sblock_other->header_error && + !sblock_other->checksum_error && + sblock_other->no_io_error_seen) { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy(sblock_bad, + sblock_other, + force_write); + if (0 == ret) + goto corrected_error; + } + } /* - * first find a good copy + * in case of I/O errors in the area that is supposed to be + * repaired, continue by picking good copies of those pages. + * Select the good pages from mirrors to rewrite bad pages from + * the area to fix. Afterwards verify the checksum of the block + * that is supposed to be repaired. This verification step is + * only done for the purpose of statistic counting and for the + * final scrub report, whether errors remain. + * A perfect algorithm could make use of the checksum and try + * all possible combinations of pages from the different mirrors + * until the checksum verification succeeds. For example, when + * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * of mirror #2 is readable but the final checksum test fails, + * then the 2nd page of mirror #3 could be tried, whether now + * the final checksum succeedes. But this would be a rare + * exception and is therefore not implemented. At least it is + * avoided that the good copy is overwritten. + * A more useful improvement would be to pick the sectors + * without I/O error based on sector sizes (512 bytes on legacy + * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * mirror could be repaired by taking 512 byte of a different + * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * area are unreadable. */ - for (i = 0; i < bbio->num_stripes; ++i) { - if (i + 1 == sbio->spag[ix].mirror_num) - continue; - if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, - bbio->stripes[i].physical >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, this is not a good copy */ + /* can only fix I/O errors from here on */ + if (sblock_bad->no_io_error_seen) + goto did_not_correct_error; + + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + + if (!page_bad->io_error) continue; + + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + struct scrub_page *page_other = sblock_other->pagev + + page_num; + + if (!page_other->io_error) { + ret = scrub_repair_page_from_good_copy( + sblock_bad, sblock_other, page_num, 0); + if (0 == ret) { + page_bad->io_error = 0; + break; /* succeeded for this page */ + } + } } - if (scrub_fixup_check(sbio, ix) == 0) - break; + if (page_bad->io_error) { + /* did not find a mirror to copy the page from */ + success = 0; + } } - if (i == bbio->num_stripes) - goto uncorrectable; - if (!sdev->readonly) { - /* - * bi_io_vec[ix].bv_page now contains good data, write it back - */ - if (scrub_fixup_io(WRITE, sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, writeback failed, give up */ - goto uncorrectable; + if (success) { + if (is_metadata || have_csum) { + /* + * need to verify the checksum now that all + * sectors on disk are repaired (the write + * request for data to be repaired is on its way). + * Just be lazy and use scrub_recheck_block() + * which re-reads the data before the checksum + * is verified, but most likely the data comes out + * of the page cache. + */ + ret = scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (!ret && !sblock_bad->header_error && + !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) + goto corrected_error; + else + goto did_not_correct_error; + } else { +corrected_error: + spin_lock(&sdev->stat_lock); + sdev->stat.corrected_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: fixed up error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } + } else { +did_not_correct_error: + spin_lock(&sdev->stat_lock); + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); +out: + if (sblocks_for_recheck) { + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; + mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck + + mirror_index; + int page_index; + + for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; + page_index++) + if (sblock->pagev[page_index].page) + __free_page( + sblock->pagev[page_index].page); + } + kfree(sblocks_for_recheck); + } - printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", - (unsigned long long)logical); - return; + return 0; +} -uncorrectable: - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblocks_for_recheck) +{ + int page_index; + int mirror_index; + int ret; + + /* + * note: the three members sdev, ref_count and outstanding_pages + * are not used (and not set) in the blocks that are used for + * the recheck procedure + */ + + page_index = 0; + while (length > 0) { + u64 sublen = min_t(u64, length, PAGE_SIZE); + u64 mapped_length = sublen; + struct btrfs_bio *bbio = NULL; + + /* + * with a length of PAGE_SIZE, each returned stripe + * represents one mirror + */ + ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + &bbio, 0); + if (ret || !bbio || mapped_length < sublen) { + kfree(bbio); + return -EIO; + } + + BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + mirror_index++) { + struct scrub_block *sblock; + struct scrub_page *page; + + if (mirror_index >= BTRFS_MAX_MIRRORS) + continue; + + sblock = sblocks_for_recheck + mirror_index; + page = sblock->pagev + page_index; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + page->bdev = bbio->stripes[mirror_index].dev->bdev; + page->mirror_num = mirror_index + 1; + page->page = alloc_page(GFP_NOFS); + if (!page->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; + } + sblock->page_count++; + } + kfree(bbio); + length -= sublen; + logical += sublen; + page_index++; + } - printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " - "logical %llu\n", (unsigned long long)logical); + return 0; } -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page) +/* + * this function will check the on disk data for checksum errors, header + * errors and read I/O errors. If any I/O errors happen, the exact pages + * which are errored are marked as being bad. The goal is to enable scrub + * to take those pages that are not errored from all the mirrors so that + * the pages that are errored in the just handled mirror can be repaired. + */ +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { - struct bio *bio = NULL; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); + int page_num; - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = bdev; - bio->bi_sector = sector; - bio_add_page(bio, page, PAGE_SIZE, 0); - bio->bi_end_io = scrub_fixup_end_io; - bio->bi_private = &complete; - btrfsic_submit_bio(rw, bio); + sblock->no_io_error_seen = 1; + sblock->header_error = 0; + sblock->checksum_error = 0; - /* this will also unplug the queue */ - wait_for_completion(&complete); + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct bio *bio; + int ret; + struct scrub_page *page = sblock->pagev + page_num; + DECLARE_COMPLETION_ONSTACK(complete); + + BUG_ON(!page->page); + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = page->bdev; + bio->bi_sector = page->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; + } + btrfsic_submit_bio(READ, bio); - ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); - bio_put(bio); - return ret; + /* this will also unplug the queue */ + wait_for_completion(&complete); + + page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + sblock->no_io_error_seen = 0; + bio_put(bio); + } + + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + csum_size); + + return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size) { - struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + int page_num; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + struct btrfs_root *root = fs_info->extent_root; + void *mapped_buffer; + + BUG_ON(!sblock->pagev[0].page); + if (is_metadata) { + struct btrfs_header *h; + + mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); + h = (struct btrfs_header *)mapped_buffer; + + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + generation != le64_to_cpu(h->generation) || + memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + sblock->header_error = 1; + csum = h->csum; + } else { + if (!have_csum) + return; - sbio->err = err; - sbio->bio = bio; + mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); + } - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + for (page_num = 0;;) { + if (page_num == 0 && is_metadata) + crc = btrfs_csum_data(root, + ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, + crc, PAGE_SIZE - BTRFS_CSUM_SIZE); + else + crc = btrfs_csum_data(root, mapped_buffer, crc, + PAGE_SIZE); + + kunmap_atomic(mapped_buffer, KM_USER0); + page_num++; + if (page_num >= sblock->page_count) + break; + BUG_ON(!sblock->pagev[page_num].page); + + mapped_buffer = kmap_atomic(sblock->pagev[page_num].page, + KM_USER0); + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, csum, csum_size)) + sblock->checksum_error = 1; } -static void scrub_checksum(struct btrfs_work *work) +static void scrub_complete_bio_end_io(struct bio *bio, int err) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; - struct page *page; - void *buffer; - int i; - u64 flags; - u64 logical; - int ret; + complete((struct completion *)bio->bi_private); +} - if (sbio->err) { - ret = 0; - for (i = 0; i < sbio->count; ++i) - ret |= scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write) +{ + int page_num; + int ret = 0; - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + int ret_sub; - for (i = 0; i < sbio->count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - goto out; + ret_sub = scrub_repair_page_from_good_copy(sblock_bad, + sblock_good, + page_num, + force_write); + if (ret_sub) + ret = ret_sub; } - for (i = 0; i < sbio->count; ++i) { - page = sbio->bio->bi_io_vec[i].bv_page; - buffer = kmap_atomic(page, KM_USER0); - flags = sbio->spag[i].flags; - logical = sbio->logical + i * PAGE_SIZE; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sdev, sbio->spag + i, - logical, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { - BUG_ON(i); - (void)scrub_checksum_super(sbio, buffer); - } else { - WARN_ON(1); - } - kunmap_atomic(buffer, KM_USER0); - if (ret) { - ret = scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } + + return ret; +} + +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write) +{ + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_good = sblock_good->pagev + page_num; + + BUG_ON(sblock_bad->pagev[page_num].page == NULL); + BUG_ON(sblock_good->pagev[page_num].page == NULL); + if (force_write || sblock_bad->header_error || + sblock_bad->checksum_error || page_bad->io_error) { + struct bio *bio; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = page_bad->bdev; + bio->bi_sector = page_bad->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; } + btrfsic_submit_bio(WRITE, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + bio_put(bio); } -out: - scrub_free_bio(sbio->bio); - sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + return 0; } -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer) +static void scrub_checksum(struct scrub_block *sblock) { + u64 flags; + int ret; + + BUG_ON(sblock->page_count < 1); + flags = sblock->pagev[0].flags; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) + ret = scrub_checksum_data(sblock); + else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = scrub_checksum_tree_block(sblock); + else if (flags & BTRFS_EXTENT_FLAG_SUPER) + (void)scrub_checksum_super(sblock); + else + WARN_ON(1); + if (ret) + scrub_handle_errored_block(sblock); +} + +static int scrub_checksum_data(struct scrub_block *sblock) +{ + struct scrub_dev *sdev = sblock->sdev; u8 csum[BTRFS_CSUM_SIZE]; + u8 *on_disk_csum; + struct page *page; + void *buffer; u32 crc = ~(u32)0; int fail = 0; struct btrfs_root *root = sdev->dev->dev_root; + u64 len; + int index; - if (!spag->have_csum) + BUG_ON(sblock->page_count < 1); + if (!sblock->pagev[0].have_csum) return 0; - crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); + on_disk_csum = sblock->pagev[0].csum; + page = sblock->pagev[0].page; + buffer = kmap_atomic(page, KM_USER0); + + len = sdev->sectorsize; + index = 0; + for (;;) { + u64 l = min_t(u64, len, PAGE_SIZE); + + crc = btrfs_csum_data(root, buffer, crc, l); + kunmap_atomic(buffer, KM_USER0); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + buffer = kmap_atomic(page, KM_USER0); + } + btrfs_csum_final(crc, csum); - if (memcmp(csum, spag->csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - spin_lock(&sdev->stat_lock); - ++sdev->stat.data_extents_scrubbed; - sdev->stat.data_bytes_scrubbed += PAGE_SIZE; - if (fail) + if (fail) { + spin_lock(&sdev->stat_lock); ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); + spin_unlock(&sdev->stat_lock); + } return fail; } -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer) +static int scrub_checksum_tree_block(struct scrub_block *sblock) { + struct scrub_dev *sdev = sblock->sdev; struct btrfs_header *h; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; int crc_fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + h = (struct btrfs_header *)mapped_buffer; + memcpy(on_disk_csum, h->csum, sdev->csum_size); /* * we don't use the getter functions here, as we * a) don't have an extent buffer and * b) the page is already kmapped */ - h = (struct btrfs_header *)buffer; - if (logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) ++fail; - if (spag->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -887,51 +1306,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, h->csum, sdev->csum_size)) + BUG_ON(sdev->nodesize != sdev->leafsize); + len = sdev->nodesize - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer, KM_USER0); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++crc_fail; - spin_lock(&sdev->stat_lock); - ++sdev->stat.tree_extents_scrubbed; - sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); + if (crc_fail || fail) { + spin_lock(&sdev->stat_lock); + if (crc_fail) + ++sdev->stat.csum_errors; + if (fail) + ++sdev->stat.verify_errors; + spin_unlock(&sdev->stat_lock); + } return fail || crc_fail; } -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) +static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - u64 logical; - struct scrub_dev *sdev = sbio->sdev; + struct scrub_dev *sdev = sblock->sdev; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; + u64 len; + int index; - s = (struct btrfs_super_block *)buffer; - logical = sbio->logical; + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + s = (struct btrfs_super_block *)mapped_buffer; + memcpy(on_disk_csum, s->csum, sdev->csum_size); - if (logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail; - if (sbio->spag[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) ++fail; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer, KM_USER0); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++fail; if (fail) { @@ -948,6 +1415,23 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) return fail; } +static void scrub_block_get(struct scrub_block *sblock) +{ + atomic_inc(&sblock->ref_count); +} + +static void scrub_block_put(struct scrub_block *sblock) +{ + if (atomic_dec_and_test(&sblock->ref_count)) { + int i; + + for (i = 0; i < sblock->page_count; i++) + if (sblock->pagev[i].page) + __free_page(sblock->pagev[i].page); + kfree(sblock); + } +} + static void scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; @@ -956,19 +1440,17 @@ static void scrub_submit(struct scrub_dev *sdev) return; sbio = sdev->bios[sdev->curr]; - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); } -static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage) { + struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; - struct page *page; int ret; again: @@ -981,7 +1463,7 @@ again: if (sdev->curr != -1) { sdev->first_free = sdev->bios[sdev->curr]->next_free; sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->count = 0; + sdev->bios[sdev->curr]->page_count = 0; spin_unlock(&sdev->list_lock); } else { spin_unlock(&sdev->list_lock); @@ -989,53 +1471,200 @@ again: } } sbio = sdev->bios[sdev->curr]; - if (sbio->count == 0) { + if (sbio->page_count == 0) { struct bio *bio; - sbio->physical = physical; - sbio->logical = logical; - bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - return -ENOMEM; + sbio->physical = spage->physical; + sbio->logical = spage->logical; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + if (!bio) + return -ENOMEM; + sbio->bio = bio; + } bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_sector = spage->physical >> 9; sbio->err = 0; - sbio->bio = bio; - } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || - sbio->logical + sbio->count * PAGE_SIZE != logical) { + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { scrub_submit(sdev); goto again; } - sbio->spag[sbio->count].flags = flags; - sbio->spag[sbio->count].generation = gen; - sbio->spag[sbio->count].have_csum = 0; - sbio->spag[sbio->count].mirror_num = mirror_num; - - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); + sbio->pagev[sbio->page_count] = spage; + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + return -EIO; + } scrub_submit(sdev); goto again; } - if (csum) { - sbio->spag[sbio->count].have_csum = 1; - memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); + scrub_block_get(sblock); /* one for the added page */ + atomic_inc(&sblock->outstanding_pages); + sbio->page_count++; + if (sbio->page_count == sdev->pages_per_bio) + scrub_submit(sdev); + + return 0; +} + +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force) +{ + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sdev = sdev; + sblock->no_io_error_seen = 1; + + for (index = 0; len > 0; index++) { + struct scrub_page *spage = sblock->pagev + index; + u64 l = min_t(u64, len, PAGE_SIZE); + + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + while (index > 0) { + index--; + __free_page(sblock->pagev[index].page); + } + kfree(sblock); + return -ENOMEM; + } + spage->sblock = sblock; + spage->bdev = sdev->dev->bdev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sdev->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + len -= l; + logical += l; + physical += l; + } + + BUG_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev + index; + int ret; + + ret = scrub_add_page_to_bio(sdev, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } } - ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) + + if (force) scrub_submit(sdev); + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); return 0; } +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); +} + +static void scrub_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_dev *sdev = sbio->sdev; + int i; + + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + if (sbio->err) { + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + spage->sblock->no_io_error_seen = 0; + } + } + + /* now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + struct scrub_block *sblock = spage->sblock; + + if (atomic_dec_and_test(&sblock->outstanding_pages)) + scrub_block_complete(sblock); + scrub_block_put(sblock); + } + + if (sbio->err) { + /* what is this good for??? */ + sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bio->bi_phys_segments = 0; + sbio->bio->bi_idx = 0; + + for (i = 0; i < sbio->page_count; i++) { + struct bio_vec *bi; + bi = &sbio->bio->bi_io_vec[i]; + bi->bv_offset = 0; + bi->bv_len = PAGE_SIZE; + } + } + + bio_put(sbio->bio); + sbio->bio = NULL; + spin_lock(&sdev->list_lock); + sbio->next_free = sdev->first_free; + sdev->first_free = sbio->index; + spin_unlock(&sdev->list_lock); + atomic_dec(&sdev->in_flight); + wake_up(&sdev->list_wait); +} + +static void scrub_block_complete(struct scrub_block *sblock) +{ + if (!sblock->no_io_error_seen) + scrub_handle_errored_block(sblock); + else + scrub_checksum(sblock); +} + static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum) { @@ -1043,7 +1672,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, int ret = 0; unsigned long i; unsigned long num_sectors; - u32 sectorsize = sdev->dev->dev_root->sectorsize; while (!list_empty(&sdev->csum_list)) { sum = list_first_entry(&sdev->csum_list, @@ -1061,7 +1689,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sectorsize; + num_sectors = sum->len / sdev->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { memcpy(csum, &sum->sums[i].sum, sdev->csum_size); @@ -1082,9 +1710,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, { int ret; u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sdev->sectorsize; + spin_lock(&sdev->stat_lock); + sdev->stat.data_extents_scrubbed++; + sdev->stat.data_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + BUG_ON(sdev->nodesize != sdev->leafsize); + blocksize = sdev->nodesize; + spin_lock(&sdev->stat_lock); + sdev->stat.tree_extents_scrubbed++; + sdev->stat.tree_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else { + blocksize = sdev->sectorsize; + BUG_ON(1); + } while (len) { - u64 l = min_t(u64, len, PAGE_SIZE); + u64 l = min_t(u64, len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -1093,8 +1740,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (have_csum == 0) ++sdev->stat.no_csum; } - ret = scrub_page(sdev, logical, l, physical, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + ret = scrub_pages(sdev, logical, l, physical, flags, gen, + mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; len -= l; @@ -1159,6 +1806,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; + /* + * work on commit root. The related disk blocks are static as + * long as COW is applied. This means, it is save to rewrite + * them to repair disk errors without any race conditions + */ path->search_commit_root = 1; path->skip_locking = 1; @@ -1512,11 +2164,11 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } @@ -1575,10 +2227,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, /* * check some assumptions */ - if (root->sectorsize != PAGE_SIZE || - root->sectorsize != root->leafsize || - root->sectorsize != root->nodesize) { - printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); + if (root->nodesize != root->leafsize) { + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + root->nodesize, root->leafsize); + return -EINVAL; + } + + if (root->nodesize > BTRFS_STRIPE_LEN) { + /* + * in this case scrub is unable to calculate the checksum + * the way scrub is implemented. Do not handle this + * situation at all because it won't ever happen. + */ + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + root->nodesize, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (root->sectorsize != PAGE_SIZE) { + /* not supported for data w/o checksums */ + printk(KERN_ERR + "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", + root->sectorsize, (unsigned long long)PAGE_SIZE); return -EINVAL; } @@ -1732,6 +2404,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) return 0; } + int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) { struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index bc1f6ad1844..c6ffa581241 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -44,8 +44,9 @@ #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, \ + type *s, struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -54,9 +55,18 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ + unsigned long mem_len = sizeof(((type *)0)->member); \ u##bits res; \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ + kaddr = token->kaddr; \ + p = (type *)(kaddr + part_offset - token->offset); \ + res = le##bits##_to_cpu(p->member); \ + return res; \ + } \ err = map_private_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ + mem_len, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits leres; \ @@ -65,10 +75,15 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ res = le##bits##_to_cpu(p->member); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ return res; \ } \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ +void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -77,8 +92,17 @@ void btrfs_set_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ + unsigned long mem_len = sizeof(((type *)0)->member); \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ + kaddr = token->kaddr; \ + p = (type *)(kaddr + part_offset - token->offset); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ err = map_private_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ + mem_len, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits val2; \ @@ -88,7 +112,22 @@ void btrfs_set_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ p->member = cpu_to_le##bits(val); \ -} + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + btrfs_set_token_##name(eb, s, val, NULL); \ +} \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + return btrfs_token_##name(eb, s, NULL); \ +} \ #include "ctree.h" diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 57305e88ea8..d64cd6cbdbb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4434,7 +4434,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) * to silence the warning eg. on PowerPC 64. */ if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->first_page); + SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); |