diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 252 |
1 files changed, 178 insertions, 74 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f351422938e..d26767246d2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, return sh; } +/* Determine if 'data_offset' or 'new_data_offset' should be used + * in this stripe_head. + */ +static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) +{ + sector_t progress = conf->reshape_progress; + /* Need a memory barrier to make sure we see the value + * of conf->generation, or ->data_offset that was set before + * reshape_progress was updated. + */ + smp_rmb(); + if (progress == MaxSector) + return 0; + if (sh->generation == conf->generation - 1) + return 0; + /* We are in a reshape, and this is a new-generation stripe, + * so use new_data_offset. + */ + return 1; +} + static void raid5_end_read_request(struct bio *bi, int error); static void @@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) replace_only = 1; } else continue; + if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) + rw |= REQ_SYNC; bi = &sh->dev[i].req; rbi = &sh->dev[i].rreq; /* For writing to replacement */ @@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; + if (use_new_offset(conf, sh)) + bi->bi_sector = (sh->sector + + rdev->new_data_offset); + else + bi->bi_sector = (sh->sector + + rdev->data_offset); bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_idx = 0; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, rbi->bi_rw, i); atomic_inc(&sh->count); - rbi->bi_sector = sh->sector + rrdev->data_offset; + if (use_new_offset(conf, sh)) + rbi->bi_sector = (sh->sector + + rrdev->new_data_offset); + else + rbi->bi_sector = (sh->sector + + rrdev->data_offset); rbi->bi_flags = 1 << BIO_UPTODATE; rbi->bi_idx = 0; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) dev->sector + STRIPE_SECTORS) { if (wbi->bi_rw & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); + if (wbi->bi_rw & REQ_SYNC) + set_bit(R5_SyncIO, &dev->flags); tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false; + bool fua = false, sync = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - for (i = disks; i--; ) + for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); + if (sync) + set_bit(R5_SyncIO, &dev->flags); } } @@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; struct md_rdev *rdev = NULL; - + sector_t s; for (i=0 ; i<disks; i++) if (bi == &sh->dev[i].req) @@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) if (!rdev) rdev = conf->disks[i].rdev; + if (use_new_offset(conf, sh)) + s = sh->sector + rdev->new_data_offset; + else + s = sh->sector + rdev->data_offset; if (uptodate) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { @@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error corrected" " (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdevname(rdev->bdev, b)); atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); @@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error on replacement device " "(sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdn); else if (conf->mddev->degraded >= conf->max_degraded) printk_ratelimited( @@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error not correctable " "(sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ @@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) "md/raid:%s: read error NOT corrected!! " "(sector %llu on %s).\n", mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), + (unsigned long long)s, bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) @@ -3561,7 +3600,7 @@ finish: if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); + STRIPE_SECTORS, 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -3570,7 +3609,7 @@ finish: /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); + STRIPE_SECTORS, 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + /* No reshape active, so we can trust rdev->data_offset */ align_bi->bi_sector += rdev->data_offset; if (!bio_fits_rdev(align_bi) || @@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) plugged = mddev_check_plugged(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); - int disks, data_disks; int previous; retry: previous = 0; - disks = conf->raid_disks; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); if (unlikely(conf->reshape_progress != MaxSector)) { /* spinlock is needed as reshape_progress may be @@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) * to check again. */ spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0 + if (mddev->reshape_backwards ? logical_sector < conf->reshape_progress : logical_sector >= conf->reshape_progress) { - disks = conf->previous_raid_disks; previous = 1; } else { - if (mddev->delta_disks < 0 + if (mddev->reshape_backwards ? logical_sector < conf->reshape_safe : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); @@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) } spin_unlock_irq(&conf->device_lock); } - data_disks = disks - conf->max_degraded; new_sector = raid5_compute_sector(conf, logical_sector, previous, @@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0 + if (mddev->reshape_backwards ? logical_sector >= conf->reshape_progress : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ @@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ - if (mddev->delta_disks < 0 && + if (mddev->reshape_backwards && conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; - } else if (mddev->delta_disks >= 0 && + } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); @@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk else reshape_sectors = mddev->chunk_sectors; - /* we update the metadata when there is more than 3Meg - * in the block range (that is rather arbitrary, should - * probably be time based) or when the data about to be - * copied would over-write the source of the data at - * the front of the range. - * i.e. one new_stripe along from reshape_progress new_maps - * to after where reshape_safe old_maps to + /* We update the metadata at least every 10 seconds, or when + * the data about to be copied would over-write the source of + * the data at the front of the range. i.e. one new_stripe + * along from reshape_progress new_maps to after where + * reshape_safe old_maps to */ writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); @@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_div(readpos, data_disks); safepos = conf->reshape_safe; sector_div(safepos, data_disks); - if (mddev->delta_disks < 0) { + if (mddev->reshape_backwards) { writepos -= min_t(sector_t, reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; @@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos -= min_t(sector_t, reshape_sectors, safepos); } + /* Having calculated the 'writepos' possibly use it + * to set 'stripe_addr' which is where we will write to. + */ + if (mddev->reshape_backwards) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + /* 'writepos' is the most advanced device address we might write. * 'readpos' is the least advanced device address we might read. * 'safepos' is the least address recorded in the metadata as having * been reshaped. - * If 'readpos' is behind 'writepos', then there is no way that we can + * If there is a min_offset_diff, these are adjusted either by + * increasing the safepos/readpos if diff is negative, or + * increasing writepos if diff is positive. + * If 'readpos' is then behind 'writepos', there is no way that we can * ensure safety in the face of a crash - that must be done by userspace * making a backup of the data. So in that case there is no particular * rush to update metadata. @@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * Maybe that number should be configurable, but I'm not sure it is * worth it.... maybe it could be a multiple of safemode_delay??? */ - if ((mddev->delta_disks < 0 + if (conf->min_offset_diff < 0) { + safepos += -conf->min_offset_diff; + readpos += -conf->min_offset_diff; + } else + writepos += conf->min_offset_diff; + + if ((mddev->reshape_backwards ? (safepos > writepos && readpos < writepos) : (safepos < writepos && readpos > writepos)) || time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { @@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - if (mddev->delta_disks < 0) { - BUG_ON(conf->reshape_progress == 0); - stripe_addr = writepos; - BUG_ON((mddev->dev_sectors & - ~((sector_t)reshape_sectors - 1)) - - reshape_sectors - stripe_addr - != sector_nr); - } else { - BUG_ON(writepos != sector_nr + reshape_sectors); - stripe_addr = sector_nr; - } INIT_LIST_HEAD(&stripes); for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; @@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); - if (mddev->delta_disks < 0) + if (mddev->reshape_backwards) conf->reshape_progress -= reshape_sectors * new_data_disks; else conf->reshape_progress += reshape_sectors * new_data_disks; @@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) struct md_rdev *rdev; sector_t reshape_offset = 0; int i; + long long min_offset_diff = 0; + int first = 1; if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "md/raid:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); + + rdev_for_each(rdev, mddev) { + long long diff; + if (rdev->raid_disk < 0) + continue; + diff = (rdev->new_data_offset - rdev->data_offset); + if (first) { + min_offset_diff = diff; + first = 0; + } else if (mddev->reshape_backwards && + diff < min_offset_diff) + min_offset_diff = diff; + else if (!mddev->reshape_backwards && + diff > min_offset_diff) + min_offset_diff = diff; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself + * Difficulties arise if the stripe we would write to + * next is at or after the stripe we would read from next. + * For a reshape that changes the number of devices, this + * is only possible for a very short time, and mdadm makes + * sure that time appears to have past before assembling + * the array. So we fail if that time hasn't passed. + * For a reshape that keeps the number of devices the same + * mdadm must be monitoring the reshape can keeping the + * critical areas read-only and backed up. It will start + * the array in read-only mode, so we check for that. */ sector_t here_new, here_old; int old_disks; @@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors)) { + printk(KERN_ERR "md/raid:%s: reshape position is" + " confused - aborting\n", mdname(mddev)); + return -EINVAL; + } /* We cannot be sure it is safe to start an in-place - * reshape. It is only safe if user-space if monitoring + * reshape. It is only safe if user-space is monitoring * and taking constant backups. * mdadm always starts a situation like this in * readonly mode so it can take control before * allowing any writes. So just check for that. */ - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors) || - mddev->ro == 0) { - printk(KERN_ERR "md/raid:%s: in-place reshape must be started" - " in read-only mode - aborting\n", + if (abs(min_offset_diff) >= mddev->chunk_sectors && + abs(min_offset_diff) >= mddev->new_chunk_sectors) + /* not really in-place - so OK */; + else if (mddev->ro == 0) { + printk(KERN_ERR "md/raid:%s: in-place reshape " + "must be started in read-only mode " + "- aborting\n", mdname(mddev)); return -EINVAL; } - } else if (mddev->delta_disks < 0 - ? (here_new * mddev->new_chunk_sectors <= + } else if (mddev->reshape_backwards + ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= here_old * mddev->chunk_sectors) : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors)) { + here_old * mddev->chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + conf->min_offset_diff = min_offset_diff; mddev->thread = conf->thread; conf->thread = NULL; mddev->private = conf; @@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->new_data_offset << 9); + } } return 0; @@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ + sector_t newsize; sectors &= ~((sector_t)mddev->chunk_sectors - 1); - md_set_array_sectors(mddev, raid5_size(mddev, sectors, - mddev->raid_disks)); - if (mddev->array_sectors > - raid5_size(mddev, sectors, mddev->raid_disks)) + newsize = raid5_size(mddev, sectors, mddev->raid_disks); + if (mddev->external_size && + mddev->array_sectors > newsize) return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && @@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) mddev->new_layout == mddev->layout && mddev->new_chunk_sectors == mddev->chunk_sectors) return 0; /* nothing to do */ - if (mddev->bitmap) - /* Cannot grow a bitmap yet */ - return -EBUSY; if (has_failed(conf)) return -EINVAL; if (mddev->delta_disks < 0) { @@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - rdev_for_each(rdev, mddev) + if (has_failed(conf)) + return -EINVAL; + + rdev_for_each(rdev, mddev) { if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; + } if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) /* Not enough devices even to make a degraded array @@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) conf->chunk_sectors = mddev->new_chunk_sectors; conf->prev_algo = conf->algorithm; conf->algorithm = mddev->new_layout; - if (mddev->delta_disks < 0) + conf->generation++; + /* Code that selects data_offset needs to see the generation update + * if reshape_progress has been set - so a memory barrier needed. + */ + smp_mb(); + if (mddev->reshape_backwards) conf->reshape_progress = raid5_size(mddev, 0, 0); else conf->reshape_progress = 0; conf->reshape_safe = conf->reshape_progress; - conf->generation++; spin_unlock_irq(&conf->device_lock); /* Add some new drives, as many as will fit. @@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); conf->reshape_progress = MaxSector; mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); @@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; + rdev_for_each(rdev, conf->mddev) + rdev->data_offset = rdev->new_data_offset; + smp_wmb(); conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); @@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) d < conf->raid_disks - mddev->delta_disks; d++) { struct md_rdev *rdev = conf->disks[d].rdev; - if (rdev && - raid5_remove_disk(mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = conf->disks[d].replacement; + if (rdev) + clear_bit(In_sync, &rdev->flags); } } mddev->layout = conf->algorithm; mddev->chunk_sectors = conf->chunk_sectors; mddev->reshape_position = MaxSector; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; } } |