@@ -1533,6 +1533,256 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
raid10_write_request(mddev, bio, r10_bio);
}
+static struct bio *raid10_split_bio(struct r10conf *conf,
+ struct bio *bio, sector_t sectors, bool want_first)
+{
+ struct bio *split;
+
+ split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
+ bio_chain(split, bio);
+ allow_barrier(conf);
+ if (want_first) {
+ generic_make_request(bio);
+ bio = split;
+ } else
+ generic_make_request(split);
+ wait_barrier(conf);
+
+ return bio;
+}
+
+static void raid10_end_discard_request(struct bio *bio)
+{
+ struct r10bio *r10_bio = bio->bi_private;
+ struct r10conf *conf = r10_bio->mddev->private;
+ struct md_rdev *rdev = NULL;
+ int dev;
+ int slot, repl;
+
+ /*
+ * We don't care the return value of discard bio
+ */
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+
+ dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[dev].replacement;
+ if (!rdev) {
+ /* raid10_remove_disk uses smp_mb to make sure rdev is set to
+ * replacement before setting replacement to NULL. It can read
+ * rdev first without barrier protect even replacment is NULL
+ */
+ smp_rmb();
+ rdev = conf->mirrors[dev].rdev;
+ }
+
+ if (atomic_dec_and_test(&r10_bio->remaining)) {
+ md_write_end(r10_bio->mddev);
+ raid_end_bio_io(r10_bio);
+ }
+
+ rdev_dec_pending(rdev, conf->mddev);
+}
+
+/* There are some limitations to handle discard bio
+ * 1st, the discard size is bigger than stripe_size*2.
+ * 2st, if the discard bio spans reshape progress, we use the old way to
+ * handle discard bio
+ */
+static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+ struct r10conf *conf = mddev->private;
+ struct geom *geo = &conf->geo;
+ struct r10bio *r10_bio;
+
+ int disk;
+ sector_t chunk;
+ unsigned int stripe_size;
+ sector_t split_size;
+
+ sector_t bio_start, bio_end;
+ sector_t first_stripe_index, last_stripe_index;
+ sector_t start_disk_offset;
+ unsigned int start_disk_index;
+ sector_t end_disk_offset;
+ unsigned int end_disk_index;
+ unsigned int remainder;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return -EAGAIN;
+
+ wait_barrier(conf);
+
+ /* Check reshape again to avoid reshape happens after checking
+ * MD_RECOVERY_RESHAPE and before wait_barrier
+ */
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ goto out;
+
+ stripe_size = geo->raid_disks << geo->chunk_shift;
+ bio_start = bio->bi_iter.bi_sector;
+ bio_end = bio_end_sector(bio);
+
+ /* Maybe one discard bio is smaller than strip size or across one stripe
+ * and discard region is larger than one stripe size. For far offset layout,
+ * if the discard region is not aligned with stripe size, there is hole
+ * when we submit discard bio to member disk. For simplicity, we only
+ * handle discard bio which discard region is bigger than stripe_size*2
+ */
+ if (bio_sectors(bio) < stripe_size*2)
+ goto out;
+
+ /* For far offset layout, if bio is not aligned with stripe size, it splits
+ * the part that is not aligned with strip size.
+ */
+ div_u64_rem(bio_start, stripe_size, &remainder);
+ if (geo->far_offset && remainder) {
+ split_size = stripe_size - remainder;
+ bio = raid10_split_bio(conf, bio, split_size, false);
+ }
+ div_u64_rem(bio_end, stripe_size, &remainder);
+ if (geo->far_offset && remainder) {
+ split_size = bio_sectors(bio) - remainder;
+ bio = raid10_split_bio(conf, bio, split_size, true);
+ }
+
+ r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+ r10_bio->mddev = mddev;
+ r10_bio->state = 0;
+ r10_bio->sectors = 0;
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
+
+ wait_blocked_dev(mddev, r10_bio);
+
+ r10_bio->master_bio = bio;
+
+ bio_start = bio->bi_iter.bi_sector;
+ bio_end = bio_end_sector(bio);
+
+ /* raid10 uses chunk as the unit to store data. It's similar like raid0.
+ * One stripe contains the chunks from all member disk (one chunk from
+ * one disk at the same HBA address). For layout detail, see 'man md 4'
+ */
+ chunk = bio_start >> geo->chunk_shift;
+ chunk *= geo->near_copies;
+ first_stripe_index = chunk;
+ start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
+ if (geo->far_offset)
+ first_stripe_index *= geo->far_copies;
+ start_disk_offset = (bio_start & geo->chunk_mask) +
+ (first_stripe_index << geo->chunk_shift);
+
+ chunk = bio_end >> geo->chunk_shift;
+ chunk *= geo->near_copies;
+ last_stripe_index = chunk;
+ end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
+ if (geo->far_offset)
+ last_stripe_index *= geo->far_copies;
+ end_disk_offset = (bio_end & geo->chunk_mask) +
+ (last_stripe_index << geo->chunk_shift);
+
+ rcu_read_lock();
+ for (disk = 0; disk < geo->raid_disks; disk++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[disk].replacement);
+
+ r10_bio->devs[disk].bio = NULL;
+ r10_bio->devs[disk].repl_bio = NULL;
+
+ if (rdev && (test_bit(Faulty, &rdev->flags)))
+ rdev = NULL;
+ if (rrdev && (test_bit(Faulty, &rrdev->flags)))
+ rrdev = NULL;
+ if (!rdev && !rrdev)
+ continue;
+
+ if (rdev) {
+ r10_bio->devs[disk].bio = bio;
+ atomic_inc(&rdev->nr_pending);
+ }
+ if (rrdev) {
+ r10_bio->devs[disk].repl_bio = bio;
+ atomic_inc(&rrdev->nr_pending);
+ }
+ }
+ rcu_read_unlock();
+
+ atomic_set(&r10_bio->remaining, 1);
+ for (disk = 0; disk < geo->raid_disks; disk++) {
+ sector_t dev_start, dev_end;
+ struct bio *mbio, *rbio = NULL;
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[disk].replacement);
+
+ /*
+ * Now start to calculate the start and end address for each disk.
+ * The space between dev_start and dev_end is the discard region.
+ *
+ * For dev_start, it needs to consider three conditions:
+ * 1st, the disk is before start_disk, you can imagine the disk in
+ * the next stripe. So the dev_start is the start address of next
+ * stripe.
+ * 2st, the disk is after start_disk, it means the disk is at the
+ * same stripe of first disk
+ * 3st, the first disk itself, we can use start_disk_offset directly
+ */
+ if (disk < start_disk_index)
+ dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
+ else if (disk > start_disk_index)
+ dev_start = first_stripe_index * mddev->chunk_sectors;
+ else
+ dev_start = start_disk_offset;
+
+ if (disk < end_disk_index)
+ dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+ else if (disk > end_disk_index)
+ dev_end = last_stripe_index * mddev->chunk_sectors;
+ else
+ dev_end = end_disk_offset;
+
+ /* It only handles discard bio which size is >= stripe size, so
+ * dev_end > dev_start all the time
+ */
+ if (r10_bio->devs[disk].bio) {
+ mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ mbio->bi_end_io = raid10_end_discard_request;
+ mbio->bi_private = r10_bio;
+ r10_bio->devs[disk].bio = mbio;
+ r10_bio->devs[disk].devnum = disk;
+ atomic_inc(&r10_bio->remaining);
+ md_submit_discard_bio(mddev, rdev, mbio,
+ dev_start + choose_data_offset(r10_bio, rdev),
+ dev_end - dev_start);
+ bio_endio(mbio);
+ }
+ if (r10_bio->devs[disk].repl_bio) {
+ rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+ rbio->bi_end_io = raid10_end_discard_request;
+ rbio->bi_private = r10_bio;
+ r10_bio->devs[disk].repl_bio = rbio;
+ r10_bio->devs[disk].devnum = disk;
+ atomic_inc(&r10_bio->remaining);
+ md_submit_discard_bio(mddev, rrdev, rbio,
+ dev_start + choose_data_offset(r10_bio, rrdev),
+ dev_end - dev_start);
+ bio_endio(rbio);
+ }
+ }
+
+ if (atomic_dec_and_test(&r10_bio->remaining)) {
+ md_write_end(r10_bio->mddev);
+ raid_end_bio_io(r10_bio);
+ }
+
+ return 0;
+out:
+ allow_barrier(conf);
+ return -EAGAIN;
+}
+
static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
@@ -1547,6 +1797,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
if (!md_write_start(mddev, bio))
return false;
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+ if (!raid10_handle_discard(mddev, bio))
+ return true;
+
/*
* If this request crosses a chunk boundary, we need to split
* it.
@@ -3777,7 +4031,7 @@ static int raid10_run(struct mddev *mddev)
chunk_size = mddev->chunk_sectors << 9;
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
- mddev->chunk_sectors);
+ UINT_MAX);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, chunk_size);