[PATCH] md: support BIO_RW_BARRIER for md/raid1 We can only accept BARRIER requests if all slaves handle barriers, and that can, of course, change with time.... So we keep track of whether the whole array seems safe for barriers, and also whether each individual rdev handles barriers. We initially assumes barriers are OK. When writing the superblock we try a barrier, and if that fails, we flag things for no-barriers. This will usually clear the flags fairly quickly. If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to resubmit, so introduce function "md_super_wait" which waits for requests to finish, and retries ENOTSUPP requests without the barrier flag. When writing the real raid1, write requests which were BIO_RW_BARRIER but which aresn't supported need to be retried. So raid1d is enhanced to do this, and when any bio write completes (i.e. no retry needed) we remove it from the r1bio, so that devices needing retry are easy to find. We should hardly ever get -ENOTSUPP errors when writing data to the raid. It should only happen if: 1/ the device used to support BARRIER, but now doesn't. Few devices change like this, though raid1 can! or 2/ the array has no persistent superblock, so there was no opportunity to pre-test for barriers when writing the superblock. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

commit: a9701a30470856408d08657eb1bd7ae29a146190 [log] [tgz]
author: NeilBrown <neilb@suse.de> Tue Nov 08 21:39:34 2005 -0800
committer: Linus Torvalds <torvalds@g5.osdl.org> Wed Nov 09 07:56:38 2005 -0800
tree: eb6ea8c82fdc1b50bf56abadeee63a935034cf27
parent: bd926c63b7a6843d3ce2728396c0891e54fce5c4 [diff] [blame]
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb6b866..1cbf51f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c

@@ -301,7 +301,7 @@
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
-	int mirror, behind;
+	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	if (bio->bi_size)
@@ -311,47 +311,54 @@
 		if (r1_bio->bios[mirror] == bio)
 			break;
 
-	/*
-	 * this branch is our 'one mirror IO has finished' event handler:
-	 */
-	if (!uptodate) {
-		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-		/* an I/O failed, we can't clear the bitmap */
-		set_bit(R1BIO_Degraded, &r1_bio->state);
-	} else
+	if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
+		set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
+		set_bit(R1BIO_BarrierRetry, &r1_bio->state);
+		r1_bio->mddev->barriers_work = 0;
+	} else {
 		/*
-		 * Set R1BIO_Uptodate in our master bio, so that
-		 * we will return a good error code for to the higher
-		 * levels even if IO on some other mirrored buffer fails.
-		 *
-		 * The 'master' represents the composite IO operation to
-		 * user-side. So if something waits for IO, then it will
-		 * wait for the 'master' bio.
+		 * this branch is our 'one mirror IO has finished' event handler:
 		 */
-		set_bit(R1BIO_Uptodate, &r1_bio->state);
+		r1_bio->bios[mirror] = NULL;
+		bio_put(bio);
+		if (!uptodate) {
+			md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+			/* an I/O failed, we can't clear the bitmap */
+			set_bit(R1BIO_Degraded, &r1_bio->state);
+		} else
+			/*
+			 * Set R1BIO_Uptodate in our master bio, so that
+			 * we will return a good error code for to the higher
+			 * levels even if IO on some other mirrored buffer fails.
+			 *
+			 * The 'master' represents the composite IO operation to
+			 * user-side. So if something waits for IO, then it will
+			 * wait for the 'master' bio.
+			 */
+			set_bit(R1BIO_Uptodate, &r1_bio->state);
 
-	update_head_pos(mirror, r1_bio);
+		update_head_pos(mirror, r1_bio);
 
-	behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
-	if (behind) {
-		if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
-			atomic_dec(&r1_bio->behind_remaining);
+		if (behind) {
+			if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+				atomic_dec(&r1_bio->behind_remaining);
 
-		/* In behind mode, we ACK the master bio once the I/O has safely
-		 * reached all non-writemostly disks. Setting the Returned bit
-		 * ensures that this gets done only once -- we don't ever want to
-		 * return -EIO here, instead we'll wait */
+			/* In behind mode, we ACK the master bio once the I/O has safely
+			 * reached all non-writemostly disks. Setting the Returned bit
+			 * ensures that this gets done only once -- we don't ever want to
+			 * return -EIO here, instead we'll wait */
 
-		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-			/* Maybe we can return now */
-			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-				struct bio *mbio = r1_bio->master_bio;
-				PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-				       (unsigned long long) mbio->bi_sector,
-				       (unsigned long long) mbio->bi_sector +
-				       (mbio->bi_size >> 9) - 1);
-				bio_endio(mbio, mbio->bi_size, 0);
+			if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+			    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+				/* Maybe we can return now */
+				if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+					struct bio *mbio = r1_bio->master_bio;
+					PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+					       (unsigned long long) mbio->bi_sector,
+					       (unsigned long long) mbio->bi_sector +
+					       (mbio->bi_size >> 9) - 1);
+					bio_endio(mbio, mbio->bi_size, 0);
+				}
 			}
 		}
 	}
@@ -361,8 +368,16 @@
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
+		if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+			reschedule_retry(r1_bio);
+			/* Don't dec_pending yet, we want to hold
+			 * the reference over the retry
+			 */
+			return 0;
+		}
 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
 			/* free extra copy of the data pages */
+/* FIXME bio has been freed!!! */
 			int i = bio->bi_vcnt;
 			while (i--)
 				__free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@
 	struct bio_list bl;
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
+	int do_barriers;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
 		return 0;
 	}
@@ -759,6 +775,10 @@
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
+	do_barriers = bio->bi_rw & BIO_RW_BARRIER;
+	if (do_barriers)
+		set_bit(R1BIO_Barrier, &r1_bio->state);
+
 	bio_list_init(&bl);
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio;
@@ -771,7 +791,7 @@
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE;
+		mbio->bi_rw = WRITE | do_barriers;
 		mbio->bi_private = r1_bio;
 
 		if (behind_pages) {
@@ -1153,6 +1173,36 @@
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
 			unplug = 1;
+		} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+			/* some requests in the r1bio were BIO_RW_BARRIER
+			 * requests which failed with -ENOTSUPP.  Hohumm..
+			 * Better resubmit without the barrier.
+			 * We know which devices to resubmit for, because
+			 * all others have had their bios[] entry cleared.
+			 */
+			int i;
+			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
+			clear_bit(R1BIO_Barrier, &r1_bio->state);
+			for (i=0; i < conf->raid_disks; i++)
+				if (r1_bio->bios[i]) {
+					struct bio_vec *bvec;
+					int j;
+
+					bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+					/* copy pages from the failed bio, as
+					 * this might be a write-behind device */
+					__bio_for_each_segment(bvec, bio, j, 0)
+						bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
+					bio_put(r1_bio->bios[i]);
+					bio->bi_sector = r1_bio->sector +
+						conf->mirrors[i].rdev->data_offset;
+					bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+					bio->bi_end_io = raid1_end_write_request;
+					bio->bi_rw = WRITE;
+					bio->bi_private = r1_bio;
+					r1_bio->bios[i] = bio;
+					generic_make_request(bio);
+				}
 		} else {
 			int disk;
 			bio = r1_bio->bios[r1_bio->read_disk];
commit	a9701a30470856408d08657eb1bd7ae29a146190	[log] [tgz]
author	NeilBrown <neilb@suse.de>	Tue Nov 08 21:39:34 2005 -0800
committer	Linus Torvalds <torvalds@g5.osdl.org>	Wed Nov 09 07:56:38 2005 -0800
tree	eb6ea8c82fdc1b50bf56abadeee63a935034cf27
parent	bd926c63b7a6843d3ce2728396c0891e54fce5c4 [diff] [blame]