[PATCH] md: improve the interface to sync_request

1/ change the return value (which is number-of-sectors synced)
 from 'int' to 'sector_t'.
 The number of sectors is usually easily small enough to fit
 in an int, but if resync needs to abort, it may want to return
 the total number of remaining sectors, which could be large.
 Also errors cannot be returned as negative numbers now, so use
 0 instead
2/ Add a 'skipped' return parameter to allow the array to report
 that it skipped the sectors.  This allows md to take this into account
 in the speed calculations.
 Currently there is no important skipping, but the bitmap-based-resync
 that is coming will use this.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 177d2a7..fa608a1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3241,12 +3241,13 @@
 	mddev_t *mddev2;
 	unsigned int currspeed = 0,
 		 window;
-	sector_t max_sectors,j;
+	sector_t max_sectors,j, io_sectors;
 	unsigned long mark[SYNC_MARKS];
 	sector_t mark_cnt[SYNC_MARKS];
 	int last_mark,m;
 	struct list_head *tmp;
 	sector_t last_check;
+	int skipped = 0;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -3312,7 +3313,7 @@
 
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		/* resync follows the size requested by the personality,
-		 * which default to physical size, but can be virtual size
+		 * which defaults to physical size, but can be virtual size
 		 */
 		max_sectors = mddev->resync_max_sectors;
 	else
@@ -3331,9 +3332,10 @@
 		j = mddev->recovery_cp;
 	else
 		j = 0;
+	io_sectors = 0;
 	for (m = 0; m < SYNC_MARKS; m++) {
 		mark[m] = jiffies;
-		mark_cnt[m] = j;
+		mark_cnt[m] = io_sectors;
 	}
 	last_mark = 0;
 	mddev->resync_mark = mark[last_mark];
@@ -3358,21 +3360,29 @@
 	}
 
 	while (j < max_sectors) {
-		int sectors;
+		sector_t sectors;
 
-		sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
-		if (sectors < 0) {
+		skipped = 0;
+		sectors = mddev->pers->sync_request(mddev, j, &skipped,
+					    currspeed < sysctl_speed_limit_min);
+		if (sectors == 0) {
 			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 			goto out;
 		}
-		atomic_add(sectors, &mddev->recovery_active);
+
+		if (!skipped) { /* actual IO requested */
+			io_sectors += sectors;
+			atomic_add(sectors, &mddev->recovery_active);
+		}
+
 		j += sectors;
 		if (j>1) mddev->curr_resync = j;
 
-		if (last_check + window > j || j == max_sectors)
+
+		if (last_check + window > io_sectors || j == max_sectors)
 			continue;
 
-		last_check = j;
+		last_check = io_sectors;
 
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
@@ -3386,7 +3396,7 @@
 			mddev->resync_mark = mark[next];
 			mddev->resync_mark_cnt = mark_cnt[next];
 			mark[next] = jiffies;
-			mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
 			last_mark = next;
 		}
 
@@ -3413,7 +3423,8 @@
 		mddev->queue->unplug_fn(mddev->queue);
 		cond_resched();
 
-		currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+			/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
 		if (currspeed > sysctl_speed_limit_min) {
 			if ((currspeed > sysctl_speed_limit_max) ||
@@ -3433,7 +3444,7 @@
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
-	mddev->pers->sync_request(mddev, max_sectors, 1);
+	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
 	    mddev->curr_resync > 2 &&
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3f1280b..3c5c916 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1010,7 +1010,7 @@
  * that can be installed to exclude normal IO requests.
  */
 
-static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	mirror_info_t *mirror;
@@ -1023,7 +1023,7 @@
 
 	if (!conf->r1buf_pool)
 		if (init_resync(conf))
-			return -ENOMEM;
+			return 0;
 
 	max_sector = mddev->size << 1;
 	if (sector_nr >= max_sector) {
@@ -1107,8 +1107,8 @@
 		/* There is nowhere to write, so all non-sync
 		 * drives must be failed - so we are finished
 		 */
-		int rv = max_sector - sector_nr;
-		md_done_sync(mddev, rv, 1);
+		sector_t rv = max_sector - sector_nr;
+		*skipped = 1;
 		put_buf(r1_bio);
 		rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
 		return rv;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index bfc9f52..8476515 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1321,7 +1321,7 @@
  *
  */
 
-static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	r10bio_t *r10_bio;
@@ -1335,7 +1335,7 @@
 
 	if (!conf->r10buf_pool)
 		if (init_resync(conf))
-			return -ENOMEM;
+			return 0;
 
  skipped:
 	max_sector = mddev->size << 1;
@@ -1343,15 +1343,15 @@
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
 		close_sync(conf);
+		*skipped = 1;
 		return sectors_skipped;
 	}
 	if (chunks_skipped >= conf->raid_disks) {
 		/* if there has been nothing to do on any drive,
 		 * then there is nothing to do at all..
 		 */
-		sector_t sec = max_sector - sector_nr;
-		md_done_sync(mddev, sec, 1);
-		return sec + sectors_skipped;
+		*skipped = 1;
+		return (max_sector - sector_nr) + sectors_skipped;
 	}
 
 	/* make sure whole request will fit in a chunk - if chunks
@@ -1565,17 +1565,22 @@
 		}
 	}
 
+	if (sectors_skipped)
+		/* pretend they weren't skipped, it makes
+		 * no important difference in this case
+		 */
+		md_done_sync(mddev, sectors_skipped, 1);
+
 	return sectors_skipped + nr_sectors;
  giveup:
 	/* There is nowhere to write, so all non-sync
 	 * drives must be failed, so try the next chunk...
 	 */
 	{
-	int sec = max_sector - sector_nr;
+	sector_t sec = max_sector - sector_nr;
 	sectors_skipped += sec;
 	chunks_skipped ++;
 	sector_nr = max_sector;
-	md_done_sync(mddev, sec, 1);
 	goto skipped;
 	}
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 677ce49..1ce3f5a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1477,7 +1477,7 @@
 }
 
 /* FIXME go_faster isn't used */
-static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
@@ -1500,8 +1500,8 @@
 	 * nothing we can do.
 	 */
 	if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		int rv = (mddev->size << 1) - sector_nr;
-		md_done_sync(mddev, rv, 1);
+		sector_t rv = (mddev->size << 1) - sector_nr;
+		*skipped = 1;
 		return rv;
 	}
 
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index fede16c..d9c3854 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1636,7 +1636,7 @@
 }
 
 /* FIXME go_faster isn't used */
-static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
 	raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
 	struct stripe_head *sh;
@@ -1659,8 +1659,8 @@
 	 * nothing we can do.
 	 */
 	if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-		int rv = (mddev->size << 1) - sector_nr;
-		md_done_sync(mddev, rv, 1);
+		sector_t rv = (mddev->size << 1) - sector_nr;
+		*skipped = 1;
 		return rv;
 	}