md: use separate bio pool for each md device.

bio_clone and bio_alloc allocate from a common bio pool.
If an md device is stacked with other devices that use this pool, or under
something like swap which uses the pool, then the multiple calls on
the pool can cause deadlocks.

So allocate a local bio pool for each md array and use that rather
than the common pool.

This pool is used both for regular IO and metadata updates.

Signed-off-by: NeilBrown <neilb@suse.de>
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 1a89878..339fdc6 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -210,7 +210,7 @@
 		}
 	}
 	if (failit) {
-		struct bio *b = bio_clone(bio, GFP_NOIO);
+		struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		b->bi_bdev = conf->rdev->bdev;
 		b->bi_private = bio;
 		b->bi_end_io = faulty_fail;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3149bf3..4e957f3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -149,6 +149,72 @@
 
 static int start_readonly;
 
+/* bio_clone_mddev
+ * like bio_clone, but with a local bio set
+ */
+
+static void mddev_bio_destructor(struct bio *bio)
+{
+	mddev_t *mddev, **mddevp;
+
+	mddevp = (void*)bio;
+	mddev = mddevp[-1];
+
+	bio_free(bio, mddev->bio_set);
+}
+
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+			    mddev_t *mddev)
+{
+	struct bio *b;
+	mddev_t **mddevp;
+
+	if (!mddev || !mddev->bio_set)
+		return bio_alloc(gfp_mask, nr_iovecs);
+
+	b = bio_alloc_bioset(gfp_mask, nr_iovecs,
+			     mddev->bio_set);
+	if (!b)
+		return NULL;
+	mddevp = (void*)b;
+	mddevp[-1] = mddev;
+	b->bi_destructor = mddev_bio_destructor;
+	return b;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+			    mddev_t *mddev)
+{
+	struct bio *b;
+	mddev_t **mddevp;
+
+	if (!mddev || !mddev->bio_set)
+		return bio_clone(bio, gfp_mask);
+
+	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+			     mddev->bio_set);
+	if (!b)
+		return NULL;
+	mddevp = (void*)b;
+	mddevp[-1] = mddev;
+	b->bi_destructor = mddev_bio_destructor;
+	__bio_clone(b, bio);
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
+
+		if (ret < 0) {
+			bio_put(b);
+			return NULL;
+		}
+	}
+
+	return b;
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -321,7 +387,7 @@
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
-			bi = bio_alloc(GFP_KERNEL, 0);
+			bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bi->bi_bdev = rdev->bdev;
@@ -428,6 +494,8 @@
 
 static void mddev_put(mddev_t *mddev)
 {
+	struct bio_set *bs = NULL;
+
 	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
 		return;
 	if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -435,6 +503,8 @@
 		/* Array is not configured at all, and not held active,
 		 * so destroy it */
 		list_del(&mddev->all_mddevs);
+		bs = mddev->bio_set;
+		mddev->bio_set = NULL;
 		if (mddev->gendisk) {
 			/* We did a probe so need to clean up.  Call
 			 * queue_work inside the spinlock so that
@@ -447,6 +517,8 @@
 			kfree(mddev);
 	}
 	spin_unlock(&all_mddevs_lock);
+	if (bs)
+		bioset_free(bs);
 }
 
 void mddev_init(mddev_t *mddev)
@@ -690,7 +762,7 @@
 	 * if zero is reached.
 	 * If an error occurred, call md_error
 	 */
-	struct bio *bio = bio_alloc(GFP_NOIO, 1);
+	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_sector = sector;
@@ -724,7 +796,7 @@
 int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
 		 struct page *page, int rw)
 {
-	struct bio *bio = bio_alloc(GFP_NOIO, 1);
+	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
 	struct completion event;
 	int ret;
 
@@ -4379,6 +4451,9 @@
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 	}
 
+	if (mddev->bio_set == NULL)
+		mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5ee5371..d05bab5 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -331,6 +331,8 @@
 	struct attribute_group		*to_remove;
 	struct plug_handle		*plug; /* if used by personality */
 
+	struct bio_set			*bio_set;
+
 	/* Generic flush handling.
 	 * The last to finish preflush schedules a worker to submit
 	 * the rest of the request (without the REQ_FLUSH flag).
@@ -517,4 +519,8 @@
 
 extern void mddev_suspend(mddev_t *mddev);
 extern void mddev_resume(mddev_t *mddev);
+extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+				   mddev_t *mddev);
+extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+				   mddev_t *mddev);
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 08c66af..54f60c9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -861,7 +861,7 @@
 		}
 		r1_bio->read_disk = rdisk;
 
-		read_bio = bio_clone(bio, GFP_NOIO);
+		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 
 		r1_bio->bios[rdisk] = read_bio;
 
@@ -950,7 +950,7 @@
 		if (!r1_bio->bios[i])
 			continue;
 
-		mbio = bio_clone(bio, GFP_NOIO);
+		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		r1_bio->bios[i] = mbio;
 
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
@@ -1640,7 +1640,8 @@
 					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
 				bio_put(bio);
-				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+				bio = bio_clone_mddev(r1_bio->master_bio,
+						      GFP_NOIO, mddev);
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
 				if (printk_ratelimit())
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d54122b..c67aa54 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -889,7 +889,7 @@
 		}
 		mirror = conf->mirrors + disk;
 
-		read_bio = bio_clone(bio, GFP_NOIO);
+		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 
 		r10_bio->devs[slot].bio = read_bio;
 
@@ -958,7 +958,7 @@
 		if (!r10_bio->devs[i].bio)
 			continue;
 
-		mbio = bio_clone(bio, GFP_NOIO);
+		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		r10_bio->devs[i].bio = mbio;
 
 		mbio->bi_sector	= r10_bio->devs[i].addr+
@@ -1746,7 +1746,8 @@
 					       mdname(mddev),
 					       bdevname(rdev->bdev,b),
 					       (unsigned long long)r10_bio->sector);
-				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
+				bio = bio_clone_mddev(r10_bio->master_bio,
+						      GFP_NOIO, mddev);
 				r10_bio->devs[r10_bio->read_slot].bio = bio;
 				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
 					+ rdev->data_offset;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8abc159..dc574f3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3876,9 +3876,9 @@
 		return 0;
 	}
 	/*
-	 * use bio_clone to make a copy of the bio
+	 * use bio_clone_mddev to make a copy of the bio
 	 */
-	align_bi = bio_clone(raid_bio, GFP_NOIO);
+	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
 	if (!align_bi)
 		return 0;
 	/*