Btrfs: Add support for online device removal

This required a few structural changes to the code that manages bdev pointers:

The VFS super block now gets an anon-bdev instead of a pointer to the
lowest bdev.  This allows us to avoid swapping the super block bdev pointer
around at run time.

The code to read in the super block no longer goes through the extent
buffer interface.  Things got ugly keeping the mapping constant.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b6f852..33ab165 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,7 +505,7 @@
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
-	struct extent_buffer *sb_buffer;
+	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
@@ -1208,6 +1208,7 @@
 
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fabc31b..9d5424a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -78,9 +78,13 @@
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
-	spin_unlock(&em_tree->lock);
-	if (em)
+	if (em) {
+		em->bdev =
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+		spin_unlock(&em_tree->lock);
 		goto out;
+	}
+	spin_unlock(&em_tree->lock);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
@@ -90,7 +94,7 @@
 	em->start = 0;
 	em->len = (u64)-1;
 	em->block_start = 0;
-	em->bdev = inode->i_sb->s_bdev;
+	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
@@ -435,11 +439,6 @@
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
 	BUG_ON(ret);
 
-	if (offset == BTRFS_SUPER_INFO_OFFSET) {
-		bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-		submit_bio(rw, bio);
-		return 0;
-	}
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 }
 
@@ -587,8 +586,7 @@
 	list = &fs_info->fs_devices->devices;
 	list_for_each(next, list) {
 		device = list_entry(next, struct btrfs_device, dev_list);
-		if (device->bdev && device->bdev != fs_info->sb->s_bdev)
-			close_bdev_excl(device->bdev);
+		close_bdev_excl(device->bdev);
 		device->bdev = NULL;
 	}
 	return 0;
@@ -1118,6 +1116,7 @@
 	u32 leafsize;
 	u32 blocksize;
 	u32 stripesize;
+	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
@@ -1153,7 +1152,6 @@
 	spin_lock_init(&fs_info->new_trans_lock);
 
 	init_completion(&fs_info->kobj_unregister);
-	sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -1170,6 +1168,9 @@
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
 
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = blksize_bits(4096);
+
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
 	 * the real end of the address space is determined by all of
@@ -1229,19 +1230,16 @@
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-	fs_info->sb_buffer = read_tree_block(tree_root,
-					     BTRFS_SUPER_INFO_OFFSET,
-					     4096);
 
-	if (!fs_info->sb_buffer)
+	bh = __bread(fs_devices->latest_bdev,
+		     BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh)
 		goto fail_iput;
 
-	read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0,
-			   sizeof(fs_info->super_copy));
+	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+	brelse(bh);
 
-	read_extent_buffer(fs_info->sb_buffer, fs_info->fsid,
-			   (unsigned long)btrfs_super_fsid(fs_info->sb_buffer),
-			   BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
 
 	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
@@ -1263,7 +1261,9 @@
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
 	tree_root->stripesize = stripesize;
-	sb_set_blocksize(sb, sectorsize);
+
+	sb->s_blocksize = sectorsize;
+	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
@@ -1339,7 +1339,6 @@
 fail_sys_array:
 	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
-	free_extent_buffer(fs_info->sb_buffer);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
 	iput(fs_info->btree_inode);
@@ -1380,41 +1379,44 @@
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
-	struct extent_buffer *sb;
+	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
 	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
 	int max_errors;
 	int total_errors = 0;
+	u32 crc;
+	u64 flags;
 
 	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
-	sb = root->fs_info->sb_buffer;
-	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
-						      dev_item);
+	sb = &root->fs_info->super_for_commit;
+	dev_item = &sb->dev_item;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
-		btrfs_set_device_type(sb, dev_item, dev->type);
-		btrfs_set_device_id(sb, dev_item, dev->devid);
-		btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes);
-		btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used);
-		btrfs_set_device_io_align(sb, dev_item, dev->io_align);
-		btrfs_set_device_io_width(sb, dev_item, dev->io_width);
-		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
-		write_extent_buffer(sb, dev->uuid,
-				    (unsigned long)btrfs_device_uuid(dev_item),
-				    BTRFS_UUID_SIZE);
+		btrfs_set_stack_device_type(dev_item, dev->type);
+		btrfs_set_stack_device_id(dev_item, dev->devid);
+		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+		btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		flags = btrfs_super_flags(sb);
+		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
-		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
-		csum_tree_block(root, sb, 0);
 
-		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET /
-			      root->fs_info->sb->s_blocksize,
+		crc = ~(u32)0;
+		crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
+				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, sb->csum);
+
+		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
 			      BTRFS_SUPER_INFO_SIZE);
 
-		read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE);
+		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 		dev->pending_io = bh;
 
 		get_bh(bh);
@@ -1483,15 +1485,6 @@
 	int ret;
 
 	ret = write_all_supers(root);
-#if 0
-	if (!btrfs_test_opt(root, NOBARRIER))
-		blkdev_issue_flush(sb->s_bdev, NULL);
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
-	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
-				     super->start, super->len);
-	if (!btrfs_test_opt(root, NOBARRIER))
-		blkdev_issue_flush(sb->s_bdev, NULL);
-#endif
 	return ret;
 }
 
@@ -1570,8 +1563,6 @@
 	if (root->fs_info->dev_root->node);
 		free_extent_buffer(root->fs_info->dev_root->node);
 
-	free_extent_buffer(fs_info->sb_buffer);
-
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
 
@@ -1652,7 +1643,7 @@
 {
 	struct backing_dev_info *bdi;
 
-	bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = &root->fs_info->bdi;
 	if (root->fs_info->throttles && bdi_write_congested(bdi)) {
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
 		congestion_wait(WRITE, HZ/20);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f94794a..c0e67bd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -147,6 +147,8 @@
 	u64 end;
 	int ret;
 
+	bytenr = max_t(u64, bytenr,
+		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
 	block_group_cache = &info->block_group_cache;
 	ret = find_first_extent_bit(block_group_cache,
 				    bytenr, &start, &end,
@@ -1059,16 +1061,25 @@
 	}
 }
 
-static u64 reduce_alloc_profile(u64 flags)
+static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
+	u64 num_devices = root->fs_info->fs_devices->num_devices;
+
+	if (num_devices == 1)
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+	if (num_devices < 4)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		      BTRFS_BLOCK_GROUP_RAID10)))
+		      BTRFS_BLOCK_GROUP_RAID10))) {
 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
 
 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (flags & BTRFS_BLOCK_GROUP_RAID10))
+	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+	}
 
 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
@@ -1078,7 +1089,6 @@
 	return flags;
 }
 
-
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1089,7 +1099,7 @@
 	u64 num_bytes;
 	int ret;
 
-	flags = reduce_alloc_profile(flags);
+	flags = reduce_alloc_profile(extent_root, flags);
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
@@ -1169,6 +1179,21 @@
 	return 0;
 }
 
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+	u64 start;
+	u64 end;
+	int ret;
+	ret = find_first_extent_bit(&root->fs_info->block_group_cache,
+				    search_start, &start, &end,
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
+	if (ret)
+		return 0;
+	return start;
+}
+
+
 static int update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
@@ -1185,16 +1210,25 @@
 	}
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
-		WARN_ON(!cache);
-		len = min(num, cache->key.offset -
-			  (bytenr - cache->key.objectid));
+		if (!cache) {
+			u64 first = first_logical_byte(root, bytenr);
+			WARN_ON(first < bytenr);
+			len = min(first - bytenr, num);
+		} else {
+			len = min(num, cache->key.offset -
+				  (bytenr - cache->key.objectid));
+		}
 		if (pin) {
-			cache->pinned += len;
-			cache->space_info->bytes_pinned += len;
+			if (cache) {
+				cache->pinned += len;
+				cache->space_info->bytes_pinned += len;
+			}
 			fs_info->total_pinned += len;
 		} else {
-			cache->pinned -= len;
-			cache->space_info->bytes_pinned -= len;
+			if (cache) {
+				cache->pinned -= len;
+				cache->space_info->bytes_pinned -= len;
+			}
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1547,7 +1581,7 @@
 				     int data)
 {
 	int ret;
-	u64 orig_search_start = search_start;
+	u64 orig_search_start;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
@@ -1577,6 +1611,9 @@
 		}
 	}
 
+	search_start = max(search_start, first_logical_byte(root, 0));
+	orig_search_start = search_start;
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
@@ -1751,7 +1788,7 @@
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	data = reduce_alloc_profile(data);
+	data = reduce_alloc_profile(root, data);
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -2309,6 +2346,7 @@
 	struct file_ra_state *ra;
 	unsigned long total_read = 0;
 	unsigned long ra_pages;
+	struct btrfs_trans_handle *trans;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
@@ -2326,9 +2364,13 @@
 				       calc_ra(i, last_index, ra_pages));
 		}
 		total_read++;
+		if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
+			goto truncate_racing;
+
 		page = grab_cache_page(inode->i_mapping, i);
-		if (!page)
+		if (!page) {
 			goto out_unlock;
+		}
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
@@ -2350,20 +2392,33 @@
 
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-		set_page_dirty(page);
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
+		set_page_dirty(page);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
 	}
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+					   total_read);
 
 out_unlock:
 	kfree(ra);
+	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+	if (trans) {
+		btrfs_add_ordered_inode(inode);
+		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+		mark_inode_dirty(inode);
+	}
 	mutex_unlock(&inode->i_mutex);
 	return 0;
+
+truncate_racing:
+	vmtruncate(inode, inode->i_size);
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+					   total_read);
+	goto out_unlock;
 }
 
 /*
@@ -2466,6 +2521,27 @@
 	return 0;
 }
 
+static int noinline del_extent_zero(struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct btrfs_key *extent_key)
+{
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	trans = btrfs_start_transaction(extent_root, 1);
+	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+	if (ret > 0) {
+		ret = -EIO;
+		goto out;
+	}
+	if (ret < 0)
+		goto out;
+	ret = btrfs_del_item(trans, extent_root, path);
+out:
+	btrfs_end_transaction(trans, extent_root);
+	return ret;
+}
+
 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key)
@@ -2477,6 +2553,10 @@
 	u32 item_size;
 	int ret = 0;
 
+	if (extent_key->objectid == 0) {
+		ret = del_extent_zero(extent_root, path, extent_key);
+		goto out;
+	}
 	key.objectid = extent_key->objectid;
 	key.type = BTRFS_EXTENT_REF_KEY;
 	key.offset = 0;
@@ -2490,15 +2570,24 @@
 		ret = 0;
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] == nritems)
-			goto out;
+		if (path->slots[0] == nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			if (ret < 0)
+				goto out;
+		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != extent_key->objectid)
+		if (found_key.objectid != extent_key->objectid) {
 			break;
+		}
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY)
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
 			break;
+		}
 
 		key.offset = found_key.offset + 1;
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -2519,7 +2608,7 @@
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
+	num_devices = root->fs_info->fs_devices->num_devices;
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -2535,9 +2624,6 @@
 		return flags;
 	} else {
 		/* they already had raid on here, just return */
-		if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-		    (flags & BTRFS_BLOCK_GROUP_RAID1)) {
-		}
 		if (flags & stripped)
 			return flags;
 
@@ -2570,7 +2656,7 @@
 	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
-	int progress = 0;
+	int progress;
 
 	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
 						      shrink_start);
@@ -2597,6 +2683,7 @@
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
+	progress = 0;
 	key.objectid = shrink_start;
 	key.offset = 0;
 	key.type = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f7beb9b..b437d3b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2194,6 +2194,8 @@
 again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
+	if (em)
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
 	spin_unlock(&em_tree->lock);
 
 	if (em) {
@@ -2212,7 +2214,7 @@
 
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
-	em->bdev = inode->i_sb->s_bdev;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -3101,6 +3103,27 @@
 	return ret;
 }
 
+long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_rm_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
 int dup_item_to_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct btrfs_path *path,
@@ -3294,6 +3317,8 @@
 		return btrfs_ioctl_resize(root, (void __user *)arg);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(root, (void __user *)arg);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7153dfa..020e5a8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -315,24 +315,12 @@
 	sb->s_dirt = 0;
 }
 
-/*
- * This is almost a copy of get_sb_bdev in fs/super.c.
- * We need the local copy to allow direct mounting of
- * subvolumes, but this could be easily integrated back
- * into the generic version.  --hch
- */
-
-/* start copy & paste */
-static int set_bdev_super(struct super_block *s, void *data)
+static int btrfs_test_super(struct super_block *s, void *data)
 {
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	return 0;
-}
+	struct btrfs_fs_devices *test_fs_devices = data;
+	struct btrfs_root *root = btrfs_sb(s);
 
-static int test_bdev_super(struct super_block *s, void *data)
-{
-	return (void *)s->s_bdev == data;
+	return root->fs_info->fs_devices == test_fs_devices;
 }
 
 int btrfs_get_sb_bdev(struct file_system_type *fs_type,
@@ -354,14 +342,9 @@
 		return error;
 
 	bdev = fs_devices->lowest_bdev;
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
-	down(&bdev->bd_mount_sem);
-	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-	up(&bdev->bd_mount_sem);
+	btrfs_lock_volumes();
+	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+	btrfs_unlock_volumes();
 	if (IS_ERR(s))
 		goto error_s;
 
@@ -373,13 +356,11 @@
 			goto error_bdev;
 		}
 
-		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		sb_set_blocksize(s, block_size(bdev));
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -458,7 +439,7 @@
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.get_sb		= btrfs_get_sb,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9826942..57746c1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -738,9 +738,8 @@
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
-	write_extent_buffer(root->fs_info->sb_buffer,
-			    &root->fs_info->super_copy, 0,
-			    sizeof(root->fs_info->super_copy));
+	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+	       sizeof(root->fs_info->super_copy));
 
 	btrfs_copy_pinned(root, pinned_copy);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b381875..55da5f0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -45,6 +45,16 @@
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
+void btrfs_lock_volumes(void)
+{
+	mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+	mutex_unlock(&uuid_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -193,12 +203,14 @@
 			ret = PTR_ERR(bdev);
 			goto fail;
 		}
+		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
 		}
 		device->bdev = bdev;
+
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -393,6 +405,9 @@
 	struct btrfs_path *path;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -403,8 +418,25 @@
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		BUG_ON(ret);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+		ret = 0;
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	}
 	BUG_ON(ret);
 
+	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -593,6 +625,170 @@
 	return ret;
 }
 
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct block_device *bdev = device->bdev;
+	struct btrfs_device *next_dev;
+	struct btrfs_key key;
+	u64 total_bytes;
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	list_del_init(&device->dev_list);
+	list_del_init(&device->dev_alloc_list);
+	fs_devices = root->fs_info->fs_devices;
+
+	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
+			      dev_list);
+	if (bdev == fs_devices->lowest_bdev)
+		fs_devices->lowest_bdev = next_dev->bdev;
+	if (bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_dev->bdev;
+	if (bdev == fs_devices->latest_bdev)
+		fs_devices->latest_bdev = next_dev->bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes - device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes - 1);
+out:
+	btrfs_free_path(path);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 all_avail;
+	u64 devid;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&uuid_mutex);
+
+	all_avail = root->fs_info->avail_data_alloc_bits |
+		root->fs_info->avail_system_alloc_bits |
+		root->fs_info->avail_metadata_alloc_bits;
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+	    root->fs_info->fs_devices->num_devices <= 4) {
+		printk("btrfs: unable to go below four devices on raid10\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+	    root->fs_info->fs_devices->num_devices <= 2) {
+		printk("btrfs: unable to go below two devices on raid1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto out;
+	}
+
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+
+	root->fs_info->fs_devices->num_devices--;
+
+	ret = btrfs_shrink_device(device, 0);
+	if (ret)
+		goto error_brelse;
+
+
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_brelse;
+
+	/* make sure this device isn't detected as part of the FS anymore */
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+
+	brelse(bh);
+
+	/* one close for the device struct or super_block */
+	close_bdev_excl(device->bdev);
+
+	/* one close for us */
+	close_bdev_excl(device->bdev);
+
+	kfree(device->name);
+	kfree(device);
+	ret = 0;
+	goto out;
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+out:
+	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -831,13 +1027,17 @@
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	spin_unlock(&em_tree->lock);
 
-	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	BUG_ON(em->start > chunk_offset ||
+	       em->start + em->len < chunk_offset);
 	map = (struct map_lookup *)em->bdev;
 
 	for (i = 0; i < map->num_stripes; i++) {
 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
+
+		ret = btrfs_update_device(trans, map->stripes[i].dev);
+		BUG_ON(ret);
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -847,11 +1047,8 @@
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
 		BUG_ON(ret);
-		goto out;
 	}
 
-
-
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
 	kfree(map);
@@ -861,7 +1058,6 @@
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 
-out:
 	/* once for us */
 	free_extent_map(em);
 
@@ -1449,7 +1645,7 @@
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu\n", logical);
+		printk("unable to find logical %Lu len %Lu\n", logical, *length);
 		BUG();
 	}
 
@@ -1712,6 +1908,7 @@
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1845,7 +2042,7 @@
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
-	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -1857,6 +2054,12 @@
 	u32 cur;
 	struct btrfs_key key;
 
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+					  BTRFS_SUPER_INFO_SIZE);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
 	ptr = super_copy->sys_chunk_array;
@@ -1867,8 +2070,7 @@
 		disk_key = (struct btrfs_disk_key *)ptr;
 		btrfs_disk_key_to_cpu(&key, disk_key);
 
-		len = sizeof(*disk_key);
-		ptr += len;
+		len = sizeof(*disk_key); ptr += len;
 		sb_ptr += len;
 		cur += len;
 
@@ -1887,6 +2089,7 @@
 		sb_ptr += len;
 		cur += len;
 	}
+	free_extent_buffer(sb);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a9663e92..0f94a69 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -125,6 +125,7 @@
 int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
@@ -136,4 +137,6 @@
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
 #endif