Btrfs: crash recovery fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 606a19b..9eb6465 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -75,6 +75,17 @@
 	struct btrfs_node *cow_node;
 	int ret;
 
+	WARN_ON(!buffer_uptodate(buf));
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->generation);
+		WARN_ON(1);
+	}
 	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
 				    trans->transid) {
 		*cow_ret = buf;
@@ -107,7 +118,7 @@
 		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
 	}
 	btrfs_block_release(root, buf);
-	mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1998f86..0287bd5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1013,18 +1013,13 @@
 	memmove(dst, src, nr);
 }
 
-static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
-{
-	WARN_ON(!atomic_read(&bh->b_count));
-	mark_buffer_dirty(bh);
-}
-
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-tree.c */
+int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 blocknr);
@@ -1044,8 +1039,9 @@
 		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *root);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct radix_tree_root *unpin_radix);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 blocknr, u64 num_blocks);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7081729..d1bf5bc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -270,14 +270,6 @@
 	return NULL;
 }
 
-int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct buffer_head *buf)
-{
-	WARN_ON(atomic_read(&buf->b_count) == 0);
-	mark_buffer_dirty(buf);
-	return 0;
-}
-
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf)
 {
@@ -621,6 +613,20 @@
 	return 0;
 }
 
+void btrfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
+	u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+	WARN_ON(!atomic_read(&bh->b_count));
+	if (transid != root->fs_info->generation) {
+		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
+			(unsigned long long)bh->b_blocknr,
+			transid, root->fs_info->generation);
+		WARN_ON(1);
+	}
+	mark_buffer_dirty(bh);
+}
+
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 {
 	brelse(buf);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c4a695a..9e2c261 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -78,4 +78,5 @@
 int btrfs_releasepage(struct page *page, gfp_t flags);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct buffer_head *bh);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01dc305..14b9326 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -523,6 +523,7 @@
 	}
 	return 0;
 fail:
+	WARN_ON(1);
 	for (i =0; i < faili; i++) {
 		if (leaf) {
 			u64 disk_blocknr;
@@ -572,7 +573,7 @@
 	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_block_group_item);
 	memcpy(bi, &cache->item, sizeof(*bi));
-	mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(extent_root, path);
 fail:
 	finish_current_insert(trans, extent_root);
@@ -739,8 +740,30 @@
 	return ret;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *root)
+int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy)
+{
+	unsigned long gang[8];
+	u64 last = 0;
+	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
+	int ret;
+	int i;
+
+	while(1) {
+		ret = find_first_radix_bit(pinned_radix, gang, last,
+					   ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0 ; i < ret; i++) {
+			set_radix_bit(copy, gang[i]);
+			last = gang[i] + 1;
+		}
+	}
+	return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct radix_tree_root *unpin_radix)
 {
 	unsigned long gang[8];
 	struct inode *btree_inode = root->fs_info->btree_inode;
@@ -752,7 +775,7 @@
 	struct radix_tree_root *extent_radix = &root->fs_info->extent_map_radix;
 
 	while(1) {
-		ret = find_first_radix_bit(pinned_radix, gang, 0,
+		ret = find_first_radix_bit(unpin_radix, gang, 0,
 					   ARRAY_SIZE(gang));
 		if (!ret)
 			break;
@@ -760,6 +783,7 @@
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
+			clear_radix_bit(unpin_radix, gang[i]);
 			block_group = btrfs_lookup_block_group(root->fs_info,
 							       gang[i]);
 			if (block_group) {
@@ -1309,6 +1333,7 @@
 	if (data) {
 		ret = find_free_extent(trans, root, 0, 0,
 				       search_end, 0, &prealloc_key, 0, 0, 0);
+		BUG_ON(ret);
 		if (ret)
 			return ret;
 		exclude_nr = info->extent_tree_prealloc_nr;
@@ -1319,6 +1344,7 @@
 	ret = find_free_extent(trans, root, num_blocks, search_start,
 			       search_end, hint_block, ins,
 			       exclude_start, exclude_nr, data);
+	BUG_ON(ret);
 	if (ret)
 		return ret;
 
@@ -1334,10 +1360,12 @@
 	if (!data) {
 		exclude_start = ins->objectid;
 		exclude_nr = ins->offset;
+		hint_block = exclude_start + exclude_nr;
 		ret = find_free_extent(trans, root, 0, search_start,
 				       search_end, hint_block,
 				       &prealloc_key, exclude_start,
 				       exclude_nr, 0);
+		BUG_ON(ret);
 		if (ret)
 			return ret;
 	}
@@ -1348,6 +1376,7 @@
 	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
 				sizeof(extent_item));
 
+	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
 	if (ret) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fef7ba1..2456cc3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -127,7 +127,7 @@
 		     ptr, kaddr + bh_offset(bh),
 		     size);
 	kunmap_atomic(kaddr, KM_USER0);
-	mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_free_path(path);
 	ret = btrfs_end_transaction(trans, root);
@@ -211,11 +211,13 @@
 	int found_type;
 	int found_extent;
 	int found_inline;
+	int recow;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	while(1) {
+		recow = 0;
 		btrfs_release_path(root, path);
 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 					       search_start, -1);
@@ -244,6 +246,10 @@
 		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY) {
 			goto out;
 		}
+		if (recow) {
+			search_start = key.offset;
+			continue;
+		}
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
@@ -274,6 +280,7 @@
 				nextret = btrfs_next_leaf(root, path);
 				if (nextret)
 					goto out;
+				recow = 1;
 			} else {
 				path->slots[0]++;
 			}
@@ -321,7 +328,7 @@
 				}
 				btrfs_set_file_extent_num_blocks(extent,
 								 new_num);
-				mark_buffer_dirty(path->nodes[0]);
+				btrfs_mark_buffer_dirty(path->nodes[0]);
 			} else {
 				WARN_ON(1);
 			}
@@ -452,6 +459,8 @@
 			err = -ENOMEM;
 			goto failed_release;
 		}
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+		wait_on_page_writeback(pages[i]);
 	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -522,8 +531,6 @@
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	for (i = 0; i < num_pages; i++) {
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-		wait_on_page_writeback(pages[i]);
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
 		if (!page_has_buffers(pages[i])) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eba06e7..4fc0367 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -506,7 +506,7 @@
 							 extent_num_blocks);
 				inode->i_blocks -= (orig_num_blocks -
 					extent_num_blocks) << 3;
-				mark_buffer_dirty(path->nodes[0]);
+				btrfs_mark_buffer_dirty(path->nodes[0]);
 			} else {
 				extent_start =
 					btrfs_file_extent_disk_blocknr(fi);
@@ -2020,7 +2020,7 @@
 	btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
 	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(leaf->header.fsid));
-	mark_buffer_dirty(subvol);
+	btrfs_mark_buffer_dirty(subvol);
 
 	inode_item = &root_item.inode;
 	memset(inode_item, 0, sizeof(*inode_item));
@@ -2497,7 +2497,7 @@
 	ptr = btrfs_file_extent_inline_start(ei);
 	btrfs_memcpy(root, path->nodes[0]->b_data,
 		     ptr, symname, name_len);
-	mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b2face..bec38ae 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -85,6 +85,8 @@
 
 	if (root != root->fs_info->tree_root && root->last_trans <
 	    running_trans_id) {
+		WARN_ON(root == root->fs_info->extent_root);
+		WARN_ON(root->ref_cows != 1);
 		if (root->root_item.refs != 0) {
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 					   (unsigned long)root->root_key.objectid,
@@ -113,10 +115,11 @@
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = root->fs_info->running_transaction;
+	WARN_ON(cur_trans != trans->transaction);
 	WARN_ON(cur_trans->num_writers < 1);
+	cur_trans->num_writers--;
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-	cur_trans->num_writers--;
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	memset(trans, 0, sizeof(*trans));
@@ -194,6 +197,7 @@
 			   struct btrfs_transaction *commit)
 {
 	DEFINE_WAIT(wait);
+	mutex_lock(&root->fs_info->trans_mutex);
 	while(!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
@@ -203,6 +207,7 @@
 		schedule();
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
+	mutex_unlock(&root->fs_info->trans_mutex);
 	finish_wait(&commit->commit_wait, &wait);
 	return 0;
 }
@@ -279,7 +284,6 @@
 						&root->root_item);
 			if (err)
 				break;
-
 			refs = btrfs_root_refs(&tmp_item);
 			btrfs_set_root_refs(&tmp_item, refs - 1);
 			err = btrfs_update_root(trans, root->fs_info->tree_root,
@@ -333,31 +337,53 @@
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
+	struct radix_tree_root pinned_copy;
 	DEFINE_WAIT(wait);
 
+	init_bit_radix(&pinned_copy);
 	INIT_LIST_HEAD(&dirty_fs_roots);
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
 		cur_trans = trans->transaction;
 		trans->transaction->use_count++;
+		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
+
+		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
 		put_transaction(cur_trans);
-		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
-	cur_trans = trans->transaction;
 	trans->transaction->in_commit = 1;
+	cur_trans = trans->transaction;
+	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+		prev_trans = list_entry(cur_trans->list.prev,
+					struct btrfs_transaction, list);
+		if (!prev_trans->commit_done) {
+			prev_trans->use_count++;
+			mutex_unlock(&root->fs_info->fs_mutex);
+			mutex_unlock(&root->fs_info->trans_mutex);
+
+			wait_for_commit(root, prev_trans);
+			put_transaction(prev_trans);
+
+			mutex_lock(&root->fs_info->fs_mutex);
+			mutex_lock(&root->fs_info->trans_mutex);
+		}
+	}
 	while (trans->transaction->num_writers > 1) {
 		WARN_ON(cur_trans != trans->transaction);
 		prepare_to_wait(&trans->transaction->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (trans->transaction->num_writers <= 1)
 			break;
+		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
 		schedule();
+		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&trans->transaction->writer_wait, &wait);
 	}
@@ -372,34 +398,22 @@
 
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
-	if (cur_trans->list.prev != &root->fs_info->trans_list) {
-		prev_trans = list_entry(cur_trans->list.prev,
-					struct btrfs_transaction, list);
-		if (prev_trans->commit_done)
-			prev_trans = NULL;
-		else
-			prev_trans->use_count++;
-	}
 	btrfs_set_super_generation(&root->fs_info->super_copy,
 				   cur_trans->transid);
 	btrfs_set_super_root(&root->fs_info->super_copy,
 			     bh_blocknr(root->fs_info->tree_root->node));
 	memcpy(root->fs_info->disk_super, &root->fs_info->super_copy,
 	       sizeof(root->fs_info->super_copy));
+
+	btrfs_copy_pinned(root, &pinned_copy);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
-	if (prev_trans) {
-		mutex_lock(&root->fs_info->trans_mutex);
-		wait_for_commit(root, prev_trans);
-		put_transaction(prev_trans);
-		mutex_unlock(&root->fs_info->trans_mutex);
-	}
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
-
 	mutex_lock(&root->fs_info->fs_mutex);
-	btrfs_finish_extent_commit(trans, root);
+	btrfs_finish_extent_commit(trans, root, &pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans->commit_done = 1;
 	wake_up(&cur_trans->commit_wait);