Drop locks in btrfs_search_slot when reading a tree block.
One lock per btree block can make for significant congestion if everyone
has to wait for IO at the high levels of the btree. This drops
locks held by a path when doing reads during a tree search.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dff4da0..1b756fa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -63,7 +63,6 @@
void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
{
int i;
- int skip = p->skip_locking;
int keep = p->keep_locks;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
@@ -76,7 +75,6 @@
free_extent_buffer(p->nodes[i]);
}
memset(p, 0, sizeof(*p));
- p->skip_locking = skip;
p->keep_locks = keep;
}
@@ -1137,7 +1135,6 @@
return;
node = path->nodes[level];
- WARN_ON(!path->skip_locking && !btrfs_tree_locked(node));
search = btrfs_node_blockptr(node, slot);
blocksize = btrfs_level_size(root, level - 1);
@@ -1192,6 +1189,7 @@
{
int i;
int skip_level = level;
+ int no_skips = 0;
struct extent_buffer *t;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1199,27 +1197,24 @@
break;
if (!path->locks[i])
break;
- if (path->slots[i] == 0) {
+ if (!no_skips && path->slots[i] == 0) {
skip_level = i + 1;
continue;
}
- if (path->keep_locks) {
+ if (!no_skips && path->keep_locks) {
u32 nritems;
t = path->nodes[i];
nritems = btrfs_header_nritems(t);
- if (nritems < 2 || path->slots[i] >= nritems - 2) {
-if (path->keep_locks) {
-//printk("path %p skip level now %d\n", path, skip_level);
-}
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
skip_level = i + 1;
continue;
}
}
+ if (skip_level < i && i >= lowest_unlock)
+ no_skips = 1;
+
t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
-if (path->keep_locks) {
-//printk("path %p unlocking level %d slot %d nritems %d skip_level %d\n", path, i, path->slots[i], btrfs_header_nritems(t), skip_level);
-}
btrfs_tree_unlock(t);
path->locks[i] = 0;
}
@@ -1244,6 +1239,7 @@
ins_len, int cow)
{
struct extent_buffer *b;
+ struct extent_buffer *tmp;
int slot;
int ret;
int level;
@@ -1263,10 +1259,7 @@
if (ins_len < 0)
lowest_unlock = 2;
again:
- if (!p->skip_locking)
- b = btrfs_lock_root_node(root);
- else
- b = btrfs_root_node(root);
+ b = btrfs_lock_root_node(root);
while (b) {
level = btrfs_header_level(b);
@@ -1286,8 +1279,7 @@
WARN_ON(1);
level = btrfs_header_level(b);
p->nodes[level] = b;
- if (!p->skip_locking)
- p->locks[level] = 1;
+ p->locks[level] = 1;
ret = check_block(root, p, level);
if (ret)
return -1;
@@ -1328,10 +1320,29 @@
reada_for_search(root, p, level, slot,
key->objectid);
- b = read_node_slot(root, b, slot);
- if (!p->skip_locking)
- btrfs_tree_lock(b);
- unlock_up(p, level + 1, lowest_unlock);
+ tmp = btrfs_find_tree_block(root,
+ btrfs_node_blockptr(b, slot),
+ btrfs_level_size(root, level - 1));
+ if (tmp && btrfs_buffer_uptodate(tmp,
+ btrfs_node_ptr_generation(b, slot))) {
+ b = tmp;
+ } else {
+ /*
+ * reduce lock contention at high levels
+ * of the btree by dropping locks before
+ * we read.
+ */
+ if (level > 1) {
+ btrfs_release_path(NULL, p);
+ if (tmp)
+ free_extent_buffer(tmp);
+ goto again;
+ } else {
+ b = read_node_slot(root, b, slot);
+ }
+ }
+ btrfs_tree_lock(b);
+ unlock_up(p, level, lowest_unlock);
} else {
p->slots[level] = slot;
if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -3007,17 +3018,8 @@
reada_for_search(root, path, level, slot, 0);
next = read_node_slot(root, c, slot);
- if (!path->skip_locking) {
- if (!btrfs_tree_locked(c)) {
- int i;
- WARN_ON(1);
-printk("path %p no lock on level %d\n", path, level);
-for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-printk("path %p level %d slot %d nritems %d\n", path, i, path->slots[i], btrfs_header_nritems(path->nodes[i]));
-}
- }
- btrfs_tree_lock(next);
- }
+ WARN_ON(!btrfs_tree_locked(c));
+ btrfs_tree_lock(next);
break;
}
path->slots[level] = slot;
@@ -3035,10 +3037,8 @@
if (level == 1 && path->locks[1] && path->reada)
reada_for_search(root, path, level, slot, 0);
next = read_node_slot(root, next, 0);
- if (!path->skip_locking) {
- WARN_ON(!btrfs_tree_locked(path->nodes[level]));
- btrfs_tree_lock(next);
- }
+ WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+ btrfs_tree_lock(next);
}
done:
unlock_up(path, 0, 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 692b8ea..9ea12d4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -336,7 +336,6 @@
/* keep some upper locks as we walk down */
int keep_locks;
int lowest_level;
- int skip_locking;
};
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f638803..ffc363d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1684,6 +1684,7 @@
#else
blk_congestion_wait(WRITE, HZ/20);
#endif
+
}
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 890b9e9..0905653 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -88,7 +88,6 @@
return -ENOMEM;
path->reada = 2;
- path->skip_locking = 1;
first_free = block_group->key.objectid;
key.objectid = block_group->key.objectid;
key.offset = 0;