[PATCH] inotify: lock avoidance with parent watch status in dentry

Previous inotify work avoidance is good when inotify is completely unused,
but it breaks down if even a single watch is in place anywhere in the
system.  Robin Holt notices that udev is one such culprit - it slows down a
512-thread application on a 512 CPU system from 6 seconds to 22 minutes.

Solve this by adding a flag in the dentry that tells inotify whether or not
its parent inode has a watch on it.  Event queueing to parent will skip
taking locks if this flag is cleared.  Setting and clearing of this flag on
all child dentries versus event delivery: this is no in terms of race
cases, and that was shown to be equivalent to always performing the check.

The essential behaviour is that activity occuring _after_ a watch has been
added and _before_ it has been removed, will generate events.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/fs/dcache.c b/fs/dcache.c
index 139e5fd..0f7ec12 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -802,6 +802,7 @@
 	if (inode)
 		list_add(&entry->d_alias, &inode->i_dentry);
 	entry->d_inode = inode;
+	fsnotify_d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
@@ -853,6 +854,7 @@
 	list_add(&entry->d_alias, &inode->i_dentry);
 do_negative:
 	entry->d_inode = inode;
+	fsnotify_d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 	return NULL;
@@ -983,6 +985,7 @@
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			fsnotify_d_instantiate(new, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_rehash(dentry);
@@ -992,6 +995,7 @@
 			/* d_instantiate takes dcache_lock, so we do it by hand */
 			list_add(&dentry->d_alias, &inode->i_dentry);
 			dentry->d_inode = inode;
+			fsnotify_d_instantiate(dentry, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
@@ -1176,6 +1180,9 @@
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (atomic_read(&dentry->d_count) == 1) {
+		/* remove this and other inotify debug checks after 2.6.18 */
+		dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
+
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
@@ -1342,6 +1349,7 @@
 
 	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	spin_unlock(&target->d_lock);
+	fsnotify_d_move(dentry);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
diff --git a/fs/inotify.c b/fs/inotify.c
index 0ee39ef..a61e93e 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -38,7 +38,6 @@
 #include <asm/ioctls.h>
 
 static atomic_t inotify_cookie;
-static atomic_t inotify_watches;
 
 static kmem_cache_t *watch_cachep;
 static kmem_cache_t *event_cachep;
@@ -381,6 +380,48 @@
 }
 
 /*
+ * inotify_inode_watched - returns nonzero if there are watches on this inode
+ * and zero otherwise.  We call this lockless, we do not care if we race.
+ */
+static inline int inotify_inode_watched(struct inode *inode)
+{
+	return !list_empty(&inode->inotify_watches);
+}
+
+/*
+ * Get child dentry flag into synch with parent inode.
+ * Flag should always be clear for negative dentrys.
+ */
+static void set_dentry_child_flags(struct inode *inode, int watched)
+{
+	struct dentry *alias;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode) {
+				WARN_ON(child->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
+				continue;
+			}
+			spin_lock(&child->d_lock);
+			if (watched) {
+				WARN_ON(child->d_flags &
+						DCACHE_INOTIFY_PARENT_WATCHED);
+				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+			} else {
+				WARN_ON(!(child->d_flags &
+					DCACHE_INOTIFY_PARENT_WATCHED));
+				child->d_flags&=~DCACHE_INOTIFY_PARENT_WATCHED;
+			}
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/*
  * create_watch - creates a watch on the given device.
  *
  * Callers must hold dev->mutex.  Calls inotify_dev_get_wd() so may sleep.
@@ -426,7 +467,6 @@
 	get_inotify_watch(watch);
 
 	atomic_inc(&dev->user->inotify_watches);
-	atomic_inc(&inotify_watches);
 
 	return watch;
 }
@@ -458,8 +498,10 @@
 	list_del(&watch->i_list);
 	list_del(&watch->d_list);
 
+	if (!inotify_inode_watched(watch->inode))
+		set_dentry_child_flags(watch->inode, 0);
+
 	atomic_dec(&dev->user->inotify_watches);
-	atomic_dec(&inotify_watches);
 	idr_remove(&dev->idr, watch->wd);
 	put_inotify_watch(watch);
 }
@@ -481,16 +523,39 @@
 	remove_watch_no_event(watch, dev);
 }
 
+/* Kernel API */
+
 /*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
+ * inotify_d_instantiate - instantiate dcache entry for inode
  */
-static inline int inotify_inode_watched(struct inode *inode)
+void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
 {
-	return !list_empty(&inode->inotify_watches);
+	struct dentry *parent;
+
+	if (!inode)
+		return;
+
+	WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
+	spin_lock(&entry->d_lock);
+	parent = entry->d_parent;
+	if (inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	spin_unlock(&entry->d_lock);
 }
 
-/* Kernel API */
+/*
+ * inotify_d_move - dcache entry has been moved
+ */
+void inotify_d_move(struct dentry *entry)
+{
+	struct dentry *parent;
+
+	parent = entry->d_parent;
+	if (inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	else
+		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
+}
 
 /**
  * inotify_inode_queue_event - queue an event to all watches on this inode
@@ -538,7 +603,7 @@
 	struct dentry *parent;
 	struct inode *inode;
 
-	if (!atomic_read (&inotify_watches))
+	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
 		return;
 
 	spin_lock(&dentry->d_lock);
@@ -993,6 +1058,9 @@
 		goto out;
 	}
 
+	if (!inotify_inode_watched(inode))
+		set_dentry_child_flags(inode, 1);
+
 	/* Add the watch to the device's and the inode's list */
 	list_add(&watch->d_list, &dev->watches);
 	list_add(&watch->i_list, &inode->inotify_watches);
@@ -1065,7 +1133,6 @@
 	inotify_max_user_watches = 8192;
 
 	atomic_set(&inotify_cookie, 0);
-	atomic_set(&inotify_watches, 0);
 
 	watch_cachep = kmem_cache_create("inotify_watch_cache",
 					 sizeof(struct inotify_watch),