thp: allocate memory in khugepaged outside of mmap_sem write mode This tries to be more friendly to filesystem in userland, with userland backends that allocate memory in the I/O paths and that could deadlock if khugepaged holds the mmap_sem write mode of the userland backend while allocating memory. Memory allocation may wait for writeback I/O completion from the daemon that may be blocked in the mmap_sem read mode if a page fault happens and the daemon wasn't using mlock for the memory required for the I/O submission and completion. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: ce83d2174ea9c3d72d5821cf3ebc974e36391bf7 [log] [tgz]
author: Andrea Arcangeli <aarcange@redhat.com> Thu Jan 13 15:47:06 2011 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> Thu Jan 13 17:32:45 2011 -0800
tree: 5e31167f84110551cf00ed3335b2cc3af317b33a
parent: 0bbbc0b33d141f78a0d9218a54a47f50621220d3 [diff] [blame]
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f6559e7..bce6e12 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c

@@ -1664,9 +1664,9 @@
 
 static void collapse_huge_page(struct mm_struct *mm,
 			       unsigned long address,
-			       struct page **hpage)
+			       struct page **hpage,
+			       struct vm_area_struct *vma)
 {
-	struct vm_area_struct *vma;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd, _pmd;
@@ -1680,9 +1680,34 @@
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 #ifndef CONFIG_NUMA
 	VM_BUG_ON(!*hpage);
+	new_page = *hpage;
 #else
 	VM_BUG_ON(*hpage);
+	/*
+	 * Allocate the page while the vma is still valid and under
+	 * the mmap_sem read mode so there is no memory allocation
+	 * later when we take the mmap_sem in write mode. This is more
+	 * friendly behavior (OTOH it may actually hide bugs) to
+	 * filesystems in userland with daemons allocating memory in
+	 * the userland I/O paths.  Allocating memory with the
+	 * mmap_sem in read mode is good idea also to allow greater
+	 * scalability.
+	 */
+	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+	if (unlikely(!new_page)) {
+		up_read(&mm->mmap_sem);
+		*hpage = ERR_PTR(-ENOMEM);
+		return;
+	}
 #endif
+	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+		up_read(&mm->mmap_sem);
+		put_page(new_page);
+		return;
+	}
+
+	/* after allocating the hugepage upgrade to mmap_sem write mode */
+	up_read(&mm->mmap_sem);
 
 	/*
 	 * Prevent all access to pagetables with the exception of
@@ -1720,18 +1745,6 @@
 	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
 		goto out;
 
-#ifndef CONFIG_NUMA
-	new_page = *hpage;
-#else
-	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
-	if (unlikely(!new_page)) {
-		*hpage = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-#endif
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
-		goto out_put_page;
-
 	anon_vma_lock(vma->anon_vma);
 
 	pte = pte_offset_map(pmd, address);
@@ -1759,7 +1772,7 @@
 		spin_unlock(&mm->page_table_lock);
 		anon_vma_unlock(vma->anon_vma);
 		mem_cgroup_uncharge_page(new_page);
-		goto out_put_page;
+		goto out;
 	}
 
 	/*
@@ -1798,15 +1811,15 @@
 	*hpage = NULL;
 #endif
 	khugepaged_pages_collapsed++;
-out:
+out_up_write:
 	up_write(&mm->mmap_sem);
 	return;
 
-out_put_page:
+out:
 #ifdef CONFIG_NUMA
 	put_page(new_page);
 #endif
-	goto out;
+	goto out_up_write;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1865,10 +1878,9 @@
 		ret = 1;
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
-	if (ret) {
-		up_read(&mm->mmap_sem);
-		collapse_huge_page(mm, address, hpage);
-	}
+	if (ret)
+		/* collapse_huge_page will return with the mmap_sem released */
+		collapse_huge_page(mm, address, hpage, vma);
 out:
 	return ret;
 }
commit	ce83d2174ea9c3d72d5821cf3ebc974e36391bf7	[log] [tgz]
author	Andrea Arcangeli <aarcange@redhat.com>	Thu Jan 13 15:47:06 2011 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	Thu Jan 13 17:32:45 2011 -0800
tree	5e31167f84110551cf00ed3335b2cc3af317b33a
parent	0bbbc0b33d141f78a0d9218a54a47f50621220d3 [diff] [blame]