ANDROID: Make SPF aware of fast mremaps

SPF attempts page faults without taking the mmap lock, but takes the PTL. If there is a concurrent fast mremap (at PMD/PUD level), this can lead to a UAF as fast mremap will only take the PTL locks at the PMD/PUD level. SPF cannot take the PTL locks at the larger subtree granularity since this introduces much contention in the page fault paths. To address the race: 1) Fast mremaps wait until there are no users of the VMA. 2) Speculative faults detect ongoing fast mremaps and fallback to conventional fault handling (taking mmap read lock). Since this race condition is very rare the performance impact is negligible. Bug: 263177905 Change-Id: If9755aa4261337fe180e3093a3cefaae8ac9ff1a Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
2022-12-20 15:16:45 -08:00
parent 8d58de2c76
commit af027c97fc
3 changed files with 120 additions and 16 deletions
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3380,6 +3380,8 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,

 #ifdef CONFIG_MMU
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern wait_queue_head_t vma_users_wait;
+extern atomic_t vma_user_waiters;

 bool __pte_map_lock(struct vm_fault *vmf);

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -217,6 +217,23 @@ struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)

 	rcu_read_lock();
 	vma = find_vma_from_tree(mm, addr);
+
+	/*
+	 * atomic_inc_unless_negative() also protects from races with
+	 * fast mremap.
+	 *
+	 * If there is a concurrent fast mremap, bail out since the entire
+	 * PMD/PUD subtree may have been remapped.
+	 *
+	 * This is usually safe for conventional mremap since it takes the
+	 * PTE locks as does SPF. However fast mremap only takes the lock
+	 * at the PMD/PUD level which is ok as it is done with the mmap
+	 * write lock held. But since SPF, as the term implies forgoes,
+	 * taking the mmap read lock and also cannot take PTL lock at the
+	 * larger PMD/PUD granualrity, since it would introduce huge
+	 * contention in the page fault path; fall back to regular fault
+	 * handling.
+	 */
 	if (vma) {
 		if (vma->vm_start > addr ||
 		    !atomic_inc_unless_negative(&vma->file_ref_count))
@@ -232,7 +249,16 @@ void put_vma(struct vm_area_struct *vma)
 	int new_ref_count;

 	new_ref_count = atomic_dec_return(&vma->file_ref_count);
-	if (new_ref_count < 0)
+
+	/*
+	 * Implicit smp_mb due to atomic_dec_return.
+	 *
+	 * If this is the last reference, wake up the mremap waiter
+	 * (if any).
+	 */
+	if (new_ref_count == 0 && unlikely(atomic_read(&vma_user_waiters) > 0))
+		wake_up(&vma_users_wait);
+	else if (new_ref_count < 0)
 		vm_area_free_no_check(vma);
 }

--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -219,17 +219,77 @@ static inline bool arch_supports_page_table_move(void)
 }
 #endif

+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+DECLARE_WAIT_QUEUE_HEAD(vma_users_wait);
+atomic_t vma_user_waiters = ATOMIC_INIT(0);
+
+static inline void wait_for_vma_users(struct vm_area_struct *vma)
+{
+	/*
+	 * If we have the only reference, swap the refcount to -1. This
+	 * will prevent other concurrent references by get_vma() for SPFs.
+	 */
+	if (likely(atomic_cmpxchg(&vma->file_ref_count, 0, -1) == 0))
+		return;
+
+	/* Indicate we are waiting for other users of the VMA to finish. */
+	atomic_inc(&vma_user_waiters);
+
+	/* Failed atomic_cmpxchg; no implicit barrier, use an explicit one. */
+	smp_mb();
+
+	/*
+	 * Callers cannot handle failure, sleep uninterruptibly until there
+	 * are no other users of this VMA.
+	 *
+	 * We don't need to worry about references from concurrent waiters,
+	 * since this is only used in the context of fast mremaps, with
+	 * exclusive mmap write lock held.
+	 */
+	wait_event(vma_users_wait, atomic_cmpxchg(&vma->file_ref_count, 0, -1) == 0);
+
+	atomic_dec(&vma_user_waiters);
+}
+
+/*
+ * Restore the VMA reference count to 0 after a fast mremap.
+ */
+static inline void restore_vma_ref_count(struct vm_area_struct *vma)
+{
+	/*
+	 * This should only be called after a corresponding,
+	 * wait_for_vma_users()
+	 */
+	VM_BUG_ON_VMA(atomic_cmpxchg(&vma->file_ref_count, -1, 0) != -1,
+		      vma);
+}
+#else	/* !CONFIG_SPECULATIVE_PAGE_FAULT */
+static inline void wait_for_vma_users(struct vm_area_struct *vma)
+{
+}
+static inline void restore_vma_ref_count(struct vm_area_struct *vma)
+{
+}
+#endif	/* CONFIG_SPECULATIVE_PAGE_FAULT */
+
 /*
 * Speculative page fault handlers will not detect page table changes done
 * without ptl locking.
 */
-#if defined(CONFIG_HAVE_MOVE_PMD) && !defined(CONFIG_SPECULATIVE_PAGE_FAULT)
+#ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
+	bool ret;
+
+	/*
+	 * Wait for concurrent users, since these can potentially be
+	 * speculative page faults.
+	 */
+	wait_for_vma_users(vma);

 	if (!arch_supports_page_table_move())
 		return false;
@@ -256,8 +316,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * One alternative might be to just unmap the target pmd at
 	 * this point, and verify that it really is empty. We'll see.
 	 */
-	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
-		return false;
+	if (WARN_ON_ONCE(!pmd_none(*new_pmd))) {
+		ret = false;
+		goto out;
+	}

 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -280,7 +342,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);

-	return true;
+	ret = true;
+
+out:
+	restore_vma_ref_count(vma);
+	return ret;
 }
 #else
 static inline bool move_normal_pmd(struct vm_area_struct *vma,
@@ -291,27 +357,33 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma,
 }
 #endif

-/*
- * Speculative page fault handlers will not detect page table changes done
- * without ptl locking.
- */
-#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) && \
-		!defined(CONFIG_SPECULATIVE_PAGE_FAULT)
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
+	bool ret;

-	if (!arch_supports_page_table_move())
-		return false;
+	/*
+	 * Wait for concurrent users, since these can potentially be
+	 * speculative page faults.
+	 */
+	wait_for_vma_users(vma);
+
+	if (!arch_supports_page_table_move()) {
+		ret = false;
+		goto out;
+	}
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
 	 */
-	if (WARN_ON_ONCE(!pud_none(*new_pud)))
-		return false;
+	if (WARN_ON_ONCE(!pud_none(*new_pud))) {
+		ret = false;
+		goto out;
+	}

 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -334,7 +406,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);

-	return true;
+	ret = true;
+
+out:
+	restore_vma_ref_count(vma);
+	return ret;
 }
 #else
 static inline bool move_normal_pud(struct vm_area_struct *vma,