diff options
Diffstat (limited to 'mm/mmu_notifier.c')
-rw-r--r-- | mm/mmu_notifier.c | 147 |
1 files changed, 86 insertions, 61 deletions
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 71c78115c45..88fa54d158e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -14,10 +14,14 @@ | |||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/srcu.h> | ||
17 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
20 | 21 | ||
22 | /* global SRCU for all MMs */ | ||
23 | static struct srcu_struct srcu; | ||
24 | |||
21 | /* | 25 | /* |
22 | * This function can't run concurrently against mmu_notifier_register | 26 | * This function can't run concurrently against mmu_notifier_register |
23 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | 27 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
@@ -25,58 +29,61 @@ | |||
25 | * in parallel despite there being no task using this mm any more, | 29 | * in parallel despite there being no task using this mm any more, |
26 | * through the vmas outside of the exit_mmap context, such as with | 30 | * through the vmas outside of the exit_mmap context, such as with |
27 | * vmtruncate. This serializes against mmu_notifier_unregister with | 31 | * vmtruncate. This serializes against mmu_notifier_unregister with |
28 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | 32 | * the mmu_notifier_mm->lock in addition to SRCU and it serializes |
29 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | 33 | * against the other mmu notifiers with SRCU. struct mmu_notifier_mm |
30 | * can't go away from under us as exit_mmap holds an mm_count pin | 34 | * can't go away from under us as exit_mmap holds an mm_count pin |
31 | * itself. | 35 | * itself. |
32 | */ | 36 | */ |
33 | void __mmu_notifier_release(struct mm_struct *mm) | 37 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 38 | { |
35 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | 40 | int id; |
37 | 41 | ||
38 | /* | 42 | /* |
39 | * RCU here will block mmu_notifier_unregister until | 43 | * srcu_read_lock() here will block synchronize_srcu() in |
40 | * ->release returns. | 44 | * mmu_notifier_unregister() until all registered |
45 | * ->release() callouts this function makes have | ||
46 | * returned. | ||
41 | */ | 47 | */ |
42 | rcu_read_lock(); | 48 | id = srcu_read_lock(&srcu); |
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
54 | |||
55 | spin_lock(&mm->mmu_notifier_mm->lock); | 49 | spin_lock(&mm->mmu_notifier_mm->lock); |
56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 50 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
57 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | 51 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, |
58 | struct mmu_notifier, | 52 | struct mmu_notifier, |
59 | hlist); | 53 | hlist); |
54 | |||
60 | /* | 55 | /* |
61 | * We arrived before mmu_notifier_unregister so | 56 | * Unlink. This will prevent mmu_notifier_unregister() |
62 | * mmu_notifier_unregister will do nothing other than | 57 | * from also making the ->release() callout. |
63 | * to wait ->release to finish and | ||
64 | * mmu_notifier_unregister to return. | ||
65 | */ | 58 | */ |
66 | hlist_del_init_rcu(&mn->hlist); | 59 | hlist_del_init_rcu(&mn->hlist); |
60 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
61 | |||
62 | /* | ||
63 | * Clear sptes. (see 'release' description in mmu_notifier.h) | ||
64 | */ | ||
65 | if (mn->ops->release) | ||
66 | mn->ops->release(mn, mm); | ||
67 | |||
68 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
67 | } | 69 | } |
68 | spin_unlock(&mm->mmu_notifier_mm->lock); | 70 | spin_unlock(&mm->mmu_notifier_mm->lock); |
69 | 71 | ||
70 | /* | 72 | /* |
71 | * synchronize_rcu here prevents mmu_notifier_release to | 73 | * All callouts to ->release() which we have done are complete. |
72 | * return to exit_mmap (which would proceed freeing all pages | 74 | * Allow synchronize_srcu() in mmu_notifier_unregister() to complete |
73 | * in the mm) until the ->release method returns, if it was | 75 | */ |
74 | * invoked by mmu_notifier_unregister. | 76 | srcu_read_unlock(&srcu, id); |
75 | * | 77 | |
76 | * The mmu_notifier_mm can't go away from under us because one | 78 | /* |
77 | * mm_count is hold by exit_mmap. | 79 | * mmu_notifier_unregister() may have unlinked a notifier and may |
80 | * still be calling out to it. Additionally, other notifiers | ||
81 | * may have been active via vmtruncate() et. al. Block here | ||
82 | * to ensure that all notifier callouts for this mm have been | ||
83 | * completed and the sptes are really cleaned up before returning | ||
84 | * to exit_mmap(). | ||
78 | */ | 85 | */ |
79 | synchronize_rcu(); | 86 | synchronize_srcu(&srcu); |
80 | } | 87 | } |
81 | 88 | ||
82 | /* | 89 | /* |
@@ -89,14 +96,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
89 | { | 96 | { |
90 | struct mmu_notifier *mn; | 97 | struct mmu_notifier *mn; |
91 | struct hlist_node *n; | 98 | struct hlist_node *n; |
92 | int young = 0; | 99 | int young = 0, id; |
93 | 100 | ||
94 | rcu_read_lock(); | 101 | id = srcu_read_lock(&srcu); |
95 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 102 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
96 | if (mn->ops->clear_flush_young) | 103 | if (mn->ops->clear_flush_young) |
97 | young |= mn->ops->clear_flush_young(mn, mm, address); | 104 | young |= mn->ops->clear_flush_young(mn, mm, address); |
98 | } | 105 | } |
99 | rcu_read_unlock(); | 106 | srcu_read_unlock(&srcu, id); |
100 | 107 | ||
101 | return young; | 108 | return young; |
102 | } | 109 | } |
@@ -106,9 +113,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
106 | { | 113 | { |
107 | struct mmu_notifier *mn; | 114 | struct mmu_notifier *mn; |
108 | struct hlist_node *n; | 115 | struct hlist_node *n; |
109 | int young = 0; | 116 | int young = 0, id; |
110 | 117 | ||
111 | rcu_read_lock(); | 118 | id = srcu_read_lock(&srcu); |
112 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 119 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
113 | if (mn->ops->test_young) { | 120 | if (mn->ops->test_young) { |
114 | young = mn->ops->test_young(mn, mm, address); | 121 | young = mn->ops->test_young(mn, mm, address); |
@@ -116,7 +123,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
116 | break; | 123 | break; |
117 | } | 124 | } |
118 | } | 125 | } |
119 | rcu_read_unlock(); | 126 | srcu_read_unlock(&srcu, id); |
120 | 127 | ||
121 | return young; | 128 | return young; |
122 | } | 129 | } |
@@ -126,8 +133,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
126 | { | 133 | { |
127 | struct mmu_notifier *mn; | 134 | struct mmu_notifier *mn; |
128 | struct hlist_node *n; | 135 | struct hlist_node *n; |
136 | int id; | ||
129 | 137 | ||
130 | rcu_read_lock(); | 138 | id = srcu_read_lock(&srcu); |
131 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 139 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
132 | if (mn->ops->change_pte) | 140 | if (mn->ops->change_pte) |
133 | mn->ops->change_pte(mn, mm, address, pte); | 141 | mn->ops->change_pte(mn, mm, address, pte); |
@@ -138,7 +146,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
138 | else if (mn->ops->invalidate_page) | 146 | else if (mn->ops->invalidate_page) |
139 | mn->ops->invalidate_page(mn, mm, address); | 147 | mn->ops->invalidate_page(mn, mm, address); |
140 | } | 148 | } |
141 | rcu_read_unlock(); | 149 | srcu_read_unlock(&srcu, id); |
142 | } | 150 | } |
143 | 151 | ||
144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | 152 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, |
@@ -146,13 +154,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, | |||
146 | { | 154 | { |
147 | struct mmu_notifier *mn; | 155 | struct mmu_notifier *mn; |
148 | struct hlist_node *n; | 156 | struct hlist_node *n; |
157 | int id; | ||
149 | 158 | ||
150 | rcu_read_lock(); | 159 | id = srcu_read_lock(&srcu); |
151 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 160 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
152 | if (mn->ops->invalidate_page) | 161 | if (mn->ops->invalidate_page) |
153 | mn->ops->invalidate_page(mn, mm, address); | 162 | mn->ops->invalidate_page(mn, mm, address); |
154 | } | 163 | } |
155 | rcu_read_unlock(); | 164 | srcu_read_unlock(&srcu, id); |
156 | } | 165 | } |
157 | 166 | ||
158 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | 167 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, |
@@ -160,13 +169,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
160 | { | 169 | { |
161 | struct mmu_notifier *mn; | 170 | struct mmu_notifier *mn; |
162 | struct hlist_node *n; | 171 | struct hlist_node *n; |
172 | int id; | ||
163 | 173 | ||
164 | rcu_read_lock(); | 174 | id = srcu_read_lock(&srcu); |
165 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 175 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
166 | if (mn->ops->invalidate_range_start) | 176 | if (mn->ops->invalidate_range_start) |
167 | mn->ops->invalidate_range_start(mn, mm, start, end); | 177 | mn->ops->invalidate_range_start(mn, mm, start, end); |
168 | } | 178 | } |
169 | rcu_read_unlock(); | 179 | srcu_read_unlock(&srcu, id); |
170 | } | 180 | } |
171 | 181 | ||
172 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 182 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
@@ -174,13 +184,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
174 | { | 184 | { |
175 | struct mmu_notifier *mn; | 185 | struct mmu_notifier *mn; |
176 | struct hlist_node *n; | 186 | struct hlist_node *n; |
187 | int id; | ||
177 | 188 | ||
178 | rcu_read_lock(); | 189 | id = srcu_read_lock(&srcu); |
179 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 190 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
180 | if (mn->ops->invalidate_range_end) | 191 | if (mn->ops->invalidate_range_end) |
181 | mn->ops->invalidate_range_end(mn, mm, start, end); | 192 | mn->ops->invalidate_range_end(mn, mm, start, end); |
182 | } | 193 | } |
183 | rcu_read_unlock(); | 194 | srcu_read_unlock(&srcu, id); |
184 | } | 195 | } |
185 | 196 | ||
186 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 197 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
@@ -192,6 +203,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
192 | 203 | ||
193 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 204 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
194 | 205 | ||
206 | /* | ||
207 | * Verify that mmu_notifier_init() already run and the global srcu is | ||
208 | * initialized. | ||
209 | */ | ||
210 | BUG_ON(!srcu.per_cpu_ref); | ||
211 | |||
195 | ret = -ENOMEM; | 212 | ret = -ENOMEM; |
196 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 213 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); |
197 | if (unlikely(!mmu_notifier_mm)) | 214 | if (unlikely(!mmu_notifier_mm)) |
@@ -274,8 +291,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
274 | /* | 291 | /* |
275 | * This releases the mm_count pin automatically and frees the mm | 292 | * This releases the mm_count pin automatically and frees the mm |
276 | * structure if it was the last user of it. It serializes against | 293 | * structure if it was the last user of it. It serializes against |
277 | * running mmu notifiers with RCU and against mmu_notifier_unregister | 294 | * running mmu notifiers with SRCU and against mmu_notifier_unregister |
278 | * with the unregister lock + RCU. All sptes must be dropped before | 295 | * with the unregister lock + SRCU. All sptes must be dropped before |
279 | * calling mmu_notifier_unregister. ->release or any other notifier | 296 | * calling mmu_notifier_unregister. ->release or any other notifier |
280 | * method may be invoked concurrently with mmu_notifier_unregister, | 297 | * method may be invoked concurrently with mmu_notifier_unregister, |
281 | * and only after mmu_notifier_unregister returned we're guaranteed | 298 | * and only after mmu_notifier_unregister returned we're guaranteed |
@@ -285,35 +302,43 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
285 | { | 302 | { |
286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 303 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
287 | 304 | ||
305 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 306 | if (!hlist_unhashed(&mn->hlist)) { |
289 | /* | 307 | int id; |
290 | * RCU here will force exit_mmap to wait ->release to finish | ||
291 | * before freeing the pages. | ||
292 | */ | ||
293 | rcu_read_lock(); | ||
294 | 308 | ||
295 | /* | 309 | /* |
296 | * exit_mmap will block in mmu_notifier_release to | 310 | * Ensure we synchronize up with __mmu_notifier_release(). |
297 | * guarantee ->release is called before freeing the | ||
298 | * pages. | ||
299 | */ | 311 | */ |
312 | id = srcu_read_lock(&srcu); | ||
313 | |||
314 | hlist_del_rcu(&mn->hlist); | ||
315 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
316 | |||
300 | if (mn->ops->release) | 317 | if (mn->ops->release) |
301 | mn->ops->release(mn, mm); | 318 | mn->ops->release(mn, mm); |
302 | rcu_read_unlock(); | ||
303 | 319 | ||
304 | spin_lock(&mm->mmu_notifier_mm->lock); | 320 | /* |
305 | hlist_del_rcu(&mn->hlist); | 321 | * Allow __mmu_notifier_release() to complete. |
322 | */ | ||
323 | srcu_read_unlock(&srcu, id); | ||
324 | } else | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 325 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
308 | 326 | ||
309 | /* | 327 | /* |
310 | * Wait any running method to finish, of course including | 328 | * Wait for any running method to finish, including ->release() if it |
311 | * ->release if it was run by mmu_notifier_relase instead of us. | 329 | * was run by __mmu_notifier_release() instead of us. |
312 | */ | 330 | */ |
313 | synchronize_rcu(); | 331 | synchronize_srcu(&srcu); |
314 | 332 | ||
315 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 333 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
316 | 334 | ||
317 | mmdrop(mm); | 335 | mmdrop(mm); |
318 | } | 336 | } |
319 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | 337 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
338 | |||
339 | static int __init mmu_notifier_init(void) | ||
340 | { | ||
341 | return init_srcu_struct(&srcu); | ||
342 | } | ||
343 | |||
344 | module_init(mmu_notifier_init); | ||