aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOmar Sandoval2019-09-16 13:30:53 -0500
committerGreg Kroah-Hartman2019-12-31 05:37:44 -0600
commit6d52fb75cd543ae3f5ff443294ce3bef56bb12fe (patch)
tree8cfb4803d79f647cb4990e5dd82f0602faa43dfe
parent9e5ae20bb9b5e37d9ec07fe7933e14b4bc19f75f (diff)
downloadkernel-6d52fb75cd543ae3f5ff443294ce3bef56bb12fe.tar.gz
kernel-6d52fb75cd543ae3f5ff443294ce3bef56bb12fe.tar.xz
kernel-6d52fb75cd543ae3f5ff443294ce3bef56bb12fe.zip
btrfs: don't prematurely free work in run_ordered_work()
[ Upstream commit c495dcd6fbe1dce51811a76bb85b4675f6494938 ] We hit the following very strange deadlock on a system with Btrfs on a loop device backed by another Btrfs filesystem: 1. The top (loop device) filesystem queues an async_cow work item from cow_file_range_async(). We'll call this work X. 2. Worker thread A starts work X (normal_work_helper()). 3. Worker thread A executes the ordered work for the top filesystem (run_ordered_work()). 4. Worker thread A finishes the ordered work for work X and frees X (work->ordered_free()). 5. Worker thread A executes another ordered work and gets blocked on I/O to the bottom filesystem (still in run_ordered_work()). 6. Meanwhile, the bottom filesystem allocates and queues an async_cow work item which happens to be the recently-freed X. 7. The workqueue code sees that X is already being executed by worker thread A, so it schedules X to be executed _after_ worker thread A finishes (see the find_worker_executing_work() call in process_one_work()). Now, the top filesystem is waiting for I/O on the bottom filesystem, but the bottom filesystem is waiting for the top filesystem to finish, so we deadlock. This happens because we are breaking the workqueue assumption that a work item cannot be recycled while it still depends on other work. Fix it by waiting to free the work item until we are done with all of the related ordered work. P.S.: One might ask why the workqueue code doesn't try to detect a recycled work item. It actually does try by checking whether the work item has the same work function (find_worker_executing_work()), but in our case the function is the same. This is the only key that the workqueue code has available to compare, short of adding an additional, layer-violating "custom key". Considering that we're the only ones that have ever hit this, we should just play by the rules. Unfortunately, we haven't been able to create a minimal reproducer other than our full container setup using a compress-force=zstd filesystem on top of another compress-force=zstd filesystem. Suggested-by: Tejun Heo <tj@kernel.org> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Omar Sandoval <osandov@fb.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r--fs/btrfs/async-thread.c56
1 files changed, 44 insertions, 12 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e00c8a9fd5bb..72d7589072f5 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -265,16 +265,17 @@ out:
265 } 265 }
266} 266}
267 267
268static void run_ordered_work(struct __btrfs_workqueue *wq) 268static void run_ordered_work(struct __btrfs_workqueue *wq,
269 struct btrfs_work *self)
269{ 270{
270 struct list_head *list = &wq->ordered_list; 271 struct list_head *list = &wq->ordered_list;
271 struct btrfs_work *work; 272 struct btrfs_work *work;
272 spinlock_t *lock = &wq->list_lock; 273 spinlock_t *lock = &wq->list_lock;
273 unsigned long flags; 274 unsigned long flags;
275 void *wtag;
276 bool free_self = false;
274 277
275 while (1) { 278 while (1) {
276 void *wtag;
277
278 spin_lock_irqsave(lock, flags); 279 spin_lock_irqsave(lock, flags);
279 if (list_empty(list)) 280 if (list_empty(list))
280 break; 281 break;
@@ -300,16 +301,47 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
300 list_del(&work->ordered_list); 301 list_del(&work->ordered_list);
301 spin_unlock_irqrestore(lock, flags); 302 spin_unlock_irqrestore(lock, flags);
302 303
303 /* 304 if (work == self) {
304 * We don't want to call the ordered free functions with the 305 /*
305 * lock held though. Save the work as tag for the trace event, 306 * This is the work item that the worker is currently
306 * because the callback could free the structure. 307 * executing.
307 */ 308 *
308 wtag = work; 309 * The kernel workqueue code guarantees non-reentrancy
309 work->ordered_free(work); 310 * of work items. I.e., if a work item with the same
310 trace_btrfs_all_work_done(wq->fs_info, wtag); 311 * address and work function is queued twice, the second
312 * execution is blocked until the first one finishes. A
313 * work item may be freed and recycled with the same
314 * work function; the workqueue code assumes that the
315 * original work item cannot depend on the recycled work
316 * item in that case (see find_worker_executing_work()).
317 *
318 * Note that the work of one Btrfs filesystem may depend
319 * on the work of another Btrfs filesystem via, e.g., a
320 * loop device. Therefore, we must not allow the current
321 * work item to be recycled until we are really done,
322 * otherwise we break the above assumption and can
323 * deadlock.
324 */
325 free_self = true;
326 } else {
327 /*
328 * We don't want to call the ordered free functions with
329 * the lock held though. Save the work as tag for the
330 * trace event, because the callback could free the
331 * structure.
332 */
333 wtag = work;
334 work->ordered_free(work);
335 trace_btrfs_all_work_done(wq->fs_info, wtag);
336 }
311 } 337 }
312 spin_unlock_irqrestore(lock, flags); 338 spin_unlock_irqrestore(lock, flags);
339
340 if (free_self) {
341 wtag = self;
342 self->ordered_free(self);
343 trace_btrfs_all_work_done(wq->fs_info, wtag);
344 }
313} 345}
314 346
315static void normal_work_helper(struct btrfs_work *work) 347static void normal_work_helper(struct btrfs_work *work)
@@ -337,7 +369,7 @@ static void normal_work_helper(struct btrfs_work *work)
337 work->func(work); 369 work->func(work);
338 if (need_order) { 370 if (need_order) {
339 set_bit(WORK_DONE_BIT, &work->flags); 371 set_bit(WORK_DONE_BIT, &work->flags);
340 run_ordered_work(wq); 372 run_ordered_work(wq, work);
341 } 373 }
342 if (!need_order) 374 if (!need_order)
343 trace_btrfs_all_work_done(wq->fs_info, wtag); 375 trace_btrfs_all_work_done(wq->fs_info, wtag);