
Problem: We received a report of a workload that causes an xfs task to be blocked for more than 120 seconds on log reservation via iomap_ioend completion batching. kernel: err [5636141.631454] INFO: task xfs-conv/dm-4:1788 blocked for more than 122 seconds. kernel: info [267022.728862] Workqueue: xfs-conv/dm-4 xfs_end_io [xfs] kernel: info [267022.728864] Call Trace: kernel: info [267022.728870] __schedule+0x340/0x810 kernel: info [267022.728876] schedule+0x51/0xc0 kernel: info [267022.728913] xlog_grant_head_wait+0xc7/0x200 [xfs] kernel: info [267022.728950] xlog_grant_head_check+0xd0/0x110 [xfs] kernel: info [267022.728985] xfs_log_reserve+0xc3/0x1e0 [xfs] kernel: info [267022.729023] xfs_trans_reserve+0x156/0x1b0 [xfs] kernel: info [267022.729184] xfs_trans_alloc+0xc6/0x190 [xfs] kernel: info [267022.729317] xfs_iomap_write_unwritten+0xaa/0x2c0 [xfs] kernel: info [267022.729333] ? stop_one_cpu+0x71/0xa0 kernel: info [267022.729347] ? set_cpus_allowed_ptr+0x10/0x10 kernel: info [267022.729396] xfs_end_ioend+0xc4/0x100 [xfs] kernel: info [267022.729444] ? xfs_setfilesize_ioend+0x60/0x60 [xfs] kernel: info [267022.729491] xfs_end_io+0xb9/0xe0 [xfs] kernel: info [267022.729505] process_one_work+0x1a1/0x370 kernel: info [267022.729516] rescuer_thread+0x207/0x350 kernel: info [267022.729528] ? worker_thread+0x370/0x370 kernel: info [267022.729537] kthread+0x12e/0x150 kernel: info [267022.729548] ? __kthread_cancel_work+0x40/0x40 kernel: info [267022.729559] ret_from_fork+0x1f/0x30 After that, the connection via ssh to the controller is stuck, Press Ctrl+C, it entered shell and the prompt displayed '-sh-4.2$' Solution: Removing the preallocated transaction from xfs append ioends to avoid the ioend completion batching log reservation deadlock. Now we continue to process the append ioend completions via the workqueue, but let the wq task allocate the transaction similar to other ioend types. Backport the four patches from upstream(git://git.kernel.org/pub/scm/ linux/kernel/git/torvalds/linux.git) for debian-based StarlingX. Only the 0034-xfs-use-current-journal_info-for-detecting-transacti.patch for centos-based StarlingX is from stable tree(git://git.kernel.org/pub/ scm/linux/kernel/git/stable/linux.git linux-5.10.y branch), because the kernel has been upgraded to v5.10.152 for debian-based StarlingX which includes this fix, so we just apply it for the centos-based one. TestPlan: Pass: Execute bonnie++ test for xfs filesystem successfully without kernel panic and any xfs anomalies in the kernel logs. $mkfs.x /dev/sdc1 $mount /dev/sdc1 ~/xfstests $sudo bonnie++ -u root:root -d ~/xfstests Debian: Pass: build-pkgs -c -a Pass: build-image Pass: boot successfully with std/rt. CentOS: Pass: build-pkgs Pass: build-iso Pass: boot successfully with std/rt. Closes-Bug: 1996269 Signed-off-by: Zhixiong Chi <zhixiong.chi@windriver.com> Change-Id: I1e5b85111b2b54cd249c116724b952042f9d781f
270 lines
8.0 KiB
Diff
270 lines
8.0 KiB
Diff
From aebe2063884ef5e8a969d73d75e1086ea67d8033 Mon Sep 17 00:00:00 2001
|
|
From: Dave Chinner <dchinner@redhat.com>
|
|
Date: Sun, 3 Jul 2022 08:04:50 +0300
|
|
Subject: [PATCH 1/5] xfs: use current->journal_info for detecting transaction
|
|
recursion
|
|
|
|
commit 756b1c343333a5aefcc26b0409f3fd16f72281bf upstream.
|
|
|
|
Because the iomap code using PF_MEMALLOC_NOFS to detect transaction
|
|
recursion in XFS is just wrong. Remove it from the iomap code and
|
|
replace it with XFS specific internal checks using
|
|
current->journal_info instead.
|
|
|
|
[djwong: This change also realigns the lifetime of NOFS flag changes to
|
|
match the incore transaction, instead of the inconsistent scheme we have
|
|
now.]
|
|
|
|
Fixes: 9070733b4efa ("xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS")
|
|
Signed-off-by: Dave Chinner <dchinner@redhat.com>
|
|
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
|
|
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
|
|
Reviewed-by: Christoph Hellwig <hch@lst.de>
|
|
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
|
|
Acked-by: Darrick J. Wong <djwong@kernel.org>
|
|
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
[commit b261cd005ab980c4018634a849f77e036bfd4f80 from
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
|
|
linux-5.10.y branch]
|
|
Signed-off-by: Zhixiong Chi <zhixiong.chi@windriver.com>
|
|
---
|
|
fs/iomap/buffered-io.c | 7 -------
|
|
fs/xfs/libxfs/xfs_btree.c | 12 ++++++++++--
|
|
fs/xfs/xfs_aops.c | 17 +++++++++++++++--
|
|
fs/xfs/xfs_trans.c | 20 +++++---------------
|
|
fs/xfs/xfs_trans.h | 30 ++++++++++++++++++++++++++++++
|
|
5 files changed, 60 insertions(+), 26 deletions(-)
|
|
|
|
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
|
|
index cd9f7baa5..47279fe00 100644
|
|
--- a/fs/iomap/buffered-io.c
|
|
+++ b/fs/iomap/buffered-io.c
|
|
@@ -1459,13 +1459,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
|
|
PF_MEMALLOC))
|
|
goto redirty;
|
|
|
|
- /*
|
|
- * Given that we do not allow direct reclaim to call us, we should
|
|
- * never be called in a recursive filesystem reclaim context.
|
|
- */
|
|
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
|
|
- goto redirty;
|
|
-
|
|
/*
|
|
* Is this page beyond the end of the file?
|
|
*
|
|
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
|
|
index 2d25bab68..fe87ecd76 100644
|
|
--- a/fs/xfs/libxfs/xfs_btree.c
|
|
+++ b/fs/xfs/libxfs/xfs_btree.c
|
|
@@ -2814,7 +2814,7 @@ xfs_btree_split_worker(
|
|
struct xfs_btree_split_args *args = container_of(work,
|
|
struct xfs_btree_split_args, work);
|
|
unsigned long pflags;
|
|
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
|
|
+ unsigned long new_pflags = 0;
|
|
|
|
/*
|
|
* we are in a transaction context here, but may also be doing work
|
|
@@ -2826,12 +2826,20 @@ xfs_btree_split_worker(
|
|
new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
|
|
|
|
current_set_flags_nested(&pflags, new_pflags);
|
|
+ xfs_trans_set_context(args->cur->bc_tp);
|
|
|
|
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
|
|
args->key, args->curp, args->stat);
|
|
- complete(args->done);
|
|
|
|
+ xfs_trans_clear_context(args->cur->bc_tp);
|
|
current_restore_flags_nested(&pflags, new_pflags);
|
|
+
|
|
+ /*
|
|
+ * Do not access args after complete() has run here. We don't own args
|
|
+ * and the owner may run and free args before we return here.
|
|
+ */
|
|
+ complete(args->done);
|
|
+
|
|
}
|
|
|
|
/*
|
|
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
|
|
index 4304c6416..b4186d666 100644
|
|
--- a/fs/xfs/xfs_aops.c
|
|
+++ b/fs/xfs/xfs_aops.c
|
|
@@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc(
|
|
* We hand off the transaction to the completion thread now, so
|
|
* clear the flag here.
|
|
*/
|
|
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
+ xfs_trans_clear_context(tp);
|
|
return 0;
|
|
}
|
|
|
|
@@ -125,7 +125,7 @@ xfs_setfilesize_ioend(
|
|
* thus we need to mark ourselves as being in a transaction manually.
|
|
* Similarly for freeze protection.
|
|
*/
|
|
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
+ xfs_trans_set_context(tp);
|
|
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
|
|
|
|
/* we abort the update if there was an IO error */
|
|
@@ -568,6 +568,12 @@ xfs_vm_writepage(
|
|
{
|
|
struct xfs_writepage_ctx wpc = { };
|
|
|
|
+ if (WARN_ON_ONCE(current->journal_info)) {
|
|
+ redirty_page_for_writepage(wbc, page);
|
|
+ unlock_page(page);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
|
|
}
|
|
|
|
@@ -578,6 +584,13 @@ xfs_vm_writepages(
|
|
{
|
|
struct xfs_writepage_ctx wpc = { };
|
|
|
|
+ /*
|
|
+ * Writing back data in a transaction context can result in recursive
|
|
+ * transactions. This is bad, so issue a warning and get out of here.
|
|
+ */
|
|
+ if (WARN_ON_ONCE(current->journal_info))
|
|
+ return 0;
|
|
+
|
|
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
|
|
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
|
|
}
|
|
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
|
|
index c94e71f74..2d7deacea 100644
|
|
--- a/fs/xfs/xfs_trans.c
|
|
+++ b/fs/xfs/xfs_trans.c
|
|
@@ -68,6 +68,7 @@ xfs_trans_free(
|
|
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
|
|
|
|
trace_xfs_trans_free(tp, _RET_IP_);
|
|
+ xfs_trans_clear_context(tp);
|
|
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
|
|
sb_end_intwrite(tp->t_mountp->m_super);
|
|
xfs_trans_free_dqinfo(tp);
|
|
@@ -119,7 +120,8 @@ xfs_trans_dup(
|
|
|
|
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
|
|
tp->t_rtx_res = tp->t_rtx_res_used;
|
|
- ntp->t_pflags = tp->t_pflags;
|
|
+
|
|
+ xfs_trans_switch_context(tp, ntp);
|
|
|
|
/* move deferred ops over to the new tp */
|
|
xfs_defer_move(ntp, tp);
|
|
@@ -153,9 +155,6 @@ xfs_trans_reserve(
|
|
int error = 0;
|
|
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
|
|
|
|
- /* Mark this thread as being in a transaction */
|
|
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
-
|
|
/*
|
|
* Attempt to reserve the needed disk blocks by decrementing
|
|
* the number needed from the number available. This will
|
|
@@ -163,10 +162,8 @@ xfs_trans_reserve(
|
|
*/
|
|
if (blocks > 0) {
|
|
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
|
|
- if (error != 0) {
|
|
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
+ if (error != 0)
|
|
return -ENOSPC;
|
|
- }
|
|
tp->t_blk_res += blocks;
|
|
}
|
|
|
|
@@ -240,9 +237,6 @@ xfs_trans_reserve(
|
|
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
|
|
tp->t_blk_res = 0;
|
|
}
|
|
-
|
|
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
-
|
|
return error;
|
|
}
|
|
|
|
@@ -266,6 +260,7 @@ xfs_trans_alloc(
|
|
tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
|
|
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
|
|
sb_start_intwrite(mp->m_super);
|
|
+ xfs_trans_set_context(tp);
|
|
|
|
/*
|
|
* Zero-reservation ("empty") transactions can't modify anything, so
|
|
@@ -878,7 +873,6 @@ __xfs_trans_commit(
|
|
|
|
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
|
|
|
|
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
xfs_trans_free(tp);
|
|
|
|
/*
|
|
@@ -910,7 +904,6 @@ __xfs_trans_commit(
|
|
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
|
|
tp->t_ticket = NULL;
|
|
}
|
|
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
xfs_trans_free_items(tp, !!error);
|
|
xfs_trans_free(tp);
|
|
|
|
@@ -970,9 +963,6 @@ xfs_trans_cancel(
|
|
tp->t_ticket = NULL;
|
|
}
|
|
|
|
- /* mark this thread as no longer being in a transaction */
|
|
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
|
|
-
|
|
xfs_trans_free_items(tp, dirty);
|
|
xfs_trans_free(tp);
|
|
}
|
|
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
|
|
index 084658946..075eeade4 100644
|
|
--- a/fs/xfs/xfs_trans.h
|
|
+++ b/fs/xfs/xfs_trans.h
|
|
@@ -268,4 +268,34 @@ xfs_trans_item_relog(
|
|
return lip->li_ops->iop_relog(lip, tp);
|
|
}
|
|
|
|
+static inline void
|
|
+xfs_trans_set_context(
|
|
+ struct xfs_trans *tp)
|
|
+{
|
|
+ ASSERT(current->journal_info == NULL);
|
|
+ tp->t_pflags = memalloc_nofs_save();
|
|
+ current->journal_info = tp;
|
|
+}
|
|
+
|
|
+static inline void
|
|
+xfs_trans_clear_context(
|
|
+ struct xfs_trans *tp)
|
|
+{
|
|
+ if (current->journal_info == tp) {
|
|
+ memalloc_nofs_restore(tp->t_pflags);
|
|
+ current->journal_info = NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void
|
|
+xfs_trans_switch_context(
|
|
+ struct xfs_trans *old_tp,
|
|
+ struct xfs_trans *new_tp)
|
|
+{
|
|
+ ASSERT(current->journal_info == old_tp);
|
|
+ new_tp->t_pflags = old_tp->t_pflags;
|
|
+ old_tp->t_pflags = 0;
|
|
+ current->journal_info = new_tp;
|
|
+}
|
|
+
|
|
#endif /* __XFS_TRANS_H__ */
|
|
--
|
|
2.34.1
|
|
|