 bf940a84c8
			
		
	
	bf940a84c8
	
	
	
		
			
			We encountered an unexpected system reboot during a stress test, and based on the kernel warning backtrace, it looks like a race condition in the RCU subsystem caused this issue. It is similar to the issue reported at https://lore.kernel.org/all/20210917211148.GU4156@paulmck-ThinkPad-P17-Gen-1/#t Guillaume Morin applied two patches, then he can not reproduce it again. commit 2431774f04 [rcu: Mark accesses to rcu_state.n_force_qs] had been in upstream Linux kernel. https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git/ commit/?h=rcu/next&id=325a2030b90376d179a129794e2fae2b24d73923 [rcu: Tighten rcu_advance_cbs_nowake() checks] Paul E. McKenney provided, which was not pushed to the mainline now. (It would be slated for the v5.17 merge window by default) Basically the rcu_advance_cbs() == true warning in rcu_advance_cbs_nowake() is firing then everything eventually gets stuck on RCU synchronization. WARNING: CPU: 35 PID: 2743975 at kernel/rcu/tree.c:1589 rcu_advance_cbs_nowake+0x78/0x80 ...... Call Trace: call_rcu+0x173/0x5c0 task_work_run+0x6d/0xa0 exit_to_user_mode_prepare+0x130/0x140 syscall_exit_to_user_mode+0x27/0x1d0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Verification: The formal regression tests were carried out by colleagues in the test team at Wind River, which include userspace packages, ltp and posix, basic networking test etc. Closes-Bug: #1952710 Signed-off-by: Jiping Ma <jiping.ma2@windriver.com> Change-Id: I4ee7d3d007edced81c9ac43c5850f941f1d393ee
		
			
				
	
	
		
			65 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			65 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| From e21cab0b7f1a6634ee57950600b68f6df960e3f5 Mon Sep 17 00:00:00 2001
 | |
| From: "Paul E. McKenney" <paulmck@kernel.org>
 | |
| Date: Tue, 20 Jul 2021 06:16:27 -0700
 | |
| Subject: [PATCH] rcu: Mark accesses to rcu_state.n_force_qs
 | |
| 
 | |
| This commit marks accesses to the rcu_state.n_force_qs.  These data
 | |
| races are hard to make happen, but syzkaller was equal to the task.
 | |
| 
 | |
| Reported-by: syzbot+e08a83a1940ec3846cd5@syzkaller.appspotmail.com
 | |
| Acked-by: Marco Elver <elver@google.com>
 | |
| Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
 | |
| (cherry picked from commit 2431774f04d1050292054c763070021bade7b151)
 | |
| Signed-off-by: Jiping Ma <jiping.ma2@windriver.com>
 | |
| ---
 | |
|  kernel/rcu/tree.c | 10 +++++-----
 | |
|  1 file changed, 5 insertions(+), 5 deletions(-)
 | |
| 
 | |
| diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
 | |
| index 4cc5af7f2b91..30cb9af8284c 100644
 | |
| --- a/kernel/rcu/tree.c
 | |
| +++ b/kernel/rcu/tree.c
 | |
| @@ -1890,7 +1890,7 @@ static void rcu_gp_fqs(bool first_time)
 | |
|  	struct rcu_node *rnp = rcu_get_root();
 | |
|  
 | |
|  	WRITE_ONCE(rcu_state.gp_activity, jiffies);
 | |
| -	rcu_state.n_force_qs++;
 | |
| +	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
 | |
|  	if (first_time) {
 | |
|  		/* Collect dyntick-idle snapshots. */
 | |
|  		force_qs_rnp(dyntick_save_progress_counter);
 | |
| @@ -2532,7 +2532,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 | |
|  	/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
 | |
|  	if (count == 0 && rdp->qlen_last_fqs_check != 0) {
 | |
|  		rdp->qlen_last_fqs_check = 0;
 | |
| -		rdp->n_force_qs_snap = rcu_state.n_force_qs;
 | |
| +		rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
 | |
|  	} else if (count < rdp->qlen_last_fqs_check - qhimark)
 | |
|  		rdp->qlen_last_fqs_check = count;
 | |
|  
 | |
| @@ -2878,10 +2878,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
 | |
|  		} else {
 | |
|  			/* Give the grace period a kick. */
 | |
|  			rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
 | |
| -			if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
 | |
| +			if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
 | |
|  			    rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
 | |
|  				rcu_force_quiescent_state();
 | |
| -			rdp->n_force_qs_snap = rcu_state.n_force_qs;
 | |
| +			rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
 | |
|  			rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
 | |
|  		}
 | |
|  	}
 | |
| @@ -3988,7 +3988,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
 | |
|  	/* Set up local state, ensuring consistent view of global state. */
 | |
|  	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 | |
|  	rdp->qlen_last_fqs_check = 0;
 | |
| -	rdp->n_force_qs_snap = rcu_state.n_force_qs;
 | |
| +	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
 | |
|  	rdp->blimit = blimit;
 | |
|  	if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
 | |
|  	    !rcu_segcblist_is_offloaded(&rdp->cblist))
 | |
| -- 
 | |
| 2.31.1
 | |
| 
 |