mirror of
https://kernel.googlesource.com/pub/scm/linux/kernel/git/stable/linux-stable.git
synced 2025-09-14 11:19:08 +10:00
net/mlx5: Fix lockdep assertion on sync reset unload event
Fix lockdep assertion triggered during sync reset unload event. When the
sync reset flow is initiated using the devlink reload fw_activate
option, the PF already holds the devlink lock while handling unload
event. In this case, delegate sync reset unload event handling back to
the devlink callback process to avoid double-locking and resolve the
lockdep warning.
Kernel log:
WARNING: CPU: 9 PID: 1578 at devl_assert_locked+0x31/0x40
[...]
Call Trace:
<TASK>
mlx5_unload_one_devl_locked+0x2c/0xc0 [mlx5_core]
mlx5_sync_reset_unload_event+0xaf/0x2f0 [mlx5_core]
process_one_work+0x222/0x640
worker_thread+0x199/0x350
kthread+0x10b/0x230
? __pfx_worker_thread+0x10/0x10
? __pfx_kthread+0x10/0x10
ret_from_fork+0x8e/0x100
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Fixes: 7a9770f1bf
("net/mlx5: Handle sync reset unload event")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250825143435.598584-7-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
parent
34cc6a5491
commit
902a8bc23a
@ -160,7 +160,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
mlx5_unload_one_devl_locked(dev, false);
|
||||
mlx5_sync_reset_unload_flow(dev, true);
|
||||
err = mlx5_health_wait_pci_up(dev);
|
||||
if (err)
|
||||
NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset");
|
||||
|
@ -12,7 +12,8 @@ enum {
|
||||
MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
|
||||
MLX5_FW_RESET_FLAGS_PENDING_COMP,
|
||||
MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
|
||||
MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED
|
||||
MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED,
|
||||
MLX5_FW_RESET_FLAGS_UNLOAD_EVENT,
|
||||
};
|
||||
|
||||
struct mlx5_fw_reset {
|
||||
@ -219,7 +220,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
|
||||
return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
|
||||
}
|
||||
|
||||
static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded)
|
||||
static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
|
||||
struct devlink *devlink = priv_to_devlink(dev);
|
||||
@ -228,8 +229,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload
|
||||
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
|
||||
complete(&fw_reset->done);
|
||||
} else {
|
||||
if (!unloaded)
|
||||
mlx5_unload_one(dev, false);
|
||||
mlx5_sync_reset_unload_flow(dev, false);
|
||||
if (mlx5_health_wait_pci_up(dev))
|
||||
mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
|
||||
else
|
||||
@ -272,7 +272,7 @@ static void mlx5_sync_reset_reload_work(struct work_struct *work)
|
||||
|
||||
mlx5_sync_reset_clear_reset_requested(dev, false);
|
||||
mlx5_enter_error_state(dev, true);
|
||||
mlx5_fw_reset_complete_reload(dev, false);
|
||||
mlx5_fw_reset_complete_reload(dev);
|
||||
}
|
||||
|
||||
#define MLX5_RESET_POLL_INTERVAL (HZ / 10)
|
||||
@ -586,65 +586,23 @@ static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method)
|
||||
return err;
|
||||
}
|
||||
|
||||
static void mlx5_sync_reset_now_event(struct work_struct *work)
|
||||
void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
|
||||
reset_now_work);
|
||||
struct mlx5_core_dev *dev = fw_reset->dev;
|
||||
int err;
|
||||
|
||||
if (mlx5_sync_reset_clear_reset_requested(dev, false))
|
||||
return;
|
||||
|
||||
mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n");
|
||||
|
||||
err = mlx5_cmd_fast_teardown_hca(dev);
|
||||
if (err) {
|
||||
mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err);
|
||||
goto done;
|
||||
}
|
||||
|
||||
err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
|
||||
if (err) {
|
||||
mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, no reset done, err %d\n", err);
|
||||
set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags);
|
||||
}
|
||||
|
||||
mlx5_enter_error_state(dev, true);
|
||||
done:
|
||||
fw_reset->ret = err;
|
||||
mlx5_fw_reset_complete_reload(dev, false);
|
||||
}
|
||||
|
||||
static void mlx5_sync_reset_unload_event(struct work_struct *work)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset;
|
||||
struct mlx5_core_dev *dev;
|
||||
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
|
||||
unsigned long timeout;
|
||||
int poll_freq = 20;
|
||||
bool reset_action;
|
||||
u8 rst_state;
|
||||
int err;
|
||||
|
||||
fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work);
|
||||
dev = fw_reset->dev;
|
||||
|
||||
if (mlx5_sync_reset_clear_reset_requested(dev, false))
|
||||
return;
|
||||
|
||||
mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n");
|
||||
|
||||
err = mlx5_cmd_fast_teardown_hca(dev);
|
||||
if (err)
|
||||
mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n", err);
|
||||
else
|
||||
mlx5_enter_error_state(dev, true);
|
||||
|
||||
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags))
|
||||
if (locked)
|
||||
mlx5_unload_one_devl_locked(dev, false);
|
||||
else
|
||||
mlx5_unload_one(dev, false);
|
||||
|
||||
if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags))
|
||||
return;
|
||||
|
||||
mlx5_set_fw_rst_ack(dev);
|
||||
mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
|
||||
|
||||
@ -672,17 +630,73 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
|
||||
goto done;
|
||||
}
|
||||
|
||||
mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", rst_state);
|
||||
mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n",
|
||||
rst_state);
|
||||
if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
|
||||
err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
|
||||
if (err) {
|
||||
mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", err);
|
||||
mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n",
|
||||
err);
|
||||
fw_reset->ret = err;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
mlx5_fw_reset_complete_reload(dev, true);
|
||||
clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
|
||||
}
|
||||
|
||||
static void mlx5_sync_reset_now_event(struct work_struct *work)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
|
||||
reset_now_work);
|
||||
struct mlx5_core_dev *dev = fw_reset->dev;
|
||||
int err;
|
||||
|
||||
if (mlx5_sync_reset_clear_reset_requested(dev, false))
|
||||
return;
|
||||
|
||||
mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n");
|
||||
|
||||
err = mlx5_cmd_fast_teardown_hca(dev);
|
||||
if (err) {
|
||||
mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err);
|
||||
goto done;
|
||||
}
|
||||
|
||||
err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
|
||||
if (err) {
|
||||
mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, no reset done, err %d\n", err);
|
||||
set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags);
|
||||
}
|
||||
|
||||
mlx5_enter_error_state(dev, true);
|
||||
done:
|
||||
fw_reset->ret = err;
|
||||
mlx5_fw_reset_complete_reload(dev);
|
||||
}
|
||||
|
||||
static void mlx5_sync_reset_unload_event(struct work_struct *work)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset;
|
||||
struct mlx5_core_dev *dev;
|
||||
int err;
|
||||
|
||||
fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work);
|
||||
dev = fw_reset->dev;
|
||||
|
||||
if (mlx5_sync_reset_clear_reset_requested(dev, false))
|
||||
return;
|
||||
|
||||
set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
|
||||
mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n");
|
||||
|
||||
err = mlx5_cmd_fast_teardown_hca(dev);
|
||||
if (err)
|
||||
mlx5_core_warn(dev, "Fast teardown failed, unloading, err %d\n", err);
|
||||
else
|
||||
mlx5_enter_error_state(dev, true);
|
||||
|
||||
mlx5_fw_reset_complete_reload(dev);
|
||||
}
|
||||
|
||||
static void mlx5_sync_reset_abort_event(struct work_struct *work)
|
||||
|
@ -12,6 +12,7 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
|
||||
int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
|
||||
|
||||
int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
|
||||
void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked);
|
||||
int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
|
||||
struct netlink_ext_ack *extack);
|
||||
void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);
|
||||
|
Loading…
Reference in New Issue
Block a user