From 1a9c7a1a25fd6e7d78efede58e852f2baf2c7990 Mon Sep 17 00:00:00 2001 From: hselasky Date: Mon, 26 Mar 2018 20:36:32 +0000 Subject: MFC r330646: Fix race between PCI error handlers and health work in mlx5core. linux commit 05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f Submitted by: Matthew Finlay Sponsored by: Mellanox Technologies --- sys/dev/mlx5/driver.h | 4 ++++ sys/dev/mlx5/mlx5_core/mlx5_health.c | 24 +++++++++++++++++++++++- sys/dev/mlx5/mlx5_core/mlx5_main.c | 7 ++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h index 1f6520a..b5cce4a 100644 --- a/sys/dev/mlx5/driver.h +++ b/sys/dev/mlx5/driver.h @@ -482,7 +482,10 @@ struct mlx5_core_health { u32 prev; int miss_counter; bool sick; + /* wq spinlock to synchronize draining */ + spinlock_t wq_lock; struct workqueue_struct *wq; + unsigned long flags; struct work_struct work; }; @@ -871,6 +874,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +void mlx5_drain_health_wq(struct mlx5_core_dev *dev); #define mlx5_buf_alloc_node(dev, size, direct, buf, node) \ mlx5_buf_alloc(dev, size, direct, buf) diff --git a/sys/dev/mlx5/mlx5_core/mlx5_health.c b/sys/dev/mlx5/mlx5_core/mlx5_health.c index eb77ea2..4e84a34 100644 --- a/sys/dev/mlx5/mlx5_core/mlx5_health.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_health.c @@ -43,6 +43,10 @@ enum { MLX5_NIC_IFC_NO_DRAM_NIC = 2 }; +enum { + MLX5_DROP_NEW_HEALTH_WORK, +}; + static u8 get_nic_interface(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; @@ -242,7 +246,13 @@ static void poll_health(unsigned long data) if (in_fatal(dev) && !health->sick) { health->sick = true; print_health_info(dev); - queue_work(health->wq, &health->work); + spin_lock(&health->wq_lock); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + queue_work(health->wq, &health->work); + else + dev_err(&dev->pdev->dev, + "new health works are not permitted at this stage\n"); + spin_unlock(&health->wq_lock); } } @@ -252,6 +262,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) init_timer(&health->timer); health->sick = 0; + clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); health->health = &dev->iseg->health; health->health_counter = &dev->iseg->health_counter; @@ -267,6 +278,16 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev) del_timer_sync(&health->timer); } +void mlx5_drain_health_wq(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + spin_lock(&health->wq_lock); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock(&health->wq_lock); + cancel_work_sync(&health->work); +} + void mlx5_health_cleanup(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; @@ -293,6 +314,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) if (!health->wq) return -ENOMEM; + spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); return 0; diff --git a/sys/dev/mlx5/mlx5_core/mlx5_main.c b/sys/dev/mlx5/mlx5_core/mlx5_main.c index ddf70c3..a8ac5b8 100644 --- a/sys/dev/mlx5/mlx5_core/mlx5_main.c +++ b/sys/dev/mlx5/mlx5_core/mlx5_main.c @@ -1243,7 +1243,12 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, dev_info(&pdev->dev, "%s was called\n", __func__); mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv); - mlx5_pci_disable_device(dev); + if (state) { + pci_save_state(pdev->dev.bsddev); + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); + } + return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; } -- cgit v1.1