From 7f9ad1cf867a445b55ef87cec3db03b35a361ba3 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 9 Sep 2021 17:11:26 +0800 Subject: [PATCH] tcmur_device: skip reporting the event if device in recovery If the device is in recovery, we can defer reporting the event in the recovery when reopening the device. And if the device is stopped or stopping we can just skip it. Just wait for the report event to finish when recoverying the device, because the recovery will close and then open the device during which the private data maybe released. And it may cause use-after-free crash in report event routine. Signed-off-by: Xiubo Li --- main.c | 14 +++++++++++++- rbd.c | 9 +++------ target.c | 6 ++++++ tcmu-runner.h | 2 +- tcmur_device.c | 28 ++++++++++++++++++++++++++-- tcmur_device.h | 3 +++ 6 files changed, 52 insertions(+), 10 deletions(-) diff --git a/main.c b/main.c index 70b42335..1ad82d06 100644 --- a/main.c +++ b/main.c @@ -1044,9 +1044,15 @@ static int dev_added(struct tcmu_device *dev) goto cleanup_format_lock; } + ret = pthread_cond_init(&rdev->report_event_cond, NULL); + if (ret) { + ret = -ret; + goto cleanup_rdev_lock; + } + ret = setup_io_work_queue(dev); if (ret < 0) - goto cleanup_rdev_lock; + goto cleanup_event_cond; ret = setup_aio_tracking(rdev); if (ret < 0) @@ -1088,6 +1094,8 @@ static int dev_added(struct tcmu_device *dev) cleanup_aio_tracking(rdev); cleanup_io_work_queue: cleanup_io_work_queue(dev, true); +cleanup_event_cond: + pthread_cond_destroy(&rdev->report_event_cond); cleanup_rdev_lock: pthread_mutex_destroy(&rdev->rdev_lock); cleanup_format_lock: @@ -1130,6 +1138,10 @@ static void dev_removed(struct tcmu_device *dev) tcmur_destroy_work(rdev->event_work); + ret = pthread_cond_destroy(&rdev->report_event_cond); + if (ret != 0) + tcmu_err("could not cleanup report event cond %d\n", ret); + ret = pthread_mutex_destroy(&rdev->rdev_lock); if (ret != 0) tcmu_err("could not cleanup state lock %d\n", ret); diff --git a/rbd.c b/rbd.c index b3ea9d6b..4d151a5d 100644 --- a/rbd.c +++ b/rbd.c @@ -146,17 +146,14 @@ static int tcmu_rbd_service_status_update(struct tcmu_device *dev, #endif /* RBD_LOCK_ACQUIRE_SUPPORT */ -static int tcmu_rbd_report_event(struct tcmu_device *dev) +static int tcmu_rbd_report_event(struct tcmu_device *dev, bool has_lock) { - struct tcmur_device *rdev = tcmu_dev_get_private(dev); - /* * We ignore the specific event and report all the current counter * values, because tools like gwcli/dashboard may not see every * update, and we do not want one event to overwrite the info. */ - return tcmu_rbd_service_status_update(dev, - rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED ? true : false); + return tcmu_rbd_service_status_update(dev, has_lock); } static int tcmu_rbd_service_register(struct tcmu_device *dev) @@ -215,7 +212,7 @@ static int tcmu_rbd_service_register(struct tcmu_device *dev) goto free_meta_buf; } - ret = tcmu_rbd_report_event(dev); + ret = tcmu_rbd_report_event(dev, false); free_meta_buf: free(metadata_buf); diff --git a/target.c b/target.c index 51a0d97c..91cc2d8e 100644 --- a/target.c +++ b/target.c @@ -252,6 +252,12 @@ static void tgt_port_grp_recovery_work_fn(void *arg) */ list_for_each_safe(&tpg->devs, rdev, tmp_rdev, recovery_entry) { list_del(&rdev->recovery_entry); + + pthread_mutex_lock(&rdev->rdev_lock); + if (rdev->flags & TCMUR_DEV_FLAG_REPORTING_EVENT) + pthread_cond_wait(&rdev->report_event_cond, &rdev->rdev_lock); + pthread_mutex_unlock(&rdev->rdev_lock); + ret = __tcmu_reopen_dev(rdev->dev, -1); if (ret) { tcmu_dev_err(rdev->dev, "Could not reinitialize device. (err %d).\n", diff --git a/tcmu-runner.h b/tcmu-runner.h index 844904c6..6157d9e9 100644 --- a/tcmu-runner.h +++ b/tcmu-runner.h @@ -161,7 +161,7 @@ struct tcmur_handler { * * Return 0 on success and a -Exyz error code on error. */ - int (*report_event)(struct tcmu_device *dev); + int (*report_event)(struct tcmu_device *dev, bool has_lock); /* * If the lock is acquired and the tag is not TCMU_INVALID_LOCK_TAG, diff --git a/tcmur_device.c b/tcmur_device.c index 09b29ba9..9ee62ae6 100644 --- a/tcmur_device.c +++ b/tcmur_device.c @@ -86,8 +86,8 @@ int __tcmu_reopen_dev(struct tcmu_device *dev, int retries) rhandler->close(dev); } - pthread_mutex_lock(&rdev->rdev_lock); ret = -EIO; + pthread_mutex_lock(&rdev->rdev_lock); while (ret != 0 && !(rdev->flags & TCMUR_DEV_FLAG_STOPPING) && (retries < 0 || attempt <= retries)) { pthread_mutex_unlock(&rdev->rdev_lock); @@ -128,7 +128,10 @@ int tcmu_reopen_dev(struct tcmu_device *dev, int retries) pthread_mutex_unlock(&rdev->rdev_lock); return -EBUSY; } + rdev->flags |= TCMUR_DEV_FLAG_IN_RECOVERY; + if (rdev->flags & TCMUR_DEV_FLAG_REPORTING_EVENT) + pthread_cond_wait(&rdev->report_event_cond, &rdev->rdev_lock); pthread_mutex_unlock(&rdev->rdev_lock); return __tcmu_reopen_dev(dev, retries); @@ -159,6 +162,7 @@ static void __tcmu_report_event(void *data) struct tcmu_device *dev = data; struct tcmur_device *rdev = tcmu_dev_get_private(dev); struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev); + bool has_lock; int ret; /* @@ -168,9 +172,29 @@ static void __tcmu_report_event(void *data) sleep(1); pthread_mutex_lock(&rdev->rdev_lock); - ret = rhandler->report_event(dev); + /* + * If the device is in recovery, will skip reporting the event + * this time because the event should be report when the device + * is reopening. + */ + if (rdev->flags & (TCMUR_DEV_FLAG_IN_RECOVERY | + TCMUR_DEV_FLAG_STOPPING | + TCMUR_DEV_FLAG_STOPPED)) { + pthread_mutex_unlock(&rdev->rdev_lock); + return; + } + + rdev->flags |= TCMUR_DEV_FLAG_REPORTING_EVENT; + has_lock = rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED; + pthread_mutex_unlock(&rdev->rdev_lock); + + ret = rhandler->report_event(dev, has_lock); if (ret) tcmu_dev_err(dev, "Could not report events. Error %d.\n", ret); + + pthread_mutex_lock(&rdev->rdev_lock); + rdev->flags &= ~TCMUR_DEV_FLAG_REPORTING_EVENT; + pthread_cond_signal(&rdev->report_event_cond); pthread_mutex_unlock(&rdev->rdev_lock); } diff --git a/tcmur_device.h b/tcmur_device.h index cd37aabe..3c7a449e 100644 --- a/tcmur_device.h +++ b/tcmur_device.h @@ -22,6 +22,7 @@ #define TCMUR_DEV_FLAG_IS_OPEN (1 << 2) #define TCMUR_DEV_FLAG_STOPPING (1 << 3) #define TCMUR_DEV_FLAG_STOPPED (1 << 4) +#define TCMUR_DEV_FLAG_REPORTING_EVENT (1 << 5) #define TCMUR_UA_DEV_SIZE_CHANGED 0 @@ -83,6 +84,8 @@ struct tcmur_device { int cmd_time_out; + pthread_cond_t report_event_cond; + pthread_spinlock_t cmds_list_lock; /* protects cmds_list */ struct list_head cmds_list; };