From 5637abcbea871f3fd7711faea7bd501fe27c75fd Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Mon, 26 Jan 2026 13:53:00 +0800 Subject: [PATCH] DAOS-18361 chk: handle CHK engine side inconsistency in parallel On CHK engine side, most of inconsistencies can be handled in parallel. For each of them, create dedicated ULT to handle the inconsistency and report (including interaction) to CHK leader independently. So even if some ULT was blocked for some reason, such as waiting for interaction, it will not affect the other inconsistencies to be handled in parallel. Test-tag: recovery Signed-off-by: Fan Yong --- src/chk/chk_common.c | 34 ++- src/chk/chk_engine.c | 544 +++++++++++++++++++++++++++----------- src/chk/chk_internal.h | 13 + src/chk/chk_leader.c | 58 ++-- src/tests/suite/daos_cr.c | 115 ++++++++ 5 files changed, 582 insertions(+), 182 deletions(-) diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index adf5d068523..8790e4c02ed 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -76,6 +76,7 @@ chk_pool_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov, D_INIT_LIST_HEAD(&cpr->cpr_shutdown_link); D_INIT_LIST_HEAD(&cpr->cpr_shard_list); D_INIT_LIST_HEAD(&cpr->cpr_pending_list); + D_INIT_LIST_HEAD(&cpr->cpr_ult_list); cpr->cpr_refs = 1; uuid_copy(cpr->cpr_uuid, cpb->cpb_uuid); cpr->cpr_thread = ABT_THREAD_NULL; @@ -930,6 +931,27 @@ chk_pool_shard_cleanup(struct chk_instance *ins) } } +int +chk_pending_lookup(struct chk_instance *ins, uint64_t seq, struct chk_pending_rec **cpr) +{ + d_iov_t kiov; + d_iov_t riov; + int rc; + + d_iov_set(&riov, NULL, 0); + d_iov_set(&kiov, &seq, sizeof(seq)); + + ABT_rwlock_rdlock(ins->ci_abt_lock); + rc = dbtree_lookup(ins->ci_pending_hdl, &kiov, &riov); + ABT_rwlock_unlock(ins->ci_abt_lock); + if (rc == 0) + *cpr = (struct chk_pending_rec *)riov.iov_buf; + else + *cpr = NULL; + + return rc; +} + int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid, uint64_t seq, uint32_t rank, uint32_t cla, uint32_t option_nr, uint32_t *options, @@ -985,12 +1007,14 @@ chk_pending_del(struct chk_instance *ins, uint64_t seq, struct chk_pending_rec * d_iov_set(&kiov, &seq, sizeof(seq)); ABT_rwlock_wrlock(ins->ci_abt_lock); - rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, &riov); + rc = dbtree_delete(ins->ci_pending_hdl, BTR_PROBE_EQ, &kiov, cpr == NULL ? NULL : &riov); ABT_rwlock_unlock(ins->ci_abt_lock); - if (rc == 0) - *cpr = (struct chk_pending_rec *)riov.iov_buf; - else - *cpr = NULL; + if (cpr != NULL) { + if (rc == 0) + *cpr = (struct chk_pending_rec *)riov.iov_buf; + else + *cpr = NULL; + } D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG, "Del pending record with gen "DF_X64", seq "DF_X64": "DF_RC"\n", diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 3dfb7b9b705..9c8064ecb0a 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -262,28 +262,41 @@ chk_engine_post_repair(struct chk_pool_rec *cpr, int *result, bool update) return *result != 0 ? *result : rc; } -static int -chk_engine_pm_orphan(struct chk_pool_rec *cpr, d_rank_t rank, int index, uint32_t status) +struct chk_pm_orphan_args { + struct chk_pool_rec *cpoa_pool; + d_rank_t cpoa_rank; + int cpoa_index; + uint32_t cpoa_status; +}; + +static void +chk_engine_pm_orphan_ult(void *args) { - struct chk_instance *ins = cpr->cpr_ins; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &cpr->cpr_bk; - d_rank_list_t ranks = { 0 }; - struct chk_report_unit cru = { 0 }; - char *strs[2]; - d_iov_t iovs[2]; - d_sg_list_t sgl; - d_sg_list_t *details = NULL; - Chk__CheckInconsistClass cla; - Chk__CheckInconsistAction act; - char msg[CHK_MSG_BUFLEN] = { 0 }; - uint64_t seq = 0; - uint32_t options[2]; - uint32_t option_nr = 0; - uint32_t detail_nr = 0; - int decision = -1; - int result = 0; - int rc = 0; + struct chk_engine_ult *ult; + struct chk_pm_orphan_args *cpoa = args; + struct chk_pool_rec *cpr = cpoa->cpoa_pool; + struct chk_instance *ins = cpr->cpr_ins; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &cpr->cpr_bk; + d_rank_list_t ranks = {0}; + struct chk_report_unit cru = {0}; + char *strs[2]; + d_iov_t iovs[2]; + d_sg_list_t sgl; + d_sg_list_t *details = NULL; + Chk__CheckInconsistClass cla; + Chk__CheckInconsistAction act; + char msg[CHK_MSG_BUFLEN] = {0}; + uint64_t seq = 0; + uint32_t options[2]; + uint32_t option_nr = 0; + uint32_t detail_nr = 0; + uint32_t status = cpoa->cpoa_status; + d_rank_t rank = cpoa->cpoa_rank; + int index = cpoa->cpoa_index; + int decision = -1; + int result = 0; + int rc = 0; /* * NOTE: The subsequent check after handling orphan pm entry will not access the @@ -453,32 +466,78 @@ chk_engine_pm_orphan(struct chk_pool_rec *cpr, d_rank_t rank, int index, uint32_ goto report; out: - return chk_engine_post_repair(cpr, &result, rc <= 0); + ult = container_of(args, struct chk_engine_ult, ceu_data); + ult->ceu_result = chk_engine_post_repair(cpr, &result, rc <= 0); } static int -chk_engine_pm_dangling(struct chk_pool_rec *cpr, struct pool_map *map, struct pool_component *comp, - uint32_t status) +chk_engine_pm_orphan(struct chk_pool_rec *cpr, d_rank_t rank, int index, uint32_t status) { - struct chk_instance *ins = cpr->cpr_ins; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &cpr->cpr_bk; - struct chk_report_unit cru = { 0 }; - char *strs[2]; - d_iov_t iovs[2]; - d_sg_list_t sgl; - d_sg_list_t *details = NULL; - Chk__CheckInconsistClass cla; - Chk__CheckInconsistAction act; - char suggested[CHK_MSG_BUFLEN] = { 0 }; - char msg[CHK_MSG_BUFLEN] = { 0 }; - uint64_t seq = 0; - uint32_t options[2]; - uint32_t option_nr = 0; - uint32_t detail_nr = 0; - int decision = -1; - int result = 0; - int rc = 0; + struct chk_engine_ult *ult; + struct chk_pm_orphan_args *args; + int rc; + + D_ALLOC(ult, sizeof(*ult) + sizeof(*args)); + if (ult == NULL) + return -DER_NOMEM; + + args = (struct chk_pm_orphan_args *)&ult->ceu_data[0]; + args->cpoa_pool = cpr; + args->cpoa_rank = rank; + args->cpoa_index = index; + args->cpoa_status = status; + + ult->ceu_ult = ABT_THREAD_NULL; + d_list_add_tail(&ult->ceu_link, &cpr->cpr_ult_list); + + rc = dss_ult_create(chk_engine_pm_orphan_ult, args, DSS_XS_SYS, 0, 0, &ult->ceu_ult); + if (rc != 0) { + d_list_del(&ult->ceu_link); + D_FREE(ult); + D_ERROR(DF_ENGINE " failed to create ULT to handle PM orphan for pool " DF_UUID + ", rank %u, index %d, status %s: " DF_RC "\n", + DP_ENGINE(cpr->cpr_ins), DP_UUID(cpr->cpr_uuid), rank, index, + pool_map_status2name(status), DP_RC(rc)); + } + + return rc; +} + +struct chk_pm_dangling_args { + struct chk_pool_rec *cpda_pool; + struct pool_map *cpda_map; + struct pool_component *cpda_comp; + uint32_t cpda_status; +}; + +static void +chk_engine_pm_dangling_ult(void *args) +{ + struct chk_engine_ult *ult; + struct chk_pm_dangling_args *cpda = args; + struct pool_map *map = cpda->cpda_map; + struct pool_component *comp = cpda->cpda_comp; + struct chk_pool_rec *cpr = cpda->cpda_pool; + struct chk_instance *ins = cpr->cpr_ins; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &cpr->cpr_bk; + struct chk_report_unit cru = {0}; + char *strs[2]; + d_iov_t iovs[2]; + d_sg_list_t sgl; + d_sg_list_t *details = NULL; + Chk__CheckInconsistClass cla; + Chk__CheckInconsistAction act; + char suggested[CHK_MSG_BUFLEN] = {0}; + char msg[CHK_MSG_BUFLEN] = {0}; + uint64_t seq = 0; + uint32_t options[2]; + uint32_t option_nr = 0; + uint32_t detail_nr = 0; + uint32_t status = cpda->cpda_status; + int decision = -1; + int result = 0; + int rc = 0; D_ASSERTF(status == PO_COMP_ST_DOWNOUT || status == PO_COMP_ST_DOWN, "Unexpected pool map status %u\n", status); @@ -635,20 +694,67 @@ chk_engine_pm_dangling(struct chk_pool_rec *cpr, struct pool_map *map, struct po goto report; out: - return chk_engine_post_repair(cpr, &result, rc <= 0); + ult = container_of(args, struct chk_engine_ult, ceu_data); + ult->ceu_result = chk_engine_post_repair(cpr, &result, rc <= 0); } static int -chk_engine_pm_unknown_target(struct chk_pool_rec *cpr, struct pool_component *comp) +chk_engine_pm_dangling(struct chk_pool_rec *cpr, struct pool_map *map, struct pool_component *comp, + uint32_t status) { - struct chk_instance *ins = cpr->cpr_ins; - struct chk_bookmark *cbk = &cpr->cpr_bk; - struct chk_report_unit cru = { 0 }; - Chk__CheckInconsistClass cla; - Chk__CheckInconsistAction act; - char msg[CHK_MSG_BUFLEN] = { 0 }; - uint64_t seq = 0; - int rc; + struct chk_engine_ult *ult; + struct chk_pm_dangling_args *args; + int rc; + + D_ALLOC(ult, sizeof(*ult) + sizeof(*args)); + if (ult == NULL) + return -DER_NOMEM; + + args = (struct chk_pm_dangling_args *)&ult->ceu_data[0]; + args->cpda_pool = cpr; + args->cpda_map = map; + args->cpda_comp = comp; + args->cpda_status = status; + + ult->ceu_ult = ABT_THREAD_NULL; + d_list_add_tail(&ult->ceu_link, &cpr->cpr_ult_list); + + rc = dss_ult_create(chk_engine_pm_dangling_ult, args, DSS_XS_SYS, 0, 0, &ult->ceu_ult); + if (rc != 0) { + d_list_del(&ult->ceu_link); + D_FREE(ult); + D_ERROR(DF_ENGINE " failed to create ULT to handle PM dangling for pool " DF_UUID + ", type %x, index %d, id %u, status %s: " DF_RC "\n", + DP_ENGINE(cpr->cpr_ins), DP_UUID(cpr->cpr_uuid), comp->co_type, + comp->co_index, comp->co_id, pool_map_status2name(status), DP_RC(rc)); + } + + return rc; +} + +struct chk_engine_unknown_args { + struct chk_pool_rec *ceua_pool; + struct chk_cont_rec *ceua_cont; + struct pool_component *ceua_comp; + uint32_t ceua_exp_tgt_nr; +}; + +static void +chk_engine_handle_unknown_ult(void *args) +{ + struct chk_engine_ult *ult; + struct chk_engine_unknown_args *ceua = args; + struct chk_pool_rec *cpr = ceua->ceua_pool; + struct chk_cont_rec *ccr = ceua->ceua_cont; + struct pool_component *comp = ceua->ceua_comp; + struct chk_instance *ins = cpr->cpr_ins; + struct chk_bookmark *cbk = &cpr->cpr_bk; + struct chk_report_unit cru = {0}; + Chk__CheckInconsistClass cla; + Chk__CheckInconsistAction act; + char msg[CHK_MSG_BUFLEN] = {0}; + uint64_t seq = 0; + int rc; cla = CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN; act = CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE; @@ -663,22 +769,89 @@ chk_engine_pm_unknown_target(struct chk_pool_rec *cpr, struct pool_component *co cru.cru_rank = dss_self_rank(); cru.cru_pool = (uuid_t *)&cpr->cpr_uuid; cru.cru_pool_label = cpr->cpr_label; - snprintf(msg, CHK_MSG_BUFLEN - 1, - "Check engine detects unknown target entry in pool map for pool " - DF_UUIDF", rank %u, index %u, status %u, skip it. You can change " - "its status via DAOS debug tool if it is not for downgraded case.\n", - DP_UUID(cpr->cpr_uuid), comp->co_rank, comp->co_index, comp->co_status); + + if (ccr != NULL) { + cru.cru_cont = (uuid_t *)&ccr->ccr_uuid; + if (ccr->ccr_label_prop != NULL && ccr->ccr_label_prop->dpp_entries != NULL) + cru.cru_cont_label = ccr->ccr_label_prop->dpp_entries[0].dpe_str; + snprintf(msg, CHK_MSG_BUFLEN - 1, + "The container " DF_UUID " in the pool " DF_UUID + " lost some shards: %u vs %u\n", + DP_UUID(ccr->ccr_uuid), DP_UUID(cpr->cpr_uuid), ccr->ccr_tgt_nr, + ceua->ceua_exp_tgt_nr); + } else { + D_ASSERT(comp != NULL); + + snprintf( + msg, CHK_MSG_BUFLEN - 1, + "Check engine detects unknown target entry in pool map for pool " DF_UUIDF + ", rank %u, index %u, status %u, skip it. You can change its status via DAOS " + "debug tool if it is not for downgraded case.\n", + DP_UUID(cpr->cpr_uuid), comp->co_rank, comp->co_index, comp->co_status); + } + cru.cru_msg = msg; cru.cru_result = 0; rc = chk_engine_report(&cru, &seq, NULL); - D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, - DF_ENGINE" detects unknown target entry in pool map for pool "DF_UUIDF", rank %u, " - "target %u, action %u (no interact), handle_rc 0, report_rc %d, decision 0\n", - DP_ENGINE(ins), DP_UUID(cpr->cpr_uuid), comp->co_rank, comp->co_index, act, rc); + if (ccr != NULL) + D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, + DF_ENGINE " detects incomplete container " DF_UUIDF "/" DF_UUID "shards " + "%u vs %u, ignore it.\n", + DP_ENGINE(ins), DP_UUID(cpr->cpr_uuid), DP_UUID(ccr->ccr_uuid), + ccr->ccr_tgt_nr, ceua->ceua_exp_tgt_nr); + else + D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, + DF_ENGINE " detects unknown target entry in pool map for pool " DF_UUIDF + ", rank %u, target %u, ignore it, report_rc %d\n", + DP_ENGINE(ins), DP_UUID(cpr->cpr_uuid), comp->co_rank, comp->co_index, rc); - return chk_engine_post_repair(cpr, &rc, rc <= 0); + ult = container_of(args, struct chk_engine_ult, ceu_data); + ult->ceu_result = chk_engine_post_repair(cpr, &rc, rc <= 0); +} + +static int +chk_engine_handle_unknown(struct chk_pool_rec *cpr, struct chk_cont_rec *ccr, + struct pool_component *comp, uint32_t exp_tgt_nr) +{ + struct chk_engine_ult *ult; + struct chk_engine_unknown_args *args; + int rc; + + D_ALLOC(ult, sizeof(*ult) + sizeof(*args)); + if (ult == NULL) + return -DER_NOMEM; + + args = (struct chk_engine_unknown_args *)&ult->ceu_data[0]; + args->ceua_pool = cpr; + args->ceua_cont = ccr; + args->ceua_comp = comp; + args->ceua_exp_tgt_nr = exp_tgt_nr; + + ult->ceu_ult = ABT_THREAD_NULL; + d_list_add_tail(&ult->ceu_link, &cpr->cpr_ult_list); + + rc = dss_ult_create(chk_engine_handle_unknown_ult, args, DSS_XS_SYS, 0, 0, &ult->ceu_ult); + if (rc != 0) { + d_list_del(&ult->ceu_link); + D_FREE(ult); + if (ccr != NULL) + D_ERROR(DF_ENGINE + " failed to create ULT to handle incomplete container " DF_UUID + "/" DF_UUID ", shards %d/%d: " DF_RC "\n", + DP_ENGINE(cpr->cpr_ins), DP_UUID(cpr->cpr_uuid), + DP_UUID(ccr->ccr_uuid), ccr->ccr_tgt_nr, exp_tgt_nr, DP_RC(rc)); + else + D_ERROR(DF_ENGINE + " failed to create ULT to handle PM unknown for pool " DF_UUID + ", type %x, index %d, id %u, status %s: " DF_RC "\n", + DP_ENGINE(cpr->cpr_ins), DP_UUID(cpr->cpr_uuid), comp->co_type, + comp->co_index, comp->co_id, pool_map_status2name(comp->co_status), + DP_RC(rc)); + } + + return rc; } static int @@ -750,7 +923,7 @@ chk_engine_pool_mbs_one(struct chk_pool_rec *cpr, struct pool_map *map, struct c * layout? It is better to keep it there with reporting it * to admin who can adjust the status via DAOS debug tool. */ - rc = chk_engine_pm_unknown_target(cpr, comp); + rc = chk_engine_handle_unknown(cpr, NULL, comp, 0); break; } @@ -947,6 +1120,45 @@ chk_engine_bad_pool_label(struct chk_pool_rec *cpr, struct ds_pool_svc *svc) return chk_engine_post_repair(cpr, &result, rc <= 0); } +struct chk_cont_cleanup_args { + struct chk_pool_rec *ccca_pool; + struct chk_cont_rec *ccca_cont; + struct cont_svc *ccca_svc; +}; + +static int +chk_engine_cont_cleanup_one(struct chk_pool_rec *cpr, struct chk_cont_rec *ccr, + struct cont_svc *svc, void (*func)(void *)) +{ + struct chk_engine_ult *ult; + struct chk_cont_cleanup_args *args; + int rc; + + D_ALLOC(ult, sizeof(*ult) + sizeof(*args)); + if (ult == NULL) + return -DER_NOMEM; + + args = (struct chk_cont_cleanup_args *)&ult->ceu_data[0]; + args->ccca_pool = cpr; + args->ccca_cont = ccr; + args->ccca_svc = svc; + + ult->ceu_ult = ABT_THREAD_NULL; + d_list_add_tail(&ult->ceu_link, &cpr->cpr_ult_list); + + rc = dss_ult_create(func, args, DSS_XS_SYS, 0, 0, &ult->ceu_ult); + if (rc != 0) { + d_list_del(&ult->ceu_link); + D_FREE(ult); + D_ERROR(DF_ENGINE " failed to create ULT to cleanup container " DF_UUID "/" DF_UUID + ": " DF_RC "\n", + DP_ENGINE(cpr->cpr_ins), DP_UUID(cpr->cpr_uuid), DP_UUID(ccr->ccr_uuid), + DP_RC(rc)); + } + + return rc; +} + static int chk_engine_cont_list_init(uuid_t pool, struct chk_cont_list_aggregator *aggregator) { @@ -1004,27 +1216,32 @@ chk_engine_cont_list_remote_cb(struct chk_co_rpc_cb_args *cb_args) cb_args->cb_nr); } -static int -chk_engine_cont_orphan(struct chk_pool_rec *cpr, struct chk_cont_rec *ccr, struct cont_svc *svc) +static void +chk_engine_cont_orphan_ult(void *args) { - struct chk_instance *ins = cpr->cpr_ins; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &cpr->cpr_bk; - struct chk_report_unit cru = { 0 }; - char *strs[2]; - d_iov_t iovs[2]; - d_sg_list_t sgl; - d_sg_list_t *details = NULL; - Chk__CheckInconsistClass cla; - Chk__CheckInconsistAction act; - char msg[CHK_MSG_BUFLEN] = { 0 }; - uint64_t seq = 0; - uint32_t options[2]; - uint32_t option_nr = 0; - uint32_t detail_nr = 0; - int decision = -1; - int result = 0; - int rc = 0; + struct chk_engine_ult *ult; + struct chk_cont_cleanup_args *ccca = args; + struct chk_pool_rec *cpr = ccca->ccca_pool; + struct chk_cont_rec *ccr = ccca->ccca_cont; + struct cont_svc *svc = ccca->ccca_svc; + struct chk_instance *ins = cpr->cpr_ins; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &cpr->cpr_bk; + struct chk_report_unit cru = {0}; + char *strs[2]; + d_iov_t iovs[2]; + d_sg_list_t sgl; + d_sg_list_t *details = NULL; + Chk__CheckInconsistClass cla; + Chk__CheckInconsistAction act; + char msg[CHK_MSG_BUFLEN] = {0}; + uint64_t seq = 0; + uint32_t options[2]; + uint32_t option_nr = 0; + uint32_t detail_nr = 0; + int decision = -1; + int result = 0; + int rc = 0; cla = CHK__CHECK_INCONSIST_CLASS__CIC_CONT_NONEXIST_ON_PS; act = prop->cp_policies[cla]; @@ -1167,7 +1384,8 @@ chk_engine_cont_orphan(struct chk_pool_rec *cpr, struct chk_cont_rec *ccr, struc /* NOTE: For orphan container, mark it as 'skip' since we do not support to add it back. */ ccr->ccr_skip = 1; - return chk_engine_post_repair(cpr, &result, rc <= 0); + ult = container_of(args, struct chk_engine_ult, ceu_data); + ult->ceu_result = chk_engine_post_repair(cpr, &result, rc <= 0); } static daos_prop_t * @@ -1267,29 +1485,34 @@ chk_engine_ccr2label(struct chk_cont_rec *ccr, bool prefer_target) return NULL; } -static int -chk_engine_cont_set_label(struct chk_pool_rec *cpr, struct chk_cont_rec *ccr, struct cont_svc *svc) +static void +chk_engine_cont_set_label_ult(void *args) { - struct chk_instance *ins = cpr->cpr_ins; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &cpr->cpr_bk; - daos_prop_t *prop_tmp = NULL; - struct chk_report_unit cru = { 0 }; - char strs[3][CHK_MSG_BUFLEN]; - d_iov_t iovs[3]; - d_sg_list_t sgl; - d_sg_list_t *details = NULL; - char *label = NULL; - Chk__CheckInconsistClass cla; - Chk__CheckInconsistAction act; - char msg[CHK_MSG_BUFLEN] = { 0 }; - uint64_t seq = 0; - uint32_t options[3]; - uint32_t option_nr = 0; - uint32_t detail_nr = 0; - int decision = -1; - int result = 0; - int rc = 0; + struct chk_engine_ult *ult; + struct chk_cont_cleanup_args *ccca = args; + struct chk_pool_rec *cpr = ccca->ccca_pool; + struct chk_cont_rec *ccr = ccca->ccca_cont; + struct cont_svc *svc = ccca->ccca_svc; + struct chk_instance *ins = cpr->cpr_ins; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &cpr->cpr_bk; + daos_prop_t *prop_tmp = NULL; + struct chk_report_unit cru = {0}; + char strs[3][CHK_MSG_BUFLEN]; + d_iov_t iovs[3]; + d_sg_list_t sgl; + d_sg_list_t *details = NULL; + char *label = NULL; + Chk__CheckInconsistClass cla; + Chk__CheckInconsistAction act; + char msg[CHK_MSG_BUFLEN] = {0}; + uint64_t seq = 0; + uint32_t options[3]; + uint32_t option_nr = 0; + uint32_t detail_nr = 0; + int decision = -1; + int result = 0; + int rc = 0; cla = CHK__CHECK_INCONSIST_CLASS__CIC_CONT_BAD_LABEL; act = prop->cp_policies[cla]; @@ -1550,7 +1773,8 @@ chk_engine_cont_set_label(struct chk_pool_rec *cpr, struct chk_cont_rec *ccr, st daos_prop_free(prop_tmp); - return chk_engine_post_repair(cpr, &result, rc <= 0); + ult = container_of(args, struct chk_engine_ult, ceu_data); + ult->ceu_result = chk_engine_post_repair(cpr, &result, rc <= 0); } static int @@ -1599,10 +1823,7 @@ chk_engine_cont_cleanup(struct chk_pool_rec *cpr, struct ds_pool_svc *ds_svc, struct chk_instance *ins = cpr->cpr_ins; struct cont_svc *svc; struct chk_cont_rec *ccr; - char msg[CHK_MSG_BUFLEN]; - struct chk_cont_label_cb_args cclca = {0}; - struct chk_report_unit cru = {0}; - uint64_t seq; + struct chk_cont_label_cb_args cclca = {0}; uint32_t exp_tgt_nr = 0; int rc = 0; bool failout; @@ -1635,7 +1856,7 @@ chk_engine_cont_cleanup(struct chk_pool_rec *cpr, struct ds_pool_svc *ds_svc, continue; } - rc = chk_engine_cont_orphan(cpr, ccr, svc); + rc = chk_engine_cont_cleanup_one(cpr, ccr, svc, chk_engine_cont_orphan_ult); if (rc != 0) goto out; } @@ -1652,7 +1873,8 @@ chk_engine_cont_cleanup(struct chk_pool_rec *cpr, struct ds_pool_svc *ds_svc, continue; if (!ccr->ccr_label_checked) { - rc = chk_engine_cont_set_label(cpr, ccr, svc); + rc = chk_engine_cont_cleanup_one(cpr, ccr, svc, + chk_engine_cont_set_label_ult); if (rc != 0) goto out; } @@ -1660,24 +1882,7 @@ chk_engine_cont_cleanup(struct chk_pool_rec *cpr, struct ds_pool_svc *ds_svc, if (likely(ccr->ccr_tgt_nr >= exp_tgt_nr)) continue; - snprintf( - msg, CHK_MSG_BUFLEN - 1, - "The container " DF_UUID " in the pool " DF_UUID " lost some shards: %u/%u\n", - DP_UUID(ccr->ccr_uuid), DP_UUID(cpr->cpr_uuid), ccr->ccr_tgt_nr, exp_tgt_nr); - - cru.cru_msg = msg; - cru.cru_gen = cpr->cpr_bk.cb_gen; - cru.cru_cla = CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN; - cru.cru_act = CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE; - cru.cru_rank = dss_self_rank(); - cru.cru_pool = (uuid_t *)&cpr->cpr_uuid; - cru.cru_pool_label = cpr->cpr_label; - cru.cru_cont = (uuid_t *)&ccr->ccr_uuid; - if (ccr->ccr_label_prop != NULL && ccr->ccr_label_prop->dpp_entries != NULL) - cru.cru_cont_label = ccr->ccr_label_prop->dpp_entries[0].dpe_str; - - seq = 0; - chk_engine_report(&cru, &seq, NULL); + chk_engine_handle_unknown(cpr, ccr, NULL, exp_tgt_nr); } out: @@ -1728,6 +1933,26 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) } } +static int +chk_engine_wait_ults(struct chk_pool_rec *cpr) +{ + struct chk_engine_ult *ult; + int rc = 0; + + while ((ult = d_list_pop_entry(&cpr->cpr_ult_list, struct chk_engine_ult, ceu_link)) != + NULL) { + if (ult->ceu_ult != ABT_THREAD_NULL) + ABT_thread_free(&ult->ceu_ult); + + if (rc == 0) + rc = ult->ceu_result; + + D_FREE(ult); + } + + return rc; +} + static void chk_engine_pool_ult(void *args) { @@ -1788,6 +2013,11 @@ chk_engine_pool_ult(void *args) if (rc != 0 || cpr->cpr_skip || cpr->cpr_stop) goto out; + /* Wait for the ULTs that handle orphan, dangling and unknown pool map components. */ + rc = chk_engine_wait_ults(cpr); + if (rc != 0) + goto out; + if (cpr->cpr_map_refreshed) { /* * Under dryrun mode, we cannot make the changed pool map to be used by @@ -1867,6 +2097,10 @@ chk_engine_pool_ult(void *args) rc = ds_pool_svc_schedule_reconf(svc); out: + rc1 = chk_engine_wait_ults(cpr); + if (rc == 0) + rc = rc1; + chk_engine_cont_list_fini(&aggregator); if (map != NULL) pool_map_decref(map); @@ -3177,13 +3411,13 @@ chk_engine_set_policy(uint64_t gen, uint32_t policy_nr, struct chk_policy *polic static int chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) { - struct chk_instance *ins = chk_engine; - struct chk_pending_rec *cpr = NULL; - struct chk_pending_rec *tmp = NULL; - struct chk_pool_rec *pool = NULL; - d_iov_t kiov; - d_iov_t riov; - int rc; + struct chk_instance *ins = chk_engine; + struct chk_pending_rec *cpr = NULL; + struct chk_pool_rec *pool = NULL; + d_iov_t kiov; + d_iov_t riov; + int rc; + bool report_fail = false; D_ASSERT(cru->cru_pool != NULL); @@ -3220,22 +3454,19 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) cru->cru_detail_nr, cru->cru_details, *seq); if (unlikely(rc == -DER_AGAIN)) { D_ASSERT(cru->cru_act == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT); + D_ASSERT(cpr != NULL); - rc = chk_pending_del(ins, *seq, &tmp); - if (rc == 0) - D_ASSERT(tmp == NULL); - else if (rc != -DER_NONEXIST) - goto log; - - chk_pending_destroy(cpr); + cpr->cpr_busy = 0; + chk_pending_del(ins, *seq, NULL); cpr = NULL; goto new_seq; } - /* Check cpr->cpr_action for the case of "dmg check repair" by race. */ - if (rc == 0 && pool != NULL && - likely(cpr->cpr_action == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT)) + if (rc != 0) + report_fail = true; + else if (pool != NULL && cpr->cpr_action == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) + /* Check cpr->cpr_action for the case of "dmg check repair" by race. */ pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_PENDING; log: @@ -3273,11 +3504,18 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) goto again; out: - if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING) - pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; + if (cpr != NULL) { + if (report_fail) { + cpr->cpr_busy = 0; + chk_pending_del(ins, *seq, NULL); + } else { + chk_pending_destroy(cpr); + } + } - if (cpr != NULL) - chk_pending_destroy(cpr); + if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING && + d_list_empty(&pool->cpr_pending_list)) + pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; return rc; } diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index e4d6d52f3fd..34de268d959 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -591,6 +591,14 @@ struct chk_pool_shard { chk_pool_free_data_t cps_free_cb; }; +struct chk_engine_ult { + /* Link into chk_pool_rec::cpr_ult_list. */ + d_list_t ceu_link; + ABT_thread ceu_ult; + int ceu_result; + char ceu_data[0]; +}; + /* Check engine uses it to trace pools. Query logic uses it to organize the result. */ struct chk_pool_rec { /* Link into chk_instance::ci_pool_list. */ @@ -601,6 +609,8 @@ struct chk_pool_rec { d_list_t cpr_shard_list; /* The list of chk_pending_rec. */ d_list_t cpr_pending_list; + /* The list of active ULTs that are handling some inconsistency and maybe blocked. */ + d_list_t cpr_ult_list; uint32_t cpr_shard_nr; uint32_t cpr_started:1, cpr_start_post:1, @@ -742,6 +752,8 @@ int chk_pool_add_shard(daos_handle_t hdl, d_list_t *head, uuid_t uuid, d_rank_t void chk_pool_shard_cleanup(struct chk_instance *ins); +int chk_pending_lookup(struct chk_instance *ins, uint64_t seq, struct chk_pending_rec **cpr); + int chk_pending_add(struct chk_instance *ins, d_list_t *pool_head, d_list_t *rank_head, uuid_t uuid, uint64_t seq, uint32_t rank, uint32_t cla, uint32_t option_nr, uint32_t *options, struct chk_pending_rec **cpr); @@ -1039,6 +1051,7 @@ chk_pool_put(struct chk_pool_rec *cpr) D_ASSERT(cpr->cpr_thread == ABT_THREAD_NULL); D_ASSERT(d_list_empty(&cpr->cpr_pending_list)); D_ASSERT(d_list_empty(&cpr->cpr_shutdown_link)); + D_ASSERT(d_list_empty(&cpr->cpr_ult_list)); while ((cps = d_list_pop_entry(&cpr->cpr_shard_list, struct chk_pool_shard, cps_link)) != NULL) { diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index 3c381346663..f6bece1ce03 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -3462,12 +3462,10 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act) d_iov_t riov; int rc; - rc = chk_pending_del(ins, seq, &pending); + rc = chk_pending_lookup(ins, seq, &pending); if (rc != 0) goto out; - D_ASSERT(pending->cpr_busy); - if (pending->cpr_on_leader) { ABT_mutex_lock(pending->cpr_mutex); /* @@ -3477,20 +3475,25 @@ chk_leader_act_internal(struct chk_instance *ins, uint64_t seq, uint32_t act) pending->cpr_action = act; ABT_cond_broadcast(pending->cpr_cond); ABT_mutex_unlock(pending->cpr_mutex); + chk_pending_del(ins, seq, &pending); } else { d_iov_set(&riov, NULL, 0); d_iov_set(&kiov, pending->cpr_uuid, sizeof(uuid_t)); rc = dbtree_lookup(ins->ci_pool_hdl, &kiov, &riov); - if (rc == 0) { + if (rc == 0) pool = (struct chk_pool_rec *)riov.iov_buf; - if (pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING) - pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; - } rc = chk_act_remote(ins->ci_ranks, ins->ci_bk.cb_gen, seq, pending->cpr_class, act, pending->cpr_rank); + if (rc == 0) { + pending->cpr_busy = 0; + chk_pending_del(ins, seq, NULL); - chk_pending_destroy(pending); + if (pool != NULL && + pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING && + d_list_empty(&pool->cpr_pending_list)) + pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; + } } out: @@ -3584,14 +3587,15 @@ chk_leader_set_policy(uint32_t policy_nr, struct chk_policy *policies) int chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_pending_rec *cpr = NULL; - struct chk_pool_rec *pool = NULL; - struct chk_rank_rec *crr = NULL; - d_iov_t kiov; - d_iov_t riov; - int rc; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_pending_rec *cpr = NULL; + struct chk_pool_rec *pool = NULL; + struct chk_rank_rec *crr = NULL; + d_iov_t kiov; + d_iov_t riov; + int rc; + bool upcall_fail = false; CHK_IS_READY(ins); @@ -3658,9 +3662,10 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) cru->cru_cont, cru->cru_cont_label, cru->cru_obj, cru->cru_dkey, cru->cru_akey, cru->cru_msg, cru->cru_option_nr, cru->cru_options, cru->cru_detail_nr, cru->cru_details); - /* Check cpr->cpr_action for the case of "dmg check repair" by race. */ - if (rc == 0 && pool != NULL && - likely(cpr->cpr_action == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT)) + if (rc != 0) + upcall_fail = true; + else if (pool != NULL && cpr->cpr_action == CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT) + /* Check cpr->cpr_action for the case of "dmg check repair" by race. */ pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_PENDING; log: @@ -3700,14 +3705,19 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) goto again; out: + if (cpr != NULL) { + if (upcall_fail) { + cpr->cpr_busy = 0; + chk_pending_del(ins, *seq, NULL); + } else if (rc != 0 || decision != NULL) { + chk_pending_destroy(cpr); + } + } + if (pool != NULL && pool->cpr_bk.cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_PENDING && - (rc != 0 || (cpr != NULL && - cpr->cpr_action != CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT))) + d_list_empty(&pool->cpr_pending_list)) pool->cpr_bk.cb_pool_status = CHK__CHECK_POOL_STATUS__CPS_CHECKING; - if ((rc != 0 || decision != NULL) && cpr != NULL) - chk_pending_destroy(cpr); - return rc; } diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index f2be8fbc056..e791ec7372e 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -3903,6 +3903,119 @@ cr_lost_rank0(void **state) cr_cleanup(arg, &pool, 1); } +/* + * 1. Create two pools and three containers for each. + * 2. Fault injection to make all containers' label inconsistent. + * 3. Start checker with CONT_BAD_LABEL:CIA_INTERACT + * 4. Query checker, should show six interactions. + * 5. Check repair the container label with trust PS (pool/container service). + * 6. Query checker, container label should have been repaired. + * 7. Switch to normal mode and verify the container label. + * 8. Cleanup. + */ +static void +cr_scan_cont_parallel(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pools[2] = {0}; + struct test_cont conts[6] = {0}; + struct daos_check_info dcis[2] = {0}; + char *label = NULL; + uint32_t classes[3]; + uint32_t actions[3]; + int rc; + int i; + int j; + bool once; + + FAULT_INJECTION_REQUIRED(); + + print_message("CR30: scan multiple containers in parallel\n"); + + for (i = 0; i < 2; i++) { + rc = cr_pool_create(state, &pools[i], true, TCC_NONE); + assert_rc_equal(rc, 0); + + for (j = 0; j < 3; j++) { + rc = cr_cont_create(state, &pools[i], &conts[i * 3 + j], 1); + assert_rc_equal(rc, 0); + } + } + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_RESET, 0, NULL, "CONT_BAD_LABEL:CIA_INTERACT"); + assert_rc_equal(rc, 0); + + for (i = 0; i < 3; i++) { + classes[i] = TCC_CONT_BAD_LABEL; + actions[i] = TCA_INTERACT; + } + + for (i = 0; i < 2; i++, once = false) { +again: + cr_pool_wait(1, &pools[i].pool_uuid, &dcis[i]); + + rc = cr_ins_verify(&dcis[i], TCIS_RUNNING); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dcis[i], pools[i].pool_uuid, TCPS_PENDING, 3, classes, actions, + NULL); + /* Report for different containers maybe asynchronously, wait 3 seconds and retry */ + if (rc == -DER_INVAL && !once) { + sleep(3); + once = true; + goto again; + } + + assert_rc_equal(rc, 0); + } + + rc = cr_check_set_policy(TCPF_NONE, "CONT_BAD_LABEL:CIA_TRUST_PS"); + assert_rc_equal(rc, 0); + + for (i = 0; i < 3; i++) + actions[i] = TCA_TRUST_PS; + + for (i = 0; i < 2; i++) { + cr_ins_wait(1, &pools[i].pool_uuid, &dcis[i]); + + rc = cr_ins_verify(&dcis[i], TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dcis[i], pools[i].pool_uuid, TCPS_CHECKED, 3, classes, actions, + NULL); + assert_rc_equal(rc, 0); + } + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + for (i = 0; i < 2; i++) { + for (j = 0; j < 3; j++) { + rc = cr_cont_get_label(state, &pools[i], &conts[i * 3 + j], j == 0, &label); + assert_rc_equal(rc, 0); + + D_ASSERTF(strcmp(label, conts[i * 3 + j].label) == 0, + "Cont (" DF_UUID ") label is not repaired: %s vs %s\n", + DP_UUID(conts[i * 3 + j].uuid), label, conts[i * 3 + j].label); + + D_FREE(label); + } + + cr_dci_fini(&dcis[i]); + } + + cr_cleanup(arg, pools, 2); +} + /* clang-format off */ static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", @@ -3963,6 +4076,8 @@ static const struct CMUnitTest cr_tests[] = { cr_maintenance_mode, async_disable, test_case_teardown}, { "CR29: CR with rank 0 excluded at the beginning", cr_lost_rank0, async_disable, test_case_teardown}, + { "CR30: scan multiple containers in parallel", + cr_scan_cont_parallel, async_disable, test_case_teardown}, }; /* clang-format on */