From a8854ece2f6805aebf139334e7e40bcf95bdaeac Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Fri, 23 Jan 2026 16:39:51 +0800 Subject: [PATCH] DAOS-17535 chk: misc improvements for CR logic - b26 Include the followings: 1. When create CHK IV namespace, make the secondary group to be same as the primary group. Otherwise, CHK logic may hit DER_NONEXIST trouble when communicate via IV. 2. Integrate CHK IV namespace create and destroy API, cleanup related logic, redefine the version. 3. Get ranks list and IV namespace version from CHK leader when rejoin. Adjust CHK_REJOIN RPC for related changes. 4. Remove unsupported functionality for checking the specified 'phase'. 5. Add new test for case of lost some engine(s) before start checker. 6. Dedicated ULT to handle dead rank event, that will not be affected by checker start or stop. Then even if check scheduler exited, the subsequent check query still can work against the latest rank list. Test-tag: recovery Signed-off-by: Fan Yong --- src/chk/chk_common.c | 22 +- src/chk/chk_engine.c | 213 ++++++------- src/chk/chk_internal.h | 125 +++----- src/chk/chk_iv.c | 102 +++++- src/chk/chk_leader.c | 295 +++++++++--------- src/chk/chk_rpc.c | 41 ++- src/chk/chk_srv.c | 19 +- src/engine/server_iv.c | 4 +- src/gurt/misc.c | 23 +- src/include/daos_srv/daos_chk.h | 7 +- src/include/gurt/common.h | 12 +- src/mgmt/srv_chk.c | 5 +- src/mgmt/srv_drpc.c | 4 +- src/mgmt/srv_internal.h | 7 +- src/mgmt/tests/mocks.c | 4 +- src/object/cli_coll.c | 5 +- src/tests/ftest/recovery/cat_recov_core.yaml | 6 +- .../ftest/recovery/container_cleanup.yaml | 2 +- .../container_list_consolidation.yaml | 2 +- src/tests/ftest/recovery/ms_membership.yaml | 5 + src/tests/ftest/recovery/pool_cleanup.yaml | 2 +- .../recovery/pool_list_consolidation.yaml | 2 +- src/tests/suite/daos_cr.c | 64 +++- 23 files changed, 559 insertions(+), 412 deletions(-) diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index a39cb94632a..2a7164ba757 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -1019,8 +1019,7 @@ chk_pending_destroy(struct chk_pending_rec *cpr) } int -chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, +chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop) { int rc = 0; @@ -1033,11 +1032,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT; if (flags & CHK__CHECK_FLAG__CF_NO_AUTO) prop->cp_flags &= ~CHK__CHECK_FLAG__CF_AUTO; - prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | - CHK__CHECK_FLAG__CF_ORPHAN_POOL | - CHK__CHECK_FLAG__CF_NO_FAILOUT | - CHK__CHECK_FLAG__CF_NO_AUTO); - prop->cp_phase = phase; + prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | CHK__CHECK_FLAG__CF_ORPHAN_POOL | + CHK__CHECK_FLAG__CF_NO_FAILOUT | CHK__CHECK_FLAG__CF_NO_AUTO); if (ranks != NULL) prop->cp_rank_nr = ranks->rl_nr; @@ -1195,12 +1191,7 @@ chk_ins_cleanup(struct chk_instance *ins) chk_stop_sched(ins); ins->ci_inited = 0; - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); } int @@ -1215,7 +1206,8 @@ chk_ins_init(struct chk_instance **p_ins) if (ins == NULL) D_GOTO(out_init, rc = -DER_NOMEM); - ins->ci_sched = ABT_THREAD_NULL; + ins->ci_sched = ABT_THREAD_NULL; + ins->ci_dead_rank_ult = ABT_THREAD_NULL; ins->ci_rank_hdl = DAOS_HDL_INVAL; D_INIT_LIST_HEAD(&ins->ci_rank_list); @@ -1281,6 +1273,8 @@ chk_ins_fini(struct chk_instance **p_ins) D_ASSERT(daos_handle_is_inval(ins->ci_pending_hdl)); D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list)); + D_ASSERT(ins->ci_dead_rank_ult == ABT_THREAD_NULL); + if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index a5f7f1a4cfd..cd0de1138fa 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -219,8 +219,7 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu iv.ci_to_leader = 1; /* Notify the leader that check instance exit on the engine. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc = %d\n", DP_ENGINE(ins), dss_self_rank(), ins_status, rc); @@ -1670,8 +1669,7 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) * to all engines. Otherwise, the engine out of the pool map cannot get * the notification. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER, - true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify pool shards for "DF_UUIDF", phase %u, " "ins_status %u, pool_status %u: rc = %d\n", @@ -1683,8 +1681,7 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) iv.ci_from_psl = 0; iv.ci_to_leader = 1; /* Synchronously notify the check leader with the new check status/phase. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify check leader for "DF_UUIDF", phase %u, " "ins_status %u, pool_status %u: rc = %d\n", @@ -2007,9 +2004,8 @@ chk_engine_sched(void *args) static int chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint64_t gen, int phase, uint32_t api_flags, - d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint64_t gen, uint32_t api_flags, d_rank_t leader, uint32_t flags) { struct chk_traverse_pools_args ctpa = { 0 }; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2091,8 +2087,7 @@ chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank init: if (!chk_is_on_leader(gen, leader, true)) { - rc = chk_prop_prepare(leader, api_flags, phase, policy_nr, policies, rank_list, - prop); + rc = chk_prop_prepare(leader, api_flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2220,16 +2215,15 @@ chk_engine_pool_filter(uuid_t uuid, void *arg, int *phase) int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], uint32_t api_flags, - int phase, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, + uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues) { - struct chk_instance *ins = chk_engine; - struct chk_bookmark *cbk = &ins->ci_bk; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - int rc; - int rc1; + struct chk_instance *ins = chk_engine; + struct chk_bookmark *cbk = &ins->ci_bk; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2251,12 +2245,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; @@ -2270,27 +2259,20 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (rc != 0) goto out_tree; - rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, - pool_nr, pools, gen, phase, api_flags, leader, flags); + rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, gen, + api_flags, leader, flags); if (rc != 0) goto out_tree; if (chk_is_on_leader(gen, leader, true)) { ins->ci_iv_ns = chk_leader_get_iv_ns(); - if (unlikely(ins->ci_iv_ns == NULL)) - goto out_tree; + D_ASSERT(ins->ci_iv_ns != NULL); + + ins->ci_ns_ver = ns_ver; } else { - uuid_unparse_lower(iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, iv_uuid, leader, ns_ver); if (rc != 0) goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); } uuid_copy(cbk->cb_iv_uuid, iv_uuid); @@ -2324,12 +2306,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic D_WARN(DF_ENGINE" failed to update engine bookmark: "DF_RC"\n", DP_ENGINE(ins), DP_RC(rc1)); } - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); @@ -2337,17 +2314,18 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic ins->ci_starting = 0; out_log: if (rc >= 0) { - D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, phase %d, leader %u, " - "flags %x, iv "DF_UUIDF": rc %d\n", + D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, ns_ver %d, leader %u, " + "flags %x, iv " DF_UUIDF ": rc %d\n", DP_ENGINE(ins), chk_is_ins_reset(ins, api_flags) ? "start" : "resume", - myrank, api_flags, phase, leader, flags, DP_UUID(iv_uuid), rc); + myrank, api_flags, ns_ver, leader, flags, DP_UUID(iv_uuid), rc); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, pool_nr, pools); } else { - D_ERROR(DF_ENGINE" failed to start on rank %u with %d pools, api_flags %x, " - "phase %d, leader %u, flags %x, gen "DF_X64", iv "DF_UUIDF": "DF_RC"\n", - DP_ENGINE(ins), myrank, pool_nr, api_flags, phase, leader, flags, gen, + D_ERROR(DF_ENGINE " failed to start on rank %u with %d pools, api_flags %x, " + "ns_ver %d, leader %u, flags %x, gen " DF_X64 ", iv " DF_UUIDF + ": " DF_RC "\n", + DP_ENGINE(ins), myrank, pool_nr, api_flags, ns_ver, leader, flags, gen, DP_UUID(iv_uuid), DP_RC(rc)); } @@ -2373,7 +2351,7 @@ chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags) if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) D_GOTO(log, rc = -DER_NOTAPPLICABLE); - if (ins->ci_starting) + if (ins->ci_starting || ins->ci_rejoining) D_GOTO(log, rc = -DER_BUSY); if (ins->ci_stopping || ins->ci_sched_exiting) @@ -2604,34 +2582,46 @@ chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *ins_status int chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - d_rank_list_t *rank_list = NULL; - int rc = 0; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + int rc = 0; CHK_IS_READY(ins); if (cbk->cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - rc = chk_prop_fetch(prop, &rank_list); - if (rc != 0) - goto out; + /* For check engine on the leader, reload rank list that has been refreshed by leader. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = NULL; + } + + if (ins->ci_ranks == NULL) { + rc = chk_prop_fetch(prop, &ins->ci_ranks); + if (rc != 0) + goto out; - D_ASSERT(rank_list != NULL); + /* For check engine on the leader, it's done. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + ins->ci_ns_ver = version; + goto out; + } + } - /* For check engine on the leader, related rank has already been marked as "dead". */ - if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) - goto group; + if (unlikely(ins->ci_ranks == NULL)) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!chk_remove_rank_from_list(rank_list, rank)) + if (!chk_remove_rank_from_list(ins->ci_ranks, rank)) D_GOTO(out, rc = -DER_NOTAPPLICABLE); prop->cp_rank_nr--; - rc = chk_prop_update(prop, rank_list); + rc = chk_prop_update(prop, ins->ci_ranks); if (rc != 0) - goto out; + ins->ci_skip_oog = 1; + else + rc = chk_iv_ns_update(ins, version); /* * NOTE: If the rank dead before DAOS check start, then subsequent check start will @@ -2652,19 +2642,7 @@ chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) * sometime later as the DAOS check going. */ -group: - if (ins->ci_iv_group != NULL) - rc = crt_group_secondary_modify(ins->ci_iv_group, rank_list, rank_list, - CRT_GROUP_MOD_OP_REPLACE, version); - out: - if (rc == 0) { - d_rank_list_free(ins->ci_ranks); - ins->ci_ranks = rank_list; - rank_list = NULL; - } - - d_rank_list_free(rank_list); if (rc != -DER_NOTAPPLICABLE) D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u mark rank %u as dead with gen " @@ -3351,19 +3329,19 @@ chk_engine_notify(struct chk_iv *iv) void chk_engine_rejoin(void *args) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *pools = NULL; - struct chk_iv iv = { 0 }; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - uint32_t pool_nr = 0; - uint32_t flags = 0; - int rc = 0; - int rc1; - bool need_join = false; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + d_rank_list_t *ranks = NULL; + uuid_t *pools = NULL; + struct chk_iv iv = {0}; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + uint32_t pool_nr = 0; + uint32_t flags = 0; + int rc = 0; + int rc1; + bool need_join = false; if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) goto out_log; @@ -3372,7 +3350,7 @@ chk_engine_rejoin(void *args) cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_PAUSED) goto out_log; - /* We do NOT support leader (and its associated engine ) to rejoin former check instance. */ + /* We do NOT support leader (and its associated engine) to rejoin former check instance. */ if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) goto out_log; @@ -3407,22 +3385,10 @@ chk_engine_rejoin(void *args) if (rc != 0) goto out_tree; - uuid_unparse_lower(cbk->cb_iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); - if (rc != 0) - goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, cbk->cb_iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, prop->cp_leader, ins->ci_iv_ns->iv_master_term + 1); - again: /* Ask leader whether this engine can rejoin or not. */ rc = chk_rejoin_remote(prop->cp_leader, cbk->cb_gen, myrank, cbk->cb_iv_uuid, &flags, - &pool_nr, &pools); + &ins->ci_ns_ver, &pool_nr, &pools, &ranks); if (rc != 0) { if ((rc == -DER_OOG || rc == -DER_GRPVER) && !ins->ci_pause) { D_INFO(DF_ENGINE" Someone is not ready %d, let's rejoin after 1 sec\n", @@ -3432,14 +3398,22 @@ chk_engine_rejoin(void *args) goto again; } - goto out_iv; + goto out_tree; } - if (pool_nr == 0) { + if (ranks == NULL || pool_nr == 0) { need_join = false; - D_GOTO(out_iv, rc = 1); + D_GOTO(out_tree, rc = 1); } + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = ranks; + ranks = NULL; + + rc = chk_iv_ns_create(ins, cbk->cb_iv_uuid, prop->cp_leader, ins->ci_ns_ver); + if (rc != 0) + goto out_tree; + rc = chk_pools_load_list(ins, cbk->cb_gen, 0, pool_nr, pools, NULL); if (rc != 0) goto out_notify; @@ -3479,21 +3453,17 @@ chk_engine_rejoin(void *args) iv.ci_to_leader = 1; /* Notify the leader that check instance exit on the engine. */ - rc1 = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE, true); + rc1 = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc1 != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc1 = %d\n", DP_ENGINE(ins), myrank, cbk->cb_ins_status, rc1); -out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); out_log: + d_rank_list_free(ranks); + D_FREE(pools); if (need_join) D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" rejoin on rank %u with iv "DF_UUIDF": "DF_RC"\n", @@ -3517,6 +3487,8 @@ chk_engine_setup(void) * related local inconsistency firstly. */ + chk_report_seq_init(ins); + rc = chk_bk_fetch_engine(cbk); if (rc == -DER_NONEXIST) goto prop; @@ -3560,15 +3532,14 @@ chk_engine_setup(void) rc = chk_prop_fetch(&ins->ci_prop, &ins->ci_ranks); if (rc == -DER_NONEXIST) rc = 0; -fini: - if (rc != 0) { - chk_ins_fini(&ins); - } else { - chk_report_seq_init(ins); + if (rc == 0) { ins->ci_inited = 1; ins->ci_pause = 0; } +fini: + if (rc != 0) + chk_engine_cleanup(); return rc; } diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 039815576db..7074276c991 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -72,6 +72,7 @@ struct chk_pool_mbs { uint32_t *cpm_tgt_status; }; +/* clang-format off */ /* * CHK_START: * From check leader to check engine to start the check instance on specified pool(s) or all pools. @@ -79,7 +80,7 @@ struct chk_pool_mbs { #define DAOS_ISEQ_CHK_START \ ((uint64_t) (csi_gen) CRT_VAR) \ ((uint32_t) (csi_flags) CRT_VAR) \ - ((int32_t) (csi_phase) CRT_VAR) \ + ((int32_t) (csi_ns_ver) CRT_VAR) \ ((d_rank_t) (csi_leader_rank) CRT_VAR) \ ((uint32_t) (csi_api_flags) CRT_VAR) \ ((uuid_t) (csi_iv_uuid) CRT_VAR) \ @@ -254,6 +255,7 @@ CRT_RPC_DECLARE(chk_pool_mbs, DAOS_ISEQ_CHK_POOL_MBS, DAOS_OSEQ_CHK_POOL_MBS); CRT_RPC_DECLARE(chk_report, DAOS_ISEQ_CHK_REPORT, DAOS_OSEQ_CHK_REPORT); +/* clang-format off */ /* * CHK_REJOIN: * From check engine to check leader to require rejoin former check instance after the engine @@ -268,9 +270,13 @@ CRT_RPC_DECLARE(chk_report, DAOS_ISEQ_CHK_REPORT, DAOS_OSEQ_CHK_REPORT); #define DAOS_OSEQ_CHK_REJOIN \ ((int32_t) (cro_status) CRT_VAR) \ ((uint32_t) (cro_flags) CRT_VAR) \ + ((uint32_t) (cro_ns_ver) CRT_VAR) \ + ((uint32_t) (cro_padding) CRT_VAR) \ + ((d_rank_t) (cro_ranks) CRT_ARRAY) \ ((uuid_t) (cro_pools) CRT_ARRAY) CRT_RPC_DECLARE(chk_rejoin, DAOS_ISEQ_CHK_REJOIN, DAOS_OSEQ_CHK_REJOIN); +/* clang-format on */ /* dkey for check DB under sys_db */ #define CHK_DB_TABLE "chk" @@ -485,16 +491,12 @@ struct chk_bookmark { * 'reset' for all pools. */ struct chk_property { - d_rank_t cp_leader; - Chk__CheckFlag cp_flags; - Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; - /* - * NOTE: Preserve for supporting to continue the check until the specified phase in the - * future. -1 means to check all phases. - */ - int32_t cp_phase; + d_rank_t cp_leader; + Chk__CheckFlag cp_flags; + Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; + uint32_t cp_padding; /* How many ranks (ever or should) take part in the check instance. */ - uint32_t cp_rank_nr; + uint32_t cp_rank_nr; }; /* @@ -537,6 +539,7 @@ struct chk_instance { d_list_t ci_dead_ranks; ABT_thread ci_sched; + ABT_thread ci_dead_rank_ult; ABT_rwlock ci_abt_lock; ABT_mutex ci_abt_mutex; ABT_cond ci_abt_cond; @@ -544,20 +547,12 @@ struct chk_instance { /* Generator for report event, pending repair actions, and so on. */ uint64_t ci_seq; - uint32_t ci_is_leader:1, - ci_sched_running:1, - ci_sched_exiting:1, - ci_for_orphan:1, - ci_orphan_done:1, /* leader has processed orphan pools. */ - ci_pool_stopped:1, /* check on some pools have been stopped. */ - ci_starting:1, - ci_stopping:1, - ci_started:1, - ci_inited:1, - ci_pause:1, - ci_rejoining:1, - ci_implicated:1; - uint32_t ci_start_flags; + uint32_t ci_is_leader : 1, ci_sched_running : 1, ci_sched_exiting : 1, ci_for_orphan : 1, + ci_orphan_done : 1, ci_pool_stopped : 1, /* check on some pools have been stopped. */ + ci_starting : 1, ci_stopping : 1, ci_started : 1, ci_inited : 1, ci_pause : 1, + ci_skip_oog : 1, ci_rejoining : 1, ci_implicated : 1; + uint32_t ci_start_flags; + uint32_t ci_ns_ver; }; struct chk_iv { @@ -738,9 +733,8 @@ int chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr); void chk_pending_destroy(struct chk_pending_rec *cpr); -int chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, - d_rank_list_t *ranks, struct chk_property *prop); +int chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, + struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop); uint32_t chk_pool_merge_status(uint32_t status_a, uint32_t status_b); @@ -757,7 +751,7 @@ void chk_ins_fini(struct chk_instance **p_ins); int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues); int chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags); @@ -792,7 +786,14 @@ void chk_engine_fini(void); /* chk_iv.c */ -int chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry); +void chk_iv_ns_destroy(struct chk_instance *ins); + +int chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver); + +int chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver); + +int chk_iv_update(struct chk_instance *ins, struct chk_iv *iv, uint32_t shortcut, + uint32_t sync_mode); int chk_iv_init(void); @@ -808,8 +809,8 @@ int chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) int chk_leader_notify(struct chk_iv *iv); -int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools); +int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, + uint32_t *ns_ver, int *pool_nr, uuid_t **pools, d_rank_list_t **ranks); int chk_leader_setup(void); @@ -823,8 +824,8 @@ void chk_leader_fini(void); int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, - uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, + uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); int chk_stop_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t pools[], chk_co_rpc_cb_t stop_cb, void *args); @@ -853,7 +854,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, uint32_t detail_nr, d_sg_list_t *details, uint64_t seq); int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools); + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks); /* chk_updcall.c */ @@ -917,41 +918,24 @@ chk_ins_set_fail(struct chk_instance *ins, uint32_t phase) static inline bool chk_rank_in_list(d_rank_list_t *rlist, d_rank_t rank) { - int i; - bool found = false; - - /* TBD: more efficiently search for the sorted ranks list. */ - - for (i = 0; i < rlist->rl_nr; i++) { - if (rlist->rl_ranks[i] == rank) { - found = true; - break; - } - } - - return found; + return d_rank_list_bsearch(rlist, rank, NULL); } static inline bool chk_remove_rank_from_list(d_rank_list_t *rlist, d_rank_t rank) { - int i; - bool found = false; - - /* TBD: more efficiently search for the sorted ranks list. */ - - for (i = 0; i < rlist->rl_nr; i++) { - if (rlist->rl_ranks[i] == rank) { - found = true; - rlist->rl_nr--; - /* The leader rank will always be in the rank list. */ - D_ASSERT(rlist->rl_nr > 0); - - if (i < rlist->rl_nr) - memmove(&rlist->rl_ranks[i], &rlist->rl_ranks[i + 1], - sizeof(rlist->rl_ranks[i]) * (rlist->rl_nr - i)); - break; - } + int idx = -1; + bool found = false; + + if (d_rank_list_bsearch(rlist, rank, &idx)) { + D_ASSERT(rlist->rl_nr > 0); + D_ASSERT(idx >= 0); + + rlist->rl_nr--; + if (idx < rlist->rl_nr) + memmove(&rlist->rl_ranks[idx], &rlist->rl_ranks[idx + 1], + sizeof(rlist->rl_ranks[idx]) * (rlist->rl_nr - idx)); + found = true; } return found; @@ -1003,17 +987,6 @@ chk_query_free(struct chk_query_pool_shard *shards, uint32_t shard_nr) } } -static inline void -chk_iv_ns_cleanup(struct ds_iv_ns **ns) -{ - if (*ns != NULL) { - if ((*ns)->iv_refcount == 1) - ds_iv_ns_cleanup(*ns); - ds_iv_ns_put(*ns); - *ns = NULL; - } -} - static inline void chk_pool_get(struct chk_pool_rec *cpr) { @@ -1212,7 +1185,7 @@ chk_ins_can_start(struct chk_instance *ins) if (ins->ci_starting) return -DER_INPROGRESS; - if (ins->ci_stopping || ins->ci_sched_exiting) + if (ins->ci_stopping || ins->ci_sched_exiting || ins->ci_rejoining) return -DER_BUSY; if (ins->ci_sched_running) diff --git a/src/chk/chk_iv.c b/src/chk/chk_iv.c index 299c1554856..712fd474c21 100644 --- a/src/chk/chk_iv.c +++ b/src/chk/chk_iv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -181,13 +182,76 @@ struct ds_iv_class_ops chk_iv_ops = { .ivc_value_alloc = chk_iv_value_alloc, }; +void +chk_iv_ns_destroy(struct chk_instance *ins) +{ + if (ins->ci_iv_ns != NULL) { + if (ins->ci_iv_ns->iv_refcount == 1) + ds_iv_ns_cleanup(ins->ci_iv_ns); + ds_iv_ns_put(ins->ci_iv_ns); + ins->ci_iv_ns = NULL; + } + + if (ins->ci_iv_group != NULL) { + crt_group_secondary_destroy(ins->ci_iv_group); + ins->ci_iv_group = NULL; + } +} + int -chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry) +chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver) { - d_sg_list_t sgl; - d_iov_t iov; - struct ds_iv_key key; - int rc; + char uuid_str[DAOS_UUID_STR_SIZE]; + int rc; + + uuid_unparse_lower(uuid, uuid_str); + rc = crt_group_secondary_create(uuid_str, NULL, NULL, &ins->ci_iv_group); + if (rc != 0) + goto out; + + rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, uuid, ins->ci_iv_group, &ins->ci_iv_id, + &ins->ci_iv_ns); + if (rc != 0) + goto out; + + rc = chk_iv_ns_update(ins, ns_ver); + if (rc == 0) { + ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); + ins->ci_skip_oog = 0; + } + +out: + if (rc != 0) + chk_iv_ns_destroy(ins); + return rc; +} + +int +chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver) +{ + int rc; + + /* Let secondary rank == primary rank. */ + rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, + CRT_GROUP_MOD_OP_REPLACE, ns_ver); + if (rc == 0) + ins->ci_ns_ver = ns_ver; + else + ins->ci_skip_oog = 1; + + return rc; +} + +int +chk_iv_update(struct chk_instance *ins, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode) +{ + d_sg_list_t sgl; + d_iov_t iov; + struct ds_iv_key key; + uint32_t ver; + int try_cnt = 0; + int wait_cnt = 0; + int rc; iv->ci_rank = dss_self_rank(); iv->ci_seq = d_hlc_get(); @@ -208,9 +272,35 @@ chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode memset(&key, 0, sizeof(key)); key.class_id = IV_CHK; - rc = ds_iv_update(ns, &key, &sgl, shortcut, sync_mode, 0, retry); + +again: + try_cnt++; + ver = ins->ci_ns_ver; + rc = ds_iv_update(ins->ci_iv_ns, &key, &sgl, shortcut, sync_mode, 0, true); + if (likely(rc != -DER_OOG)) + goto out; + + if (try_cnt % 10 == 0) + D_WARN("CHK iv " DF_X64 "/" DF_X64 " retry because of -DER_OOG for more " + "than %d times.\n", + iv->ci_gen, iv->ci_seq, try_cnt); + + /* Wait chk_deak_rank_ult to sync the IV namespace. */ + while (ver == ins->ci_ns_ver && ins->ci_skip_oog == 0 && ins->ci_pause == 0) { + dss_sleep(500); + if (++wait_cnt % 40 == 0) + D_WARN("CHK iv " DF_X64 "/" DF_X64 " is blocked because of DER_OOG " + "for %d seconds.\n", + iv->ci_gen, iv->ci_seq, wait_cnt / 2); + } + + if (ins->ci_pause || ins->ci_skip_oog) + goto out; + + goto again; } +out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, "CHK iv "DF_X64"/"DF_X64" on rank %u, phase %u, ins_status %u, " "pool_status %u, to_leader %s, from_psl %s: rc = %d\n", diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index e51aa415670..01b35cc158e 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -244,8 +244,7 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu iv.ci_ins_status = ins_status; /* Synchronously notify the engines that the check leader exit. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify the engines its exit, status %u: rc = %d\n", DP_LEADER(ins), ins_status, rc); @@ -318,8 +317,7 @@ chk_leader_post_repair(struct chk_instance *ins, struct chk_pool_rec *cpr, iv.ci_pool_status = cbk->cb_pool_status; /* Synchronously notify the engines that check on the pool got failure. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify the engines that the check for pool " DF_UUIDF" is done with status %u: rc = %d\n", @@ -2076,8 +2074,7 @@ chk_leader_pool_ult(void *arg) iv.ci_phase = cbk->cb_phase; iv.ci_pool_status = CHK__CHECK_POOL_STATUS__CPS_FAILED; - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines to exit check for pool "DF_UUIDF" failure: %d\n", DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), rc); @@ -2097,21 +2094,22 @@ chk_leader_pool_ult(void *arg) static void chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) { - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - uint32_t version = cbk->cb_gen - prop->cp_rank_nr - 1; - int rc = 0; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + uint32_t version = ins->ci_ns_ver + 1; + int rc = 0; if (!chk_remove_rank_from_list(ins->ci_ranks, cdr->cdr_rank)) D_GOTO(out, rc = -DER_NOTAPPLICABLE); prop->cp_rank_nr--; rc = chk_prop_update(prop, ins->ci_ranks); - if (rc != 0) + if (rc != 0) { + ins->ci_skip_oog = 1; goto out; + } - rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, - CRT_GROUP_MOD_OP_REPLACE, version); + rc = chk_iv_ns_update(ins, version); if (rc != 0) goto out; @@ -2150,17 +2148,15 @@ chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) static void chk_leader_sched(void *args) { - struct chk_instance *ins = args; - struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_dead_rank *cdr; - struct chk_iv iv = { 0 }; - uint32_t ins_phase; - uint32_t ins_status; - uint32_t pool_status; - int done = 0; - int rc = 0; - bool bcast = false; - bool more_dead; + struct chk_instance *ins = args; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_iv iv = {0}; + uint32_t ins_phase; + uint32_t ins_status; + uint32_t pool_status; + int done = 0; + int rc = 0; + bool bcast = false; D_INFO(DF_LEADER" scheduler enter at phase %u\n", DP_LEADER(ins), cbk->cb_phase); @@ -2191,29 +2187,9 @@ chk_leader_sched(void *args) while (1) { dss_sleep(300); -check_dead: - ABT_mutex_lock(ins->ci_abt_mutex); - if (!d_list_empty(&ins->ci_dead_ranks)) { - cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link); - if (!d_list_empty(&ins->ci_dead_ranks)) - more_dead = true; - else - more_dead = false; - } else { - cdr = NULL; - more_dead = false; - } - ABT_mutex_unlock(ins->ci_abt_mutex); - - if (cdr != NULL) - chk_leader_mark_rank_dead(ins, cdr); - if (chk_leader_need_stop(ins, &rc)) D_GOTO(out, bcast = (rc > 0 ? true : false)); - if (more_dead) - goto check_dead; - /* * TBD: The leader may need to detect engines' status/phase actively, otherwise * if some engine failed to notify the leader for its status/phase changes, @@ -2229,8 +2205,7 @@ chk_leader_sched(void *args) iv.ci_ins_status = CHK__CHECK_INST_STATUS__CIS_RUNNING; /* Synchronously notify engines that orphan pools have been processed. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines that orphan pools have been process: %d\n", DP_LEADER(ins), rc); @@ -2401,8 +2376,8 @@ chk_leader_ranks_prepare(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *r static int chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], int phase, d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + d_rank_t leader, uint32_t flags) { struct chk_property *prop = &ins->ci_prop; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2485,7 +2460,7 @@ chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank cbk->cb_version = DAOS_CHK_VERSION; init: - rc = chk_prop_prepare(leader, flags, phase, policy_nr, policies, rank_list, prop); + rc = chk_prop_prepare(leader, flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2626,8 +2601,7 @@ chk_leader_start_post(struct chk_instance *ins) * to notify the engine for the check done, that is not fatal. That * can be redo in next check instance. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines the pool "DF_UUIDF" is checked: %d\n", DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), rc); @@ -2807,20 +2781,20 @@ chk_leader_start_cb(struct chk_co_rpc_cb_args *cb_args) int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, - int pool_nr, uuid_t pools[], uint32_t api_flags, int phase) + int pool_nr, uuid_t pools[], uint32_t api_flags) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *c_pools = NULL; - struct umem_attr uma = { 0 }; - uuid_t dummy_pool = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - uint64_t old_gen = cbk->cb_gen; - d_rank_t myrank = dss_self_rank(); - uint32_t flags = api_flags; - int c_pool_nr = 0; - int rc; - int rc1; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + uuid_t *c_pools = NULL; + struct umem_attr uma = {0}; + uuid_t dummy_pool = {0}; + uint64_t old_gen = cbk->cb_gen; + d_rank_t myrank = dss_self_rank(); + uint32_t flags = api_flags; + uint32_t ns_ver = (uint32_t)daos_wallclock_secs(); + int c_pool_nr = 0; + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2845,13 +2819,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } - + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; rc = dbtree_create_inplace(DBTREE_CLASS_CHK_RANK, 0, CHK_BTREE_ORDER, &uma, @@ -2870,8 +2838,8 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_tree; reset: - rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, - phase, myrank, flags); + rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, myrank, + flags); if (rc == 1 && !(flags & CHK__CHECK_FLAG__CF_RESET)) { /* Former check instance has done, let's re-start from the beginning. */ flags |= CHK__CHECK_FLAG__CF_RESET; @@ -2885,18 +2853,10 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto remote; uuid_generate(dummy_pool); - uuid_unparse_lower(dummy_pool, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, dummy_pool, myrank, ns_ver); if (rc != 0) goto out_tree; - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, dummy_pool, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, myrank, ins->ci_iv_ns->iv_master_term + 1); - if (d_list_empty(&ins->ci_pool_list)) { c_pool_nr = pool_nr; c_pools = pools; @@ -2908,7 +2868,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c remote: rc = chk_start_remote(ins->ci_ranks, cbk->cb_gen, rank_nr, ranks, policy_nr, policies, - c_pool_nr, c_pools, flags, phase, myrank, ins->ci_start_flags, + c_pool_nr, c_pools, flags, ns_ver, myrank, ins->ci_start_flags, dummy_pool, chk_leader_start_cb, ins); if (rc != 0) { if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { @@ -2951,10 +2911,9 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_stop_pools; } - D_INFO("Leader %s check with api_flags %x, phase %d, leader %u, flags %x, gen " DF_X64 - " iv "DF_UUIDF": rc %d\n", - chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, phase, myrank, - ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool), rc); + D_INFO("Leader %s with api_flags %x, leader %u, flags %x, gen " DF_X64 " iv " DF_UUIDF "\n", + chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, myrank, + ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool)); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, c_pool_nr > 0 ? c_pool_nr : pool_nr, @@ -2977,8 +2936,6 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to rollback failed check start: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING || cbk->cb_gen != old_gen) { cbk->cb_gen = old_gen; if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING) { @@ -2990,17 +2947,16 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to update leader bookmark: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); } - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; + chk_iv_ns_destroy(ins); out_tree: chk_leader_destroy_trees(ins); ins->ci_starting = 0; out_log: - D_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, - "Leader %s to start check on %u ranks for %d pools with " - "api_flags %x, phase %d, leader %u, gen "DF_X64": rc = %d\n", - rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, phase, - myrank, cbk->cb_gen, rc); + DL_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, rc, + "Leader %s to start check on %u ranks for %d pools with api_flags %x, ns_ver %d, " + "leader %u, gen " DF_X64, + rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, ns_ver, myrank, + cbk->cb_gen); if (unlikely(rc > 0)) rc = 0; @@ -3242,22 +3198,25 @@ int chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb, chk_query_pool_cb_t pool_cb, void *buf) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_query_args *cqa = NULL; - struct chk_pool_rec *cpr; - struct chk_pool_rec *tmp; - struct chk_pool_shard *cps; - struct chk_query_pool_shard *shard; - d_iov_t kiov; - d_iov_t riov; - uint64_t gen = cbk->cb_gen; - uint32_t status; - uint32_t phase; - uint32_t idx = 0; - int rc; - int i; - bool skip; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_query_args *cqa = NULL; + struct chk_pool_rec *cpr; + struct chk_pool_rec *tmp; + struct chk_pool_shard *cps; + struct chk_query_pool_shard *shard; + d_iov_t kiov; + d_iov_t riov; + uint64_t gen = cbk->cb_gen; + uint32_t status; + uint32_t phase; + uint32_t idx = 0; + uint32_t ver; + int try_cnt = 0; + int wait_cnt = 0; + int rc; + int i; + bool skip; CHK_IS_READY(ins); @@ -3286,22 +3245,38 @@ chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb, D_GOTO(out, rc = -DER_NOMEM); again: - rc = chk_query_remote(ins->ci_ranks, gen, pool_nr, pools, chk_leader_query_cb, cqa); + try_cnt++; + ver = ins->ci_ns_ver; + rc = chk_query_remote(ins->ci_ranks, gen, pool_nr, pools, chk_leader_query_cb, cqa); if (rc != 0) { - if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { - D_INFO(DF_LEADER" Someone is not ready %d, let's retry query after 1 sec\n", - DP_LEADER(ins), rc); - if (!d_list_empty(&cqa->cqa_list)) { - chk_cqa_free(cqa); - cqa = chk_cqa_alloc(ins); - if (cqa == NULL) - D_GOTO(out, rc = -DER_NOMEM); - } - dss_sleep(1000); - goto again; + if (rc != -DER_OOG && rc != -DER_GRPVER && rc != -DER_AGAIN) + goto out; + + if (try_cnt % 10 == 0) + D_WARN("Leader (" DF_X64 ") query retried because of %d for %d times.\n", + gen, rc, try_cnt); + + while (ver == ins->ci_ns_ver && ins->ci_skip_oog == 0 && ins->ci_pause == 0) { + dss_sleep(500); + if (++wait_cnt % 40 == 0) + D_WARN("Leader (" DF_X64 ") query is blocked because of %d for " + "about %d seconds.\n", + gen, rc, wait_cnt / 2); + if (rc != -DER_OOG) + break; } - goto out; + if (ins->ci_pause || ins->ci_skip_oog) + goto out; + + if (!d_list_empty(&cqa->cqa_list)) { + chk_cqa_free(cqa); + cqa = chk_cqa_alloc(ins); + if (cqa == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + goto again; } d_list_for_each_entry(cpr, &ins->ci_pool_list, cpr_link) { @@ -3743,8 +3718,8 @@ chk_leader_notify(struct chk_iv *iv) } int -chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools) +chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, uint32_t *ns_ver, + int *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { struct chk_instance *ins = chk_leader; struct chk_bookmark *cbk = &ins->ci_bk; @@ -3771,7 +3746,9 @@ chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, if (ins->ci_orphan_done) *flags = CRF_ORPHAN_DONE; - rc = chk_leader_pools2list(ins, pool_nr, pools); + *ns_ver = ins->ci_ns_ver; + *ranks = ins->ci_ranks; + rc = chk_leader_pools2list(ins, pool_nr, pools); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, @@ -3789,6 +3766,9 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src struct chk_dead_rank *cdr = NULL; int rc = 0; + if (ins->ci_ranks == NULL) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); + /* Ignore the event that is not applicable to current rank. */ if (src != CRT_EVS_SWIM) @@ -3797,9 +3777,6 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src if (type != CRT_EVT_DEAD && type != CRT_EVT_ALIVE) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!ins->ci_sched_running) - D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (type == CRT_EVT_DEAD) { D_ALLOC_PTR(cdr); if (cdr == NULL) @@ -3834,6 +3811,34 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src DP_LEADER(ins), rank, type == CRT_EVT_DEAD ? "dead" : "alive", DP_RC(rc)); } +static void +chk_dead_rank_ult(void *args) +{ + struct chk_instance *ins = args; + struct chk_dead_rank *cdr; + + while (ins->ci_inited) { + cdr = NULL; + if (!d_list_empty(&ins->ci_dead_ranks)) { + ABT_mutex_lock(ins->ci_abt_mutex); + if (likely(!d_list_empty(&ins->ci_dead_ranks))) + cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, + cdr_link); + ABT_mutex_unlock(ins->ci_abt_mutex); + } + + if (cdr != NULL) + chk_leader_mark_rank_dead(ins, cdr); + + if (d_list_empty(&ins->ci_dead_ranks)) + dss_sleep(500); + } + + while ((cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link)) != + NULL) + D_FREE(cdr); +} + int chk_leader_setup(void) { @@ -3849,6 +3854,8 @@ chk_leader_setup(void) * related local inconsistency firstly. */ + chk_report_seq_init(ins); + rc = chk_bk_fetch_leader(cbk); if (rc == -DER_NONEXIST) goto prop; @@ -3901,17 +3908,19 @@ chk_leader_setup(void) prop: rc = chk_prop_fetch(&ins->ci_prop, &ins->ci_ranks); - if (rc == 0 || rc == -DER_NONEXIST) + if (rc != 0 && rc != -DER_NONEXIST) + goto fini; + + ins->ci_inited = 1; + ins->ci_pause = 0; + + rc = dss_ult_create(chk_dead_rank_ult, ins, DSS_XS_SYS, 0, 0, &ins->ci_dead_rank_ult); + if (rc == 0) rc = crt_register_event_cb(chk_rank_event_cb, NULL); -fini: - if (rc != 0) { - chk_ins_fini(&ins); - } else { - chk_report_seq_init(ins); - ins->ci_inited = 1; - ins->ci_pause = 0; - } +fini: + if (rc != 0) + chk_leader_cleanup(); return rc; } @@ -3920,8 +3929,13 @@ chk_leader_cleanup(void) { struct chk_instance *ins = chk_leader; + crt_unregister_event_cb(chk_rank_event_cb, NULL); + chk_ins_cleanup(ins); D_ASSERT(d_list_empty(&ins->ci_rank_list)); + + if (ins->ci_dead_rank_ult != ABT_THREAD_NULL) + ABT_thread_free(&ins->ci_dead_rank_ult); } int @@ -3939,6 +3953,5 @@ chk_leader_init(void) void chk_leader_fini(void) { - crt_unregister_event_cb(chk_rank_event_cb, NULL); chk_ins_fini(&chk_leader); } diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index e340d482cec..65f71cec6ff 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -491,8 +492,8 @@ chk_sg_rpc_prepare(d_rank_t rank, crt_opcode_t opc, crt_rpc_t **req) int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args) { struct chk_co_rpc_cb_args cb_args = { 0 }; @@ -508,12 +509,12 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran if (rc != 0) goto out; - csi = crt_req_get(req); - csi->csi_gen = gen; - csi->csi_flags = flags; - csi->csi_phase = phase; + csi = crt_req_get(req); + csi->csi_gen = gen; + csi->csi_flags = flags; + csi->csi_ns_ver = ns_ver; csi->csi_leader_rank = leader; - csi->csi_api_flags = api_flags; + csi->csi_api_flags = api_flags; uuid_copy(csi->csi_iv_uuid, iv_uuid); csi->csi_ranks.ca_count = rank_nr; csi->csi_ranks.ca_arrays = ranks; @@ -569,9 +570,9 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran crt_req_decref(req); } - D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, - "Rank %u start checker, gen "DF_X64", flags %x, phase %d, iv "DF_UUIDF":"DF_RC"\n", - leader, gen, flags, phase, DP_UUID(iv_uuid), DP_RC(rc)); + DL_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, rc, + "Rank %u start checker, gen " DF_X64 ", flags %x, ns_ver %d, iv " DF_UUIDF, + leader, gen, flags, ns_ver, DP_UUID(iv_uuid)); return rc; } @@ -982,7 +983,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools) + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { crt_rpc_t *req = NULL; struct chk_rejoin_in *cri; @@ -1005,8 +1006,22 @@ chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, cro = crt_reply_get(req); rc = cro->cro_status; - if (rc == 0 && cro->cro_pools.ca_count > 0) { - *flags = cro->cro_flags; + if (rc != 0) + goto out; + + *flags = cro->cro_flags; + *ns_ver = cro->cro_ns_ver; + + if (cro->cro_ranks.ca_count > 0) { + *ranks = d_rank_list_alloc(cro->cro_ranks.ca_count); + if (*ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy((*ranks)->rl_ranks, cro->cro_ranks.ca_arrays, + sizeof(d_rank_t) * cro->cro_ranks.ca_count); + } + + if (cro->cro_pools.ca_count > 0) { D_ALLOC(tmp, cro->cro_pools.ca_count); if (tmp == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/chk/chk_srv.c b/src/chk/chk_srv.c index 86ed5a7d2e5..442579d8411 100644 --- a/src/chk/chk_srv.c +++ b/src/chk/chk_srv.c @@ -27,7 +27,7 @@ ds_chk_start_hdlr(crt_rpc_t *rpc) rc = chk_engine_start(csi->csi_gen, csi->csi_ranks.ca_count, csi->csi_ranks.ca_arrays, csi->csi_policies.ca_count, csi->csi_policies.ca_arrays, csi->csi_uuids.ca_count, csi->csi_uuids.ca_arrays, csi->csi_api_flags, - csi->csi_phase, csi->csi_leader_rank, csi->csi_flags, + csi->csi_ns_ver, csi->csi_leader_rank, csi->csi_flags, csi->csi_iv_uuid, &clues); if (rc > 0) { D_ALLOC_PTR(rank); @@ -249,18 +249,21 @@ ds_chk_report_hdlr(crt_rpc_t *rpc) static void ds_chk_rejoin_hdlr(crt_rpc_t *rpc) { - struct chk_rejoin_in *cri = crt_req_get(rpc); - struct chk_rejoin_out *cro = crt_reply_get(rpc); - uuid_t *pools = NULL; - int pool_nr = 0; - int rc; + struct chk_rejoin_in *cri = crt_req_get(rpc); + struct chk_rejoin_out *cro = crt_reply_get(rpc); + uuid_t *pools = NULL; + d_rank_list_t *ranks = NULL; + int pool_nr = 0; + int rc; rc = chk_leader_rejoin(cri->cri_gen, cri->cri_rank, cri->cri_iv_uuid, &cro->cro_flags, - &pool_nr, &pools); + &cro->cro_ns_ver, &pool_nr, &pools, &ranks); cro->cro_status = rc; if (rc == 0) { - cro->cro_pools.ca_count = pool_nr; + cro->cro_ranks.ca_count = ranks->rl_nr; + cro->cro_ranks.ca_arrays = ranks->rl_ranks; + cro->cro_pools.ca_count = pool_nr; cro->cro_pools.ca_arrays = pools; } diff --git a/src/engine/server_iv.c b/src/engine/server_iv.c index fbeb40b712c..4d8271155a6 100644 --- a/src/engine/server_iv.c +++ b/src/engine/server_iv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1126,7 +1126,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, rc = iv_op_internal(ns, key, value, sync, shortcut, opc); if (retry && !ns->iv_stop && (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER || rc == -DER_BUSY)) { - if (rc == -DER_GRPVER && engine_in_check()) { + if ((rc == -DER_GRPVER || rc == -DER_OOG) && engine_in_check()) { /* * Under check mode, the pool shard on peer rank/target does * not exist, then it will reply "-DER_GRPVER" that is normal diff --git a/src/gurt/misc.c b/src/gurt/misc.c index 094b3cc1a56..afca34ab918 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -550,9 +550,24 @@ d_rank_list_shuffle(d_rank_list_t *rank_list) } /** - * Must be previously sorted or not modified at all in order to guarantee - * consistent indexes. - **/ + * Binary search \a rank in the sorted \a rank_list. + */ + +bool +d_rank_list_bsearch(d_rank_list_t *rank_list, d_rank_t rank, int *idx) +{ + d_rank_t *pos = NULL; + + if (rank_list != NULL) { + pos = bsearch(&rank, rank_list->rl_ranks, rank_list->rl_nr, sizeof(rank), + rank_compare); + if (pos != NULL && idx != NULL) + *idx = ((void *)pos - (void *)rank_list->rl_ranks) / sizeof(rank); + } + + return pos != NULL; +} + bool d_rank_list_find(d_rank_list_t *rank_list, d_rank_t rank, int *idx) { diff --git a/src/include/daos_srv/daos_chk.h b/src/include/daos_srv/daos_chk.h index 59e34305d28..608e53d97b7 100644 --- a/src/include/daos_srv/daos_chk.h +++ b/src/include/daos_srv/daos_chk.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -73,9 +74,9 @@ typedef int (*chk_query_pool_cb_t)(struct chk_query_pool_shard *shard, uint32_t typedef int (*chk_prop_cb_t)(void *buf, uint32_t policies[], int cnt, uint32_t flags); -int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, - struct chk_policy *policies, int pool_nr, uuid_t pools[], - uint32_t api_flags, int phase); +int +chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, + int pool_nr, uuid_t pools[], uint32_t api_flags); int chk_leader_stop(int pool_nr, uuid_t pools[]); diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index 241dcd9b74a..5742ee3f5bf 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -460,6 +460,7 @@ d_power2_nbits(unsigned int val) return val == LOWEST_BIT_SET(val) ? shift - 1 : shift; } +/* clang-format off */ int d_rank_list_dup(d_rank_list_t **dst, const d_rank_list_t *src); int d_rank_list_dup_sort_uniq(d_rank_list_t **dst, const d_rank_list_t *src); void d_rank_list_filter(d_rank_list_t *src_set, d_rank_list_t *dst_set, @@ -471,6 +472,7 @@ void d_rank_list_free(d_rank_list_t *rank_list); int d_rank_list_copy(d_rank_list_t *dst, d_rank_list_t *src); void d_rank_list_shuffle(d_rank_list_t *rank_list); void d_rank_list_sort(d_rank_list_t *rank_list); +bool d_rank_list_bsearch(d_rank_list_t *rank_list, d_rank_t rank, int *idx); bool d_rank_list_find(d_rank_list_t *rank_list, d_rank_t rank, int *idx); void d_rank_list_del_at(d_rank_list_t *list, int idx); int d_rank_list_del(d_rank_list_t *rank_list, d_rank_t rank); @@ -482,15 +484,13 @@ int d_rank_list_append(d_rank_list_t *rank_list, d_rank_t rank); int d_rank_list_dump(d_rank_list_t *rank_list, d_string_t name, int name_len); d_rank_list_t *uint32_array_to_rank_list(uint32_t *ints, size_t len); int rank_list_to_uint32_array(d_rank_list_t *rl, uint32_t **ints, size_t *len); -int - d_rank_list_to_str(d_rank_list_t *rank_list, char **rank_str); - +int d_rank_list_to_str(d_rank_list_t *rank_list, char **rank_str); d_rank_range_list_t *d_rank_range_list_alloc(uint32_t size); d_rank_range_list_t *d_rank_range_list_realloc(d_rank_range_list_t *range_list, uint32_t size); d_rank_range_list_t *d_rank_range_list_create_from_ranks(d_rank_list_t *rank_list); -int - d_rank_range_list_str(d_rank_range_list_t *list, char **ranks_str); +int d_rank_range_list_str(d_rank_range_list_t *list, char **ranks_str); void d_rank_range_list_free(d_rank_range_list_t *range_list); +/* clang-format on */ #ifdef FAULT_INJECTION diff --git a/src/mgmt/srv_chk.c b/src/mgmt/srv_chk.c index a9c1ce1c3b0..7cd32be1273 100644 --- a/src/mgmt/srv_chk.c +++ b/src/mgmt/srv_chk.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -47,7 +48,7 @@ ds_mgmt_chk_parse_uuid(int pool_nr, char **pools, uuid_t **p_uuids) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int32_t pool_nr, char **pools, - uint32_t flags, int32_t phase) + uint32_t flags) { uuid_t *uuids = NULL; struct chk_policy *ply = NULL; @@ -69,7 +70,7 @@ ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, } } - rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags, phase); + rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags); out: D_FREE(uuids); diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index 6d44faa796c..7874fb81b09 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2563,7 +2563,7 @@ ds_mgmt_drpc_check_start(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_INFO("Received request to start check\n"); rc = ds_mgmt_check_start(req->n_ranks, req->ranks, req->n_policies, req->policies, - req->n_uuids, req->uuids, req->flags, -1 /* phase */); + req->n_uuids, req->uuids, req->flags); if (rc < 0) D_ERROR("Failed to start check: "DF_RC"\n", DP_RC(rc)); diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index 95f2c809494..a0c4d7d332c 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -129,9 +129,10 @@ int const char *user, const char *group); /** srv_chk.c */ -int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, - Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase); +int + ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, + Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, + uint32_t flags); int ds_mgmt_check_stop(int pool_nr, char **pools); int ds_mgmt_check_query(int pool_nr, char **pools, chk_query_head_cb_t head_cb, chk_query_pool_cb_t pool_cb, void *buf); diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index 9a11890513e..7ed2b0277f5 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -627,7 +627,7 @@ mock_ds_mgmt_dev_set_faulty_setup(void) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase) + uint32_t flags) { return 0; } diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index d517e3269d6..ef10ed439a5 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -724,8 +725,8 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc == 0) { if (!shard->do_rebuilding && !shard->do_reintegrating) { tmp_tgt.dct_rank = shard->do_target_rank; - dct = bsearch(&tmp_tgt, coa->coa_dcts, coa->coa_dct_nr, - sizeof(tmp_tgt), &dc_coll_sort_cmp); + dct = bsearch(&tmp_tgt, coa->coa_dcts, coa->coa_dct_nr, + sizeof(tmp_tgt), dc_coll_sort_cmp); D_ASSERT(dct != NULL); goto gen_mbs; diff --git a/src/tests/ftest/recovery/cat_recov_core.yaml b/src/tests/ftest/recovery/cat_recov_core.yaml index 8e4553e30ec..7792dd408e2 100644 --- a/src/tests/ftest/recovery/cat_recov_core.yaml +++ b/src/tests/ftest/recovery/cat_recov_core.yaml @@ -6,6 +6,7 @@ server_config: engines_per_host: 2 engines: 0: + targets: 4 pinned_numa_node: 0 nr_xs_helpers: 0 log_file: daos_server0.log @@ -19,6 +20,7 @@ server_config: - D_LOG_STDERR_IN_LOG=1 storage: auto 1: + targets: 4 pinned_numa_node: 1 nr_xs_helpers: 0 log_file: daos_server1.log @@ -33,7 +35,9 @@ server_config: storage: auto transport_config: allow_insecure: true - system_ram_reserved: 64 +pool: + scm_size: 6G + nvme_size: 80G agent_config: transport_config: allow_insecure: true diff --git a/src/tests/ftest/recovery/container_cleanup.yaml b/src/tests/ftest/recovery/container_cleanup.yaml index 346378a1543..301ab604e84 100644 --- a/src/tests/ftest/recovery/container_cleanup.yaml +++ b/src/tests/ftest/recovery/container_cleanup.yaml @@ -14,7 +14,7 @@ server_config: system_ram_reserved: 1 pool: - size: 5G + size: 15G container: type: POSIX diff --git a/src/tests/ftest/recovery/container_list_consolidation.yaml b/src/tests/ftest/recovery/container_list_consolidation.yaml index 346378a1543..301ab604e84 100644 --- a/src/tests/ftest/recovery/container_list_consolidation.yaml +++ b/src/tests/ftest/recovery/container_list_consolidation.yaml @@ -14,7 +14,7 @@ server_config: system_ram_reserved: 1 pool: - size: 5G + size: 15G container: type: POSIX diff --git a/src/tests/ftest/recovery/ms_membership.yaml b/src/tests/ftest/recovery/ms_membership.yaml index 3e82da75b68..8377608ac8a 100644 --- a/src/tests/ftest/recovery/ms_membership.yaml +++ b/src/tests/ftest/recovery/ms_membership.yaml @@ -7,13 +7,18 @@ server_config: engines_per_host: 2 engines: 0: + targets: 4 pinned_numa_node: 0 nr_xs_helpers: 1 log_file: daos_server0.log storage: auto 1: + targets: 4 pinned_numa_node: 1 nr_xs_helpers: 1 log_file: daos_server1.log storage: auto system_ram_reserved: 1 + +pool: + size: 80G diff --git a/src/tests/ftest/recovery/pool_cleanup.yaml b/src/tests/ftest/recovery/pool_cleanup.yaml index 1b57a320f42..a776c8b7a5c 100644 --- a/src/tests/ftest/recovery/pool_cleanup.yaml +++ b/src/tests/ftest/recovery/pool_cleanup.yaml @@ -14,4 +14,4 @@ server_config: system_ram_reserved: 1 pool: - size: 5G + size: 15G diff --git a/src/tests/ftest/recovery/pool_list_consolidation.yaml b/src/tests/ftest/recovery/pool_list_consolidation.yaml index 37a548e1702..7336e97157d 100644 --- a/src/tests/ftest/recovery/pool_list_consolidation.yaml +++ b/src/tests/ftest/recovery/pool_list_consolidation.yaml @@ -17,7 +17,7 @@ setup: start_servers_once: False pool: - size: 60G + size: 100G container: control_method: daos diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 12b4a9958dc..c9304a7d1ff 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1310,7 +1311,7 @@ cr_engine_interaction(void **state) rc = cr_system_start(); assert_rc_equal(rc, 0); - /* Former connection for the pool has been evicted by checkre. Let's re-connect the pool. */ + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ rc = cr_cont_get_label(state, &pool, &cont, true, &label); assert_rc_equal(rc, 0); @@ -1676,7 +1677,7 @@ cr_stop_engine_interaction(void **state) rc = cr_system_start(); assert_rc_equal(rc, 0); - /* Former connection for the pool has been evicted by checkre. Let's re-connect the pool. */ + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ rc = cr_cont_get_label(state, &pool, &cont, true, &label); assert_rc_equal(rc, 0); @@ -3785,6 +3786,62 @@ cr_maintenance_mode(void **state) cr_cleanup(arg, &pool, 1); } +/* + * 1. Exclude rank 0. + * 2. Create pool without inconsistency. + * 3. Start checker without options. + * 4. Query checker, it should be completed instead of being blocked. + * 5. Switch to normal mode and cleanup. + */ +static void +cr_lost_rank0(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct daos_check_info dci = {0}; + int rc; + + print_message("CR29: CR with rank 0 excluded at the beginning\n"); + + print_message("CR: excluding the rank 0 ...\n"); + rc = dmg_system_exclude_rank(dmg_config_file, 0); + assert_rc_equal(rc, 0); + + rc = cr_pool_create(state, &pool, false, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_RESET, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + /* Reint the rank for subsequent test. */ + rc = cr_rank_reint(0, true); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + +/* clang-format off */ static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", cr_start_specified, async_disable, test_case_teardown}, @@ -3842,7 +3899,10 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool2, async_disable, test_case_teardown}, { "CR28: maintenance mode after dry-run check", cr_maintenance_mode, async_disable, test_case_teardown}, + { "CR29: CR with rank 0 excluded at the beginning", + cr_lost_rank0, async_disable, test_case_teardown}, }; +/* clang-format on */ static int cr_setup(void **state)