diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index a39cb94632a..2a7164ba757 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -1019,8 +1019,7 @@ chk_pending_destroy(struct chk_pending_rec *cpr) } int -chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, +chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop) { int rc = 0; @@ -1033,11 +1032,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT; if (flags & CHK__CHECK_FLAG__CF_NO_AUTO) prop->cp_flags &= ~CHK__CHECK_FLAG__CF_AUTO; - prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | - CHK__CHECK_FLAG__CF_ORPHAN_POOL | - CHK__CHECK_FLAG__CF_NO_FAILOUT | - CHK__CHECK_FLAG__CF_NO_AUTO); - prop->cp_phase = phase; + prop->cp_flags |= flags & ~(CHK__CHECK_FLAG__CF_RESET | CHK__CHECK_FLAG__CF_ORPHAN_POOL | + CHK__CHECK_FLAG__CF_NO_FAILOUT | CHK__CHECK_FLAG__CF_NO_AUTO); if (ranks != NULL) prop->cp_rank_nr = ranks->rl_nr; @@ -1195,12 +1191,7 @@ chk_ins_cleanup(struct chk_instance *ins) chk_stop_sched(ins); ins->ci_inited = 0; - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); } int @@ -1215,7 +1206,8 @@ chk_ins_init(struct chk_instance **p_ins) if (ins == NULL) D_GOTO(out_init, rc = -DER_NOMEM); - ins->ci_sched = ABT_THREAD_NULL; + ins->ci_sched = ABT_THREAD_NULL; + ins->ci_dead_rank_ult = ABT_THREAD_NULL; ins->ci_rank_hdl = DAOS_HDL_INVAL; D_INIT_LIST_HEAD(&ins->ci_rank_list); @@ -1281,6 +1273,8 @@ chk_ins_fini(struct chk_instance **p_ins) D_ASSERT(daos_handle_is_inval(ins->ci_pending_hdl)); D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list)); + D_ASSERT(ins->ci_dead_rank_ult == ABT_THREAD_NULL); + if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index a5f7f1a4cfd..cd0de1138fa 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -219,8 +219,7 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu iv.ci_to_leader = 1; /* Notify the leader that check instance exit on the engine. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc = %d\n", DP_ENGINE(ins), dss_self_rank(), ins_status, rc); @@ -1670,8 +1669,7 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) * to all engines. Otherwise, the engine out of the pool map cannot get * the notification. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER, - true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify pool shards for "DF_UUIDF", phase %u, " "ins_status %u, pool_status %u: rc = %d\n", @@ -1683,8 +1681,7 @@ chk_engine_pool_notify(struct chk_pool_rec *cpr) iv.ci_from_psl = 0; iv.ci_to_leader = 1; /* Synchronously notify the check leader with the new check status/phase. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify check leader for "DF_UUIDF", phase %u, " "ins_status %u, pool_status %u: rc = %d\n", @@ -2007,9 +2004,8 @@ chk_engine_sched(void *args) static int chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint64_t gen, int phase, uint32_t api_flags, - d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint64_t gen, uint32_t api_flags, d_rank_t leader, uint32_t flags) { struct chk_traverse_pools_args ctpa = { 0 }; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2091,8 +2087,7 @@ chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank init: if (!chk_is_on_leader(gen, leader, true)) { - rc = chk_prop_prepare(leader, api_flags, phase, policy_nr, policies, rank_list, - prop); + rc = chk_prop_prepare(leader, api_flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2220,16 +2215,15 @@ chk_engine_pool_filter(uuid_t uuid, void *arg, int *phase) int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], uint32_t api_flags, - int phase, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, + uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues) { - struct chk_instance *ins = chk_engine; - struct chk_bookmark *cbk = &ins->ci_bk; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - int rc; - int rc1; + struct chk_instance *ins = chk_engine; + struct chk_bookmark *cbk = &ins->ci_bk; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2251,12 +2245,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; @@ -2270,27 +2259,20 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic if (rc != 0) goto out_tree; - rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, - pool_nr, pools, gen, phase, api_flags, leader, flags); + rc = chk_engine_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, gen, + api_flags, leader, flags); if (rc != 0) goto out_tree; if (chk_is_on_leader(gen, leader, true)) { ins->ci_iv_ns = chk_leader_get_iv_ns(); - if (unlikely(ins->ci_iv_ns == NULL)) - goto out_tree; + D_ASSERT(ins->ci_iv_ns != NULL); + + ins->ci_ns_ver = ns_ver; } else { - uuid_unparse_lower(iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, iv_uuid, leader, ns_ver); if (rc != 0) goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); } uuid_copy(cbk->cb_iv_uuid, iv_uuid); @@ -2324,12 +2306,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic D_WARN(DF_ENGINE" failed to update engine bookmark: "DF_RC"\n", DP_ENGINE(ins), DP_RC(rc1)); } - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); @@ -2337,17 +2314,18 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic ins->ci_starting = 0; out_log: if (rc >= 0) { - D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, phase %d, leader %u, " - "flags %x, iv "DF_UUIDF": rc %d\n", + D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, ns_ver %d, leader %u, " + "flags %x, iv " DF_UUIDF ": rc %d\n", DP_ENGINE(ins), chk_is_ins_reset(ins, api_flags) ? "start" : "resume", - myrank, api_flags, phase, leader, flags, DP_UUID(iv_uuid), rc); + myrank, api_flags, ns_ver, leader, flags, DP_UUID(iv_uuid), rc); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, pool_nr, pools); } else { - D_ERROR(DF_ENGINE" failed to start on rank %u with %d pools, api_flags %x, " - "phase %d, leader %u, flags %x, gen "DF_X64", iv "DF_UUIDF": "DF_RC"\n", - DP_ENGINE(ins), myrank, pool_nr, api_flags, phase, leader, flags, gen, + D_ERROR(DF_ENGINE " failed to start on rank %u with %d pools, api_flags %x, " + "ns_ver %d, leader %u, flags %x, gen " DF_X64 ", iv " DF_UUIDF + ": " DF_RC "\n", + DP_ENGINE(ins), myrank, pool_nr, api_flags, ns_ver, leader, flags, gen, DP_UUID(iv_uuid), DP_RC(rc)); } @@ -2373,7 +2351,7 @@ chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags) if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) D_GOTO(log, rc = -DER_NOTAPPLICABLE); - if (ins->ci_starting) + if (ins->ci_starting || ins->ci_rejoining) D_GOTO(log, rc = -DER_BUSY); if (ins->ci_stopping || ins->ci_sched_exiting) @@ -2604,34 +2582,46 @@ chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *ins_status int chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - d_rank_list_t *rank_list = NULL; - int rc = 0; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + int rc = 0; CHK_IS_READY(ins); if (cbk->cb_gen != gen) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - rc = chk_prop_fetch(prop, &rank_list); - if (rc != 0) - goto out; + /* For check engine on the leader, reload rank list that has been refreshed by leader. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = NULL; + } + + if (ins->ci_ranks == NULL) { + rc = chk_prop_fetch(prop, &ins->ci_ranks); + if (rc != 0) + goto out; - D_ASSERT(rank_list != NULL); + /* For check engine on the leader, it's done. */ + if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) { + ins->ci_ns_ver = version; + goto out; + } + } - /* For check engine on the leader, related rank has already been marked as "dead". */ - if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) - goto group; + if (unlikely(ins->ci_ranks == NULL)) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!chk_remove_rank_from_list(rank_list, rank)) + if (!chk_remove_rank_from_list(ins->ci_ranks, rank)) D_GOTO(out, rc = -DER_NOTAPPLICABLE); prop->cp_rank_nr--; - rc = chk_prop_update(prop, rank_list); + rc = chk_prop_update(prop, ins->ci_ranks); if (rc != 0) - goto out; + ins->ci_skip_oog = 1; + else + rc = chk_iv_ns_update(ins, version); /* * NOTE: If the rank dead before DAOS check start, then subsequent check start will @@ -2652,19 +2642,7 @@ chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version) * sometime later as the DAOS check going. */ -group: - if (ins->ci_iv_group != NULL) - rc = crt_group_secondary_modify(ins->ci_iv_group, rank_list, rank_list, - CRT_GROUP_MOD_OP_REPLACE, version); - out: - if (rc == 0) { - d_rank_list_free(ins->ci_ranks); - ins->ci_ranks = rank_list; - rank_list = NULL; - } - - d_rank_list_free(rank_list); if (rc != -DER_NOTAPPLICABLE) D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u mark rank %u as dead with gen " @@ -3351,19 +3329,19 @@ chk_engine_notify(struct chk_iv *iv) void chk_engine_rejoin(void *args) { - struct chk_instance *ins = chk_engine; - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *pools = NULL; - struct chk_iv iv = { 0 }; - struct umem_attr uma = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - d_rank_t myrank = dss_self_rank(); - uint32_t pool_nr = 0; - uint32_t flags = 0; - int rc = 0; - int rc1; - bool need_join = false; + struct chk_instance *ins = chk_engine; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + d_rank_list_t *ranks = NULL; + uuid_t *pools = NULL; + struct chk_iv iv = {0}; + struct umem_attr uma = {0}; + d_rank_t myrank = dss_self_rank(); + uint32_t pool_nr = 0; + uint32_t flags = 0; + int rc = 0; + int rc1; + bool need_join = false; if (cbk->cb_magic != CHK_BK_MAGIC_ENGINE) goto out_log; @@ -3372,7 +3350,7 @@ chk_engine_rejoin(void *args) cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_PAUSED) goto out_log; - /* We do NOT support leader (and its associated engine ) to rejoin former check instance. */ + /* We do NOT support leader (and its associated engine) to rejoin former check instance. */ if (chk_is_on_leader(cbk->cb_gen, prop->cp_leader, true)) goto out_log; @@ -3407,22 +3385,10 @@ chk_engine_rejoin(void *args) if (rc != 0) goto out_tree; - uuid_unparse_lower(cbk->cb_iv_uuid, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); - if (rc != 0) - goto out_tree; - - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, cbk->cb_iv_uuid, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, prop->cp_leader, ins->ci_iv_ns->iv_master_term + 1); - again: /* Ask leader whether this engine can rejoin or not. */ rc = chk_rejoin_remote(prop->cp_leader, cbk->cb_gen, myrank, cbk->cb_iv_uuid, &flags, - &pool_nr, &pools); + &ins->ci_ns_ver, &pool_nr, &pools, &ranks); if (rc != 0) { if ((rc == -DER_OOG || rc == -DER_GRPVER) && !ins->ci_pause) { D_INFO(DF_ENGINE" Someone is not ready %d, let's rejoin after 1 sec\n", @@ -3432,14 +3398,22 @@ chk_engine_rejoin(void *args) goto again; } - goto out_iv; + goto out_tree; } - if (pool_nr == 0) { + if (ranks == NULL || pool_nr == 0) { need_join = false; - D_GOTO(out_iv, rc = 1); + D_GOTO(out_tree, rc = 1); } + d_rank_list_free(ins->ci_ranks); + ins->ci_ranks = ranks; + ranks = NULL; + + rc = chk_iv_ns_create(ins, cbk->cb_iv_uuid, prop->cp_leader, ins->ci_ns_ver); + if (rc != 0) + goto out_tree; + rc = chk_pools_load_list(ins, cbk->cb_gen, 0, pool_nr, pools, NULL); if (rc != 0) goto out_notify; @@ -3479,21 +3453,17 @@ chk_engine_rejoin(void *args) iv.ci_to_leader = 1; /* Notify the leader that check instance exit on the engine. */ - rc1 = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE, true); + rc1 = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_TO_ROOT, CRT_IV_SYNC_NONE); D_CDEBUG(rc1 != 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" on rank %u notify leader for its exit, status %u: rc1 = %d\n", DP_ENGINE(ins), myrank, cbk->cb_ins_status, rc1); -out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } + chk_iv_ns_destroy(ins); out_tree: chk_destroy_pending_tree(ins); chk_destroy_pool_tree(ins); out_log: + d_rank_list_free(ranks); + D_FREE(pools); if (need_join) D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, DF_ENGINE" rejoin on rank %u with iv "DF_UUIDF": "DF_RC"\n", @@ -3517,6 +3487,8 @@ chk_engine_setup(void) * related local inconsistency firstly. */ + chk_report_seq_init(ins); + rc = chk_bk_fetch_engine(cbk); if (rc == -DER_NONEXIST) goto prop; @@ -3560,15 +3532,14 @@ chk_engine_setup(void) rc = chk_prop_fetch(&ins->ci_prop, &ins->ci_ranks); if (rc == -DER_NONEXIST) rc = 0; -fini: - if (rc != 0) { - chk_ins_fini(&ins); - } else { - chk_report_seq_init(ins); + if (rc == 0) { ins->ci_inited = 1; ins->ci_pause = 0; } +fini: + if (rc != 0) + chk_engine_cleanup(); return rc; } diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 039815576db..7074276c991 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -72,6 +72,7 @@ struct chk_pool_mbs { uint32_t *cpm_tgt_status; }; +/* clang-format off */ /* * CHK_START: * From check leader to check engine to start the check instance on specified pool(s) or all pools. @@ -79,7 +80,7 @@ struct chk_pool_mbs { #define DAOS_ISEQ_CHK_START \ ((uint64_t) (csi_gen) CRT_VAR) \ ((uint32_t) (csi_flags) CRT_VAR) \ - ((int32_t) (csi_phase) CRT_VAR) \ + ((int32_t) (csi_ns_ver) CRT_VAR) \ ((d_rank_t) (csi_leader_rank) CRT_VAR) \ ((uint32_t) (csi_api_flags) CRT_VAR) \ ((uuid_t) (csi_iv_uuid) CRT_VAR) \ @@ -254,6 +255,7 @@ CRT_RPC_DECLARE(chk_pool_mbs, DAOS_ISEQ_CHK_POOL_MBS, DAOS_OSEQ_CHK_POOL_MBS); CRT_RPC_DECLARE(chk_report, DAOS_ISEQ_CHK_REPORT, DAOS_OSEQ_CHK_REPORT); +/* clang-format off */ /* * CHK_REJOIN: * From check engine to check leader to require rejoin former check instance after the engine @@ -268,9 +270,13 @@ CRT_RPC_DECLARE(chk_report, DAOS_ISEQ_CHK_REPORT, DAOS_OSEQ_CHK_REPORT); #define DAOS_OSEQ_CHK_REJOIN \ ((int32_t) (cro_status) CRT_VAR) \ ((uint32_t) (cro_flags) CRT_VAR) \ + ((uint32_t) (cro_ns_ver) CRT_VAR) \ + ((uint32_t) (cro_padding) CRT_VAR) \ + ((d_rank_t) (cro_ranks) CRT_ARRAY) \ ((uuid_t) (cro_pools) CRT_ARRAY) CRT_RPC_DECLARE(chk_rejoin, DAOS_ISEQ_CHK_REJOIN, DAOS_OSEQ_CHK_REJOIN); +/* clang-format on */ /* dkey for check DB under sys_db */ #define CHK_DB_TABLE "chk" @@ -485,16 +491,12 @@ struct chk_bookmark { * 'reset' for all pools. */ struct chk_property { - d_rank_t cp_leader; - Chk__CheckFlag cp_flags; - Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; - /* - * NOTE: Preserve for supporting to continue the check until the specified phase in the - * future. -1 means to check all phases. - */ - int32_t cp_phase; + d_rank_t cp_leader; + Chk__CheckFlag cp_flags; + Chk__CheckInconsistAction cp_policies[CHK_POLICY_MAX]; + uint32_t cp_padding; /* How many ranks (ever or should) take part in the check instance. */ - uint32_t cp_rank_nr; + uint32_t cp_rank_nr; }; /* @@ -537,6 +539,7 @@ struct chk_instance { d_list_t ci_dead_ranks; ABT_thread ci_sched; + ABT_thread ci_dead_rank_ult; ABT_rwlock ci_abt_lock; ABT_mutex ci_abt_mutex; ABT_cond ci_abt_cond; @@ -544,20 +547,12 @@ struct chk_instance { /* Generator for report event, pending repair actions, and so on. */ uint64_t ci_seq; - uint32_t ci_is_leader:1, - ci_sched_running:1, - ci_sched_exiting:1, - ci_for_orphan:1, - ci_orphan_done:1, /* leader has processed orphan pools. */ - ci_pool_stopped:1, /* check on some pools have been stopped. */ - ci_starting:1, - ci_stopping:1, - ci_started:1, - ci_inited:1, - ci_pause:1, - ci_rejoining:1, - ci_implicated:1; - uint32_t ci_start_flags; + uint32_t ci_is_leader : 1, ci_sched_running : 1, ci_sched_exiting : 1, ci_for_orphan : 1, + ci_orphan_done : 1, ci_pool_stopped : 1, /* check on some pools have been stopped. */ + ci_starting : 1, ci_stopping : 1, ci_started : 1, ci_inited : 1, ci_pause : 1, + ci_skip_oog : 1, ci_rejoining : 1, ci_implicated : 1; + uint32_t ci_start_flags; + uint32_t ci_ns_ver; }; struct chk_iv { @@ -738,9 +733,8 @@ int chk_pending_wakeup(struct chk_instance *ins, struct chk_pending_rec *cpr); void chk_pending_destroy(struct chk_pending_rec *cpr); -int chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase, - uint32_t policy_nr, struct chk_policy *policies, - d_rank_list_t *ranks, struct chk_property *prop); +int chk_prop_prepare(d_rank_t leader, uint32_t flags, uint32_t policy_nr, + struct chk_policy *policies, d_rank_list_t *ranks, struct chk_property *prop); uint32_t chk_pool_merge_status(uint32_t status_a, uint32_t status_b); @@ -757,7 +751,7 @@ void chk_ins_fini(struct chk_instance **p_ins); int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, struct ds_pool_clues *clues); int chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags); @@ -792,7 +786,14 @@ void chk_engine_fini(void); /* chk_iv.c */ -int chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry); +void chk_iv_ns_destroy(struct chk_instance *ins); + +int chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver); + +int chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver); + +int chk_iv_update(struct chk_instance *ins, struct chk_iv *iv, uint32_t shortcut, + uint32_t sync_mode); int chk_iv_init(void); @@ -808,8 +809,8 @@ int chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) int chk_leader_notify(struct chk_iv *iv); -int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools); +int chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, + uint32_t *ns_ver, int *pool_nr, uuid_t **pools, d_rank_list_t **ranks); int chk_leader_setup(void); @@ -823,8 +824,8 @@ void chk_leader_fini(void); int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, - uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); + uuid_t pools[], uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, + uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args); int chk_stop_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t pools[], chk_co_rpc_cb_t stop_cb, void *args); @@ -853,7 +854,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, uint32_t detail_nr, d_sg_list_t *details, uint64_t seq); int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools); + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks); /* chk_updcall.c */ @@ -917,41 +918,24 @@ chk_ins_set_fail(struct chk_instance *ins, uint32_t phase) static inline bool chk_rank_in_list(d_rank_list_t *rlist, d_rank_t rank) { - int i; - bool found = false; - - /* TBD: more efficiently search for the sorted ranks list. */ - - for (i = 0; i < rlist->rl_nr; i++) { - if (rlist->rl_ranks[i] == rank) { - found = true; - break; - } - } - - return found; + return d_rank_list_bsearch(rlist, rank, NULL); } static inline bool chk_remove_rank_from_list(d_rank_list_t *rlist, d_rank_t rank) { - int i; - bool found = false; - - /* TBD: more efficiently search for the sorted ranks list. */ - - for (i = 0; i < rlist->rl_nr; i++) { - if (rlist->rl_ranks[i] == rank) { - found = true; - rlist->rl_nr--; - /* The leader rank will always be in the rank list. */ - D_ASSERT(rlist->rl_nr > 0); - - if (i < rlist->rl_nr) - memmove(&rlist->rl_ranks[i], &rlist->rl_ranks[i + 1], - sizeof(rlist->rl_ranks[i]) * (rlist->rl_nr - i)); - break; - } + int idx = -1; + bool found = false; + + if (d_rank_list_bsearch(rlist, rank, &idx)) { + D_ASSERT(rlist->rl_nr > 0); + D_ASSERT(idx >= 0); + + rlist->rl_nr--; + if (idx < rlist->rl_nr) + memmove(&rlist->rl_ranks[idx], &rlist->rl_ranks[idx + 1], + sizeof(rlist->rl_ranks[idx]) * (rlist->rl_nr - idx)); + found = true; } return found; @@ -1003,17 +987,6 @@ chk_query_free(struct chk_query_pool_shard *shards, uint32_t shard_nr) } } -static inline void -chk_iv_ns_cleanup(struct ds_iv_ns **ns) -{ - if (*ns != NULL) { - if ((*ns)->iv_refcount == 1) - ds_iv_ns_cleanup(*ns); - ds_iv_ns_put(*ns); - *ns = NULL; - } -} - static inline void chk_pool_get(struct chk_pool_rec *cpr) { @@ -1212,7 +1185,7 @@ chk_ins_can_start(struct chk_instance *ins) if (ins->ci_starting) return -DER_INPROGRESS; - if (ins->ci_stopping || ins->ci_sched_exiting) + if (ins->ci_stopping || ins->ci_sched_exiting || ins->ci_rejoining) return -DER_BUSY; if (ins->ci_sched_running) diff --git a/src/chk/chk_iv.c b/src/chk/chk_iv.c index 299c1554856..712fd474c21 100644 --- a/src/chk/chk_iv.c +++ b/src/chk/chk_iv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -181,13 +182,76 @@ struct ds_iv_class_ops chk_iv_ops = { .ivc_value_alloc = chk_iv_value_alloc, }; +void +chk_iv_ns_destroy(struct chk_instance *ins) +{ + if (ins->ci_iv_ns != NULL) { + if (ins->ci_iv_ns->iv_refcount == 1) + ds_iv_ns_cleanup(ins->ci_iv_ns); + ds_iv_ns_put(ins->ci_iv_ns); + ins->ci_iv_ns = NULL; + } + + if (ins->ci_iv_group != NULL) { + crt_group_secondary_destroy(ins->ci_iv_group); + ins->ci_iv_group = NULL; + } +} + int -chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode, bool retry) +chk_iv_ns_create(struct chk_instance *ins, uuid_t uuid, d_rank_t leader, uint32_t ns_ver) { - d_sg_list_t sgl; - d_iov_t iov; - struct ds_iv_key key; - int rc; + char uuid_str[DAOS_UUID_STR_SIZE]; + int rc; + + uuid_unparse_lower(uuid, uuid_str); + rc = crt_group_secondary_create(uuid_str, NULL, NULL, &ins->ci_iv_group); + if (rc != 0) + goto out; + + rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, uuid, ins->ci_iv_group, &ins->ci_iv_id, + &ins->ci_iv_ns); + if (rc != 0) + goto out; + + rc = chk_iv_ns_update(ins, ns_ver); + if (rc == 0) { + ds_iv_ns_update(ins->ci_iv_ns, leader, ins->ci_iv_ns->iv_master_term + 1); + ins->ci_skip_oog = 0; + } + +out: + if (rc != 0) + chk_iv_ns_destroy(ins); + return rc; +} + +int +chk_iv_ns_update(struct chk_instance *ins, uint32_t ns_ver) +{ + int rc; + + /* Let secondary rank == primary rank. */ + rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, + CRT_GROUP_MOD_OP_REPLACE, ns_ver); + if (rc == 0) + ins->ci_ns_ver = ns_ver; + else + ins->ci_skip_oog = 1; + + return rc; +} + +int +chk_iv_update(struct chk_instance *ins, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode) +{ + d_sg_list_t sgl; + d_iov_t iov; + struct ds_iv_key key; + uint32_t ver; + int try_cnt = 0; + int wait_cnt = 0; + int rc; iv->ci_rank = dss_self_rank(); iv->ci_seq = d_hlc_get(); @@ -208,9 +272,35 @@ chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode memset(&key, 0, sizeof(key)); key.class_id = IV_CHK; - rc = ds_iv_update(ns, &key, &sgl, shortcut, sync_mode, 0, retry); + +again: + try_cnt++; + ver = ins->ci_ns_ver; + rc = ds_iv_update(ins->ci_iv_ns, &key, &sgl, shortcut, sync_mode, 0, true); + if (likely(rc != -DER_OOG)) + goto out; + + if (try_cnt % 10 == 0) + D_WARN("CHK iv " DF_X64 "/" DF_X64 " retry because of -DER_OOG for more " + "than %d times.\n", + iv->ci_gen, iv->ci_seq, try_cnt); + + /* Wait chk_deak_rank_ult to sync the IV namespace. */ + while (ver == ins->ci_ns_ver && ins->ci_skip_oog == 0 && ins->ci_pause == 0) { + dss_sleep(500); + if (++wait_cnt % 40 == 0) + D_WARN("CHK iv " DF_X64 "/" DF_X64 " is blocked because of DER_OOG " + "for %d seconds.\n", + iv->ci_gen, iv->ci_seq, wait_cnt / 2); + } + + if (ins->ci_pause || ins->ci_skip_oog) + goto out; + + goto again; } +out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, "CHK iv "DF_X64"/"DF_X64" on rank %u, phase %u, ins_status %u, " "pool_status %u, to_leader %s, from_psl %s: rc = %d\n", diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index e51aa415670..01b35cc158e 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -244,8 +244,7 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu iv.ci_ins_status = ins_status; /* Synchronously notify the engines that the check leader exit. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify the engines its exit, status %u: rc = %d\n", DP_LEADER(ins), ins_status, rc); @@ -318,8 +317,7 @@ chk_leader_post_repair(struct chk_instance *ins, struct chk_pool_rec *cpr, iv.ci_pool_status = cbk->cb_pool_status; /* Synchronously notify the engines that check on the pool got failure. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify the engines that the check for pool " DF_UUIDF" is done with status %u: rc = %d\n", @@ -2076,8 +2074,7 @@ chk_leader_pool_ult(void *arg) iv.ci_phase = cbk->cb_phase; iv.ci_pool_status = CHK__CHECK_POOL_STATUS__CPS_FAILED; - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines to exit check for pool "DF_UUIDF" failure: %d\n", DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), rc); @@ -2097,21 +2094,22 @@ chk_leader_pool_ult(void *arg) static void chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) { - struct chk_property *prop = &ins->ci_prop; - struct chk_bookmark *cbk = &ins->ci_bk; - uint32_t version = cbk->cb_gen - prop->cp_rank_nr - 1; - int rc = 0; + struct chk_property *prop = &ins->ci_prop; + struct chk_bookmark *cbk = &ins->ci_bk; + uint32_t version = ins->ci_ns_ver + 1; + int rc = 0; if (!chk_remove_rank_from_list(ins->ci_ranks, cdr->cdr_rank)) D_GOTO(out, rc = -DER_NOTAPPLICABLE); prop->cp_rank_nr--; rc = chk_prop_update(prop, ins->ci_ranks); - if (rc != 0) + if (rc != 0) { + ins->ci_skip_oog = 1; goto out; + } - rc = crt_group_secondary_modify(ins->ci_iv_group, ins->ci_ranks, ins->ci_ranks, - CRT_GROUP_MOD_OP_REPLACE, version); + rc = chk_iv_ns_update(ins, version); if (rc != 0) goto out; @@ -2150,17 +2148,15 @@ chk_leader_mark_rank_dead(struct chk_instance *ins, struct chk_dead_rank *cdr) static void chk_leader_sched(void *args) { - struct chk_instance *ins = args; - struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_dead_rank *cdr; - struct chk_iv iv = { 0 }; - uint32_t ins_phase; - uint32_t ins_status; - uint32_t pool_status; - int done = 0; - int rc = 0; - bool bcast = false; - bool more_dead; + struct chk_instance *ins = args; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_iv iv = {0}; + uint32_t ins_phase; + uint32_t ins_status; + uint32_t pool_status; + int done = 0; + int rc = 0; + bool bcast = false; D_INFO(DF_LEADER" scheduler enter at phase %u\n", DP_LEADER(ins), cbk->cb_phase); @@ -2191,29 +2187,9 @@ chk_leader_sched(void *args) while (1) { dss_sleep(300); -check_dead: - ABT_mutex_lock(ins->ci_abt_mutex); - if (!d_list_empty(&ins->ci_dead_ranks)) { - cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link); - if (!d_list_empty(&ins->ci_dead_ranks)) - more_dead = true; - else - more_dead = false; - } else { - cdr = NULL; - more_dead = false; - } - ABT_mutex_unlock(ins->ci_abt_mutex); - - if (cdr != NULL) - chk_leader_mark_rank_dead(ins, cdr); - if (chk_leader_need_stop(ins, &rc)) D_GOTO(out, bcast = (rc > 0 ? true : false)); - if (more_dead) - goto check_dead; - /* * TBD: The leader may need to detect engines' status/phase actively, otherwise * if some engine failed to notify the leader for its status/phase changes, @@ -2229,8 +2205,7 @@ chk_leader_sched(void *args) iv.ci_ins_status = CHK__CHECK_INST_STATUS__CIS_RUNNING; /* Synchronously notify engines that orphan pools have been processed. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines that orphan pools have been process: %d\n", DP_LEADER(ins), rc); @@ -2401,8 +2376,8 @@ chk_leader_ranks_prepare(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *r static int chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], int phase, d_rank_t leader, uint32_t flags) + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + d_rank_t leader, uint32_t flags) { struct chk_property *prop = &ins->ci_prop; struct chk_bookmark *cbk = &ins->ci_bk; @@ -2485,7 +2460,7 @@ chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank cbk->cb_version = DAOS_CHK_VERSION; init: - rc = chk_prop_prepare(leader, flags, phase, policy_nr, policies, rank_list, prop); + rc = chk_prop_prepare(leader, flags, policy_nr, policies, rank_list, prop); if (rc != 0) goto out; @@ -2626,8 +2601,7 @@ chk_leader_start_post(struct chk_instance *ins) * to notify the engine for the check done, that is not fatal. That * can be redo in next check instance. */ - rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, - CRT_IV_SYNC_EAGER, true); + rc = chk_iv_update(ins, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, DF_LEADER" notify engines the pool "DF_UUIDF" is checked: %d\n", DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), rc); @@ -2807,20 +2781,20 @@ chk_leader_start_cb(struct chk_co_rpc_cb_args *cb_args) int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, - int pool_nr, uuid_t pools[], uint32_t api_flags, int phase) + int pool_nr, uuid_t pools[], uint32_t api_flags) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - uuid_t *c_pools = NULL; - struct umem_attr uma = { 0 }; - uuid_t dummy_pool = { 0 }; - char uuid_str[DAOS_UUID_STR_SIZE]; - uint64_t old_gen = cbk->cb_gen; - d_rank_t myrank = dss_self_rank(); - uint32_t flags = api_flags; - int c_pool_nr = 0; - int rc; - int rc1; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + uuid_t *c_pools = NULL; + struct umem_attr uma = {0}; + uuid_t dummy_pool = {0}; + uint64_t old_gen = cbk->cb_gen; + d_rank_t myrank = dss_self_rank(); + uint32_t flags = api_flags; + uint32_t ns_ver = (uint32_t)daos_wallclock_secs(); + int c_pool_nr = 0; + int rc; + int rc1; rc = chk_ins_can_start(ins); if (rc != 0) @@ -2845,13 +2819,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c if (ins->ci_sched != ABT_THREAD_NULL) ABT_thread_free(&ins->ci_sched); - chk_iv_ns_cleanup(&ins->ci_iv_ns); - - if (ins->ci_iv_group != NULL) { - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; - } - + chk_iv_ns_destroy(ins); uma.uma_id = UMEM_CLASS_VMEM; rc = dbtree_create_inplace(DBTREE_CLASS_CHK_RANK, 0, CHK_BTREE_ORDER, &uma, @@ -2870,8 +2838,8 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_tree; reset: - rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, - phase, myrank, flags); + rc = chk_leader_start_prep(ins, rank_nr, ranks, policy_nr, policies, pool_nr, pools, myrank, + flags); if (rc == 1 && !(flags & CHK__CHECK_FLAG__CF_RESET)) { /* Former check instance has done, let's re-start from the beginning. */ flags |= CHK__CHECK_FLAG__CF_RESET; @@ -2885,18 +2853,10 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto remote; uuid_generate(dummy_pool); - uuid_unparse_lower(dummy_pool, uuid_str); - rc = crt_group_secondary_create(uuid_str, NULL, ins->ci_ranks, &ins->ci_iv_group); + rc = chk_iv_ns_create(ins, dummy_pool, myrank, ns_ver); if (rc != 0) goto out_tree; - rc = ds_iv_ns_create(dss_get_module_info()->dmi_ctx, dummy_pool, ins->ci_iv_group, - &ins->ci_iv_id, &ins->ci_iv_ns); - if (rc != 0) - goto out_group; - - ds_iv_ns_update(ins->ci_iv_ns, myrank, ins->ci_iv_ns->iv_master_term + 1); - if (d_list_empty(&ins->ci_pool_list)) { c_pool_nr = pool_nr; c_pools = pools; @@ -2908,7 +2868,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c remote: rc = chk_start_remote(ins->ci_ranks, cbk->cb_gen, rank_nr, ranks, policy_nr, policies, - c_pool_nr, c_pools, flags, phase, myrank, ins->ci_start_flags, + c_pool_nr, c_pools, flags, ns_ver, myrank, ins->ci_start_flags, dummy_pool, chk_leader_start_cb, ins); if (rc != 0) { if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { @@ -2951,10 +2911,9 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c goto out_stop_pools; } - D_INFO("Leader %s check with api_flags %x, phase %d, leader %u, flags %x, gen " DF_X64 - " iv "DF_UUIDF": rc %d\n", - chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, phase, myrank, - ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool), rc); + D_INFO("Leader %s with api_flags %x, leader %u, flags %x, gen " DF_X64 " iv " DF_UUIDF "\n", + chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, myrank, + ins->ci_start_flags, cbk->cb_gen, DP_UUID(dummy_pool)); chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks); chk_pools_dump(&ins->ci_pool_list, c_pool_nr > 0 ? c_pool_nr : pool_nr, @@ -2977,8 +2936,6 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to rollback failed check start: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); out_iv: - chk_iv_ns_cleanup(&ins->ci_iv_ns); -out_group: if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING || cbk->cb_gen != old_gen) { cbk->cb_gen = old_gen; if (cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING) { @@ -2990,17 +2947,16 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c D_WARN(DF_LEADER" failed to update leader bookmark: "DF_RC"\n", DP_LEADER(ins), DP_RC(rc1)); } - crt_group_secondary_destroy(ins->ci_iv_group); - ins->ci_iv_group = NULL; + chk_iv_ns_destroy(ins); out_tree: chk_leader_destroy_trees(ins); ins->ci_starting = 0; out_log: - D_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, - "Leader %s to start check on %u ranks for %d pools with " - "api_flags %x, phase %d, leader %u, gen "DF_X64": rc = %d\n", - rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, phase, - myrank, cbk->cb_gen, rc); + DL_CDEBUG(likely(rc < 0), DLOG_ERR, DLOG_INFO, rc, + "Leader %s to start check on %u ranks for %d pools with api_flags %x, ns_ver %d, " + "leader %u, gen " DF_X64, + rc < 0 ? "failed" : "try", rank_nr, pool_nr, api_flags, ns_ver, myrank, + cbk->cb_gen); if (unlikely(rc > 0)) rc = 0; @@ -3242,22 +3198,25 @@ int chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb, chk_query_pool_cb_t pool_cb, void *buf) { - struct chk_instance *ins = chk_leader; - struct chk_bookmark *cbk = &ins->ci_bk; - struct chk_query_args *cqa = NULL; - struct chk_pool_rec *cpr; - struct chk_pool_rec *tmp; - struct chk_pool_shard *cps; - struct chk_query_pool_shard *shard; - d_iov_t kiov; - d_iov_t riov; - uint64_t gen = cbk->cb_gen; - uint32_t status; - uint32_t phase; - uint32_t idx = 0; - int rc; - int i; - bool skip; + struct chk_instance *ins = chk_leader; + struct chk_bookmark *cbk = &ins->ci_bk; + struct chk_query_args *cqa = NULL; + struct chk_pool_rec *cpr; + struct chk_pool_rec *tmp; + struct chk_pool_shard *cps; + struct chk_query_pool_shard *shard; + d_iov_t kiov; + d_iov_t riov; + uint64_t gen = cbk->cb_gen; + uint32_t status; + uint32_t phase; + uint32_t idx = 0; + uint32_t ver; + int try_cnt = 0; + int wait_cnt = 0; + int rc; + int i; + bool skip; CHK_IS_READY(ins); @@ -3286,22 +3245,38 @@ chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb, D_GOTO(out, rc = -DER_NOMEM); again: - rc = chk_query_remote(ins->ci_ranks, gen, pool_nr, pools, chk_leader_query_cb, cqa); + try_cnt++; + ver = ins->ci_ns_ver; + rc = chk_query_remote(ins->ci_ranks, gen, pool_nr, pools, chk_leader_query_cb, cqa); if (rc != 0) { - if (rc == -DER_OOG || rc == -DER_GRPVER || rc == -DER_AGAIN) { - D_INFO(DF_LEADER" Someone is not ready %d, let's retry query after 1 sec\n", - DP_LEADER(ins), rc); - if (!d_list_empty(&cqa->cqa_list)) { - chk_cqa_free(cqa); - cqa = chk_cqa_alloc(ins); - if (cqa == NULL) - D_GOTO(out, rc = -DER_NOMEM); - } - dss_sleep(1000); - goto again; + if (rc != -DER_OOG && rc != -DER_GRPVER && rc != -DER_AGAIN) + goto out; + + if (try_cnt % 10 == 0) + D_WARN("Leader (" DF_X64 ") query retried because of %d for %d times.\n", + gen, rc, try_cnt); + + while (ver == ins->ci_ns_ver && ins->ci_skip_oog == 0 && ins->ci_pause == 0) { + dss_sleep(500); + if (++wait_cnt % 40 == 0) + D_WARN("Leader (" DF_X64 ") query is blocked because of %d for " + "about %d seconds.\n", + gen, rc, wait_cnt / 2); + if (rc != -DER_OOG) + break; } - goto out; + if (ins->ci_pause || ins->ci_skip_oog) + goto out; + + if (!d_list_empty(&cqa->cqa_list)) { + chk_cqa_free(cqa); + cqa = chk_cqa_alloc(ins); + if (cqa == NULL) + D_GOTO(out, rc = -DER_NOMEM); + } + + goto again; } d_list_for_each_entry(cpr, &ins->ci_pool_list, cpr_link) { @@ -3743,8 +3718,8 @@ chk_leader_notify(struct chk_iv *iv) } int -chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, int *pool_nr, - uuid_t **pools) +chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, uint32_t *ns_ver, + int *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { struct chk_instance *ins = chk_leader; struct chk_bookmark *cbk = &ins->ci_bk; @@ -3771,7 +3746,9 @@ chk_leader_rejoin(uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, if (ins->ci_orphan_done) *flags = CRF_ORPHAN_DONE; - rc = chk_leader_pools2list(ins, pool_nr, pools); + *ns_ver = ins->ci_ns_ver; + *ranks = ins->ci_ranks; + rc = chk_leader_pools2list(ins, pool_nr, pools); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, @@ -3789,6 +3766,9 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src struct chk_dead_rank *cdr = NULL; int rc = 0; + if (ins->ci_ranks == NULL) + D_GOTO(out, rc = -DER_NOTAPPLICABLE); + /* Ignore the event that is not applicable to current rank. */ if (src != CRT_EVS_SWIM) @@ -3797,9 +3777,6 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src if (type != CRT_EVT_DEAD && type != CRT_EVT_ALIVE) D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (!ins->ci_sched_running) - D_GOTO(out, rc = -DER_NOTAPPLICABLE); - if (type == CRT_EVT_DEAD) { D_ALLOC_PTR(cdr); if (cdr == NULL) @@ -3834,6 +3811,34 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src DP_LEADER(ins), rank, type == CRT_EVT_DEAD ? "dead" : "alive", DP_RC(rc)); } +static void +chk_dead_rank_ult(void *args) +{ + struct chk_instance *ins = args; + struct chk_dead_rank *cdr; + + while (ins->ci_inited) { + cdr = NULL; + if (!d_list_empty(&ins->ci_dead_ranks)) { + ABT_mutex_lock(ins->ci_abt_mutex); + if (likely(!d_list_empty(&ins->ci_dead_ranks))) + cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, + cdr_link); + ABT_mutex_unlock(ins->ci_abt_mutex); + } + + if (cdr != NULL) + chk_leader_mark_rank_dead(ins, cdr); + + if (d_list_empty(&ins->ci_dead_ranks)) + dss_sleep(500); + } + + while ((cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link)) != + NULL) + D_FREE(cdr); +} + int chk_leader_setup(void) { @@ -3849,6 +3854,8 @@ chk_leader_setup(void) * related local inconsistency firstly. */ + chk_report_seq_init(ins); + rc = chk_bk_fetch_leader(cbk); if (rc == -DER_NONEXIST) goto prop; @@ -3901,17 +3908,19 @@ chk_leader_setup(void) prop: rc = chk_prop_fetch(&ins->ci_prop, &ins->ci_ranks); - if (rc == 0 || rc == -DER_NONEXIST) + if (rc != 0 && rc != -DER_NONEXIST) + goto fini; + + ins->ci_inited = 1; + ins->ci_pause = 0; + + rc = dss_ult_create(chk_dead_rank_ult, ins, DSS_XS_SYS, 0, 0, &ins->ci_dead_rank_ult); + if (rc == 0) rc = crt_register_event_cb(chk_rank_event_cb, NULL); -fini: - if (rc != 0) { - chk_ins_fini(&ins); - } else { - chk_report_seq_init(ins); - ins->ci_inited = 1; - ins->ci_pause = 0; - } +fini: + if (rc != 0) + chk_leader_cleanup(); return rc; } @@ -3920,8 +3929,13 @@ chk_leader_cleanup(void) { struct chk_instance *ins = chk_leader; + crt_unregister_event_cb(chk_rank_event_cb, NULL); + chk_ins_cleanup(ins); D_ASSERT(d_list_empty(&ins->ci_rank_list)); + + if (ins->ci_dead_rank_ult != ABT_THREAD_NULL) + ABT_thread_free(&ins->ci_dead_rank_ult); } int @@ -3939,6 +3953,5 @@ chk_leader_init(void) void chk_leader_fini(void) { - crt_unregister_event_cb(chk_rank_event_cb, NULL); chk_ins_fini(&chk_leader); } diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index e340d482cec..65f71cec6ff 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -491,8 +492,8 @@ chk_sg_rpc_prepare(d_rank_t rank, crt_opcode_t opc, crt_rpc_t **req) int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, - uint32_t policy_nr, struct chk_policy *policies, int pool_nr, - uuid_t pools[], uint32_t api_flags, int phase, d_rank_t leader, uint32_t flags, + uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], + uint32_t api_flags, uint32_t ns_ver, d_rank_t leader, uint32_t flags, uuid_t iv_uuid, chk_co_rpc_cb_t start_cb, void *args) { struct chk_co_rpc_cb_args cb_args = { 0 }; @@ -508,12 +509,12 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran if (rc != 0) goto out; - csi = crt_req_get(req); - csi->csi_gen = gen; - csi->csi_flags = flags; - csi->csi_phase = phase; + csi = crt_req_get(req); + csi->csi_gen = gen; + csi->csi_flags = flags; + csi->csi_ns_ver = ns_ver; csi->csi_leader_rank = leader; - csi->csi_api_flags = api_flags; + csi->csi_api_flags = api_flags; uuid_copy(csi->csi_iv_uuid, iv_uuid); csi->csi_ranks.ca_count = rank_nr; csi->csi_ranks.ca_arrays = ranks; @@ -569,9 +570,9 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran crt_req_decref(req); } - D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, - "Rank %u start checker, gen "DF_X64", flags %x, phase %d, iv "DF_UUIDF":"DF_RC"\n", - leader, gen, flags, phase, DP_UUID(iv_uuid), DP_RC(rc)); + DL_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO, rc, + "Rank %u start checker, gen " DF_X64 ", flags %x, ns_ver %d, iv " DF_UUIDF, + leader, gen, flags, ns_ver, DP_UUID(iv_uuid)); return rc; } @@ -982,7 +983,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, - uint32_t *pool_nr, uuid_t **pools) + uint32_t *ns_ver, uint32_t *pool_nr, uuid_t **pools, d_rank_list_t **ranks) { crt_rpc_t *req = NULL; struct chk_rejoin_in *cri; @@ -1005,8 +1006,22 @@ chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, cro = crt_reply_get(req); rc = cro->cro_status; - if (rc == 0 && cro->cro_pools.ca_count > 0) { - *flags = cro->cro_flags; + if (rc != 0) + goto out; + + *flags = cro->cro_flags; + *ns_ver = cro->cro_ns_ver; + + if (cro->cro_ranks.ca_count > 0) { + *ranks = d_rank_list_alloc(cro->cro_ranks.ca_count); + if (*ranks == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy((*ranks)->rl_ranks, cro->cro_ranks.ca_arrays, + sizeof(d_rank_t) * cro->cro_ranks.ca_count); + } + + if (cro->cro_pools.ca_count > 0) { D_ALLOC(tmp, cro->cro_pools.ca_count); if (tmp == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/chk/chk_srv.c b/src/chk/chk_srv.c index 86ed5a7d2e5..442579d8411 100644 --- a/src/chk/chk_srv.c +++ b/src/chk/chk_srv.c @@ -27,7 +27,7 @@ ds_chk_start_hdlr(crt_rpc_t *rpc) rc = chk_engine_start(csi->csi_gen, csi->csi_ranks.ca_count, csi->csi_ranks.ca_arrays, csi->csi_policies.ca_count, csi->csi_policies.ca_arrays, csi->csi_uuids.ca_count, csi->csi_uuids.ca_arrays, csi->csi_api_flags, - csi->csi_phase, csi->csi_leader_rank, csi->csi_flags, + csi->csi_ns_ver, csi->csi_leader_rank, csi->csi_flags, csi->csi_iv_uuid, &clues); if (rc > 0) { D_ALLOC_PTR(rank); @@ -249,18 +249,21 @@ ds_chk_report_hdlr(crt_rpc_t *rpc) static void ds_chk_rejoin_hdlr(crt_rpc_t *rpc) { - struct chk_rejoin_in *cri = crt_req_get(rpc); - struct chk_rejoin_out *cro = crt_reply_get(rpc); - uuid_t *pools = NULL; - int pool_nr = 0; - int rc; + struct chk_rejoin_in *cri = crt_req_get(rpc); + struct chk_rejoin_out *cro = crt_reply_get(rpc); + uuid_t *pools = NULL; + d_rank_list_t *ranks = NULL; + int pool_nr = 0; + int rc; rc = chk_leader_rejoin(cri->cri_gen, cri->cri_rank, cri->cri_iv_uuid, &cro->cro_flags, - &pool_nr, &pools); + &cro->cro_ns_ver, &pool_nr, &pools, &ranks); cro->cro_status = rc; if (rc == 0) { - cro->cro_pools.ca_count = pool_nr; + cro->cro_ranks.ca_count = ranks->rl_nr; + cro->cro_ranks.ca_arrays = ranks->rl_ranks; + cro->cro_pools.ca_count = pool_nr; cro->cro_pools.ca_arrays = pools; } diff --git a/src/engine/server_iv.c b/src/engine/server_iv.c index fbeb40b712c..4d8271155a6 100644 --- a/src/engine/server_iv.c +++ b/src/engine/server_iv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1126,7 +1126,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, rc = iv_op_internal(ns, key, value, sync, shortcut, opc); if (retry && !ns->iv_stop && (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER || rc == -DER_BUSY)) { - if (rc == -DER_GRPVER && engine_in_check()) { + if ((rc == -DER_GRPVER || rc == -DER_OOG) && engine_in_check()) { /* * Under check mode, the pool shard on peer rank/target does * not exist, then it will reply "-DER_GRPVER" that is normal diff --git a/src/gurt/misc.c b/src/gurt/misc.c index 094b3cc1a56..afca34ab918 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -550,9 +550,24 @@ d_rank_list_shuffle(d_rank_list_t *rank_list) } /** - * Must be previously sorted or not modified at all in order to guarantee - * consistent indexes. - **/ + * Binary search \a rank in the sorted \a rank_list. + */ + +bool +d_rank_list_bsearch(d_rank_list_t *rank_list, d_rank_t rank, int *idx) +{ + d_rank_t *pos = NULL; + + if (rank_list != NULL) { + pos = bsearch(&rank, rank_list->rl_ranks, rank_list->rl_nr, sizeof(rank), + rank_compare); + if (pos != NULL && idx != NULL) + *idx = ((void *)pos - (void *)rank_list->rl_ranks) / sizeof(rank); + } + + return pos != NULL; +} + bool d_rank_list_find(d_rank_list_t *rank_list, d_rank_t rank, int *idx) { diff --git a/src/include/daos_srv/daos_chk.h b/src/include/daos_srv/daos_chk.h index 59e34305d28..608e53d97b7 100644 --- a/src/include/daos_srv/daos_chk.h +++ b/src/include/daos_srv/daos_chk.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -73,9 +74,9 @@ typedef int (*chk_query_pool_cb_t)(struct chk_query_pool_shard *shard, uint32_t typedef int (*chk_prop_cb_t)(void *buf, uint32_t policies[], int cnt, uint32_t flags); -int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, - struct chk_policy *policies, int pool_nr, uuid_t pools[], - uint32_t api_flags, int phase); +int +chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, + int pool_nr, uuid_t pools[], uint32_t api_flags); int chk_leader_stop(int pool_nr, uuid_t pools[]); diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index 241dcd9b74a..5742ee3f5bf 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -460,6 +460,7 @@ d_power2_nbits(unsigned int val) return val == LOWEST_BIT_SET(val) ? shift - 1 : shift; } +/* clang-format off */ int d_rank_list_dup(d_rank_list_t **dst, const d_rank_list_t *src); int d_rank_list_dup_sort_uniq(d_rank_list_t **dst, const d_rank_list_t *src); void d_rank_list_filter(d_rank_list_t *src_set, d_rank_list_t *dst_set, @@ -471,6 +472,7 @@ void d_rank_list_free(d_rank_list_t *rank_list); int d_rank_list_copy(d_rank_list_t *dst, d_rank_list_t *src); void d_rank_list_shuffle(d_rank_list_t *rank_list); void d_rank_list_sort(d_rank_list_t *rank_list); +bool d_rank_list_bsearch(d_rank_list_t *rank_list, d_rank_t rank, int *idx); bool d_rank_list_find(d_rank_list_t *rank_list, d_rank_t rank, int *idx); void d_rank_list_del_at(d_rank_list_t *list, int idx); int d_rank_list_del(d_rank_list_t *rank_list, d_rank_t rank); @@ -482,15 +484,13 @@ int d_rank_list_append(d_rank_list_t *rank_list, d_rank_t rank); int d_rank_list_dump(d_rank_list_t *rank_list, d_string_t name, int name_len); d_rank_list_t *uint32_array_to_rank_list(uint32_t *ints, size_t len); int rank_list_to_uint32_array(d_rank_list_t *rl, uint32_t **ints, size_t *len); -int - d_rank_list_to_str(d_rank_list_t *rank_list, char **rank_str); - +int d_rank_list_to_str(d_rank_list_t *rank_list, char **rank_str); d_rank_range_list_t *d_rank_range_list_alloc(uint32_t size); d_rank_range_list_t *d_rank_range_list_realloc(d_rank_range_list_t *range_list, uint32_t size); d_rank_range_list_t *d_rank_range_list_create_from_ranks(d_rank_list_t *rank_list); -int - d_rank_range_list_str(d_rank_range_list_t *list, char **ranks_str); +int d_rank_range_list_str(d_rank_range_list_t *list, char **ranks_str); void d_rank_range_list_free(d_rank_range_list_t *range_list); +/* clang-format on */ #ifdef FAULT_INJECTION diff --git a/src/mgmt/srv_chk.c b/src/mgmt/srv_chk.c index a9c1ce1c3b0..7cd32be1273 100644 --- a/src/mgmt/srv_chk.c +++ b/src/mgmt/srv_chk.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -47,7 +48,7 @@ ds_mgmt_chk_parse_uuid(int pool_nr, char **pools, uuid_t **p_uuids) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int32_t pool_nr, char **pools, - uint32_t flags, int32_t phase) + uint32_t flags) { uuid_t *uuids = NULL; struct chk_policy *ply = NULL; @@ -69,7 +70,7 @@ ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, } } - rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags, phase); + rc = chk_leader_start(rank_nr, ranks, policy_nr, ply, pool_nr, uuids, flags); out: D_FREE(uuids); diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index 6d44faa796c..7874fb81b09 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2563,7 +2563,7 @@ ds_mgmt_drpc_check_start(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) D_INFO("Received request to start check\n"); rc = ds_mgmt_check_start(req->n_ranks, req->ranks, req->n_policies, req->policies, - req->n_uuids, req->uuids, req->flags, -1 /* phase */); + req->n_uuids, req->uuids, req->flags); if (rc < 0) D_ERROR("Failed to start check: "DF_RC"\n", DP_RC(rc)); diff --git a/src/mgmt/srv_internal.h b/src/mgmt/srv_internal.h index 95f2c809494..a0c4d7d332c 100644 --- a/src/mgmt/srv_internal.h +++ b/src/mgmt/srv_internal.h @@ -129,9 +129,10 @@ int const char *user, const char *group); /** srv_chk.c */ -int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, - Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase); +int + ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, + Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, + uint32_t flags); int ds_mgmt_check_stop(int pool_nr, char **pools); int ds_mgmt_check_query(int pool_nr, char **pools, chk_query_head_cb_t head_cb, chk_query_pool_cb_t pool_cb, void *buf); diff --git a/src/mgmt/tests/mocks.c b/src/mgmt/tests/mocks.c index 9a11890513e..7ed2b0277f5 100644 --- a/src/mgmt/tests/mocks.c +++ b/src/mgmt/tests/mocks.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -627,7 +627,7 @@ mock_ds_mgmt_dev_set_faulty_setup(void) int ds_mgmt_check_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, Mgmt__CheckInconsistPolicy **policies, int pool_nr, char **pools, - uint32_t flags, int phase) + uint32_t flags) { return 0; } diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index d517e3269d6..ef10ed439a5 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -724,8 +725,8 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc == 0) { if (!shard->do_rebuilding && !shard->do_reintegrating) { tmp_tgt.dct_rank = shard->do_target_rank; - dct = bsearch(&tmp_tgt, coa->coa_dcts, coa->coa_dct_nr, - sizeof(tmp_tgt), &dc_coll_sort_cmp); + dct = bsearch(&tmp_tgt, coa->coa_dcts, coa->coa_dct_nr, + sizeof(tmp_tgt), dc_coll_sort_cmp); D_ASSERT(dct != NULL); goto gen_mbs; diff --git a/src/tests/ftest/recovery/cat_recov_core.yaml b/src/tests/ftest/recovery/cat_recov_core.yaml index 8e4553e30ec..7792dd408e2 100644 --- a/src/tests/ftest/recovery/cat_recov_core.yaml +++ b/src/tests/ftest/recovery/cat_recov_core.yaml @@ -6,6 +6,7 @@ server_config: engines_per_host: 2 engines: 0: + targets: 4 pinned_numa_node: 0 nr_xs_helpers: 0 log_file: daos_server0.log @@ -19,6 +20,7 @@ server_config: - D_LOG_STDERR_IN_LOG=1 storage: auto 1: + targets: 4 pinned_numa_node: 1 nr_xs_helpers: 0 log_file: daos_server1.log @@ -33,7 +35,9 @@ server_config: storage: auto transport_config: allow_insecure: true - system_ram_reserved: 64 +pool: + scm_size: 6G + nvme_size: 80G agent_config: transport_config: allow_insecure: true diff --git a/src/tests/ftest/recovery/container_cleanup.yaml b/src/tests/ftest/recovery/container_cleanup.yaml index 346378a1543..301ab604e84 100644 --- a/src/tests/ftest/recovery/container_cleanup.yaml +++ b/src/tests/ftest/recovery/container_cleanup.yaml @@ -14,7 +14,7 @@ server_config: system_ram_reserved: 1 pool: - size: 5G + size: 15G container: type: POSIX diff --git a/src/tests/ftest/recovery/container_list_consolidation.yaml b/src/tests/ftest/recovery/container_list_consolidation.yaml index 346378a1543..301ab604e84 100644 --- a/src/tests/ftest/recovery/container_list_consolidation.yaml +++ b/src/tests/ftest/recovery/container_list_consolidation.yaml @@ -14,7 +14,7 @@ server_config: system_ram_reserved: 1 pool: - size: 5G + size: 15G container: type: POSIX diff --git a/src/tests/ftest/recovery/ms_membership.yaml b/src/tests/ftest/recovery/ms_membership.yaml index 3e82da75b68..8377608ac8a 100644 --- a/src/tests/ftest/recovery/ms_membership.yaml +++ b/src/tests/ftest/recovery/ms_membership.yaml @@ -7,13 +7,18 @@ server_config: engines_per_host: 2 engines: 0: + targets: 4 pinned_numa_node: 0 nr_xs_helpers: 1 log_file: daos_server0.log storage: auto 1: + targets: 4 pinned_numa_node: 1 nr_xs_helpers: 1 log_file: daos_server1.log storage: auto system_ram_reserved: 1 + +pool: + size: 80G diff --git a/src/tests/ftest/recovery/pool_cleanup.yaml b/src/tests/ftest/recovery/pool_cleanup.yaml index 1b57a320f42..a776c8b7a5c 100644 --- a/src/tests/ftest/recovery/pool_cleanup.yaml +++ b/src/tests/ftest/recovery/pool_cleanup.yaml @@ -14,4 +14,4 @@ server_config: system_ram_reserved: 1 pool: - size: 5G + size: 15G diff --git a/src/tests/ftest/recovery/pool_list_consolidation.yaml b/src/tests/ftest/recovery/pool_list_consolidation.yaml index 37a548e1702..7336e97157d 100644 --- a/src/tests/ftest/recovery/pool_list_consolidation.yaml +++ b/src/tests/ftest/recovery/pool_list_consolidation.yaml @@ -17,7 +17,7 @@ setup: start_servers_once: False pool: - size: 60G + size: 100G container: control_method: daos diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 12b4a9958dc..c9304a7d1ff 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2023-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1310,7 +1311,7 @@ cr_engine_interaction(void **state) rc = cr_system_start(); assert_rc_equal(rc, 0); - /* Former connection for the pool has been evicted by checkre. Let's re-connect the pool. */ + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ rc = cr_cont_get_label(state, &pool, &cont, true, &label); assert_rc_equal(rc, 0); @@ -1676,7 +1677,7 @@ cr_stop_engine_interaction(void **state) rc = cr_system_start(); assert_rc_equal(rc, 0); - /* Former connection for the pool has been evicted by checkre. Let's re-connect the pool. */ + /* Former connection for the pool has been evicted by checker. Let's re-connect the pool. */ rc = cr_cont_get_label(state, &pool, &cont, true, &label); assert_rc_equal(rc, 0); @@ -3785,6 +3786,62 @@ cr_maintenance_mode(void **state) cr_cleanup(arg, &pool, 1); } +/* + * 1. Exclude rank 0. + * 2. Create pool without inconsistency. + * 3. Start checker without options. + * 4. Query checker, it should be completed instead of being blocked. + * 5. Switch to normal mode and cleanup. + */ +static void +cr_lost_rank0(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = {0}; + struct daos_check_info dci = {0}; + int rc; + + print_message("CR29: CR with rank 0 excluded at the beginning\n"); + + print_message("CR: excluding the rank 0 ...\n"); + rc = dmg_system_exclude_rank(dmg_config_file, 0); + assert_rc_equal(rc, 0); + + rc = cr_pool_create(state, &pool, false, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_RESET, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + /* Reint the rank for subsequent test. */ + rc = cr_rank_reint(0, true); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + +/* clang-format off */ static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", cr_start_specified, async_disable, test_case_teardown}, @@ -3842,7 +3899,10 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool2, async_disable, test_case_teardown}, { "CR28: maintenance mode after dry-run check", cr_maintenance_mode, async_disable, test_case_teardown}, + { "CR29: CR with rank 0 excluded at the beginning", + cr_lost_rank0, async_disable, test_case_teardown}, }; +/* clang-format on */ static int cr_setup(void **state)