From 2b96e5ade6280e7fc29d65c0255d876572c04b84 Mon Sep 17 00:00:00 2001 From: m-a-blommaert <61234439+m-a-blommaert@users.noreply.github.com> Date: Thu, 3 Apr 2025 00:38:24 +0200 Subject: [PATCH 01/18] Add pymoo_minimize.py pymoo_minimize.py contains some skeleton code for the integration with Pymoo --- kernel_tuner/strategies/pymoo_minimize.py | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 kernel_tuner/strategies/pymoo_minimize.py diff --git a/kernel_tuner/strategies/pymoo_minimize.py b/kernel_tuner/strategies/pymoo_minimize.py new file mode 100644 index 000000000..c5405ee15 --- /dev/null +++ b/kernel_tuner/strategies/pymoo_minimize.py @@ -0,0 +1,65 @@ +"""The Pymoo strategy that uses a minimizer method for searching through the parameter space.""" + +import pymoo.optimize +import pymoo.core + +from kernel_tuner import util +from kernel_tuner.searchspace import Searchspace +from kernel_tuner.strategies.common import ( + CostFunc, + get_options, + get_strategy_docstring, + setup_method_arguments, + setup_method_options, +) + +# TODO: Add the PyMOO algorithms +supported_methods = [] + +_options = dict(method=(f"Pymoo optimization algorithm to use, choose any from {supported_methods}", "")) + +def tune(searchspace: Searchspace, runner, tuning_options): + + # TODO: + # The idea is to create a Problem, Algorithm, and Termination + # then use to run `pymoo.optimize.minimize` + # so I basically need to write some adapter/integration code + + method = get_options(tuning_options.strategy_options, _options)[0] + + # scale variables in x to make 'eps' relevant for multiple variables + cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True) + + bounds, x0, _ = cost_func.get_bounds_x0_eps() + kwargs = setup_method_arguments(method, bounds) + options = setup_method_options(method, tuning_options) + + # TODO: make a pymoo.core.problem.Problem + # * use `searchspace`, `runner`, and `cost_func` to define the problem + # * use etc to define the problem + problem = None # pymoo.core.problem.Problem() + + # TODO: make a pymoo.core.algorithm.Algorithm + # * use `method` to select the algorithm + # * use etc to define the algorithm + algorithm = None # pymoo.core.algorithm.Algorithm() + + # TODO: + termination = None # pymoo.core.termination.Termination() + + # TODO: change the rest of the code to work with `Pymoo` + + opt_result = None + try: + opt_result = pymoo.optimize.minimize(problem, algorithm, termination) + except util.StopCriterionReached as e: + if tuning_options.verbose: + print(e) + + if opt_result and tuning_options.verbose: + print(opt_result.message) + + return cost_func.results + + +tune.__doc__ = get_strategy_docstring("Pymoo minimize", _options) From d1d1540bb1d603335fdb3e0201a67359e0f4cdf9 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Fri, 9 May 2025 00:37:45 +0200 Subject: [PATCH 02/18] add "error" field to result dict and convert existing code to use it --- kernel_tuner/core.py | 14 +++++++++++--- kernel_tuner/file_utils.py | 20 ++++++++++++-------- kernel_tuner/interface.py | 11 ++++++++++- kernel_tuner/runners/sequential.py | 10 ++++++++-- kernel_tuner/runners/simulation.py | 4 ++++ kernel_tuner/strategies/common.py | 20 +++++++++++++++----- kernel_tuner/util.py | 7 +++++++ 7 files changed, 67 insertions(+), 19 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 655779337..594ea3b78 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -480,11 +480,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett print( f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch" ) - result[objective] = util.RuntimeFailedConfig() + # result[objective] = util.RuntimeFailedConfig() + result['error'] = util.RuntimeFailedConfig() else: logging.debug("benchmark encountered runtime failure: " + str(e)) print("Error while benchmarking:", instance.name) raise e + + assert util.check_result_type(result), "The error in a result MUST be an actual error." + return result def check_kernel_output( @@ -571,7 +575,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): - result[to.objective] = util.InvalidConfig() + # result[to.objective] = util.InvalidConfig() + result['error'] = util.InvalidConfig() else: # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s gpu_args = _preprocess_gpu_arguments(gpu_args, params) @@ -581,7 +586,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, start_compilation = time.perf_counter() func = self.compile_kernel(instance, verbose) if not func: - result[to.objective] = util.CompilationFailedConfig() + # result[to.objective] = util.CompilationFailedConfig() + result['error'] = util.CompilationFailedConfig() else: # add shared memory arguments to compiled module if kernel_options.smem_args is not None: @@ -635,6 +641,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, result["verification_time"] = last_verification_time or 0 result["benchmark_time"] = last_benchmark_time or 0 + assert util.check_result_type(result), "The error in a result MUST be an actual error." + return result def compile_kernel(self, instance, verbose): diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py index e5d3dcb90..fb53d5956 100644 --- a/kernel_tuner/file_utils.py +++ b/kernel_tuner/file_utils.py @@ -32,20 +32,20 @@ def output_file_schema(target): return current_version, json_string -def get_configuration_validity(objective) -> str: +def get_configuration_validity(error) -> str: """Convert internal Kernel Tuner error to string.""" errorstring: str - if not isinstance(objective, util.ErrorConfig): + if not isinstance(error, util.ErrorConfig): errorstring = "correct" else: - if isinstance(objective, util.CompilationFailedConfig): + if isinstance(error, util.CompilationFailedConfig): errorstring = "compile" - elif isinstance(objective, util.RuntimeFailedConfig): + elif isinstance(error, util.RuntimeFailedConfig): errorstring = "runtime" - elif isinstance(objective, util.InvalidConfig): + elif isinstance(error, util.InvalidConfig): errorstring = "constraints" else: - raise ValueError(f"Unkown objective type {type(objective)}, value {objective}") + raise ValueError(f"Unkown error type {type(error)}, value {error}") return errorstring @@ -110,7 +110,8 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim out["times"] = timings # encode the validity of the configuration - out["invalidity"] = get_configuration_validity(result[objective]) + # out["invalidity"] = get_configuration_validity(result[objective]) + out["invalidity"] = get_configuration_validity(result['error']) # Kernel Tuner does not support producing results of configs that fail the correctness check # therefore correctness is always 1 @@ -127,7 +128,10 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim # In Kernel Tuner we currently support only one objective at a time, this can be a user-defined # metric that combines scores from multiple different quantities into a single value to support # multi-objective tuning however. - out["objectives"] = [objective] + # NOTE(maric): With PyMOO integrated we do support multi-objective tuning without scalarization + objectives = [objective] if isinstance(objective, str) else list(objective) + assert isinstance(objectives, list) + out["objectives"] = objectives # append to output output_data.append(out) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 97ae22848..5d2891218 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -587,7 +587,16 @@ def tune_kernel( _check_user_input(kernel_name, kernelsource, arguments, block_size_names) # default objective if none is specified - objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better) + # if len(list(objective)) == 1: + # objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better) + + if isinstance(objective, str): + objective = list(objective) + + if isinstance(objective_higher_is_better, bool): + objective_higher_is_better = list(objective_higher_is_better) + + assert len(list(objective)) == len(list(objective_higher_is_better)) # check for forbidden names in tune parameters util.check_tune_params_list(tune_params, observers, simulation_mode=simulation_mode) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index aeebd5116..95bfff500 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -90,13 +90,17 @@ def run(self, parameter_space, tuning_options): result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options) + assert util.check_result_type(result) + params.update(result) - if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): + # if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): + if 'error' in result: logging.debug('kernel configuration was skipped silently due to compile or runtime failure') # only compute metrics on configs that have not errored - if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): + # if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): + if 'error' in params: params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times @@ -113,6 +117,8 @@ def run(self, parameter_space, tuning_options): # add configuration to cache store_cache(x_int, params, tuning_options) + assert util.check_result_type(params) + # all visited configurations are added to results to provide a trace for optimization strategies results.append(params) diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py index 22c7c667c..5134f0fbd 100644 --- a/kernel_tuner/runners/simulation.py +++ b/kernel_tuner/runners/simulation.py @@ -90,6 +90,8 @@ def run(self, parameter_space, tuning_options): if tuning_options.cache and x_int in tuning_options.cache: result = tuning_options.cache[x_int].copy() + assert util.check_result_type(result) + # Simulate behavior of sequential runner that when a configuration is # served from the cache by the sequential runner, the compile_time, # verification_time, and benchmark_time are set to 0. @@ -124,6 +126,8 @@ def run(self, parameter_space, tuning_options): self.start_time = perf_counter() result['framework_time'] = total_time - self.last_strategy_time + assert util.check_result_type(result) + results.append(result) continue diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py index d01eae937..d7acd4ed9 100644 --- a/kernel_tuner/strategies/common.py +++ b/kernel_tuner/strategies/common.py @@ -92,7 +92,10 @@ def __call__(self, x, check_restrictions=True): legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose) if not legal: result = params_dict - result[self.tuning_options.objective] = util.InvalidConfig() + # result[self.tuning_options.objective] = util.InvalidConfig() + result['error'] = util.InvalidConfig() + + assert legal == ('error' not in result), "A legal config MUST NOT have an error result." if legal: # compile and benchmark this instance @@ -109,10 +112,17 @@ def __call__(self, x, check_restrictions=True): self.runner.last_strategy_start_time = perf_counter() # get numerical return value, taking optimization direction into account - return_value = result[self.tuning_options.objective] or sys.float_info.max - return_value = return_value if not self.tuning_options.objective_higher_is_better else -return_value - - return return_value + return_values = [] + for obj, higher_is_better in zip(self.tuning_options.objective, self.tuning_options.objective_higher_is_better): + return_value = result[obj] if 'error' not in result else sys.float_info.max + return_value = return_value if not higher_is_better else -return_value + return_values.append(return_value) + + if len(return_values) == 1: + return return_values[0] + else: + # NOTE: MAYBE make this a tuple() + return return_values def get_bounds_x0_eps(self): """Compute bounds, x0 (the initial guess), and eps.""" diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 710b59e0d..054ef5453 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -79,6 +79,13 @@ def default(self, obj): return super(NpEncoder, self).default(obj) +def check_result_type(r): + "Check if the result has the right format." + if 'error' in r: + return isinstance(r['error'], ErrorConfig) + return True + + class TorchPlaceHolder: def __init__(self): self.Tensor = Exception # using Exception here as a type that will never be among kernel arguments From 133d35cc4913e77fb41376e2ba9f7ed1c5e2160d Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Fri, 9 May 2025 00:39:54 +0200 Subject: [PATCH 03/18] add rough support for multiple objectis through pymoo --- kernel_tuner/interface.py | 6 +- kernel_tuner/strategies/pymoo_minimize.py | 91 ++++++++++++++++++++--- 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 5d2891218..f7adef3e2 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -57,6 +57,7 @@ pso, random_sample, simulated_annealing, + pymoo_minimize, ) strategy_map = { @@ -75,6 +76,7 @@ "simulated_annealing": simulated_annealing, "firefly_algorithm": firefly_algorithm, "bayes_opt": bayes_opt, + "pymoo_minimize": pymoo_minimize, } @@ -425,7 +427,7 @@ def __deepcopy__(self, _): """Optimization objective to sort results on, consisting of a string that also occurs in results as a metric or observed quantity, default 'time'. Please see :ref:`objectives`.""", - "string", + "str | list[str]", ), ), ( @@ -433,7 +435,7 @@ def __deepcopy__(self, _): ( """boolean that specifies whether the objective should be maximized (True) or minimized (False), default False.""", - "bool", + "bool | list[bool]", ), ), ( diff --git a/kernel_tuner/strategies/pymoo_minimize.py b/kernel_tuner/strategies/pymoo_minimize.py index c5405ee15..c31ff0797 100644 --- a/kernel_tuner/strategies/pymoo_minimize.py +++ b/kernel_tuner/strategies/pymoo_minimize.py @@ -1,7 +1,6 @@ """The Pymoo strategy that uses a minimizer method for searching through the parameter space.""" -import pymoo.optimize -import pymoo.core +import numpy as np from kernel_tuner import util from kernel_tuner.searchspace import Searchspace @@ -12,6 +11,14 @@ setup_method_arguments, setup_method_options, ) +from kernel_tuner.strategies.genetic_algorithm import mutate + +from pymoo.optimize import minimize +from pymoo.core.problem import ElementwiseProblem +from pymoo.core.sampling import Sampling +from pymoo.core.mutation import Mutation +from pymoo.operators.crossover.ux import UX +from pymoo.algorithms.moo.nsga2 import NSGA2 # TODO: Add the PyMOO algorithms supported_methods = [] @@ -28,38 +35,98 @@ def tune(searchspace: Searchspace, runner, tuning_options): method = get_options(tuning_options.strategy_options, _options)[0] # scale variables in x to make 'eps' relevant for multiple variables - cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True) + cost_func = CostFunc(searchspace, tuning_options, runner, scaling=False) bounds, x0, _ = cost_func.get_bounds_x0_eps() kwargs = setup_method_arguments(method, bounds) options = setup_method_options(method, tuning_options) - # TODO: make a pymoo.core.problem.Problem - # * use `searchspace`, `runner`, and `cost_func` to define the problem - # * use etc to define the problem - problem = None # pymoo.core.problem.Problem() + problem = KernelTunerProblem( + f = cost_func, + n_var = len(tuning_options.tune_params), + n_obj = len(tuning_options.objective), + ) # TODO: make a pymoo.core.algorithm.Algorithm # * use `method` to select the algorithm # * use etc to define the algorithm - algorithm = None # pymoo.core.algorithm.Algorithm() + + # algorithm_type = get_algorithm + algorithm = NSGA2( + pop_size=100, + sampling=SearchspaceRandomSampling(searchspace), + crossover=UX(prob=0.6), + mutation=MutateToNeighbor(searchspace, prob=0.5), + ) # TODO: + # - CostFunc throws exception when done, so isn't really needed termination = None # pymoo.core.termination.Termination() - # TODO: change the rest of the code to work with `Pymoo` - opt_result = None try: - opt_result = pymoo.optimize.minimize(problem, algorithm, termination) + opt_result = minimize(problem, algorithm, termination) except util.StopCriterionReached as e: + print(f"Stopped because of {e}") if tuning_options.verbose: print(e) if opt_result and tuning_options.verbose: - print(opt_result.message) + print(f"{opt_result.message=}") + # print(f"{opt_result.message=}") + # print(f"{cost_func.results=}") return cost_func.results tune.__doc__ = get_strategy_docstring("Pymoo minimize", _options) + + +class KernelTunerProblem(ElementwiseProblem): + def __init__(self, f, n_var, n_obj): + super().__init__( + n_var = n_var, + n_obj = n_obj, + ) + self.f = f + + def _evaluate(self, x, out, *args, **kwargs): + F = self.f(x) + out["F"] = F + + +class SearchspaceRandomSampling(Sampling): + def __init__(self, searchspace): + super().__init__() + self.ss = searchspace + + def _do(self, problem, n_samples, **kwargs): + X = self.ss.get_random_sample(n_samples) + return X + + +class MutateToNeighbor(Mutation): + def __init__( + self, + searchspace : Searchspace, + prob=1.0, + prob_var=None, + **kwargs + ): + super().__init__( + prob=prob, + prob_var=prob_var, + **kwargs, + ) + self.ss = searchspace + + def _do(self, problem, X, **kwargs): + Xm = np.empty_like(X) + for i in range(X.shape[0]): + neighbors = self.ss.get_neighbors_indices_no_cache(tuple(X[i]), neighbor_method="Hamming") + # copy X[i] to result in case there are no neighbors + if len(neighbors) > 0: + Xm[i] = neighbors[np.random.choice(len(neighbors))] + else: + Xm[i] = X[i] + return Xm From 541972cb2c60a0dac395a2e1a4b6e78eee0e47d7 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 13 Aug 2025 21:42:26 +0200 Subject: [PATCH 04/18] pareto front algo --- kernel_tuner/util.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 054ef5453..297538b30 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -400,11 +400,43 @@ def get_best_config(results, objective, objective_higher_is_better=False): ignore_val = sys.float_info.max if not objective_higher_is_better else -sys.float_info.max best_config = func( results, - key=lambda x: x[objective] if isinstance(x[objective], float) else ignore_val, + key=lambda x: x[objective] if 'error' not in x and isinstance(x[objective], float) else ignore_val, ) return best_config +def get_pareto_front(results, objective, objective_higher_is_better): + assert isinstance(objective, list) + + nonerror_results = list(filter(lambda x: "error" not in x, results)) + front = [] + + # A point `p` in a finite set of points `S` is said to be maximal or non-dominated if there is no other point `q` in `S` whose `q(i)` are all >= `p(i)` + # So for all q there must be a q(i) such that q(i) < p(i) + for p in nonerror_results: + p_nondom = True + for q in nonerror_results: + if p is q: + continue + # \forall(i): q(i) >= p(i)? + flag = True + for i, higher_is_better in zip(objective, objective_higher_is_better): + p_i, q_i = p[i], q[i] + if not higher_is_better: + p_i, q_i = -p_i, -q_i + if q_i < p_i: + flag = False + break + if flag: + p_nondom = False + break + if p_nondom: + p["optimal"] = True + front.append(p) + + return front + + def get_config_string(params, keys=None, units=None): """Return a compact string representation of a measurement.""" From daca5900603d61f010f4590bf986d418f4f47a6a Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:55:07 +0100 Subject: [PATCH 05/18] forgot to commit for a very long time... --- .gitignore | 3 +- kernel_tuner/__init__.py | 2 +- kernel_tuner/interface.py | 65 ++++- kernel_tuner/runners/simulation.py | 3 +- kernel_tuner/strategies/common.py | 29 ++- kernel_tuner/strategies/pymoo_minimize.py | 284 ++++++++++++++++------ kernel_tuner/util.py | 140 ++++++++--- 7 files changed, 400 insertions(+), 126 deletions(-) diff --git a/.gitignore b/.gitignore index 43bd95c2b..5034957b4 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ deploy_key temp_*.* .python-version .nox +.venv ### Visual Studio Code ### !.vscode/settings.json @@ -37,4 +38,4 @@ temp_*.* .LSOverride .vscode -.idea \ No newline at end of file +.idea diff --git a/kernel_tuner/__init__.py b/kernel_tuner/__init__.py index b64d69813..3f575faa0 100644 --- a/kernel_tuner/__init__.py +++ b/kernel_tuner/__init__.py @@ -1,5 +1,5 @@ from kernel_tuner.integration import store_results, create_device_targets -from kernel_tuner.interface import tune_kernel, run_kernel +from kernel_tuner.interface import tune_kernel, tune_cache, run_kernel from importlib.metadata import version diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index f7adef3e2..0c0282ad0 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -579,6 +579,7 @@ def tune_kernel( observers=None, objective=None, objective_higher_is_better=None, + objectives=None, ): start_overhead_time = perf_counter() if log: @@ -593,10 +594,17 @@ def tune_kernel( # objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better) if isinstance(objective, str): - objective = list(objective) + objective = [objective] if isinstance(objective_higher_is_better, bool): - objective_higher_is_better = list(objective_higher_is_better) + objective_higher_is_better = [objective_higher_is_better] + + if objectives: + if isinstance(objectives, dict): + objective = list(objectives.keys()) + objective_higher_is_better = list(objectives.values()) + else: + raise ValueError("objectives should be a dict of (objective, higher_is_better) pairs") assert len(list(objective)) == len(list(objective_higher_is_better)) @@ -693,13 +701,34 @@ def tune_kernel( # finished iterating over search space if results: # checks if results is not empty - best_config = util.get_best_config(results, objective, objective_higher_is_better) - # add the best configuration to env - env['best_config'] = best_config - if not device_options.quiet: - units = getattr(runner, "units", None) - print("best performing configuration:") - util.print_config_output(tune_params, best_config, device_options.quiet, metrics, units) + if len(list(objective)) == 1: + objective = objective[0] + objective_higher_is_better = objective_higher_is_better[0] + best_config = util.get_best_config(results, objective, objective_higher_is_better) + print(best_config) + # add the best configuration to env + env['best_config'] = best_config + if not device_options.quiet: + units = getattr(runner, "units", None) + print(f"\nBEST PERFORMING CONFIGURATION FOR OBJECTIVE {objective}:") + keys = list(tune_params.keys()) + keys += [objective] + if metrics: + keys += list(metrics.keys()) + print(util.get_config_string(best_config, keys, units)) + else: + pareto_front = util.get_pareto_results(results, objective, objective_higher_is_better) + # add the best configuration to env + env['best_config'] = pareto_front + if not device_options.quiet: + units = getattr(runner, "units", None) + keys = list(tune_params.keys()) + keys += list(objective) + if metrics: + keys += list(metrics.keys) + print(f"\nBEST PERFORMING CONFIGURATIONS FOR OBJECTIVES: {objective}:") + for best_config in pareto_front: + print(util.get_config_string(best_config, keys, units)) elif not device_options.quiet: print("no results to report") @@ -714,6 +743,24 @@ def tune_kernel( tune_kernel.__doc__ = _tune_kernel_docstring + +def tune_cache( + cache, + restrictions = None, + **kwargs, +): + tune_args = util.tune_args_from_cache_file(cache) + if restrictions: + new_restrictions = [tune_args['restrictions']] + if isinstance(restrictions, list): + new_restrictions.extend(restrictions) + else: + new_restrictions.append(restrictions) + tune_args['restrictions'] = new_restrictions + tune_args.update(kwargs) + return tune_kernel(simulation_mode=True, **tune_args) + + _run_kernel_docstring = """Compile and run a single kernel Compiles and runs a single kernel once, given a specific instance of the kernels tuning parameters. diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py index 5134f0fbd..1ede287dd 100644 --- a/kernel_tuner/runners/simulation.py +++ b/kernel_tuner/runners/simulation.py @@ -47,7 +47,8 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob :type iterations: int """ self.quiet = device_options.quiet - self.dev = SimulationDevice(1024, dict(device_name="Simulation"), self.quiet) + # NOTE(maric): had to increase max_threas so the default restraints would pass + self.dev = SimulationDevice(1_000_000_000, dict(device_name="Simulation"), self.quiet) self.kernel_source = kernel_source self.simulation_mode = True diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py index d7acd4ed9..e47f1edb0 100644 --- a/kernel_tuner/strategies/common.py +++ b/kernel_tuner/strategies/common.py @@ -60,11 +60,15 @@ def __init__(self, searchspace: Searchspace, tuning_options, runner, *, scaling= self.scaling = scaling self.searchspace = searchspace self.results = [] + self.total_config_count = 0 + self.illegal_config_count = 0 def __call__(self, x, check_restrictions=True): """Cost function used by almost all strategies.""" self.runner.last_strategy_time = 1000 * (perf_counter() - self.runner.last_strategy_start_time) + self.total_config_count += 1 + # error value to return for numeric optimizers that need a numerical value logging.debug('_cost_func called') logging.debug('x: ' + str(x)) @@ -94,10 +98,12 @@ def __call__(self, x, check_restrictions=True): result = params_dict # result[self.tuning_options.objective] = util.InvalidConfig() result['error'] = util.InvalidConfig() - - assert legal == ('error' not in result), "A legal config MUST NOT have an error result." + self.illegal_config_count += 1 if legal: + assert ('error' not in result), "A legal config MUST NOT have an error result." + if 'error' in result: exit() + # compile and benchmark this instance res = self.runner.run([params], self.tuning_options) result = res[0] @@ -111,18 +117,17 @@ def __call__(self, x, check_restrictions=True): # upon returning from this function control will be given back to the strategy, so reset the start time self.runner.last_strategy_start_time = perf_counter() - # get numerical return value, taking optimization direction into account - return_values = [] - for obj, higher_is_better in zip(self.tuning_options.objective, self.tuning_options.objective_higher_is_better): - return_value = result[obj] if 'error' not in result else sys.float_info.max - return_value = return_value if not higher_is_better else -return_value - return_values.append(return_value) + # get the cost of the result + cost_vec = util.get_result_cost( + result, + self.tuning_options.objective, + self.tuning_options.objective_higher_is_better + ) - if len(return_values) == 1: - return return_values[0] + if len(cost_vec) == 1: + return cost_vec[0] else: - # NOTE: MAYBE make this a tuple() - return return_values + return cost_vec def get_bounds_x0_eps(self): """Compute bounds, x0 (the initial guess), and eps.""" diff --git a/kernel_tuner/strategies/pymoo_minimize.py b/kernel_tuner/strategies/pymoo_minimize.py index c31ff0797..25060c04c 100644 --- a/kernel_tuner/strategies/pymoo_minimize.py +++ b/kernel_tuner/strategies/pymoo_minimize.py @@ -2,131 +2,277 @@ import numpy as np +from pymoo.algorithms.moo.nsga2 import NSGA2 +from pymoo.algorithms.moo.nsga3 import NSGA3 +from pymoo.core.mutation import Mutation +from pymoo.core.problem import ElementwiseProblem +from pymoo.core.sampling import Sampling +from pymoo.core.termination import NoTermination, Termination +from pymoo.core.repair import Repair +from pymoo.operators.crossover.ux import UniformCrossover +from pymoo.operators.crossover.pntx import TwoPointCrossover +from pymoo.optimize import minimize +from pymoo.util.ref_dirs import get_reference_directions +from pymoo.indicators.igd import IGD + from kernel_tuner import util from kernel_tuner.searchspace import Searchspace from kernel_tuner.strategies.common import ( CostFunc, - get_options, get_strategy_docstring, setup_method_arguments, - setup_method_options, ) -from kernel_tuner.strategies.genetic_algorithm import mutate - -from pymoo.optimize import minimize -from pymoo.core.problem import ElementwiseProblem -from pymoo.core.sampling import Sampling -from pymoo.core.mutation import Mutation -from pymoo.operators.crossover.ux import UX -from pymoo.algorithms.moo.nsga2 import NSGA2 # TODO: Add the PyMOO algorithms -supported_methods = [] +supported_methods = [ + "NSGA2", + "NSGA3", +] -_options = dict(method=(f"Pymoo optimization algorithm to use, choose any from {supported_methods}", "")) +_options = { + "method": (f"Pymoo optimization algorithm to use, choose any from {supported_methods}", "NSGA2"), + "pop_size": ("Initial population size", 100), +} -def tune(searchspace: Searchspace, runner, tuning_options): - # TODO: - # The idea is to create a Problem, Algorithm, and Termination - # then use to run `pymoo.optimize.minimize` - # so I basically need to write some adapter/integration code +def tune( + searchspace: Searchspace, + runner, + tuning_options, +): + strategy_options = tuning_options.strategy_options + + if "method" in strategy_options: + method = strategy_options["method"] + else: + (_, method) = _options["method"] + print(f"{method=}") - method = get_options(tuning_options.strategy_options, _options)[0] + if "pop_size" in strategy_options: + pop_size = strategy_options["pop_size"] + else: + (_, pop_size) = _options["pop_size"] + print(f"{pop_size=}") # scale variables in x to make 'eps' relevant for multiple variables cost_func = CostFunc(searchspace, tuning_options, runner, scaling=False) bounds, x0, _ = cost_func.get_bounds_x0_eps() kwargs = setup_method_arguments(method, bounds) - options = setup_method_options(method, tuning_options) - problem = KernelTunerProblem( - f = cost_func, - n_var = len(tuning_options.tune_params), - n_obj = len(tuning_options.objective), + problem = TuningProblem( + cost_func=cost_func, + n_var=len(tuning_options.tune_params), + n_obj=len(tuning_options.objective), ) - # TODO: make a pymoo.core.algorithm.Algorithm - # * use `method` to select the algorithm - # * use etc to define the algorithm - - # algorithm_type = get_algorithm - algorithm = NSGA2( - pop_size=100, - sampling=SearchspaceRandomSampling(searchspace), - crossover=UX(prob=0.6), - mutation=MutateToNeighbor(searchspace, prob=0.5), - ) + # algorithm_type = get_algorithm(method) + algorithm = None + if method == "NSGA2": + algorithm = NSGA2( + pop_size = pop_size, + sampling = SearchspaceRandomSampling(searchspace), + crossover = TwoPointCrossover(), + mutation = MutateToNeighbor(searchspace, prob = 0.5), + repair = RepairConfig(), + # save_history = True, + ) + elif method == "NSGA3": + algorithm = NSGA3( + pop_size = pop_size, + ref_dirs = get_reference_directions("das-dennis", len(tuning_options.objective), n_partitions = 26), + sampling = SearchspaceRandomSampling(searchspace), + crossover = UniformCrossover(prob = 0.6), + mutation = MutateToNeighbor(searchspace, prob = 0.5), + # repair = MyRepair(), + # save_history = True, + ) # TODO: # - CostFunc throws exception when done, so isn't really needed - termination = None # pymoo.core.termination.Termination() + termination = None + if "max_fevals" in tuning_options.strategy_options or "time_limit" in tuning_options.strategy_options: + termination = NoTermination() + + pf = problem.pareto_front() + igd_ind = IGD(pf, zero_to_one=True) - opt_result = None try: - opt_result = minimize(problem, algorithm, termination) + _ = algorithm.setup( + problem, + # termination = termination, + termination=("n_gen", 20), + seed=1, + verbose=True, + ) + + while algorithm.has_next(): + algorithm.next() + + illegal_count = cost_func.illegal_config_count + total_count = cost_func.total_config_count + print(f"config valid: {total_count - illegal_count}/{total_count} ({100 * (1 - (illegal_count / total_count)):.4}%)") + + print("IGD: ", igd_ind(algorithm.opt.get("F"))) + except util.StopCriterionReached as e: - print(f"Stopped because of {e}") if tuning_options.verbose: - print(e) + print(f"Stopped because of {e}") + + opt_result = cost_func.results if opt_result and tuning_options.verbose: print(f"{opt_result.message=}") - # print(f"{opt_result.message=}") - # print(f"{cost_func.results=}") - return cost_func.results + return opt_result tune.__doc__ = get_strategy_docstring("Pymoo minimize", _options) -class KernelTunerProblem(ElementwiseProblem): - def __init__(self, f, n_var, n_obj): +class TuningProblem(ElementwiseProblem): + def __init__( + self, + cost_func: CostFunc, + n_var, + n_obj, + **kwargs, + ): super().__init__( n_var = n_var, n_obj = n_obj, + **kwargs, ) - self.f = f + self.cost_func = cost_func + self.searchspace = cost_func.searchspace + self.tuning_options = cost_func.tuning_options - def _evaluate(self, x, out, *args, **kwargs): - F = self.f(x) + def _evaluate( + self, + x, + out, + *args, + **kwargs, + ): + F = self.cost_func(x) out["F"] = F + def _calc_pareto_front( + self, + *args, + **kwargs + ) -> np.ndarray | None: + # Can only compute the pareto front if we are in simulation mode. + if not self.tuning_options.simulation_mode: + return None + + objectives = self.tuning_options.objective + higher_is_better = self.tuning_options.objective_higher_is_better + pareto_results = util.get_pareto_results( + list(self.tuning_options.cache.values()), + objectives, + higher_is_better, + ) + + pareto_front_list = list() + for res in pareto_results: + cost = util.get_result_cost(res, objectives, higher_is_better) + pareto_front_list.append(cost) + + return np.array(pareto_front_list) + + +class TuningTermination(Termination): + def __init__( + self, + tuning_options, + ): + super().__init__() + self.tuning_options = tuning_options + self.reason = None + + def _update( + self, + algorithm, + ): + try: + util.check_stop_criterion(self.tuning_options) + print(f"progress: {len(self.tuning_options.unique_results) / self.tuning_options.max_fevals}") + return 0.0 + except util.StopCriterionReached as e: + self.terminate() + self.reason = e + return 1.0 + class SearchspaceRandomSampling(Sampling): - def __init__(self, searchspace): + def __init__( + self, + searchspace, + ): super().__init__() - self.ss = searchspace + self.searchspace = searchspace - def _do(self, problem, n_samples, **kwargs): - X = self.ss.get_random_sample(n_samples) + def _do( + self, + problem, + n_samples: int, + **kwargs, + ): + X = self.searchspace.get_random_sample(n_samples) return X class MutateToNeighbor(Mutation): def __init__( - self, - searchspace : Searchspace, - prob=1.0, - prob_var=None, - **kwargs - ): + self, + searchspace: Searchspace, + prob=1.0, + prob_var=None, + **kwargs + ): super().__init__( prob=prob, prob_var=prob_var, **kwargs, ) - self.ss = searchspace - - def _do(self, problem, X, **kwargs): - Xm = np.empty_like(X) - for i in range(X.shape[0]): - neighbors = self.ss.get_neighbors_indices_no_cache(tuple(X[i]), neighbor_method="Hamming") - # copy X[i] to result in case there are no neighbors - if len(neighbors) > 0: - Xm[i] = neighbors[np.random.choice(len(neighbors))] - else: - Xm[i] = X[i] - return Xm + self.searchspace = searchspace + + def _do( + self, + problem: TuningProblem, + X: np.ndarray, + **kwargs, + ): + for ind_index in range(X.shape[0]): + params_config_tuple = tuple(X[ind_index]) + neighbors_indices = self.searchspace.get_neighbors_indices_no_cache(params_config_tuple, neighbor_method="Hamming") + if len(neighbors_indices) > 0: + neighbor_index = neighbors_indices[np.random.choice(len(neighbors_indices))] + neighbor = self.searchspace.get_param_configs_at_indices([neighbor_index])[0] + X[ind_index] = np.array(neighbor) + + return X + + +class RepairConfig(Repair): + + def _do( + self, + problem: TuningProblem, + X : np.ndarray, + **kwargs, + ) -> np.ndarray: + for ind_index in range(X.shape[0]): + params_config_tuple = tuple(X[ind_index]) + if problem.searchspace.is_param_config_valid(params_config_tuple): + continue + for neighbor_method in ["strictly-adjacent", "adjacent", "Hamming"]: + neighbors_indices = problem.searchspace.get_neighbors_indices_no_cache(params_config_tuple, neighbor_method) + if len(neighbors_indices) > 0: + neighbor_index = neighbors_indices[np.random.choice(len(neighbors_indices))] + neighbor = problem.searchspace.get_param_configs_at_indices([neighbor_index])[0] + X[ind_index] = np.array(neighbor) + break + + return X diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 297538b30..3cced7390 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -79,8 +79,27 @@ def default(self, obj): return super(NpEncoder, self).default(obj) +def get_result_cost( + result: dict, + objectives: list[str], + objective_higher_is_better: list[bool] +) -> list[float]: + """Returns the cost of a result, taking the objective directions into account.""" + # return the highest cost for invalid results + if 'error' in result: + return [sys.float_info.max] * len(objectives) + + cost_vec = list() + for objective, is_maximizer in zip(objectives, objective_higher_is_better): + objective_value = result[objective] + cost = -objective_value if is_maximizer else objective_value + cost_vec.append(cost) + + return cost_vec + + def check_result_type(r): - "Check if the result has the right format." + """Check if the result has the right format.""" if 'error' in r: return isinstance(r['error'], ErrorConfig) return True @@ -198,10 +217,20 @@ def check_argument_list(kernel_name, kernel_string, args): def check_stop_criterion(to): """Checks if max_fevals is reached or time limit is exceeded.""" - if "max_fevals" in to and len(to.unique_results) >= to.max_fevals: - raise StopCriterionReached("max_fevals reached") - if "time_limit" in to and (((time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3)) > to.time_limit): - raise StopCriterionReached("time limit exceeded") + if "max_fevals" in to: + if to.verbose: + print(f"Progress: {len(to.unique_results)/to.max_fevals}") + if len(to.unique_results) >= to.max_fevals: + raise StopCriterionReached("max_fevals reached") + if "time_limit" in to: + # if to.verbose: + # print(f"Progress: {((time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3)) / to.time_limit}") + # if (((time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3)) > to.time_limit): + # raise StopCriterionReached("time limit exceeded") + if to.verbose: + print(f"Progress: {((time.perf_counter() - to.start_time)) / to.time_limit}") + if (((time.perf_counter() - to.start_time)) > to.time_limit): + raise StopCriterionReached("time limit exceeded") def check_tune_params_list(tune_params, observers, simulation_mode=False): @@ -405,35 +434,37 @@ def get_best_config(results, objective, objective_higher_is_better=False): return best_config -def get_pareto_front(results, objective, objective_higher_is_better): - assert isinstance(objective, list) +def get_pareto_results( + results: list[dict], + objectives: list[str], + objective_higher_is_better: list[bool], + mark_optima=True +): + assert isinstance(results, list) + assert isinstance(objectives, list) - nonerror_results = list(filter(lambda x: "error" not in x, results)) - front = [] + cost_points = list() + for res in results: + cost_point = get_result_cost(res, objectives, objective_higher_is_better) + cost_points.append(cost_point) + + cost_points = np.asarray(cost_points, dtype=float) + is_efficient = np.ones(cost_points.shape[0], dtype=bool) + + # A point `p` in a finite set of points `S` is said to be non-dominated if there is no other point `q` in `S` where `q(i) <= p(i)` for all `i` + for idx, cost_point in enumerate(cost_points): + if not is_efficient[idx]: + continue + is_efficient[is_efficient] = np.any(cost_points[is_efficient] <= cost_point, axis=1) + + # select and mark the optimal points + front = list() + for idx in np.flatnonzero(is_efficient): + res = results[idx] + if mark_optima: + res['optimal'] = True + front.append(res) - # A point `p` in a finite set of points `S` is said to be maximal or non-dominated if there is no other point `q` in `S` whose `q(i)` are all >= `p(i)` - # So for all q there must be a q(i) such that q(i) < p(i) - for p in nonerror_results: - p_nondom = True - for q in nonerror_results: - if p is q: - continue - # \forall(i): q(i) >= p(i)? - flag = True - for i, higher_is_better in zip(objective, objective_higher_is_better): - p_i, q_i = p[i], q[i] - if not higher_is_better: - p_i, q_i = -p_i, -q_i - if q_i < p_i: - flag = False - break - if flag: - p_nondom = False - break - if p_nondom: - p["optimal"] = True - front.append(p) - return front @@ -1197,7 +1228,8 @@ def process_cache(cache, kernel_options, tuning_options, runner): # if file exists else: - cached_data = read_cache(cache) + # cached_data = read_cache(cache) + cached_data = read_cache(cache, open_cache=(not runner.simulation_mode)) # if in simulation mode, use the device name from the cache file as the runner device name if runner.simulation_mode: @@ -1327,3 +1359,45 @@ def cuda_error_check(error): if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: _, desc = nvrtc.nvrtcGetErrorString(error) raise RuntimeError(f"NVRTC error: {desc.decode()}") + + +def restriction_from_cache(cache: dict): + param_config_string_set = set( + param_config_string + for param_config_string, result in cache['cache'].items() + if 'error' not in result + ) + + # print(f"WTH: {len(config_strings)}/{len(list(cache['cache'].keys()))}") + + def _restrictions_func(params_config: dict) -> bool: + nonlocal param_config_string_set + + param_config_string = ",".join(map(str, params_config.values())) + return param_config_string in param_config_string_set + + return _restrictions_func + + +def tune_args_from_cache_file(cache_file_path) -> dict: + with open(cache_file_path, mode="r") as cache_file: + cache = json.load(cache_file) + + tune_args = dict( + kernel_name=cache['kernel_name'], + kernel_source="", + problem_size=tuple(cache['problem_size']), + arguments=[], + tune_params=cache['tune_params'], + restrictions=restriction_from_cache(cache), + cache=cache_file_path, + ) + + return tune_args + + +def results_from_cache_file(cache_file_path) -> list[dict]: + with open(cache_file_path, mode="r") as cache_file: + cache = json.load(cache_file) + + return list(cache['cache'].values()) From ee08aaf76412123613f088757c6978ef10d15f03 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:20:22 +0100 Subject: [PATCH 06/18] Replace the error key "error" with "__error__" to make collision with user defined params and metrics far less likely. --- kernel_tuner/core.py | 8 ++++---- kernel_tuner/file_utils.py | 2 +- kernel_tuner/runners/sequential.py | 4 ++-- kernel_tuner/strategies/common.py | 3 +-- kernel_tuner/util.py | 10 +++++----- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 594ea3b78..6fb478ce2 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -481,14 +481,14 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch" ) # result[objective] = util.RuntimeFailedConfig() - result['error'] = util.RuntimeFailedConfig() + result['__error__'] = util.RuntimeFailedConfig() else: logging.debug("benchmark encountered runtime failure: " + str(e)) print("Error while benchmarking:", instance.name) raise e assert util.check_result_type(result), "The error in a result MUST be an actual error." - + return result def check_kernel_output( @@ -576,7 +576,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): # result[to.objective] = util.InvalidConfig() - result['error'] = util.InvalidConfig() + result['__error__'] = util.InvalidConfig() else: # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s gpu_args = _preprocess_gpu_arguments(gpu_args, params) @@ -587,7 +587,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, func = self.compile_kernel(instance, verbose) if not func: # result[to.objective] = util.CompilationFailedConfig() - result['error'] = util.CompilationFailedConfig() + result['__error__'] = util.CompilationFailedConfig() else: # add shared memory arguments to compiled module if kernel_options.smem_args is not None: diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py index fb53d5956..9d7b7042c 100644 --- a/kernel_tuner/file_utils.py +++ b/kernel_tuner/file_utils.py @@ -111,7 +111,7 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim # encode the validity of the configuration # out["invalidity"] = get_configuration_validity(result[objective]) - out["invalidity"] = get_configuration_validity(result['error']) + out["invalidity"] = get_configuration_validity(result['__error__']) # Kernel Tuner does not support producing results of configs that fail the correctness check # therefore correctness is always 1 diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 95bfff500..5a6f1a5a3 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -95,12 +95,12 @@ def run(self, parameter_space, tuning_options): params.update(result) # if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): - if 'error' in result: + if '__error__' in result: logging.debug('kernel configuration was skipped silently due to compile or runtime failure') # only compute metrics on configs that have not errored # if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): - if 'error' in params: + if '__error__' in params: params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py index e47f1edb0..4dc3a1bd4 100644 --- a/kernel_tuner/strategies/common.py +++ b/kernel_tuner/strategies/common.py @@ -96,8 +96,7 @@ def __call__(self, x, check_restrictions=True): legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose) if not legal: result = params_dict - # result[self.tuning_options.objective] = util.InvalidConfig() - result['error'] = util.InvalidConfig() + result['__error__'] = util.InvalidConfig() self.illegal_config_count += 1 if legal: diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 3cced7390..b1d5380fe 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -86,7 +86,7 @@ def get_result_cost( ) -> list[float]: """Returns the cost of a result, taking the objective directions into account.""" # return the highest cost for invalid results - if 'error' in result: + if '__error__' in result: return [sys.float_info.max] * len(objectives) cost_vec = list() @@ -100,8 +100,8 @@ def get_result_cost( def check_result_type(r): """Check if the result has the right format.""" - if 'error' in r: - return isinstance(r['error'], ErrorConfig) + if '__error__' in r: + return isinstance(r['__error__'], ErrorConfig) return True @@ -429,7 +429,7 @@ def get_best_config(results, objective, objective_higher_is_better=False): ignore_val = sys.float_info.max if not objective_higher_is_better else -sys.float_info.max best_config = func( results, - key=lambda x: x[objective] if 'error' not in x and isinstance(x[objective], float) else ignore_val, + key=lambda x: x[objective] if '__error__' not in x and isinstance(x[objective], float) else ignore_val, ) return best_config @@ -1365,7 +1365,7 @@ def restriction_from_cache(cache: dict): param_config_string_set = set( param_config_string for param_config_string, result in cache['cache'].items() - if 'error' not in result + if '__error__' not in result ) # print(f"WTH: {len(config_strings)}/{len(list(cache['cache'].keys()))}") From 5022750e5a1f0072d44246c0d6ebff1554a0580c Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:21:32 +0100 Subject: [PATCH 07/18] New pymoo version needs at least Python 3.10 --- noxfile.py | 6 +++--- pyproject.toml | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/noxfile.py b/noxfile.py index e32bbb588..fe26ef1d4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -15,7 +15,7 @@ # set the test parameters verbose = False -python_versions_to_test = ["3.9", "3.10", "3.11", "3.12"] +python_versions_to_test = ["3.10", "3.11", "3.12"] nox.options.stop_on_first_error = True nox.options.error_on_missing_interpreters = True nox.options.default_venv_backend = 'virtualenv' @@ -38,7 +38,7 @@ def create_settings(session: Session) -> None: venvbackend = nox.options.default_venv_backend envdir = "" # conversion from old notenv.txt - if noxenv_file_path.exists(): + if noxenv_file_path.exists(): venvbackend = noxenv_file_path.read_text().strip() noxenv_file_path.unlink() # write the settings @@ -91,7 +91,7 @@ def check_development_environment(session: Session) -> None: # packages = re.findall(r"• Installing .* | • Updating .*", output, flags=re.MULTILINE) # assert packages is not None session.warn(f""" - Your development environment is out of date ({installs} installs, {updates} updates). + Your development environment is out of date ({installs} installs, {updates} updates). Update with 'poetry install --sync', using '--with' and '-E' for optional dependencies, extras respectively. Note: {removals} packages are not in the specification (i.e. installed manually) and may be removed. To preview changes, run 'poetry install --sync --dry-run' (with optional dependencies and extras).""") diff --git a/pyproject.toml b/pyproject.toml index 48034bf15..2791b929c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ generate-setup-file = false # ATTENTION: if anything is changed here, run `poetry update` [tool.poetry.dependencies] -python = ">=3.9,<3.13" # NOTE when changing the supported Python versions, also change the test versions in the noxfile +python = ">=3.10,<3.13" # NOTE when changing the supported Python versions, also change the test versions in the noxfile numpy = "^1.26.0" # Python 3.12 requires numpy at least 1.26 scipy = ">=1.11.0" packaging = "*" # required by file_utils @@ -84,6 +84,7 @@ hip-python = { version = "*", optional = true } # Tutorial (for the notebooks used in the examples) jupyter = { version = "^1.0.0", optional = true } matplotlib = { version = "^3.5.0", optional = true } +pymoo = "^0.6.1.6" [tool.poetry.extras] cuda = ["pycuda", "nvidia-ml-py", "pynvml"] From cd3f16afde81ee913e22242e0ef56c0220f6ef1f Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:20:40 +0100 Subject: [PATCH 08/18] Use pymoo's impl of non dom sorting --- kernel_tuner/util.py | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index b1d5380fe..06328b730 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -43,6 +43,8 @@ from kernel_tuner.observers.nvml import NVMLObserver +from pymoo.util.nds.find_non_dominated import find_non_dominated + # number of special values to insert when a configuration cannot be measured @@ -443,29 +445,24 @@ def get_pareto_results( assert isinstance(results, list) assert isinstance(objectives, list) - cost_points = list() - for res in results: - cost_point = get_result_cost(res, objectives, objective_higher_is_better) - cost_points.append(cost_point) - - cost_points = np.asarray(cost_points, dtype=float) - is_efficient = np.ones(cost_points.shape[0], dtype=bool) - - # A point `p` in a finite set of points `S` is said to be non-dominated if there is no other point `q` in `S` where `q(i) <= p(i)` for all `i` - for idx, cost_point in enumerate(cost_points): - if not is_efficient[idx]: + n_rows = len(results) + n_cols = len(objectives) + Y = np.empty((n_rows, n_cols), dtype=float) + for row_idx, result in enumerate(results): + if "__error__" in result: + Y[row_idx, :] = sys.float_info.max continue - is_efficient[is_efficient] = np.any(cost_points[is_efficient] <= cost_point, axis=1) - - # select and mark the optimal points - front = list() - for idx in np.flatnonzero(is_efficient): - res = results[idx] - if mark_optima: - res['optimal'] = True - front.append(res) - - return front + for col_idx, (objective_name, higher_is_better) in enumerate(zip(objectives, objective_higher_is_better)): + y = result[objective_name] + # negate for maximizers to optimize through minimization + Y[row_idx, col_idx] = -y if higher_is_better else y + + pf_indices = find_non_dominated(Y) + pf = [results[idx] for idx in pf_indices] + if mark_optima: + for p in pf: + p["optimal"] = True + return pf def get_config_string(params, keys=None, units=None): From e8cbcda6f83e1070a9e2014640fcd57531125484 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:20:45 +0100 Subject: [PATCH 09/18] - update tune_cache() interface - improve inferred restrictions - improve check_restriction() to make it less inefficient for `FunctionConstraint`s --- kernel_tuner/interface.py | 20 +++++++++------ kernel_tuner/util.py | 53 +++++++++++++++++---------------------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 0c0282ad0..0a78bc051 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -744,21 +744,25 @@ def tune_kernel( tune_kernel.__doc__ = _tune_kernel_docstring -def tune_cache( - cache, +def tune_cache(*, + cache_path, restrictions = None, **kwargs, ): - tune_args = util.tune_args_from_cache_file(cache) + cache = util.read_cache(cache_path, open_cache=False) + tune_args = util.infer_args_from_cache(cache) + _restrictions = [util.infer_restrictions_from_cache(cache)] + + # Add the user provided restrictions if restrictions: - new_restrictions = [tune_args['restrictions']] if isinstance(restrictions, list): - new_restrictions.extend(restrictions) + _restrictions.extend(restrictions) else: - new_restrictions.append(restrictions) - tune_args['restrictions'] = new_restrictions + raise ValueError("The restrictions must be a list()") + tune_args.update(kwargs) - return tune_kernel(simulation_mode=True, **tune_args) + + return tune_kernel(**tune_args, cache=cache_path, restrictions=_restrictions, simulation_mode=True) _run_kernel_docstring = """Compile and run a single kernel diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 06328b730..5d872ce76 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -282,8 +282,11 @@ def check_block_size_params_names_list(block_size_names, tune_params): def check_restriction(restrict, params: dict) -> bool: """Check whether a configuration meets a search space restriction.""" + # if it's a function python-constraint it can be called directly + if isinstance(restrict, FunctionConstraint): + return restrict._func(*params.values()) # if it's a python-constraint, convert to function and execute - if isinstance(restrict, Constraint): + elif isinstance(restrict, Constraint): restrict = convert_constraint_restriction(restrict) return restrict(list(params.values())) # if it's a string, fill in the parameters and evaluate @@ -1358,43 +1361,33 @@ def cuda_error_check(error): raise RuntimeError(f"NVRTC error: {desc.decode()}") -def restriction_from_cache(cache: dict): - param_config_string_set = set( - param_config_string - for param_config_string, result in cache['cache'].items() +def infer_restrictions_from_cache(cache: dict): + param_names = cache["tune_params_keys"] + valid_param_config_set = set( + tuple(result[param_name] for param_name in param_names) + for result in cache['cache'].values() if '__error__' not in result ) - # print(f"WTH: {len(config_strings)}/{len(list(cache['cache'].keys()))}") - - def _restrictions_func(params_config: dict) -> bool: - nonlocal param_config_string_set - - param_config_string = ",".join(map(str, params_config.values())) - return param_config_string in param_config_string_set + def restrictions_func(*param_values) -> bool: + nonlocal valid_param_config_set + return param_values in valid_param_config_set - return _restrictions_func + return FunctionConstraint(restrictions_func) -def tune_args_from_cache_file(cache_file_path) -> dict: - with open(cache_file_path, mode="r") as cache_file: - cache = json.load(cache_file) - - tune_args = dict( - kernel_name=cache['kernel_name'], - kernel_source="", - problem_size=tuple(cache['problem_size']), - arguments=[], - tune_params=cache['tune_params'], - restrictions=restriction_from_cache(cache), - cache=cache_file_path, +def infer_args_from_cache(cache: dict) -> dict: + inferred_args = dict( + kernel_name = cache['kernel_name'], + kernel_source = "", + problem_size = tuple(cache['problem_size']), + arguments = [], + tune_params = cache['tune_params'], + # restrictions = infer_restrictions_from_cache(cache), ) - return tune_args - + return inferred_args -def results_from_cache_file(cache_file_path) -> list[dict]: - with open(cache_file_path, mode="r") as cache_file: - cache = json.load(cache_file) +def get_results_from_cache(cache) -> list[dict]: return list(cache['cache'].values()) From 1fa76b2368f1e9a2c6ba015afb9f23212db20d48 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:24:38 +0100 Subject: [PATCH 10/18] fixed and improved version of the pymoo strat --- kernel_tuner/interface.py | 5 +- kernel_tuner/strategies/pymoo_minimize.py | 230 +++++++++++----------- 2 files changed, 117 insertions(+), 118 deletions(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 0a78bc051..42a6577e2 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -76,7 +76,8 @@ "simulated_annealing": simulated_annealing, "firefly_algorithm": firefly_algorithm, "bayes_opt": bayes_opt, - "pymoo_minimize": pymoo_minimize, + "nsga2": pymoo_minimize, + "nsga3": pymoo_minimize, } @@ -466,6 +467,7 @@ def __deepcopy__(self, _): ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")), ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")), ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")), + ("seed", ("""The random seed.""", "int")), ] ) @@ -580,6 +582,7 @@ def tune_kernel( objective=None, objective_higher_is_better=None, objectives=None, + seed=None, ): start_overhead_time = perf_counter() if log: diff --git a/kernel_tuner/strategies/pymoo_minimize.py b/kernel_tuner/strategies/pymoo_minimize.py index 25060c04c..0ac530941 100644 --- a/kernel_tuner/strategies/pymoo_minimize.py +++ b/kernel_tuner/strategies/pymoo_minimize.py @@ -1,92 +1,112 @@ """The Pymoo strategy that uses a minimizer method for searching through the parameter space.""" +from typing import assert_never import numpy as np from pymoo.algorithms.moo.nsga2 import NSGA2 from pymoo.algorithms.moo.nsga3 import NSGA3 -from pymoo.core.mutation import Mutation +from pymoo.core.algorithm import Algorithm from pymoo.core.problem import ElementwiseProblem -from pymoo.core.sampling import Sampling +from pymoo.core.duplicate import ElementwiseDuplicateElimination from pymoo.core.termination import NoTermination, Termination +from pymoo.core.sampling import Sampling +from pymoo.core.mutation import Mutation from pymoo.core.repair import Repair -from pymoo.operators.crossover.ux import UniformCrossover from pymoo.operators.crossover.pntx import TwoPointCrossover -from pymoo.optimize import minimize -from pymoo.util.ref_dirs import get_reference_directions -from pymoo.indicators.igd import IGD from kernel_tuner import util +from kernel_tuner.runners.runner import Runner from kernel_tuner.searchspace import Searchspace from kernel_tuner.strategies.common import ( CostFunc, get_strategy_docstring, - setup_method_arguments, ) -# TODO: Add the PyMOO algorithms -supported_methods = [ - "NSGA2", - "NSGA3", +from enum import StrEnum + +class SupportedAlgos(StrEnum): + NSGA2 = "nsga2" + NSGA3 = "nsga3" + +supported_algos = [ algo.value for algo in SupportedAlgos ] + +supported_crossover_opers = [ + # "uniform-crossover", + # "single-point-crossover", + "two-point-crossover", ] _options = { - "method": (f"Pymoo optimization algorithm to use, choose any from {supported_methods}", "NSGA2"), - "pop_size": ("Initial population size", 100), + "pop_size": ("Initial population size", 20), + "crossover_operator": ("The crossover operator", "two-point-crossover"), + "crossover_prob": ("Crossover probability", 1.0), + "mutation_prob": ("Mutation probability", 0.1), + "ref_dirs_list": ("The list of reference directions on the unit hyperplane in the objective space to guide NSGA-III, see https://pymoo.org/misc/reference_directions.html for more information.", []), } +_option_defaults = { key: option_pair[1] for key, option_pair in _options.items() } + def tune( searchspace: Searchspace, - runner, + runner: Runner, tuning_options, ): + algo_name: str = tuning_options.strategy strategy_options = tuning_options.strategy_options - if "method" in strategy_options: - method = strategy_options["method"] + algo_name = algo_name.lower() + if algo_name not in SupportedAlgos: + raise ValueError(f"\"{algo_name}\" is not supported. The supported algorithms are: {supported_algos}\n") else: - (_, method) = _options["method"] - print(f"{method=}") + algo_name = SupportedAlgos(algo_name) - if "pop_size" in strategy_options: - pop_size = strategy_options["pop_size"] - else: - (_, pop_size) = _options["pop_size"] - print(f"{pop_size=}") + pop_size = strategy_options.get("pop_size", _option_defaults["pop_size"]) + crossover_prob = strategy_options.get("crossover_prob", _option_defaults["crossover_prob"]) + mutation_prob = strategy_options.get("mutation_prob", _option_defaults["mutation_prob"]) + ref_dirs_list = strategy_options.get("ref_dirs_list", _option_defaults["ref_dirs_list"]) - # scale variables in x to make 'eps' relevant for multiple variables - cost_func = CostFunc(searchspace, tuning_options, runner, scaling=False) + if algo_name == "nsga3" and len(ref_dirs_list) == 0: + raise ValueError("NSGA-III requires reference directions to be specified, but they are missing.") - bounds, x0, _ = cost_func.get_bounds_x0_eps() - kwargs = setup_method_arguments(method, bounds) + cost_func = CostFunc(searchspace, tuning_options, runner, scaling=False) problem = TuningProblem( - cost_func=cost_func, - n_var=len(tuning_options.tune_params), - n_obj=len(tuning_options.objective), + cost_func = cost_func, + n_var = len(tuning_options.tune_params), + n_obj = len(tuning_options.objective), ) + sampling = TuningSearchspaceRandomSampling(searchspace) + crossover = TwoPointCrossover(prob = crossover_prob) + mutation = TuningParamConfigNeighborhoodMutation(prob = mutation_prob, searchspace = searchspace) + repair = TuningParamConfigRepair() + eliminate_duplicates = TuningParamConfigDuplicateElimination() + # algorithm_type = get_algorithm(method) - algorithm = None - if method == "NSGA2": - algorithm = NSGA2( - pop_size = pop_size, - sampling = SearchspaceRandomSampling(searchspace), - crossover = TwoPointCrossover(), - mutation = MutateToNeighbor(searchspace, prob = 0.5), - repair = RepairConfig(), - # save_history = True, - ) - elif method == "NSGA3": - algorithm = NSGA3( - pop_size = pop_size, - ref_dirs = get_reference_directions("das-dennis", len(tuning_options.objective), n_partitions = 26), - sampling = SearchspaceRandomSampling(searchspace), - crossover = UniformCrossover(prob = 0.6), - mutation = MutateToNeighbor(searchspace, prob = 0.5), - # repair = MyRepair(), - # save_history = True, - ) + algo: Algorithm + match algo_name: + case SupportedAlgos.NSGA2: + algo = NSGA2( + pop_size = pop_size, + sampling = sampling, + crossover = crossover, + mutation = mutation, + repair = repair, + eliminate_duplicates = eliminate_duplicates, + ) + case SupportedAlgos.NSGA3: + algo = NSGA3( + pop_size = pop_size, + ref_dirs = ref_dirs_list, + sampling = sampling, + crossover = crossover, + mutation = mutation, + repair = repair, + eliminate_duplicates = eliminate_duplicates, + ) + case _ as unreachable: + assert_never(unreachable) # TODO: # - CostFunc throws exception when done, so isn't really needed @@ -94,37 +114,28 @@ def tune( if "max_fevals" in tuning_options.strategy_options or "time_limit" in tuning_options.strategy_options: termination = NoTermination() - pf = problem.pareto_front() - igd_ind = IGD(pf, zero_to_one=True) - try: - _ = algorithm.setup( + algo.setup( problem, - # termination = termination, - termination=("n_gen", 20), - seed=1, - verbose=True, + termination = termination, + verbose = tuning_options.verbose, + progress = tuning_options.verbose, + seed = tuning_options.seed, ) - while algorithm.has_next(): - algorithm.next() - - illegal_count = cost_func.illegal_config_count - total_count = cost_func.total_config_count - print(f"config valid: {total_count - illegal_count}/{total_count} ({100 * (1 - (illegal_count / total_count)):.4}%)") - - print("IGD: ", igd_ind(algorithm.opt.get("F"))) + while algo.has_next(): + algo.next() except util.StopCriterionReached as e: if tuning_options.verbose: print(f"Stopped because of {e}") - opt_result = cost_func.results + results = cost_func.results - if opt_result and tuning_options.verbose: - print(f"{opt_result.message=}") + if results and tuning_options.verbose: + print(f"{results.message=}") - return opt_result + return results tune.__doc__ = get_strategy_docstring("Pymoo minimize", _options) @@ -134,8 +145,8 @@ class TuningProblem(ElementwiseProblem): def __init__( self, cost_func: CostFunc, - n_var, - n_obj, + n_var: int, + n_obj: int, **kwargs, ): super().__init__( @@ -147,21 +158,12 @@ def __init__( self.searchspace = cost_func.searchspace self.tuning_options = cost_func.tuning_options - def _evaluate( - self, - x, - out, - *args, - **kwargs, - ): - F = self.cost_func(x) + def _evaluate( self, x, out, *args, **kwargs, ): + # A copy of `x` is made to make sure sharing does not happen + F = self.cost_func(tuple(x)) out["F"] = F - def _calc_pareto_front( - self, - *args, - **kwargs - ) -> np.ndarray | None: + def _calc_pareto_front( self, *args, **kwargs, ): # Can only compute the pareto front if we are in simulation mode. if not self.tuning_options.simulation_mode: return None @@ -179,14 +181,11 @@ def _calc_pareto_front( cost = util.get_result_cost(res, objectives, higher_is_better) pareto_front_list.append(cost) - return np.array(pareto_front_list) + return np.array(pareto_front_list, dtype=float) class TuningTermination(Termination): - def __init__( - self, - tuning_options, - ): + def __init__( self, tuning_options, ): super().__init__() self.tuning_options = tuning_options self.reason = None @@ -205,35 +204,26 @@ def _update( return 1.0 -class SearchspaceRandomSampling(Sampling): - def __init__( - self, - searchspace, - ): +class TuningSearchspaceRandomSampling(Sampling): + def __init__( self, searchspace, ): super().__init__() self.searchspace = searchspace - def _do( - self, - problem, - n_samples: int, - **kwargs, - ): - X = self.searchspace.get_random_sample(n_samples) - return X + def _do( self, problem, n_samples: int, **kwargs, ): + sample = self.searchspace.get_random_sample(n_samples) + return np.array(sample, dtype=object) -class MutateToNeighbor(Mutation): +class TuningParamConfigNeighborhoodMutation(Mutation): def __init__( self, + prob, searchspace: Searchspace, - prob=1.0, - prob_var=None, **kwargs ): super().__init__( - prob=prob, - prob_var=prob_var, + prob = prob, + # prob_var = None, **kwargs, ) self.searchspace = searchspace @@ -244,27 +234,27 @@ def _do( X: np.ndarray, **kwargs, ): - for ind_index in range(X.shape[0]): - params_config_tuple = tuple(X[ind_index]) + for X_index in range(X.shape[0]): + params_config_tuple = tuple(X[X_index]) neighbors_indices = self.searchspace.get_neighbors_indices_no_cache(params_config_tuple, neighbor_method="Hamming") if len(neighbors_indices) > 0: neighbor_index = neighbors_indices[np.random.choice(len(neighbors_indices))] neighbor = self.searchspace.get_param_configs_at_indices([neighbor_index])[0] - X[ind_index] = np.array(neighbor) + X[X_index] = np.array(neighbor, dtype=object) return X -class RepairConfig(Repair): +class TuningParamConfigRepair(Repair): def _do( self, problem: TuningProblem, - X : np.ndarray, + X: np.ndarray, **kwargs, - ) -> np.ndarray: - for ind_index in range(X.shape[0]): - params_config_tuple = tuple(X[ind_index]) + ): + for X_index in range(X.shape[0]): + params_config_tuple = tuple(X[X_index]) if problem.searchspace.is_param_config_valid(params_config_tuple): continue for neighbor_method in ["strictly-adjacent", "adjacent", "Hamming"]: @@ -272,7 +262,13 @@ def _do( if len(neighbors_indices) > 0: neighbor_index = neighbors_indices[np.random.choice(len(neighbors_indices))] neighbor = problem.searchspace.get_param_configs_at_indices([neighbor_index])[0] - X[ind_index] = np.array(neighbor) + X[X_index] = np.array(neighbor, dtype=object) break return X + + +class TuningParamConfigDuplicateElimination(ElementwiseDuplicateElimination): + + def is_equal(self, a, b): + return np.all(a.X == b.X) From 393a79bf33fe9bc808cdabb9a5d5e14fe7864f73 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:25:47 +0100 Subject: [PATCH 11/18] improves infeasable evals counter --- kernel_tuner/runners/sequential.py | 11 +++++++++-- kernel_tuner/runners/simulation.py | 10 ++++++---- kernel_tuner/strategies/common.py | 11 ++++------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 5a6f1a5a3..79d11f889 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -44,8 +44,15 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob #move data to the GPU self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments) + # It is the task of the cost function to increment there counters + self.config_eval_count = 0 + self.infeasable_config_eval_count = 0 + def get_environment(self, tuning_options): - return self.dev.get_environment() + env = self.dev.get_environment() + env["config_eval_count"] = self.config_eval_count + env["infeasable_config_eval_count"] = self.infeasable_config_eval_count + return env def run(self, parameter_space, tuning_options): """Iterate through the entire parameter space using a single Python process. @@ -104,7 +111,7 @@ def run(self, parameter_space, tuning_options): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) params['strategy_time'] = self.last_strategy_time params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) params['timestamp'] = str(datetime.now(timezone.utc)) diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py index 1ede287dd..cd181288a 100644 --- a/kernel_tuner/runners/simulation.py +++ b/kernel_tuner/runners/simulation.py @@ -59,10 +59,16 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob self.last_strategy_time = 0 self.units = {} + # It is the task of the cost function to increment there counters + self.config_eval_count = 0 + self.infeasable_config_eval_count = 0 + def get_environment(self, tuning_options): env = self.dev.get_environment() env["simulation"] = True env["simulated_time"] = tuning_options.simulated_time + env["config_eval_count"] = self.config_eval_count + env["infeasable_config_eval_count"] = self.infeasable_config_eval_count return env def run(self, parameter_space, tuning_options): @@ -91,8 +97,6 @@ def run(self, parameter_space, tuning_options): if tuning_options.cache and x_int in tuning_options.cache: result = tuning_options.cache[x_int].copy() - assert util.check_result_type(result) - # Simulate behavior of sequential runner that when a configuration is # served from the cache by the sequential runner, the compile_time, # verification_time, and benchmark_time are set to 0. @@ -127,8 +131,6 @@ def run(self, parameter_space, tuning_options): self.start_time = perf_counter() result['framework_time'] = total_time - self.last_strategy_time - assert util.check_result_type(result) - results.append(result) continue diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py index 4dc3a1bd4..1901476f2 100644 --- a/kernel_tuner/strategies/common.py +++ b/kernel_tuner/strategies/common.py @@ -60,15 +60,11 @@ def __init__(self, searchspace: Searchspace, tuning_options, runner, *, scaling= self.scaling = scaling self.searchspace = searchspace self.results = [] - self.total_config_count = 0 - self.illegal_config_count = 0 def __call__(self, x, check_restrictions=True): """Cost function used by almost all strategies.""" self.runner.last_strategy_time = 1000 * (perf_counter() - self.runner.last_strategy_start_time) - self.total_config_count += 1 - # error value to return for numeric optimizers that need a numerical value logging.debug('_cost_func called') logging.debug('x: ' + str(x)) @@ -76,6 +72,8 @@ def __call__(self, x, check_restrictions=True): # check if max_fevals is reached or time limit is exceeded util.check_stop_criterion(self.tuning_options) + self.runner.config_eval_count += 1 + # snap values in x to nearest actual value for each parameter, unscale x if needed if self.snap: if self.scaling: @@ -97,11 +95,10 @@ def __call__(self, x, check_restrictions=True): if not legal: result = params_dict result['__error__'] = util.InvalidConfig() - self.illegal_config_count += 1 + self.runner.infeasable_config_eval_count += 1 if legal: - assert ('error' not in result), "A legal config MUST NOT have an error result." - if 'error' in result: exit() + assert ('__error__' not in result), "A legal config MUST NOT have an error result." # compile and benchmark this instance res = self.runner.run([params], self.tuning_options) From e576048ceae09def97e202f98cc3258ab2964309 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Sun, 4 Jan 2026 01:28:58 +0100 Subject: [PATCH 12/18] Removes superfluous print --- kernel_tuner/interface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 42a6577e2..a158edc19 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -708,7 +708,6 @@ def tune_kernel( objective = objective[0] objective_higher_is_better = objective_higher_is_better[0] best_config = util.get_best_config(results, objective, objective_higher_is_better) - print(best_config) # add the best configuration to env env['best_config'] = best_config if not device_options.quiet: From 86f5ea7e54ab6053d75d2f21c950de1f319b81f5 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 21 Jan 2026 19:57:42 +0100 Subject: [PATCH 13/18] removes unused func --- kernel_tuner/util.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 5d872ce76..fc2c941ed 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1387,7 +1387,3 @@ def infer_args_from_cache(cache: dict) -> dict: ) return inferred_args - - -def get_results_from_cache(cache) -> list[dict]: - return list(cache['cache'].values()) From f6c795b26ddabd2efb3f44f26b7e58b07d115a24 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:09:38 +0100 Subject: [PATCH 14/18] improves `tune_kernel()` code --- kernel_tuner/interface.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index a158edc19..c13a3d0a1 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -592,23 +592,19 @@ def tune_kernel( _check_user_input(kernel_name, kernelsource, arguments, block_size_names) - # default objective if none is specified - # if len(list(objective)) == 1: - # objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better) - - if isinstance(objective, str): - objective = [objective] - - if isinstance(objective_higher_is_better, bool): - objective_higher_is_better = [objective_higher_is_better] - if objectives: if isinstance(objectives, dict): objective = list(objectives.keys()) objective_higher_is_better = list(objectives.values()) else: raise ValueError("objectives should be a dict of (objective, higher_is_better) pairs") + else: + objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better) + objective = [objective] + objective_higher_is_better = [objective_higher_is_better] + assert isinstance(objective, list) + assert isinstance(objective_higher_is_better, list) assert len(list(objective)) == len(list(objective_higher_is_better)) # check for forbidden names in tune parameters From 0dba06b65eabbd222b16125888d25486bda9f9f9 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:09:55 +0100 Subject: [PATCH 15/18] makes tests compat with changes --- test/strategies/test_common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/strategies/test_common.py b/test/strategies/test_common.py index 29ead8615..945290494 100644 --- a/test/strategies/test_common.py +++ b/test/strategies/test_common.py @@ -19,6 +19,8 @@ def fake_runner(): runner = Mock() runner.last_strategy_start_time = perf_counter() runner.run.return_value = [fake_result] + runner.config_eval_count = 0 + runner.infeasable_config_eval_count = 0 return runner @@ -29,7 +31,7 @@ def test_cost_func(): x = [1, 4] tuning_options = Options(scaling=False, snap=False, tune_params=tune_params, restrictions=None, strategy_options={}, cache={}, unique_results={}, - objective="time", objective_higher_is_better=False, metrics=None) + objective=["time"], objective_higher_is_better=[False], metrics=None) runner = fake_runner() time = CostFunc(Searchspace(tune_params, None, 1024), tuning_options, runner)(x) @@ -41,7 +43,7 @@ def restrictions(_): tuning_options = Options(scaling=False, snap=False, tune_params=tune_params, restrictions=restrictions, strategy_options={}, verbose=True, cache={}, unique_results={}, - objective="time", objective_higher_is_better=False, metrics=None) + objective=["time"], objective_higher_is_better=[False], metrics=None) time = CostFunc(Searchspace(tune_params, restrictions, 1024), tuning_options, runner)(x) assert time == sys.float_info.max From c0a20b3b9b3466231d6916c050e2f5fd133a7bd4 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:22:02 +0100 Subject: [PATCH 16/18] fix import issue --- kernel_tuner/runners/sequential.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 79d11f889..2a8481973 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -5,6 +5,7 @@ from kernel_tuner.core import DeviceInterface from kernel_tuner.runners.runner import Runner +import kernel_tuner.util as util from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache From 0efc4bcd7b84e6b17272f04a78a01f064955c685 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:25:57 +0100 Subject: [PATCH 17/18] fix seq runner --- kernel_tuner/runners/sequential.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 2a8481973..dae34a3c8 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -102,13 +102,11 @@ def run(self, parameter_space, tuning_options): params.update(result) - # if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): if '__error__' in result: logging.debug('kernel configuration was skipped silently due to compile or runtime failure') # only compute metrics on configs that have not errored - # if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): - if '__error__' in params: + if tuning_options.metrics and '__error__' not in params: params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times From a20f18eb5391bc5b1f861021c5591596310ea080 Mon Sep 17 00:00:00 2001 From: maric-a-b <61234439+maric-a-b@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:32:39 +0100 Subject: [PATCH 18/18] removes old commented out code --- kernel_tuner/core.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 6fb478ce2..1cd47d297 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -480,7 +480,6 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett print( f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch" ) - # result[objective] = util.RuntimeFailedConfig() result['__error__'] = util.RuntimeFailedConfig() else: logging.debug("benchmark encountered runtime failure: " + str(e)) @@ -575,7 +574,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): - # result[to.objective] = util.InvalidConfig() result['__error__'] = util.InvalidConfig() else: # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s @@ -586,7 +584,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, start_compilation = time.perf_counter() func = self.compile_kernel(instance, verbose) if not func: - # result[to.objective] = util.CompilationFailedConfig() result['__error__'] = util.CompilationFailedConfig() else: # add shared memory arguments to compiled module