KernelTuner · maric-a-b · Apr 2, 2025 · May 8, 2025 · May 8, 2025 · Aug 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ deploy_key
 temp_*.*
 .python-version
 .nox
+.venv
 
 ### Visual Studio Code ###
 !.vscode/settings.json
@@ -37,4 +38,4 @@ temp_*.*
 .LSOverride
 
 .vscode
-.idea
+.idea
diff --git a/kernel_tuner/__init__.py b/kernel_tuner/__init__.py
@@ -1,5 +1,5 @@
 from kernel_tuner.integration import store_results, create_device_targets
-from kernel_tuner.interface import tune_kernel, run_kernel
+from kernel_tuner.interface import tune_kernel, tune_cache, run_kernel
 
 from importlib.metadata import version
 

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -480,11 +480,14 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch"
                     )
-                result[objective] = util.RuntimeFailedConfig()
+                result['__error__'] = util.RuntimeFailedConfig()
             else:
                 logging.debug("benchmark encountered runtime failure: " + str(e))
                 print("Error while benchmarking:", instance.name)
                 raise e
+
+        assert util.check_result_type(result), "The error in a result MUST be an actual error."
+
         return result
 
     def check_kernel_output(
@@ -571,7 +574,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
 
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
-            result[to.objective] = util.InvalidConfig()
+            result['__error__'] = util.InvalidConfig()
         else:
             # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s
             gpu_args = _preprocess_gpu_arguments(gpu_args, params)
@@ -581,7 +584,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 start_compilation = time.perf_counter()
                 func = self.compile_kernel(instance, verbose)
                 if not func:
-                    result[to.objective] = util.CompilationFailedConfig()
+                    result['__error__'] = util.CompilationFailedConfig()
                 else:
                     # add shared memory arguments to compiled module
                     if kernel_options.smem_args is not None:
@@ -635,6 +638,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
         result["verification_time"] = last_verification_time or 0
         result["benchmark_time"] = last_benchmark_time or 0
 
+        assert util.check_result_type(result), "The error in a result MUST be an actual error."
+
         return result
 
     def compile_kernel(self, instance, verbose):

diff --git a/kernel_tuner/file_utils.py b/kernel_tuner/file_utils.py
@@ -32,20 +32,20 @@ def output_file_schema(target):
     return current_version, json_string
 
 
-def get_configuration_validity(objective) -> str:
+def get_configuration_validity(error) -> str:
     """Convert internal Kernel Tuner error to string."""
     errorstring: str
-    if not isinstance(objective, util.ErrorConfig):
+    if not isinstance(error, util.ErrorConfig):
         errorstring = "correct"
     else:
-        if isinstance(objective, util.CompilationFailedConfig):
+        if isinstance(error, util.CompilationFailedConfig):
             errorstring = "compile"
-        elif isinstance(objective, util.RuntimeFailedConfig):
+        elif isinstance(error, util.RuntimeFailedConfig):
             errorstring = "runtime"
-        elif isinstance(objective, util.InvalidConfig):
+        elif isinstance(error, util.InvalidConfig):
             errorstring = "constraints"
         else:
-            raise ValueError(f"Unkown objective type {type(objective)}, value {objective}")
+            raise ValueError(f"Unkown error type {type(error)}, value {error}")
     return errorstring
 
 
@@ -110,7 +110,8 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim
         out["times"] = timings
 
         # encode the validity of the configuration
-        out["invalidity"] = get_configuration_validity(result[objective])
+        # out["invalidity"] = get_configuration_validity(result[objective])
+        out["invalidity"] = get_configuration_validity(result['__error__'])
 
         # Kernel Tuner does not support producing results of configs that fail the correctness check
         # therefore correctness is always 1
@@ -127,7 +128,10 @@ def store_output_file(output_filename: str, results, tune_params, objective="tim
         # In Kernel Tuner we currently support only one objective at a time, this can be a user-defined
         # metric that combines scores from multiple different quantities into a single value to support
         # multi-objective tuning however.
-        out["objectives"] = [objective]
+        # NOTE(maric): With PyMOO integrated we do support multi-objective tuning without scalarization
+        objectives = [objective] if isinstance(objective, str) else list(objective)
+        assert isinstance(objectives, list)
+        out["objectives"] = objectives
 
         # append to output
         output_data.append(out)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
@@ -57,6 +57,7 @@
     pso,
     random_sample,
     simulated_annealing,
+    pymoo_minimize,
 )
 
 strategy_map = {
@@ -75,6 +76,8 @@
     "simulated_annealing": simulated_annealing,
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
+    "nsga2": pymoo_minimize,
+    "nsga3": pymoo_minimize,
 }
 
 
@@ -425,15 +428,15 @@ def __deepcopy__(self, _):
                 """Optimization objective to sort results on, consisting of a string
             that also occurs in results as a metric or observed quantity, default 'time'.
             Please see :ref:`objectives`.""",
-                "string",
+                "str | list[str]",
             ),
         ),
         (
             "objective_higher_is_better",
             (
                 """boolean that specifies whether the objective should
             be maximized (True) or minimized (False), default False.""",
-                "bool",
+                "bool | list[bool]",
             ),
         ),
         (
@@ -464,6 +467,7 @@ def __deepcopy__(self, _):
         ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")),
         ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")),
         ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")),
+        ("seed", ("""The random seed.""", "int")),
     ]
 )
 
@@ -577,6 +581,8 @@ def tune_kernel(
     observers=None,
     objective=None,
     objective_higher_is_better=None,
+    objectives=None,
+    seed=None,
 ):
     start_overhead_time = perf_counter()
     if log:
@@ -586,8 +592,20 @@ def tune_kernel(
 
     _check_user_input(kernel_name, kernelsource, arguments, block_size_names)
 
-    # default objective if none is specified
-    objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
+    if objectives:
+        if isinstance(objectives, dict):
+            objective = list(objectives.keys())
+            objective_higher_is_better = list(objectives.values())
+        else:
+            raise ValueError("objectives should be a dict of (objective, higher_is_better) pairs")
+    else:
+        objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
+        objective = [objective]
+        objective_higher_is_better = [objective_higher_is_better]
+
+    assert isinstance(objective, list)
+    assert isinstance(objective_higher_is_better, list)
+    assert len(list(objective)) == len(list(objective_higher_is_better))
 
     # check for forbidden names in tune parameters
     util.check_tune_params_list(tune_params, observers, simulation_mode=simulation_mode)
@@ -682,13 +700,33 @@ def tune_kernel(
 
     # finished iterating over search space
     if results:  # checks if results is not empty
-        best_config = util.get_best_config(results, objective, objective_higher_is_better)
-        # add the best configuration to env
-        env['best_config'] = best_config
-        if not device_options.quiet:
-            units = getattr(runner, "units", None)
-            print("best performing configuration:")
-            util.print_config_output(tune_params, best_config, device_options.quiet, metrics, units)
+        if len(list(objective)) == 1:
+            objective = objective[0]
+            objective_higher_is_better = objective_higher_is_better[0]
+            best_config = util.get_best_config(results, objective, objective_higher_is_better)
+            # add the best configuration to env
+            env['best_config'] = best_config
+            if not device_options.quiet:
+                units = getattr(runner, "units", None)
+                print(f"\nBEST PERFORMING CONFIGURATION FOR OBJECTIVE {objective}:")
+                keys = list(tune_params.keys())
+                keys += [objective]
+                if metrics:
+                    keys += list(metrics.keys())
+                print(util.get_config_string(best_config, keys, units))
+        else:
+            pareto_front = util.get_pareto_results(results, objective, objective_higher_is_better)
+            # add the best configuration to env
+            env['best_config'] = pareto_front
+            if not device_options.quiet:
+                units = getattr(runner, "units", None)
+                keys = list(tune_params.keys())
+                keys += list(objective)
+                if metrics:
+                    keys += list(metrics.keys)
+                print(f"\nBEST PERFORMING CONFIGURATIONS FOR OBJECTIVES: {objective}:")
+                for best_config in pareto_front:
+                    print(util.get_config_string(best_config, keys, units))
     elif not device_options.quiet:
         print("no results to report")
 
@@ -703,6 +741,28 @@ def tune_kernel(
 
 tune_kernel.__doc__ = _tune_kernel_docstring
 
+
+def tune_cache(*,
+    cache_path,
+    restrictions = None,
+    **kwargs,
+):
+    cache = util.read_cache(cache_path, open_cache=False)
+    tune_args = util.infer_args_from_cache(cache)
+    _restrictions = [util.infer_restrictions_from_cache(cache)]
+
+    # Add the user provided restrictions
+    if restrictions:
+        if isinstance(restrictions, list):
+            _restrictions.extend(restrictions)
+        else:
+            raise ValueError("The restrictions must be a list()")
+
+    tune_args.update(kwargs)
+
+    return tune_kernel(**tune_args, cache=cache_path, restrictions=_restrictions, simulation_mode=True)
+
+
 _run_kernel_docstring = """Compile and run a single kernel
 
     Compiles and runs a single kernel once, given a specific instance of the kernels tuning parameters.

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
@@ -5,6 +5,7 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
+import kernel_tuner.util as util
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 
 
@@ -44,8 +45,15 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
 
+        # It is the task of the cost function to increment there counters
+        self.config_eval_count = 0
+        self.infeasable_config_eval_count = 0
+
     def get_environment(self, tuning_options):
-        return self.dev.get_environment()
+        env = self.dev.get_environment()
+        env["config_eval_count"] = self.config_eval_count
+        env["infeasable_config_eval_count"] = self.infeasable_config_eval_count
+        return env
 
     def run(self, parameter_space, tuning_options):
         """Iterate through the entire parameter space using a single Python process.
@@ -90,17 +98,19 @@ def run(self, parameter_space, tuning_options):
 
                 result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
 
+                assert util.check_result_type(result)
+
                 params.update(result)
 
-                if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                if '__error__' in result:
                     logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
 
             # only compute metrics on configs that have not errored
-            if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            if tuning_options.metrics and '__error__' not in params:
                 params = process_metrics(params, tuning_options.metrics)
 
             # get the framework time by estimating based on other times
-            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) 
+            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time)
             params['strategy_time'] = self.last_strategy_time
             params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
             params['timestamp'] = str(datetime.now(timezone.utc))
@@ -113,6 +123,8 @@ def run(self, parameter_space, tuning_options):
                 # add configuration to cache
                 store_cache(x_int, params, tuning_options)
 
+            assert util.check_result_type(params)
+
             # all visited configurations are added to results to provide a trace for optimization strategies
             results.append(params)
 

diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
@@ -47,7 +47,8 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :type iterations: int
         """
         self.quiet = device_options.quiet
-        self.dev = SimulationDevice(1024, dict(device_name="Simulation"), self.quiet)
+        # NOTE(maric): had to increase max_threas so the default restraints would pass
+        self.dev = SimulationDevice(1_000_000_000, dict(device_name="Simulation"), self.quiet)
 
         self.kernel_source = kernel_source
         self.simulation_mode = True
@@ -58,10 +59,16 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.units = {}
 
+        # It is the task of the cost function to increment there counters
+        self.config_eval_count = 0
+        self.infeasable_config_eval_count = 0
+
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
         env["simulation"] = True
         env["simulated_time"] = tuning_options.simulated_time
+        env["config_eval_count"] = self.config_eval_count
+        env["infeasable_config_eval_count"] = self.infeasable_config_eval_count
         return env
 
     def run(self, parameter_space, tuning_options):

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
@@ -72,6 +72,8 @@ def __call__(self, x, check_restrictions=True):
         # check if max_fevals is reached or time limit is exceeded
         util.check_stop_criterion(self.tuning_options)
 
+        self.runner.config_eval_count += 1
+
         # snap values in x to nearest actual value for each parameter, unscale x if needed
         if self.snap:
             if self.scaling:
@@ -92,9 +94,12 @@ def __call__(self, x, check_restrictions=True):
             legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
             if not legal:
                 result = params_dict
-                result[self.tuning_options.objective] = util.InvalidConfig()
+                result['__error__'] = util.InvalidConfig()
+                self.runner.infeasable_config_eval_count += 1
 
         if legal:
+            assert ('__error__' not in result), "A legal config MUST NOT have an error result."
+
             # compile and benchmark this instance
             res = self.runner.run([params], self.tuning_options)
             result = res[0]
@@ -108,11 +113,17 @@ def __call__(self, x, check_restrictions=True):
             # upon returning from this function control will be given back to the strategy, so reset the start time
             self.runner.last_strategy_start_time = perf_counter()
 
-        # get numerical return value, taking optimization direction into account
-        return_value = result[self.tuning_options.objective] or sys.float_info.max
-        return_value = return_value if not self.tuning_options.objective_higher_is_better else -return_value
+        # get the cost of the result
+        cost_vec = util.get_result_cost(
+            result,
+            self.tuning_options.objective,
+            self.tuning_options.objective_higher_is_better
+        )
 
-        return return_value
+        if len(cost_vec) == 1:
+            return cost_vec[0]
+        else:
+            return cost_vec
 
     def get_bounds_x0_eps(self):
         """Compute bounds, x0 (the initial guess), and eps."""