Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 1 addition & 63 deletions graph_net_bench/torch/eval_backend_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from contextlib import redirect_stdout, redirect_stderr
from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
from graph_net_bench import test_compiler_util
from .util.timing import measure_performance


def register_op_lib(op_lib):
Expand Down Expand Up @@ -129,69 +130,6 @@ def get_input_dict(args):
}


def measure_performance(model_call, args, compiler):
stats = {}
outs = model_call()

# Warmup runs
for _ in range(args.warmup):
model_call()
compiler.synchronize()

print(
f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
file=sys.stderr,
flush=True,
)

if "cuda" in args.device:
torch.cuda.empty_cache()
e2e_times = []
gpu_times = []

for i in range(args.trials):
# End-to-end timing (naive_timer)
duration_box = test_compiler_util.DurationBox(-1)
with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
# GPU-only timing (CUDA Events)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

model_call()

end_event.record()
compiler.synchronize()

gpu_time_ms = start_event.elapsed_time(end_event)
e2e_times.append(duration_box.value)
gpu_times.append(gpu_time_ms)
print(
f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
file=sys.stderr,
flush=True,
)

stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)

else: # CPU or other devices
e2e_times = []
for i in range(args.trials):
duration_box = test_compiler_util.DurationBox(-1)
with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
model_call()
print(
f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
file=sys.stderr,
flush=True,
)
e2e_times.append(duration_box.value)
stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)

return outs, stats


def eval_single_model_with_single_backend(args):
check_and_complete_args(args)
set_seed(args.seed)
Expand Down
90 changes: 90 additions & 0 deletions graph_net_bench/torch/util/timing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import torch
import sys
from graph_net_bench import test_compiler_util


def measure_performance(model_call, args, compiler):
stats = {}
outs = model_call()
# Warmup runs
for _ in range(args.warmup):
model_call()
compiler.synchronize()

print(
f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
file=sys.stderr,
flush=True,
)

if "cuda" in args.device:
torch.cuda.empty_cache()
executor = CUDATrialExecutor(model_call, compiler)
else:
executor = NoneCUDATrialExecutor(model_call, compiler)

timings = run_benchmark(args.trials, executor)

stats = {
name: test_compiler_util.get_timing_stats(values)
for name, values in timings.items()
}

return outs, stats


def run_benchmark(trials, executor):
results = {}

for i in range(trials):
timings = executor.run_one_trial()

for k, v in timings.items():
results.setdefault(k, []).append(v)

log_trial(i + 1, timings)

return results


def log_trial(idx, timings):
msg = ", ".join(f"{k}={v:.5f} ms" for k, v in timings.items())
print(f"Trial {idx}: {msg}", file=sys.stderr, flush=True)


class BaseTrialExecutor:
def __init__(self, model_call, compiler):
self.model_call = model_call
self.compiler = compiler

def run_one_trial(self):
raise NotImplementedError


class NoneCUDATrialExecutor(BaseTrialExecutor):
def run_one_trial(self):
duration_box = test_compiler_util.DurationBox(-1)
with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize):
self.model_call()
return {"e2e": duration_box.value}


class CUDATrialExecutor(BaseTrialExecutor):
def run_one_trial(self):
duration_box = test_compiler_util.DurationBox(-1)

start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize):
start_event.record()
self.model_call()
end_event.record()
self.compiler.synchronize()

gpu_time = start_event.elapsed_time(end_event)

return {
"e2e": duration_box.value,
"gpu": gpu_time,
}