diff --git a/.gitignore b/.gitignore index 48627d6f2..44c501486 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,5 @@ virtualenv-[0-9]*[0-9] # Files used by run-pylint.sh .pylintrc.yml .run-pylint.py + +.codex diff --git a/examples/python/compute-examples/finite-difference-2-5D.py b/examples/python/compute-examples/finite-difference-2-5D.py new file mode 100644 index 000000000..52fc25a89 --- /dev/null +++ b/examples/python/compute-examples/finite-difference-2-5D.py @@ -0,0 +1,292 @@ +import time + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import pyopencl as cl + +import loopy as lp +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def centered_second_derivative_coefficients(radius: int, dtype) -> np.ndarray: + offsets = np.arange(-radius, radius + 1, dtype=dtype) + powers = np.arange(2 * radius + 1) + vandermonde = offsets[np.newaxis, :] ** powers[:, np.newaxis] + rhs = np.zeros(2 * radius + 1, dtype=dtype) + rhs[2] = 2 + + return np.linalg.solve(vandermonde, rhs).astype(dtype) + + +# FIXME: choose a better test case +def f(x, y, z): + return x**2 + y**2 + z**2 + + +def laplacian_f(x, y, z): + return 6 * np.ones_like(x) + + +def benchmark_executor(ex, queue, args, warmup: int, iterations: int) -> float: + if iterations <= 0: + raise ValueError("iterations must be positive") + + evt = None + for _ in range(warmup): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + + start = time.perf_counter() + for _ in range(iterations): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + end = time.perf_counter() + + return (end - start) / iterations + + +def laplacian_flop_count(npts: int, stencil_width: int) -> int: + radius = stencil_width // 2 + output_points = (npts - 2 * radius) ** 3 + return 4 * stencil_width * output_points + + +def main( + npts: int = 64, + stencil_width: int = 5, + use_compute: bool = False, + print_device_code: bool = False, + print_kernel: bool = False, + run_kernel: bool = False, + warmup: int = 3, + iterations: int = 10 + ) -> float | None: + if stencil_width <= 0 or stencil_width % 2 == 0: + raise ValueError("stencil_width must be a positive odd integer") + + pts = np.linspace(-1, 1, num=npts, endpoint=True) + h = pts[1] - pts[0] + + x, y, z = np.meshgrid(*(pts,)*3) + + dtype = np.float64 + x = x.reshape(*(npts,)*3).astype(dtype) + y = y.reshape(*(npts,)*3).astype(dtype) + z = z.reshape(*(npts,)*3).astype(dtype) + + m = stencil_width + r = m // 2 + c = (centered_second_derivative_coefficients(r, dtype) / h**2).astype(dtype) + + bm = bn = 16 + bk = 32 + + knl = lp.make_kernel( + "{ [i, j, k, l] : r <= i, j, k < npts - r and -r <= l < r + 1 }", + """ + u_(is, js, ks) := u[is, js, ks] + + lap_u[i,j,k] = sum( + [l], + c[l+r] * (u_(i-l,j,k) + u_(i,j-l,k) + u_(i,j,k-l)) + ) + """, + [ + lp.GlobalArg("u", dtype=dtype, shape=(npts, npts, npts)), + lp.GlobalArg("lap_u", dtype=dtype, shape=(npts, npts, npts), + is_output=True), + lp.GlobalArg("c", dtype=dtype, shape=(m,)) + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2 + ) + + knl = lp.fix_parameters(knl, npts=npts, r=r) + + knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io") + knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo") + knl = lp.split_iname(knl, "k", bk, inner_iname="ki", outer_iname="ko") + + if use_compute: + plane_map = nisl.make_map(f"""{{ + [is, js, ks] -> [io, ii_s, jo, ji_s, ko, ki] : + is = io * {bm} + ii_s - {r} and + js = jo * {bn} + ji_s - {r} and + ks = ko * {bk} + ki + }}""") + + knl = compute( + knl, + "u_", + compute_map=plane_map, + storage_indices=["ii_s", "ji_s"], + + temporary_name="u_ij_plane", + temporary_address_space=lp.AddressSpace.LOCAL, + temporary_dtype=dtype, + + compute_insn_id="u_plane_compute" + ) + + ring_buffer_map = nisl.make_map(f"""{{ + [is, js, ks] -> [io, ii, jo, ji, ko, ki, kb] : + is = io * {bm} + ii and + js = jo * {bn} + ji and + kb = ks - (ko * {bk} + ki) + {r} + }}""") + + knl = compute( + knl, + "u_", + compute_map=ring_buffer_map, + storage_indices=["kb"], + + temporary_name="u_k_buf", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + + compute_insn_id="u_ring_buf_compute", + inames_to_advance=["ki"] + ) + + nt = 16 + knl = lp.split_iname( + knl, "ii_s", nt, outer_iname="ii_s_tile", inner_iname="ii_s_local" + ) + + knl = lp.split_iname( + knl, "ji_s", nt, outer_iname="ji_s_tile", inner_iname="ji_s_local" + ) + + knl = lp.tag_inames(knl, { + # 2D plane compute storage loops + "ii_s_local": "l.1", + "ji_s_local": "l.0", + + # force the use of registers by unrolling + "kb": "unr" + }) + + knl = lp.tag_inames(knl, { + # outer block loops + "io": "g.2", + "jo": "g.1", + "ko": "g.0", + + # inner tile loops + "ii": "l.1", + "ji": "l.0", + }) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + if print_kernel: + print(knl) + + if not run_kernel: + return None + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + ex = knl.executor(queue) + + f_vals = f(x, y, z) + + import pyopencl.array as cl_array + f_vals_cl = cl_array.to_device(queue, f_vals) + c_cl = cl_array.to_device(queue, c) + lap_u_cl = cl_array.zeros(queue, (npts,)*3, dtype=f_vals_cl.dtype) + avg_time_per_iter = benchmark_executor( + ex, queue, {"u": f_vals_cl, "c": c_cl, "lap_u": lap_u_cl}, + warmup=warmup, iterations=iterations) + avg_gflops = laplacian_flop_count(npts, stencil_width) / avg_time_per_iter / 1e9 + + _, lap_fd = ex(queue, u=f_vals_cl, c=c_cl, lap_u=lap_u_cl) + lap_true = laplacian_f(x, y, z) + sl = (slice(r, npts - r),)*3 + + rel_err = la.norm(lap_true[sl] - lap_fd[0].get()[sl]) / la.norm(lap_true[sl]) + + print(20 * "=", "Finite difference report", 20 * "=") + print(f"Variant : {'compute' if use_compute else 'baseline'}") + print(f"Grid points : {npts}^3") + print(f"Stencil width: {stencil_width}") + print(f"Iterations : warmup = {warmup}, timed = {iterations}") + print(f"Average time per iteration: {avg_time_per_iter:.6e} s") + print(f"Average throughput: {avg_gflops:.3f} GFLOP/s") + print(f"Relative error: {rel_err:.3e}") + print((40 + len(" Finite difference report ")) * "=") + + return avg_time_per_iter + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + _ = parser.add_argument("--npoints", action="store", type=int, default=64) + _ = parser.add_argument("--stencil-width", action="store", type=int, default=5) + + _ = parser.add_argument("--compare", action="store_true") + _ = parser.add_argument("--compute", action="store_true") + _ = parser.add_argument("--run-kernel", action="store_true") + _ = parser.add_argument("--no-run-kernel", action="store_false", + dest="run_kernel") + _ = parser.add_argument("--print-device-code", action="store_true") + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--warmup", action="store", type=int, default=3) + _ = parser.add_argument("--iterations", action="store", type=int, default=10) + + args = parser.parse_args() + + if args.compare: + print("Running example without compute...") + no_compute_time = main( + npts=args.npoints, + stencil_width=args.stencil_width, + use_compute=False, + print_device_code=args.print_device_code, + print_kernel=args.print_kernel, + run_kernel=True, + warmup=args.warmup, + iterations=args.iterations, + ) + print(50 * "=", "\n") + + print("Running example with compute...") + compute_time = main( + npts=args.npoints, + stencil_width=args.stencil_width, + use_compute=True, + print_device_code=args.print_device_code, + print_kernel=args.print_kernel, + run_kernel=True, + warmup=args.warmup, + iterations=args.iterations, + ) + print(50 * "=", "\n") + + assert no_compute_time is not None + assert compute_time is not None + speedup = no_compute_time / compute_time + print(f"Speedup: {speedup:.3f}x") + time_reduction = (1 - compute_time / no_compute_time) * 100 + print(f"Relative time reduction: {time_reduction:.2f}%") + else: + _ = main( + npts=args.npoints, + stencil_width=args.stencil_width, + use_compute=args.compute, + print_device_code=args.print_device_code, + print_kernel=args.print_kernel, + run_kernel=args.run_kernel, + warmup=args.warmup, + iterations=args.iterations, + ) diff --git a/examples/python/compute-examples/l2p-3d-tensor-product-compute.py b/examples/python/compute-examples/l2p-3d-tensor-product-compute.py new file mode 100644 index 000000000..0a615c5a3 --- /dev/null +++ b/examples/python/compute-examples/l2p-3d-tensor-product-compute.py @@ -0,0 +1,484 @@ +"""Benchmark a 3D Cartesian Taylor L2P microkernel with Loopy compute. + +FMM kernel class: kernel-independent Cartesian Taylor/asymptotic local +expansion evaluation in three spatial dimensions. This script does not +directly evaluate the 3D Laplace, Helmholtz, or biharmonic Green's function. +The dense local coefficient tensor ``gamma`` is assumed to have already been +formed by the relevant FMM translation machinery; this benchmark isolates the +target-side monomial contraction. + +The kernel evaluates a dense 3D tensor-product local expansion at many target +points: + + phi[itgt] = sum_{q0,q1,q2} gamma[q0, q1, q2] + * x[itgt]**q0 / q0! + * y[itgt]**q1 / q1! + * z[itgt]**q2 / q2! + +The baseline variants are GPU-parallel kernels over target blocks that expand +the basis substitutions inline. The compute variants use +:func:`loopy.transform.compute.compute` to materialize the x, y, and z basis +values into private temporaries. The script includes both a direct tiled +compute schedule and an optimized register-tiled schedule, following the style +of Loopy's compute matmul example. + +Use ``--compare`` to run the naive parallel baseline and the optimized compute +kernel, validate both against the NumPy reference, and report timing, modeled +GFLOP/s, speedup, and relative error. +""" + +import os +import time + + +os.environ.setdefault("XDG_CACHE_HOME", "/tmp") + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import loopy as lp +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def inv_factorials(order: int, dtype) -> np.ndarray: + result = np.empty(order + 1, dtype=dtype) + result[0] = 1 + for i in range(1, order + 1): + result[i] = result[i - 1] / i + return result + + +def reference_l2p_3d( + gamma: np.ndarray, + x: np.ndarray, + y: np.ndarray, + z: np.ndarray, + inv_fact: np.ndarray + ) -> np.ndarray: + order = gamma.shape[0] - 1 + result = np.empty_like(x) + + for itgt in range(x.size): + acc = 0 + for q0 in range(order + 1): + x_basis = x[itgt]**q0 * inv_fact[q0] + for q1 in range(order + 1): + y_basis = y[itgt]**q1 * inv_fact[q1] + for q2 in range(order + 1): + z_basis = z[itgt]**q2 * inv_fact[q2] + acc += gamma[q0, q1, q2] * x_basis * y_basis * z_basis + result[itgt] = acc + + return result + + +def make_kernel( + ntargets: int, + order: int, + dtype + ) -> lp.TranslationUnit: + knl = lp.make_kernel( + "{ [itgt, q0, q1, q2] : " + "0 <= itgt < ntargets and 0 <= q0, q1, q2 <= p }", + """ + x_basis_(itgt_arg, q0_arg) := ( + x[itgt_arg] ** q0_arg * inv_fact[q0_arg] + ) + + y_basis_(itgt_arg, q1_arg) := ( + y[itgt_arg] ** q1_arg * inv_fact[q1_arg] + ) + + z_basis_(itgt_arg, q2_arg) := ( + z[itgt_arg] ** q2_arg * inv_fact[q2_arg] + ) + + phi[itgt] = sum( + [q0, q1, q2], + gamma[q0, q1, q2] + * x_basis_(itgt, q0) + * y_basis_(itgt, q1) + * z_basis_(itgt, q2) + ) + """, + [ + lp.GlobalArg("x", dtype=dtype, shape=(ntargets,)), + lp.GlobalArg("y", dtype=dtype, shape=(ntargets,)), + lp.GlobalArg("z", dtype=dtype, shape=(ntargets,)), + lp.GlobalArg("inv_fact", dtype=dtype, shape=(order + 1,)), + lp.GlobalArg( + "gamma", + dtype=dtype, + shape=(order + 1, order + 1, order + 1), + ), + lp.GlobalArg("phi", dtype=dtype, shape=(ntargets,), is_output=True), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + return lp.fix_parameters(knl, ntargets=ntargets, p=order) + + +def split_targets( + knl: lp.TranslationUnit, + target_block_size: int + ) -> lp.TranslationUnit: + knl = lp.split_iname( + knl, + "itgt", + target_block_size, + inner_iname="itgt_inner", + outer_iname="itgt_block", + ) + return lp.tag_inames(knl, {"itgt_block": "g.0"}) + + +def block_private_l2p_3d( + knl: lp.TranslationUnit, + target_block_size: int, + dtype + ) -> lp.TranslationUnit: + knl = split_targets(knl, target_block_size) + + x_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q0_arg] -> [itgt_block, itgt_s, q0_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_s and + q0_arg = q0_s + }}""") + + knl = compute( + knl, + "x_basis_", + compute_map=x_basis_map, + storage_indices=["itgt_s", "q0_s"], + temporary_name="x_basis_tile", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="x_basis_compute", + ) + + y_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q1_arg] -> [itgt_block, itgt_s, q1_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_s and + q1_arg = q1_s + }}""") + + knl = compute( + knl, + "y_basis_", + compute_map=y_basis_map, + storage_indices=["itgt_s", "q1_s"], + temporary_name="y_basis_tile", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="y_basis_compute", + ) + + z_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q2_arg] -> [itgt_block, itgt_s, q2_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_s and + q2_arg = q2_s + }}""") + + return compute( + knl, + "z_basis_", + compute_map=z_basis_map, + storage_indices=["itgt_s", "q2_s"], + temporary_name="z_basis_tile", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="z_basis_compute", + ) + + +def register_tiled_l2p_3d( + knl: lp.TranslationUnit, + target_block_size: int, + dtype + ) -> lp.TranslationUnit: + knl = split_targets(knl, target_block_size) + + x_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q0_arg] -> [itgt_block, itgt_inner, q0_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_inner and + q0_arg = q0_s + }}""") + + knl = compute( + knl, + "x_basis_", + compute_map=x_basis_map, + storage_indices=["q0_s"], + temporary_name="x_basis_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="x_basis_compute", + ) + + y_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q1_arg] -> [itgt_block, itgt_inner, q1_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_inner and + q1_arg = q1_s + }}""") + + knl = compute( + knl, + "y_basis_", + compute_map=y_basis_map, + storage_indices=["q1_s"], + temporary_name="y_basis_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="y_basis_compute", + ) + + z_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q2_arg] -> [itgt_block, itgt_inner, q2_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_inner and + q2_arg = q2_s + }}""") + + knl = compute( + knl, + "z_basis_", + compute_map=z_basis_map, + storage_indices=["q2_s"], + temporary_name="z_basis_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="z_basis_compute", + ) + + return lp.tag_inames(knl, { + "itgt_inner": "l.0", + "q0_s": "unr", + "q1_s": "unr", + "q2_s": "unr", + "q0": "unr", + "q1": "unr", + "q2": "unr", + }) + + +def operation_model( + ntargets: int, + order: int, + target_block_size: int + ) -> tuple[int, int]: + ncoeff = order + 1 + inline_basis_evals = 3 * ntargets * ncoeff**3 + tiled_compute_basis_evals = 3 * ntargets * ncoeff + return inline_basis_evals, tiled_compute_basis_evals + + +def l2p_3d_flop_count(ntargets: int, order: int, use_compute: bool) -> int: + ncoeff = order + 1 + + contraction_flops = 4 * ntargets * ncoeff**3 + if use_compute: + basis_scale_flops = 3 * ntargets * ncoeff + else: + basis_scale_flops = 3 * ntargets * ncoeff**3 + + return contraction_flops + basis_scale_flops + + +def benchmark_executor(ex, queue, args, warmup: int, iterations: int) -> float: + evt = None + for _ in range(warmup): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + + start = time.perf_counter() + for _ in range(iterations): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + end = time.perf_counter() + + return (end - start) / iterations + + +def run_kernel( + knl: lp.TranslationUnit, + x: np.ndarray, + y: np.ndarray, + z: np.ndarray, + inv_fact: np.ndarray, + gamma: np.ndarray, + warmup: int, + iterations: int + ) -> tuple[np.ndarray, float]: + import pyopencl as cl + import pyopencl.array as cl_array + + ctx = cl.create_some_context(interactive=False) + queue = cl.CommandQueue(ctx) + ex = knl.executor(queue) + + x_cl = cl_array.to_device(queue, x) + y_cl = cl_array.to_device(queue, y) + z_cl = cl_array.to_device(queue, z) + inv_fact_cl = cl_array.to_device(queue, inv_fact) + gamma_cl = cl_array.to_device(queue, gamma) + phi_cl = cl_array.zeros(queue, x.shape, dtype=x.dtype) + + elapsed = benchmark_executor( + ex, + queue, + { + "x": x_cl, + "y": y_cl, + "z": z_cl, + "inv_fact": inv_fact_cl, + "gamma": gamma_cl, + "phi": phi_cl, + }, + warmup=warmup, + iterations=iterations, + ) + + _, out = ex( + queue, x=x_cl, y=y_cl, z=z_cl, inv_fact=inv_fact_cl, + gamma=gamma_cl, phi=phi_cl) + return out[0].get(), elapsed + + +def main( + ntargets: int = 256, + order: int = 8, + target_block_size: int = 32, + use_compute: bool = False, + use_block_private_compute: bool = False, + compare: bool = False, + print_kernel: bool = False, + print_device_code: bool = False, + run: bool = False, + warmup: int = 3, + iterations: int = 10 + ) -> None: + if ntargets % target_block_size: + raise ValueError("ntargets must be divisible by target_block_size") + + dtype = np.float64 + rng = np.random.default_rng(22) + x = rng.uniform(-0.25, 0.25, size=ntargets).astype(dtype) + y = rng.uniform(-0.25, 0.25, size=ntargets).astype(dtype) + z = rng.uniform(-0.25, 0.25, size=ntargets).astype(dtype) + inv_fact = inv_factorials(order, dtype) + gamma = rng.normal(size=(order + 1, order + 1, order + 1)).astype(dtype) + reference = reference_l2p_3d(gamma, x, y, z, inv_fact) + + inline_evals, compute_evals = operation_model( + ntargets, order, target_block_size) + + if compare: + variants = ["inline", "register-tiled compute"] + elif use_block_private_compute: + variants = ["block-private compute"] + elif use_compute: + variants = ["register-tiled compute"] + else: + variants = ["inline"] + + timings: dict[str, float] = {} + for variant in variants: + knl = make_kernel(ntargets, order, dtype) + + if variant == "inline": + knl = split_targets(knl, target_block_size) + knl = lp.tag_inames(knl, { + "itgt_inner": "l.0", + "q0": "unr", + "q1": "unr", + "q2": "unr", + }) + elif variant == "block-private compute": + knl = block_private_l2p_3d(knl, target_block_size, dtype) + elif variant == "register-tiled compute": + knl = register_tiled_l2p_3d(knl, target_block_size, dtype) + else: + raise ValueError(f"unknown variant '{variant}'") + + variant_uses_compute = variant != "inline" + modeled_flops = l2p_3d_flop_count( + ntargets, order, use_compute=variant_uses_compute) + + print(20 * "=", "3D L2P basis report", 20 * "=") + print(f"Variant : {variant}") + print(f"Targets : {ntargets}") + print(f"Order : {order}") + print(f"Target block: {target_block_size}") + print(f"Inline basis evaluations: {inline_evals}") + print(f"Tiled compute evaluations: {compute_evals}") + print(f"Modeled flop count: {modeled_flops}") + + if print_kernel: + print(knl) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + if run or compare: + try: + result, elapsed = run_kernel( + knl, x, y, z, inv_fact, gamma, + warmup=warmup, iterations=iterations) + except Exception as exc: + print(f"Runtime execution unavailable: {exc}") + else: + rel_err = la.norm(result - reference) / la.norm(reference) + gflops = modeled_flops / elapsed * 1e-9 + timings[variant] = elapsed + print(f"Average time per iteration: {elapsed:.6e} s") + print(f"Modeled throughput: {gflops:.3f} GFLOP/s") + print(f"Relative error: {rel_err:.3e}") + + print((40 + len(" 3D L2P basis report ")) * "=") + + if ( + compare + and "inline" in timings + and "register-tiled compute" in timings): + speedup = timings["inline"] / timings["register-tiled compute"] + time_reduction = ( + 1 - timings["register-tiled compute"] / timings["inline"]) * 100 + print(f"Speedup: {speedup:.3f}x") + print(f"Relative time reduction: {time_reduction:.2f}%") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + _ = parser.add_argument("--ntargets", action="store", type=int, default=256) + _ = parser.add_argument("--order", action="store", type=int, default=8) + _ = parser.add_argument("--target-block-size", action="store", + type=int, default=32) + _ = parser.add_argument("--compute", action="store_true") + _ = parser.add_argument("--block-private-compute", action="store_true") + _ = parser.add_argument("--compare", action="store_true") + _ = parser.add_argument("--run-kernel", action="store_true") + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--print-device-code", action="store_true") + _ = parser.add_argument("--warmup", action="store", type=int, default=3) + _ = parser.add_argument("--iterations", action="store", type=int, default=10) + + args = parser.parse_args() + + main( + ntargets=args.ntargets, + order=args.order, + target_block_size=args.target_block_size, + use_compute=args.compute, + use_block_private_compute=args.block_private_compute, + compare=args.compare, + print_kernel=args.print_kernel, + print_device_code=args.print_device_code, + run=args.run_kernel, + warmup=args.warmup, + iterations=args.iterations, + ) diff --git a/examples/python/compute-examples/l2p-tiled-basis-compute.py b/examples/python/compute-examples/l2p-tiled-basis-compute.py new file mode 100644 index 000000000..077024cf0 --- /dev/null +++ b/examples/python/compute-examples/l2p-tiled-basis-compute.py @@ -0,0 +1,350 @@ +"""Benchmark a 2D Cartesian Taylor L2P microkernel with Loopy compute. + +FMM kernel class: kernel-independent Cartesian Taylor/asymptotic local +expansion evaluation. This script does not evaluate a particular Green's +function such as the 2D Laplace or Helmholtz kernel. The local coefficients +``gamma`` are treated as already available; if they came from a Laplace FMM, +this kernel is the L2P monomial-contraction stage after the Laplace-specific +coefficient/derivative work has already happened. + +The kernel evaluates a tensor-product Taylor-like local expansion at many +target points: + + phi[itgt] = sum_{q0,q1} gamma[q0, q1] + * x[itgt]**q0 / q0! + * y[itgt]**q1 / q1! + +The inline variant is a parallel GPU kernel over target blocks that expands the +two basis substitutions at every use inside the coefficient sum. The compute +variant uses :func:`loopy.transform.compute.compute` to materialize the x and y +basis values into private temporaries for each target, so the powers/factorial +scalings are reused across the inner coefficient loops instead of recomputed. + +Use ``--compare`` to run both GPU-parallel variants, check against the NumPy +reference implementation, and report timing, modeled GFLOP/s, speedup, and +relative error. +""" + +import os +import time + + +os.environ.setdefault("XDG_CACHE_HOME", "/tmp") + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import loopy as lp +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def inv_factorials(order: int, dtype) -> np.ndarray: + result = np.empty(order + 1, dtype=dtype) + result[0] = 1 + for i in range(1, order + 1): + result[i] = result[i - 1] / i + return result + + +def reference_l2p( + gamma: np.ndarray, + x: np.ndarray, + y: np.ndarray, + inv_fact: np.ndarray + ) -> np.ndarray: + order = gamma.shape[0] - 1 + result = np.empty_like(x) + + for itgt in range(x.size): + acc = 0 + for q0 in range(order + 1): + x_basis = x[itgt]**q0 * inv_fact[q0] + for q1 in range(order + 1): + y_basis = y[itgt]**q1 * inv_fact[q1] + acc += gamma[q0, q1] * x_basis * y_basis + result[itgt] = acc + + return result + + +def make_kernel( + ntargets: int, + order: int, + target_block_size: int, + dtype, + use_compute: bool = False + ) -> lp.TranslationUnit: + if ntargets % target_block_size: + raise ValueError("ntargets must be divisible by target_block_size") + + knl = lp.make_kernel( + "{ [itgt, q0, q1] : 0 <= itgt < ntargets and 0 <= q0, q1 <= p }", + """ + x_basis_(itgt_arg, q0_arg) := ( + x[itgt_arg] ** q0_arg * inv_fact[q0_arg] + ) + + y_basis_(itgt_arg, q1_arg) := ( + y[itgt_arg] ** q1_arg * inv_fact[q1_arg] + ) + + phi[itgt] = sum( + [q0, q1], + gamma[q0, q1] * x_basis_(itgt, q0) * y_basis_(itgt, q1) + ) + """, + [ + lp.GlobalArg("x", dtype=dtype, shape=(ntargets,)), + lp.GlobalArg("y", dtype=dtype, shape=(ntargets,)), + lp.GlobalArg("inv_fact", dtype=dtype, shape=(order + 1,)), + lp.GlobalArg("gamma", dtype=dtype, shape=(order + 1, order + 1)), + lp.GlobalArg("phi", dtype=dtype, shape=(ntargets,), is_output=True), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + + knl = lp.fix_parameters(knl, ntargets=ntargets, p=order) + knl = lp.split_iname( + knl, + "itgt", + target_block_size, + inner_iname="itgt_inner", + outer_iname="itgt_block", + ) + + if use_compute: + x_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q0_arg] -> [itgt_block, itgt_inner, q0_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_inner and + q0_arg = q0_s + }}""") + + knl = compute( + knl, + "x_basis_", + compute_map=x_basis_map, + storage_indices=["q0_s"], + temporary_name="x_basis_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="x_basis_compute", + ) + + y_basis_map = nisl.make_map(f"""{{ + [itgt_arg, q1_arg] -> [itgt_block, itgt_inner, q1_s] : + itgt_arg = itgt_block * {target_block_size} + itgt_inner and + q1_arg = q1_s + }}""") + + knl = compute( + knl, + "y_basis_", + compute_map=y_basis_map, + storage_indices=["q1_s"], + temporary_name="y_basis_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="y_basis_compute", + ) + + iname_tags = { + "itgt_block": "g.0", + "itgt_inner": "l.0", + "q0": "unr", + "q1": "unr", + } + if use_compute: + iname_tags.update({ + "q0_s": "unr", + "q1_s": "unr", + }) + + knl = lp.tag_inames(knl, iname_tags) + return knl + + +def operation_model( + ntargets: int, + order: int, + target_block_size: int + ) -> tuple[int, int]: + ncoeff = order + 1 + inline_basis_evals = 2 * ntargets * ncoeff**2 + tiled_compute_basis_evals = 2 * ntargets * ncoeff + return inline_basis_evals, tiled_compute_basis_evals + + +def l2p_flop_count(ntargets: int, order: int, use_compute: bool) -> int: + ncoeff = order + 1 + + contraction_flops = 3 * ntargets * ncoeff**2 + if use_compute: + basis_scale_flops = 2 * ntargets * ncoeff + else: + basis_scale_flops = 2 * ntargets * ncoeff**2 + + return contraction_flops + basis_scale_flops + + +def benchmark_executor(ex, queue, args, warmup: int, iterations: int) -> float: + evt = None + for _ in range(warmup): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + + start = time.perf_counter() + for _ in range(iterations): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + end = time.perf_counter() + + return (end - start) / iterations + + +def run_kernel( + knl: lp.TranslationUnit, + x: np.ndarray, + y: np.ndarray, + inv_fact: np.ndarray, + gamma: np.ndarray, + warmup: int, + iterations: int + ) -> tuple[np.ndarray, float]: + import pyopencl as cl + import pyopencl.array as cl_array + + ctx = cl.create_some_context(interactive=False) + queue = cl.CommandQueue(ctx) + ex = knl.executor(queue) + + x_cl = cl_array.to_device(queue, x) + y_cl = cl_array.to_device(queue, y) + inv_fact_cl = cl_array.to_device(queue, inv_fact) + gamma_cl = cl_array.to_device(queue, gamma) + phi_cl = cl_array.zeros(queue, x.shape, dtype=x.dtype) + + elapsed = benchmark_executor( + ex, + queue, + { + "x": x_cl, + "y": y_cl, + "inv_fact": inv_fact_cl, + "gamma": gamma_cl, + "phi": phi_cl, + }, + warmup=warmup, + iterations=iterations, + ) + + _, out = ex( + queue, x=x_cl, y=y_cl, inv_fact=inv_fact_cl, + gamma=gamma_cl, phi=phi_cl) + return out[0].get(), elapsed + + +def main( + ntargets: int = 256, + order: int = 12, + target_block_size: int = 32, + use_compute: bool = False, + compare: bool = False, + print_kernel: bool = False, + print_device_code: bool = False, + run: bool = False, + warmup: int = 3, + iterations: int = 10 + ) -> None: + dtype = np.float64 + rng = np.random.default_rng(14) + x = rng.uniform(-0.25, 0.25, size=ntargets).astype(dtype) + y = rng.uniform(-0.25, 0.25, size=ntargets).astype(dtype) + inv_fact = inv_factorials(order, dtype) + gamma = rng.normal(size=(order + 1, order + 1)).astype(dtype) + reference = reference_l2p(gamma, x, y, inv_fact) + + inline_evals, compute_evals = operation_model( + ntargets, order, target_block_size) + + variants = [False, True] if compare else [use_compute] + timings: dict[bool, float] = {} + for variant_uses_compute in variants: + knl = make_kernel( + ntargets, order, target_block_size, dtype, + use_compute=variant_uses_compute) + modeled_flops = l2p_flop_count( + ntargets, order, use_compute=variant_uses_compute) + + print(20 * "=", "L2P basis report", 20 * "=") + print(f"Variant : {'tiled compute' if variant_uses_compute else 'inline'}") + print(f"Targets : {ntargets}") + print(f"Order : {order}") + print(f"Target block: {target_block_size}") + print(f"Inline basis evaluations: {inline_evals}") + print(f"Tiled compute evaluations: {compute_evals}") + print(f"Modeled flop count: {modeled_flops}") + + if print_kernel: + print(knl) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + if run or compare: + try: + result, elapsed = run_kernel( + knl, x, y, inv_fact, gamma, + warmup=warmup, iterations=iterations) + except Exception as exc: + print(f"Runtime execution unavailable: {exc}") + else: + rel_err = la.norm(result - reference) / la.norm(reference) + gflops = modeled_flops / elapsed * 1e-9 + timings[variant_uses_compute] = elapsed + print(f"Average time per iteration: {elapsed:.6e} s") + print(f"Modeled throughput: {gflops:.3f} GFLOP/s") + print(f"Relative error: {rel_err:.3e}") + + print((40 + len(" L2P basis report ")) * "=") + + if compare and False in timings and True in timings: + speedup = timings[False] / timings[True] + time_reduction = (1 - timings[True] / timings[False]) * 100 + print(f"Speedup: {speedup:.3f}x") + print(f"Relative time reduction: {time_reduction:.2f}%") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + _ = parser.add_argument("--ntargets", action="store", type=int, default=256) + _ = parser.add_argument("--order", action="store", type=int, default=12) + _ = parser.add_argument("--target-block-size", action="store", + type=int, default=32) + _ = parser.add_argument("--compute", action="store_true") + _ = parser.add_argument("--compare", action="store_true") + _ = parser.add_argument("--run-kernel", action="store_true") + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--print-device-code", action="store_true") + _ = parser.add_argument("--warmup", action="store", type=int, default=3) + _ = parser.add_argument("--iterations", action="store", type=int, default=10) + + args = parser.parse_args() + + main( + ntargets=args.ntargets, + order=args.order, + target_block_size=args.target_block_size, + use_compute=args.compute, + compare=args.compare, + print_kernel=args.print_kernel, + print_device_code=args.print_device_code, + run=args.run_kernel, + warmup=args.warmup, + iterations=args.iterations, + ) diff --git a/examples/python/compute-examples/m2m-sum-factorization.py b/examples/python/compute-examples/m2m-sum-factorization.py new file mode 100644 index 000000000..0ca11b84e --- /dev/null +++ b/examples/python/compute-examples/m2m-sum-factorization.py @@ -0,0 +1,641 @@ +"""Benchmark compressed Cartesian Taylor M2M sum factorization with compute. + +FMM kernel class: compressed Cartesian Taylor/asymptotic multipole-to-multipole +translation. This is the binomial center-shift part of an FMM translation, not +a direct evaluation of a Green's function such as Laplace, Helmholtz, or +biharmonic. The translation weights are powers of the center displacement +divided by factorials; PDE-specific derivative generation, compression +matrices, and recompression are outside this microbenchmark. + +The stored-index pattern here is intentionally simple. The 2D mode stores the +two coordinate axes, and the 3D mode stores the three coordinate axes. That +captures the sum-factorized structure from Section 4.2.3, but it is not the +full compressed 3D Laplace stored set, which would retain PDE-derived +hyperplane layers with O(p**2) stored coefficients rather than only O(p) axis +coefficients. + +This script models a multipole-to-multipole-like translation in 2D or 3D where +the input expansion is stored only on the coordinate axes. For 2D, the stored +coefficients are ``beta[zeta0, 0]`` and ``beta[0, zeta1]``. For 3D, they are +``beta[zeta0, 0, 0]``, ``beta[0, zeta1, 0]``, and ``beta[0, 0, zeta2]``. The +output still fills the full tensor-product coefficient grid. + +The inline variant is a GPU-parallel kernel over output coefficient indices +that expands the one-dimensional translation sums at each output. The compute +variant uses :func:`loopy.transform.compute.compute` to materialize those axis +sums into private temporaries and reuses them across an ILP tile of the last +output axis. In 2D it tiles ``eta1``; in 3D it tiles ``eta2`` while reusing the +``eta0`` and ``eta1`` axis sums across that register tile. + +Use ``--dimension 2`` or ``--dimension 3`` to choose the kernel. Use +``--compare`` to run both GPU-parallel variants, check against the NumPy +reference implementation, and report timing, modeled GFLOP/s, speedup, and +relative error. +""" + +import os +import time + + +os.environ.setdefault("XDG_CACHE_HOME", "/tmp") + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import pymbolic.primitives as p + +import loopy as lp +import loopy.transform.compute as compute_mod +from loopy.symbolic import DependencyMapper +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def enable_compute_for_reduction_substitutions() -> None: + """Let compute inspect substitution rules whose expressions are reductions.""" + + def gather_vars(expr): + deps = DependencyMapper()(expr) + var_names = set() + for dep in deps: + if isinstance(dep, p.Variable): + var_names.add(dep.name) + elif ( + isinstance(dep, p.Subscript) + and isinstance(dep.aggregate, p.Variable)): + var_names.add(dep.aggregate.name) + + return var_names + + compute_mod._gather_vars = gather_vars + + +def translation_weights(h: np.ndarray, order: int) -> np.ndarray: + weights = np.empty((len(h), order + 1), dtype=h.dtype) + weights[:, 0] = 1 + + for axis in range(len(h)): + for n in range(1, order + 1): + weights[axis, n] = weights[axis, n - 1] * h[axis] / n + + return weights + + +def make_axis_compressed_coefficients( + order: int, + dimension: int, + dtype + ) -> np.ndarray: + rng = np.random.default_rng(12) + beta = np.zeros(dimension * (order + 1,), dtype=dtype) + + for axis in range(dimension): + axis_slice = [0] * dimension + axis_slice[axis] = slice(None) + beta[tuple(axis_slice)] = rng.normal(size=order + 1) + + beta[(0,) * dimension] = rng.normal() + + return beta + + +def reference_axis_m2m_2d(beta: np.ndarray, weights: np.ndarray) -> np.ndarray: + order = beta.shape[0] - 1 + sigma = np.empty_like(beta) + + for eta0 in range(order + 1): + for eta1 in range(order + 1): + acc = 0 + + for zeta1 in range(eta1 + 1): + acc += ( + weights[0, eta0] + * weights[1, eta1 - zeta1] + * beta[0, zeta1] + ) + + for zeta0 in range(eta0 + 1): + acc += ( + weights[0, eta0 - zeta0] + * weights[1, eta1] + * beta[zeta0, 0] + ) + + acc -= weights[0, eta0] * weights[1, eta1] * beta[0, 0] + sigma[eta0, eta1] = acc + + return sigma + + +def reference_axis_m2m_3d(beta: np.ndarray, weights: np.ndarray) -> np.ndarray: + order = beta.shape[0] - 1 + sigma = np.empty_like(beta) + + for eta0 in range(order + 1): + for eta1 in range(order + 1): + for eta2 in range(order + 1): + acc = 0 + + for zeta0 in range(eta0 + 1): + acc += ( + weights[0, eta0 - zeta0] + * weights[1, eta1] + * weights[2, eta2] + * beta[zeta0, 0, 0] + ) + + for zeta1 in range(eta1 + 1): + acc += ( + weights[0, eta0] + * weights[1, eta1 - zeta1] + * weights[2, eta2] + * beta[0, zeta1, 0] + ) + + for zeta2 in range(eta2 + 1): + acc += ( + weights[0, eta0] + * weights[1, eta1] + * weights[2, eta2 - zeta2] + * beta[0, 0, zeta2] + ) + + acc -= ( + 2 + * weights[0, eta0] + * weights[1, eta1] + * weights[2, eta2] + * beta[0, 0, 0] + ) + sigma[eta0, eta1, eta2] = acc + + return sigma + + +def make_kernel_2d( + order: int, + eta_tile_size: int, + dtype, + use_compute: bool = False + ) -> lp.TranslationUnit: + if (order + 1) % eta_tile_size: + raise ValueError("order + 1 must be divisible by eta_tile_size") + + knl = lp.make_kernel( + "{ [eta0, eta1, zeta0, zeta1] : 0 <= eta0, eta1, zeta0, zeta1 <= p }", + """ + x_axis_sum_(eta0_arg) := sum( + [zeta0], + if( + zeta0 <= eta0_arg, + w[0, eta0_arg - zeta0] * beta[zeta0, 0], + 0 + ) + ) + + y_axis_sum_(eta1_arg) := sum( + [zeta1], + if( + zeta1 <= eta1_arg, + w[1, eta1_arg - zeta1] * beta[0, zeta1], + 0 + ) + ) + + sigma[eta0, eta1] = ( + w[0, eta0] * y_axis_sum_(eta1) + + w[1, eta1] * x_axis_sum_(eta0) + - w[0, eta0] * w[1, eta1] * beta[0, 0] + ) + """, + [ + lp.GlobalArg("beta", dtype=dtype, shape=(order + 1, order + 1)), + lp.GlobalArg("w", dtype=dtype, shape=(2, order + 1)), + lp.GlobalArg("sigma", dtype=dtype, shape=(order + 1, order + 1), + is_output=True), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + + knl = lp.fix_parameters(knl, p=order) + knl = lp.split_iname( + knl, + "eta1", + eta_tile_size, + inner_iname="eta1_inner", + outer_iname="eta1_block", + ) + + if use_compute: + x_axis_sum_map = nisl.make_map("""{ + [eta0_arg] -> [eta0, x_slot] : + eta0_arg = eta0 and x_slot = 0 + }""") + knl = compute( + knl, + "x_axis_sum_", + compute_map=x_axis_sum_map, + storage_indices=["x_slot"], + placement_inames=["eta1_block"], + temporary_name="x_axis_sum_vec", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="x_axis_sum_compute", + ) + + y_axis_sum_map = nisl.make_map(f"""{{ + [eta1_arg] -> [eta1_block, y_slot] : + eta1_arg = eta1_block * {eta_tile_size} + y_slot + }}""") + knl = compute( + knl, + "y_axis_sum_", + compute_map=y_axis_sum_map, + storage_indices=["y_slot"], + placement_inames=["eta0"], + temporary_name="y_axis_sum_vec", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="y_axis_sum_compute", + ) + knl = lp.tag_inames(knl, { + "x_slot": "unr", + "y_slot": "unr", + }) + + knl = lp.tag_inames(knl, { + "eta0": "g.1", + "eta1_block": "g.0", + "eta1_inner": "ilp", + }) + return knl + + +def make_kernel_3d( + order: int, + eta_tile_size: int, + dtype, + use_compute: bool = False + ) -> lp.TranslationUnit: + if (order + 1) % eta_tile_size: + raise ValueError("order + 1 must be divisible by eta_tile_size") + + knl = lp.make_kernel( + """ + { [eta0, eta1, eta2, zeta0, zeta1, zeta2] : + 0 <= eta0, eta1, eta2, zeta0, zeta1, zeta2 <= p } + """, + """ + x_axis_sum_(eta0_arg) := sum( + [zeta0], + if( + zeta0 <= eta0_arg, + w[0, eta0_arg - zeta0] * beta[zeta0, 0, 0], + 0 + ) + ) + + y_axis_sum_(eta1_arg) := sum( + [zeta1], + if( + zeta1 <= eta1_arg, + w[1, eta1_arg - zeta1] * beta[0, zeta1, 0], + 0 + ) + ) + + z_axis_sum_(eta2_arg) := sum( + [zeta2], + if( + zeta2 <= eta2_arg, + w[2, eta2_arg - zeta2] * beta[0, 0, zeta2], + 0 + ) + ) + + sigma[eta0, eta1, eta2] = ( + w[1, eta1] * w[2, eta2] * x_axis_sum_(eta0) + + w[0, eta0] * w[2, eta2] * y_axis_sum_(eta1) + + w[0, eta0] * w[1, eta1] * z_axis_sum_(eta2) + - 2 * w[0, eta0] * w[1, eta1] * w[2, eta2] * beta[0, 0, 0] + ) + """, + [ + lp.GlobalArg( + "beta", dtype=dtype, shape=(order + 1, order + 1, order + 1)), + lp.GlobalArg("w", dtype=dtype, shape=(3, order + 1)), + lp.GlobalArg( + "sigma", dtype=dtype, + shape=(order + 1, order + 1, order + 1), + is_output=True), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + + knl = lp.fix_parameters(knl, p=order) + knl = lp.split_iname( + knl, + "eta2", + eta_tile_size, + inner_iname="eta2_inner", + outer_iname="eta2_block", + ) + + if use_compute: + x_axis_sum_map = nisl.make_map(""" + { + [eta0_arg] -> [eta0, x_slot] : + eta0_arg = eta0 and x_slot = 0 + } + """) + knl = compute( + knl, + "x_axis_sum_", + compute_map=x_axis_sum_map, + storage_indices=["x_slot"], + placement_inames=["eta1", "eta2_block"], + temporary_name="x_axis_sum_vec", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="x_axis_sum_compute", + ) + + y_axis_sum_map = nisl.make_map(""" + { + [eta1_arg] -> [eta1, y_slot] : + eta1_arg = eta1 and y_slot = 0 + } + """) + knl = compute( + knl, + "y_axis_sum_", + compute_map=y_axis_sum_map, + storage_indices=["y_slot"], + placement_inames=["eta0", "eta2_block"], + temporary_name="y_axis_sum_vec", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="y_axis_sum_compute", + ) + + z_axis_sum_map = nisl.make_map(f"""{{ + [eta2_arg] -> [eta2_block, z_slot] : + eta2_arg = eta2_block * {eta_tile_size} + z_slot + }}""") + knl = compute( + knl, + "z_axis_sum_", + compute_map=z_axis_sum_map, + storage_indices=["z_slot"], + placement_inames=["eta0", "eta1"], + temporary_name="z_axis_sum_vec", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="z_axis_sum_compute", + ) + knl = lp.tag_inames(knl, { + "x_slot": "unr", + "y_slot": "unr", + "z_slot": "unr", + }) + + knl = lp.tag_inames(knl, { + "eta0": "g.2", + "eta1": "g.1", + "eta2_block": "g.0", + "eta2_inner": "ilp", + }) + return knl + + +def make_kernel( + order: int, + eta_tile_size: int, + dimension: int, + dtype, + use_compute: bool = False + ) -> lp.TranslationUnit: + if dimension == 2: + return make_kernel_2d(order, eta_tile_size, dtype, use_compute) + if dimension == 3: + return make_kernel_3d(order, eta_tile_size, dtype, use_compute) + raise ValueError("dimension must be 2 or 3") + + +def reference_axis_m2m(beta: np.ndarray, weights: np.ndarray) -> np.ndarray: + dimension = beta.ndim + if dimension == 2: + return reference_axis_m2m_2d(beta, weights) + if dimension == 3: + return reference_axis_m2m_3d(beta, weights) + raise ValueError("dimension must be 2 or 3") + + +def operation_model( + order: int, + eta_tile_size: int, + dimension: int + ) -> tuple[int, int]: + ncoeff = order + 1 + if dimension == 2: + inline_sum_terms = 2 * ncoeff**3 + tiled_compute_sum_terms = ncoeff**3 + ncoeff**3 // eta_tile_size + elif dimension == 3: + inline_sum_terms = 3 * ncoeff**4 + tiled_compute_sum_terms = ncoeff**4 + 2 * ncoeff**4 // eta_tile_size + else: + raise ValueError("dimension must be 2 or 3") + return inline_sum_terms, tiled_compute_sum_terms + + +def m2m_flop_count( + order: int, + eta_tile_size: int, + dimension: int, + use_compute: bool + ) -> int: + ncoeff = order + 1 + + if dimension == 2: + if use_compute: + sum_flops = 2 * ncoeff**3 + 2 * ncoeff**3 // eta_tile_size + else: + sum_flops = 4 * ncoeff**3 + correction_flops = 3 * ncoeff**2 + elif dimension == 3: + if use_compute: + sum_flops = 2 * ncoeff**4 + 4 * ncoeff**4 // eta_tile_size + else: + sum_flops = 6 * ncoeff**4 + correction_flops = 8 * ncoeff**3 + else: + raise ValueError("dimension must be 2 or 3") + + return sum_flops + correction_flops + + +def benchmark_executor(ex, queue, args, warmup: int, iterations: int) -> float: + if iterations <= 0: + raise ValueError("iterations must be positive") + + evt = None + for _ in range(warmup): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + + start = time.perf_counter() + for _ in range(iterations): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + end = time.perf_counter() + + return (end - start) / iterations + + +def run_kernel( + knl: lp.TranslationUnit, + beta: np.ndarray, + weights: np.ndarray, + warmup: int, + iterations: int + ) -> tuple[np.ndarray, float]: + import pyopencl as cl + import pyopencl.array as cl_array + + ctx = cl.create_some_context(interactive=False) + queue = cl.CommandQueue(ctx) + ex = knl.executor(queue) + + beta_cl = cl_array.to_device(queue, beta) + weights_cl = cl_array.to_device(queue, weights) + sigma_cl = cl_array.zeros(queue, beta.shape, dtype=beta.dtype) + + elapsed = benchmark_executor( + ex, + queue, + {"beta": beta_cl, "w": weights_cl, "sigma": sigma_cl}, + warmup=warmup, + iterations=iterations, + ) + + _, out = ex(queue, beta=beta_cl, w=weights_cl, sigma=sigma_cl) + return out[0].get(), elapsed + + +def main( + order: int = 16, + eta_tile_size: int = 8, + dimension: int = 2, + use_compute: bool = False, + compare: bool = False, + print_kernel: bool = False, + print_device_code: bool = False, + run: bool = False, + warmup: int = 3, + iterations: int = 10 + ) -> None: + if order < 0: + raise ValueError("order must be nonnegative") + if dimension not in (2, 3): + raise ValueError("dimension must be 2 or 3") + + dtype = np.float64 + h = np.array([0.25, -0.2, 0.15][:dimension], dtype=dtype) + weights = translation_weights(h, order) + beta = make_axis_compressed_coefficients(order, dimension, dtype) + reference = reference_axis_m2m(beta, weights) + + inline_terms, compute_terms = operation_model( + order, eta_tile_size, dimension) + + variants = [False, True] if compare else [use_compute] + timings: dict[bool, float] = {} + for variant_uses_compute in variants: + knl = make_kernel( + order, eta_tile_size, dimension, dtype, + use_compute=variant_uses_compute) + modeled_flops = m2m_flop_count( + order, eta_tile_size, dimension, + use_compute=variant_uses_compute) + + print(20 * "=", "Compressed M2M report", 20 * "=") + variant_name = ( + "compute sum-factorized" if variant_uses_compute else "inline" + ) + print(f"Variant: {variant_name}") + print(f"Dimension: {dimension}D") + print(f"Order : {order}") + print(f"Eta tile: {eta_tile_size}") + if dimension == 2: + print("Stored compressed set: zeta0 = 0 or zeta1 = 0") + else: + print("Stored compressed set: exactly one zeta axis may be nonzero") + print(f"Inline sum terms : {inline_terms}") + print(f"Tiled compute sum terms: {compute_terms}") + print(f"Modeled flop count : {modeled_flops}") + + if print_kernel: + print(knl) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + if run or compare: + try: + result, elapsed = run_kernel( + knl, beta, weights, warmup=warmup, iterations=iterations) + except Exception as exc: + print(f"Runtime execution unavailable: {exc}") + else: + rel_err = la.norm(result - reference) / la.norm(reference) + gflops = modeled_flops / elapsed * 1e-9 + timings[variant_uses_compute] = elapsed + print(f"Average time per iteration: {elapsed:.6e} s") + print(f"Modeled throughput: {gflops:.3f} GFLOP/s") + print(f"Relative error: {rel_err:.3e}") + + print((40 + len(" Compressed M2M report ")) * "=") + + if compare and False in timings and True in timings: + speedup = timings[False] / timings[True] + time_reduction = (1 - timings[True] / timings[False]) * 100 + print(f"Speedup: {speedup:.3f}x") + print(f"Relative time reduction: {time_reduction:.2f}%") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + _ = parser.add_argument("--order", action="store", type=int, default=16) + _ = parser.add_argument("--eta-tile-size", action="store", type=int, default=8) + _ = parser.add_argument("--dimension", action="store", type=int, choices=(2, 3), + default=2) + _ = parser.add_argument("--dim", action="store", type=int, choices=(2, 3), + dest="dimension") + _ = parser.add_argument("--compute", action="store_true") + _ = parser.add_argument("--compare", action="store_true") + _ = parser.add_argument("--run-kernel", action="store_true") + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--print-device-code", action="store_true") + _ = parser.add_argument("--warmup", action="store", type=int, default=3) + _ = parser.add_argument("--iterations", action="store", type=int, default=10) + + args = parser.parse_args() + + main( + order=args.order, + eta_tile_size=args.eta_tile_size, + dimension=args.dimension, + use_compute=args.compute, + compare=args.compare, + print_kernel=args.print_kernel, + print_device_code=args.print_device_code, + run=args.run_kernel, + warmup=args.warmup, + iterations=args.iterations, + ) diff --git a/examples/python/compute-examples/matmul.py b/examples/python/compute-examples/matmul.py new file mode 100644 index 000000000..1bcdba6f1 --- /dev/null +++ b/examples/python/compute-examples/matmul.py @@ -0,0 +1,399 @@ + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import pyopencl as cl +import pyopencl.array as cl_array + +import loopy as lp +from loopy.transform.compute import compute + + +def benchmark_kernel( + knl: lp.TranslationUnit, + queue: cl.CommandQueue, + a: np.ndarray, + b: np.ndarray, + nwarmup: int = 5, + niterations: int = 20 +): + ex = knl.executor(queue) + + a_cl = cl_array.to_device(queue, a) + b_cl = cl_array.to_device(queue, b) + c_cl = cl_array.zeros(queue, (a.shape[0], b.shape[1]), dtype=a_cl.dtype) + + start = cl.enqueue_marker(queue) + for _ in range(nwarmup): + ex(queue, a=a_cl, b=b_cl, c=c_cl) + end = cl.enqueue_marker(queue) + end.wait() + start.wait() + + start = cl.enqueue_marker(queue) + for _ in range(niterations): + ex(queue, a=a_cl, b=b_cl, c=c_cl) + end = cl.enqueue_marker(queue) + end.wait() + start.wait() + + total_ns = end.profile.end - start.profile.end + total_elapsed_s = total_ns * 1e-9 + s_per_iter = total_elapsed_s / niterations + + total_flops = 2 * a.shape[0] * a.shape[1] * b.shape[1] + gflops = (total_flops / s_per_iter) * 1e-9 + + c_ref = a @ b + _, c_res = ex(queue, a=a_cl, b=b_cl, c=c_cl) + + error = la.norm(c_res[0].get() - c_ref) / la.norm(c_ref) + + m, k = a.shape + _, n = b.shape + print("================= Results =================") + print(f"M = {m}, N = {n}, K = {k}") + print(f" Error = {error:.4}") + print(f" Total time (s): {total_elapsed_s:.4}") + print(f"Time per iter (s): {s_per_iter:.4}") + print(f" GFLOP/s: {gflops}") + print("===========================================") + + +def naive_matmul( + knl: lp.TranslationUnit, + bm: int, + bn: int, + bk: int + ) -> lp.TranslationUnit: + knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io") + knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo") + knl = lp.split_iname(knl, "k", bk, inner_iname="ki", outer_iname="ko") + + iname_tags = { + "io": "g.1", + "jo": "g.0", + + "ii": "l.1", + "ji": "l.0" + } + + return lp.tag_inames(knl, iname_tags) + + +def shared_memory_tiled_matmul( + knl: lp.TranslationUnit, + bm: int, + bn: int, + bk: int + ) -> lp.TranslationUnit: + knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io") + knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo") + knl = lp.split_iname(knl, "k", bk, inner_iname="ki", outer_iname="ko") + + compute_map_a = nisl.make_map(f"""{{ + [is, ks] -> [a_ii, io, a_ki, ko] : + is = io * {bm} + a_ii and + ks = ko * {bk} + a_ki + }}""") + + compute_map_b = nisl.make_map(f"""{{ + [ks, js] -> [b_ki, ko, b_ji, jo] : + js = jo * {bn} + b_ji and + ks = ko * {bk} + b_ki + }}""") + + knl = compute( + knl, + "a_", + compute_map=compute_map_a, + storage_indices=["a_ii", "a_ki"], + placement_inames=["jo"], + temporary_name="a_tile", + temporary_address_space=lp.AddressSpace.LOCAL, + compute_insn_id="a_load" + ) + + knl = compute( + knl, + "b_", + compute_map=compute_map_b, + storage_indices=["b_ki", "b_ji"], + placement_inames=["io"], + temporary_name="b_tile", + temporary_address_space=lp.AddressSpace.LOCAL, + compute_insn_id="b_load" + ) + + iname_tags = { + "io": "g.1", + "ii": "l.1", + + "jo": "g.0", + "ji": "l.0", + + "a_ii": "l.1", + "a_ki": "l.0", + + "b_ki": "l.1", + "b_ji": "l.0" + } + + return lp.tag_inames(knl, iname_tags) + + +def register_tiled_matmul( + knl: lp.TranslationUnit, + bm: int, + bn: int, + bk: int, + tm: int, + tn: int + ) -> lp.TranslationUnit: + + # {{{ shared-memory-level split / compute + + knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io") + knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo") + knl = lp.split_iname(knl, "k", bk, inner_iname="ki", outer_iname="ko") + + compute_map_a = nisl.make_map(f"""{{ + [is, ks] -> [a_ii, io, a_ki, ko] : + is = io * {bm} + a_ii and + ks = ko * {bk} + a_ki + }}""") + + compute_map_b = nisl.make_map(f"""{{ + [ks, js] -> [b_ki, ko, b_ji, jo] : + js = jo * {bn} + b_ji and + ks = ko * {bk} + b_ki + }}""") + + knl = compute( + knl, + "a_", + compute_map=compute_map_a, + storage_indices=["a_ii", "a_ki"], + placement_inames=["jo"], + temporary_name="a_smem", + temporary_address_space=lp.AddressSpace.LOCAL, + compute_insn_id="a_load" + ) + + knl = compute( + knl, + "b_", + compute_map=compute_map_b, + storage_indices=["b_ki", "b_ji"], + placement_inames=["io"], + temporary_name="b_smem", + temporary_address_space=lp.AddressSpace.LOCAL, + compute_insn_id="b_load" + ) + + wg_size_i = bm // tm + wg_size_j = bn // tn + knl = lp.split_iname( + knl, + "a_ii", + wg_size_i, + inner_iname="a_local", + outer_iname="a_tile" + ) + + knl = lp.split_iname( + knl, + "b_ji", + wg_size_j, + inner_iname="b_local", + outer_iname="b_tile" + ) + + # }}} + + # {{{ register-level split / compute + + knl = lp.extract_subst( + knl, + "a_smem_", + "a_smem[is, ks]", + parameters="is, ks" + ) + + knl = lp.extract_subst( + knl, + "b_smem_", + "b_smem[ks, js]", + parameters="ks, js" + ) + + knl = lp.split_iname(knl, "ii", tm, + inner_iname="ii_reg", + outer_iname="ii_thr") + + knl = lp.split_iname(knl, "ji", tn, + inner_iname="ji_reg", + outer_iname="ji_thr") + + knl = lp.split_iname(knl, "ki", 8, + inner_iname="dot", + outer_iname="ki_outer") + + a_reg_tile = nisl.make_map(f"""{{ + [is, ks] -> [a_reg_i, ii_thr, ki_outer, dot] : + is = ii_thr * {tm} + a_reg_i and + ks = ki_outer * 8 + dot + }}""") + + b_reg_tile = nisl.make_map(f"""{{ + [ks, js] -> [b_reg_j, ki_outer, dot, ji_thr] : + ks = ki_outer * 8 + dot and + js = ji_thr * {tn} + b_reg_j + }}""") + + knl = compute( + knl, + "a_smem_", + compute_map=a_reg_tile, + storage_indices=["a_reg_i"], + placement_inames=["ji_thr", "io", "jo", "ko"], + temporary_name="a_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + compute_insn_id="a_reg_load" + ) + + knl = compute( + knl, + "b_smem_", + compute_map=b_reg_tile, + storage_indices=["b_reg_j"], + placement_inames=["ii_thr", "io", "jo", "ko"], + temporary_name="b_reg", + temporary_address_space=lp.AddressSpace.PRIVATE, + compute_insn_id="b_reg_load" + ) + + # }}} + + iname_tags = { + # global tiles + "io": "g.1", + "jo": "g.0", + + # a local storage axes + "a_local": "l.1", + "a_ki": "l.0", + + # b local storage axes + "b_local": "l.0", + "b_ki": "l.1", + + # register tiles + "ii_thr": "l.1", + "ji_thr": "l.0", + + # register storage axes + "a_reg_i": "ilp", + "b_reg_j": "ilp", + + # compute axes + "ii_reg": "ilp", + "ji_reg": "ilp" + } + + return lp.tag_inames(knl, iname_tags) + + +def main( + m: int = 1024, + n: int = 1024, + k: int = 1024, + bm: int = 64, + bn: int = 64, + bk: int = 32, + tm: int = 4, + tn: int = 4, + shared_memory_tiled: bool = False, + register_tiled: bool = False, + dtype: lp.ToLoopyTypeConvertible = np.float32, + print_kernel: bool = False, + print_device_code: bool = False + ) -> None: + + knl = lp.make_kernel( + "{ [i, j, k] : 0 <= i < M and 0 <= j < N and 0 <= k < K }", + """ + a_(is, ks) := a[is, ks] + b_(ks, js) := b[ks, js] + + c[i, j] = sum([k], a_(i, k) * b_(k, j)) + """, + [ + lp.GlobalArg("a", shape=(m, k), dtype=dtype), + lp.GlobalArg("b", shape=(k, n), dtype=dtype), + lp.GlobalArg("c", shape=(m, n), is_output=True) + ] + ) + + knl = lp.fix_parameters(knl, M=m, N=n, K=k) + + if shared_memory_tiled: + knl = shared_memory_tiled_matmul(knl, bm, bn, bk) + elif register_tiled: + knl = register_tiled_matmul(knl, bm, bn, bk, tm, tn) + else: + knl = naive_matmul(knl, bm, bn, bk) + + ctx = cl.create_some_context() + queue = cl.CommandQueue( + ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE + ) + + rng = np.random.default_rng() + a = rng.standard_normal((m, k)).astype(dtype) + b = rng.standard_normal((k, n)).astype(dtype) + + benchmark_kernel(knl, queue, a, b) + + if print_kernel: + print(knl) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + _ = parser.add_argument("--m", action="store", type=int, default=1024) + _ = parser.add_argument("--n", action="store", type=int, default=1024) + _ = parser.add_argument("--k", action="store", type=int, default=1024) + + _ = parser.add_argument("--bm", action="store", type=int, default=64) + _ = parser.add_argument("--bn", action="store", type=int, default=64) + _ = parser.add_argument("--bk", action="store", type=int, default=16) + + _ = parser.add_argument("--tm", action="store", type=int, default=4) + _ = parser.add_argument("--tn", action="store", type=int, default=4) + + _ = parser.add_argument("--shared-memory-tiled", action="store_true") + _ = parser.add_argument("--register-tiled", action="store_true") + + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--print-device-code", action="store_true") + + args = parser.parse_args() + + main( + m=args.m, n=args.n, k=args.k, + bm=args.bm, bn=args.bn, bk=args.bk, + tm=args.tm, tn=args.tn, + shared_memory_tiled=args.shared_memory_tiled, + register_tiled=args.register_tiled, + print_kernel=args.print_kernel, + print_device_code=args.print_device_code + ) diff --git a/examples/python/compute-examples/p2m-basis-compute.py b/examples/python/compute-examples/p2m-basis-compute.py new file mode 100644 index 000000000..ec9ae5ce3 --- /dev/null +++ b/examples/python/compute-examples/p2m-basis-compute.py @@ -0,0 +1,370 @@ +"""Benchmark a 2D Cartesian Taylor P2M microkernel with Loopy compute. + +FMM kernel class: kernel-independent Cartesian Taylor/asymptotic multipole +moment formation. This script does not evaluate a particular Green's function +such as the 2D Laplace or Helmholtz kernel. It builds the source monomial +moments that a Taylor FMM would later pair with kernel derivatives or translated +coefficients. In a Laplace FMM, the Laplace-specific derivative recurrence or +compressed representation lives outside this benchmark. + +The kernel forms tensor-product source moments from particle strengths: + + beta[q0, q1] = sum_{isrc} strength[isrc] + * x[isrc]**q0 / q0! + * y[isrc]**q1 / q1! + +The inline variant is a GPU-parallel reduction over sources for every output +coefficient. The compute variant splits the source and q1 loops and uses +:func:`loopy.transform.compute.compute` to precompute reusable x and y monomial +basis values in private temporaries. This tests whether compute can expose +source-tile and coefficient-tile reuse in a reduction-heavy P2M-like kernel. + +Use ``--compare`` to run both GPU-parallel variants, compare with the NumPy +reference result, and print timing, modeled GFLOP/s, speedup, and relative +error. +""" + +import os +import time + + +os.environ.setdefault("XDG_CACHE_HOME", "/tmp") + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import loopy as lp +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def inv_factorials(order: int, dtype) -> np.ndarray: + result = np.empty(order + 1, dtype=dtype) + result[0] = 1 + for i in range(1, order + 1): + result[i] = result[i - 1] / i + return result + + +def reference_p2m( + strength: np.ndarray, + x: np.ndarray, + y: np.ndarray, + inv_fact: np.ndarray + ) -> np.ndarray: + order = inv_fact.size - 1 + beta = np.empty((order + 1, order + 1), dtype=x.dtype) + + for q0 in range(order + 1): + for q1 in range(order + 1): + acc = 0 + for isrc in range(x.size): + x_monom = x[isrc]**q0 * inv_fact[q0] + y_monom = y[isrc]**q1 * inv_fact[q1] + acc += strength[isrc] * x_monom * y_monom + beta[q0, q1] = acc + + return beta + + +def make_kernel( + nsources: int, + order: int, + q1_tile_size: int, + source_tile_size: int, + dtype, + use_compute: bool = False + ) -> lp.TranslationUnit: + if (order + 1) % q1_tile_size: + raise ValueError("order + 1 must be divisible by q1_tile_size") + if nsources % source_tile_size: + raise ValueError("nsources must be divisible by source_tile_size") + + knl = lp.make_kernel( + "{ [isrc, q0, q1] : 0 <= isrc < nsources and 0 <= q0, q1 <= p }", + """ + x_monom_(isrc_arg, q0_arg) := ( + x[isrc_arg] ** q0_arg * inv_fact[q0_arg] + ) + + y_monom_(isrc_arg, q1_arg) := ( + y[isrc_arg] ** q1_arg * inv_fact[q1_arg] + ) + + beta[q0, q1] = sum( + [isrc], + strength[isrc] * x_monom_(isrc, q0) * y_monom_(isrc, q1) + ) + """, + [ + lp.GlobalArg("x", dtype=dtype, shape=(nsources,)), + lp.GlobalArg("y", dtype=dtype, shape=(nsources,)), + lp.GlobalArg("strength", dtype=dtype, shape=(nsources,)), + lp.GlobalArg("inv_fact", dtype=dtype, shape=(order + 1,)), + lp.GlobalArg("beta", dtype=dtype, shape=(order + 1, order + 1), + is_output=True), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + + knl = lp.fix_parameters(knl, nsources=nsources, p=order) + knl = lp.split_iname( + knl, + "q1", + q1_tile_size, + inner_iname="q1_inner", + outer_iname="q1_outer", + ) + knl = lp.split_iname( + knl, + "isrc", + source_tile_size, + inner_iname="isrc_inner", + outer_iname="isrc_outer", + ) + + if use_compute: + x_monom_map = nisl.make_map(f"""{{ + [isrc_arg, q0_arg] -> [q0, isrc_outer, isrc_s] : + isrc_arg = isrc_outer * {source_tile_size} + isrc_s and + q0_arg = q0 + }}""") + + knl = compute( + knl, + "x_monom_", + compute_map=x_monom_map, + storage_indices=["isrc_s"], + placement_inames=["q1_outer"], + temporary_name="x_monom_for_q1_tile", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="x_monom_compute", + ) + + y_monom_map = nisl.make_map(f"""{{ + [isrc_arg, q1_arg] -> [q1_outer, q1_inner, isrc_outer, isrc_s] : + isrc_arg = isrc_outer * {source_tile_size} + isrc_s and + q1_arg = q1_outer * {q1_tile_size} + q1_inner + }}""") + + knl = compute( + knl, + "y_monom_", + compute_map=y_monom_map, + storage_indices=["isrc_s"], + placement_inames=["q0"], + temporary_name="y_monom_for_coeff", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + compute_insn_id="y_monom_compute", + ) + + return lp.tag_inames(knl, { + "q0": "g.1", + "q1_outer": "g.0", + "q1_inner": "ilp", + }) + + +def operation_model( + nsources: int, + order: int, + q1_tile_size: int + ) -> tuple[int, int]: + ncoeff = order + 1 + inline_monomial_evals = 2 * nsources * ncoeff**2 + compute_monomial_evals = ( + nsources * ncoeff**2 + + nsources * ncoeff**2 // q1_tile_size + ) + return inline_monomial_evals, compute_monomial_evals + + +def p2m_flop_count( + nsources: int, + order: int, + q1_tile_size: int, + use_compute: bool + ) -> int: + ncoeff = order + 1 + + contraction_flops = 3 * nsources * ncoeff**2 + if use_compute: + monomial_scale_flops = ( + nsources * ncoeff**2 + + nsources * ncoeff**2 // q1_tile_size + ) + else: + monomial_scale_flops = 2 * nsources * ncoeff**2 + + return contraction_flops + monomial_scale_flops + + +def benchmark_executor(ex, queue, args, warmup: int, iterations: int) -> float: + evt = None + for _ in range(warmup): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + + start = time.perf_counter() + for _ in range(iterations): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + end = time.perf_counter() + + return (end - start) / iterations + + +def run_kernel( + knl: lp.TranslationUnit, + x: np.ndarray, + y: np.ndarray, + strength: np.ndarray, + inv_fact: np.ndarray, + warmup: int, + iterations: int + ) -> tuple[np.ndarray, float]: + import pyopencl as cl + import pyopencl.array as cl_array + + ctx = cl.create_some_context(interactive=False) + queue = cl.CommandQueue(ctx) + ex = knl.executor(queue) + + x_cl = cl_array.to_device(queue, x) + y_cl = cl_array.to_device(queue, y) + strength_cl = cl_array.to_device(queue, strength) + inv_fact_cl = cl_array.to_device(queue, inv_fact) + beta_cl = cl_array.zeros(queue, (inv_fact.size, inv_fact.size), + dtype=x.dtype) + + elapsed = benchmark_executor( + ex, + queue, + { + "x": x_cl, + "y": y_cl, + "strength": strength_cl, + "inv_fact": inv_fact_cl, + "beta": beta_cl, + }, + warmup=warmup, + iterations=iterations, + ) + + _, out = ex( + queue, x=x_cl, y=y_cl, strength=strength_cl, + inv_fact=inv_fact_cl, beta=beta_cl) + return out[0].get(), elapsed + + +def main( + nsources: int = 256, + order: int = 12, + q1_tile_size: int = 13, + source_tile_size: int = 128, + use_compute: bool = False, + compare: bool = False, + print_kernel: bool = False, + print_device_code: bool = False, + run: bool = False, + warmup: int = 3, + iterations: int = 10 + ) -> None: + dtype = np.float64 + rng = np.random.default_rng(18) + x = rng.uniform(-0.25, 0.25, size=nsources).astype(dtype) + y = rng.uniform(-0.25, 0.25, size=nsources).astype(dtype) + strength = rng.normal(size=nsources).astype(dtype) + inv_fact = inv_factorials(order, dtype) + reference = reference_p2m(strength, x, y, inv_fact) + + inline_evals, compute_evals = operation_model( + nsources, order, q1_tile_size) + + variants = [False, True] if compare else [use_compute] + timings: dict[bool, float] = {} + for variant_uses_compute in variants: + knl = make_kernel( + nsources, order, q1_tile_size, source_tile_size, dtype, + use_compute=variant_uses_compute) + modeled_flops = p2m_flop_count( + nsources, order, q1_tile_size, + use_compute=variant_uses_compute) + + print(20 * "=", "P2M basis report", 20 * "=") + print(f"Variant: {'compute' if variant_uses_compute else 'inline'}") + print(f"Sources: {nsources}") + print(f"Order : {order}") + print(f"q1 tile: {q1_tile_size}") + print(f"Source tile: {source_tile_size}") + print(f"Inline monomial evaluations: {inline_evals}") + print(f"Compute monomial evaluations: {compute_evals}") + print(f"Modeled flop count: {modeled_flops}") + + if print_kernel: + print(knl) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + if run or compare: + try: + result, elapsed = run_kernel( + knl, x, y, strength, inv_fact, + warmup=warmup, iterations=iterations) + except Exception as exc: + print(f"Runtime execution unavailable: {exc}") + else: + rel_err = la.norm(result - reference) / la.norm(reference) + gflops = modeled_flops / elapsed * 1e-9 + timings[variant_uses_compute] = elapsed + print(f"Average time per iteration: {elapsed:.6e} s") + print(f"Modeled throughput: {gflops:.3f} GFLOP/s") + print(f"Relative error: {rel_err:.3e}") + + print((40 + len(" P2M basis report ")) * "=") + + if compare and False in timings and True in timings: + speedup = timings[False] / timings[True] + time_reduction = (1 - timings[True] / timings[False]) * 100 + print(f"Speedup: {speedup:.3f}x") + print(f"Relative time reduction: {time_reduction:.2f}%") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + _ = parser.add_argument("--nsources", action="store", type=int, default=256) + _ = parser.add_argument("--order", action="store", type=int, default=12) + _ = parser.add_argument("--q1-tile-size", action="store", type=int, default=13) + _ = parser.add_argument("--source-tile-size", action="store", + type=int, default=128) + _ = parser.add_argument("--compute", action="store_true") + _ = parser.add_argument("--compare", action="store_true") + _ = parser.add_argument("--run-kernel", action="store_true") + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--print-device-code", action="store_true") + _ = parser.add_argument("--warmup", action="store", type=int, default=3) + _ = parser.add_argument("--iterations", action="store", type=int, default=10) + + args = parser.parse_args() + + main( + nsources=args.nsources, + order=args.order, + q1_tile_size=args.q1_tile_size, + source_tile_size=args.source_tile_size, + use_compute=args.compute, + compare=args.compare, + print_kernel=args.print_kernel, + print_device_code=args.print_device_code, + run=args.run_kernel, + warmup=args.warmup, + iterations=args.iterations, + ) diff --git a/examples/python/compute-examples/run-compute-examples.sh b/examples/python/compute-examples/run-compute-examples.sh new file mode 100755 index 000000000..096a6d6f3 --- /dev/null +++ b/examples/python/compute-examples/run-compute-examples.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail + +PYTHON="$(which python)" +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" + +cd "$SCRIPT_DIR" + +run_example() { + echo + echo "===== $* =====" + "$PYTHON" "$@" +} + +run_example finite-difference-2-5D.py \ + --npoints 96 \ + --stencil-width 9 \ + --compute \ + --run-kernel \ + --warmup 2 \ + --iterations 3 + +run_example wave-equation-ring-buffer.py \ + --ntime 4096 \ + --compute \ + --run-kernel \ + --warmup 2 \ + --iterations 5 + +run_example matmul.py \ + --m 512 \ + --n 512 \ + --k 512 \ + --bm 32 \ + --bn 32 \ + --bk 16 \ + --shared-memory-tiled + +run_example matmul.py \ + --m 512 \ + --n 512 \ + --k 512 \ + --bm 64 \ + --bn 64 \ + --bk 16 \ + --tm 4 \ + --tn 4 \ + --register-tiled + +run_example l2p-tiled-basis-compute.py \ + --ntargets 512 \ + --order 12 \ + --target-block-size 64 \ + --compare \ + --run-kernel \ + --warmup 2 \ + --iterations 3 + +run_example p2m-basis-compute.py \ + --nsources 512 \ + --order 12 \ + --q1-tile-size 13 \ + --source-tile-size 128 \ + --compare \ + --run-kernel \ + --warmup 2 \ + --iterations 3 + +run_example m2m-sum-factorization.py \ + --order 23 \ + --eta-tile-size 8 \ + --dimension 3 \ + --compare \ + --run-kernel \ + --warmup 2 \ + --iterations 3 + +run_example l2p-3d-tensor-product-compute.py \ + --ntargets 512 \ + --order 8 \ + --target-block-size 64 \ + --compare \ + --run-kernel \ + --warmup 2 \ + --iterations 3 diff --git a/examples/python/compute-examples/wave-equation-ring-buffer.py b/examples/python/compute-examples/wave-equation-ring-buffer.py new file mode 100644 index 000000000..039f38a6e --- /dev/null +++ b/examples/python/compute-examples/wave-equation-ring-buffer.py @@ -0,0 +1,210 @@ +import time + +import namedisl as nisl +import numpy as np +import numpy.linalg as la + +import pyopencl as cl + +import loopy as lp +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def benchmark_executor(ex, queue, args, warmup: int, iterations: int) -> float: + if iterations <= 0: + raise ValueError("iterations must be positive") + + evt = None + for _ in range(warmup): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + + start = time.perf_counter() + for _ in range(iterations): + evt, _ = ex(queue, **args) + if evt is not None: + evt.wait() + end = time.perf_counter() + + return (end - start) / iterations + + +def wave_flop_count(ntime: int) -> int: + return 5 * (ntime - 2) + + +def main( + ntime: int = 128, + use_compute: bool = False, + print_device_code: bool = False, + print_kernel: bool = False, + run_kernel: bool = False, + warmup: int = 3, + iterations: int = 10 + ) -> float | None: + dtype = np.float64 + + dt = dtype(1 / 512) + omega = dtype(2 * np.pi) + omega2 = dtype(omega**2) + + t = dt * np.arange(ntime, dtype=dtype) + u = np.cos(omega * t).astype(dtype) + + bt = 32 + + knl = lp.make_kernel( + "{ [t] : 1 <= t < ntime - 1 }", + """ + u_hist(ts) := u[ts] + + u_next[t + 1] = ( + 2 * u_hist(t) + - u_hist(t - 1) + - dt2 * omega2 * u_hist(t) + ) + """, + [ + lp.GlobalArg("u", dtype=dtype, shape=(ntime,)), + lp.GlobalArg("u_next", dtype=dtype, shape=(ntime,), + is_output=True), + lp.ValueArg("dt2", dtype=dtype), + lp.ValueArg("omega2", dtype=dtype), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + + knl = lp.fix_parameters(knl, ntime=ntime) + knl = lp.split_iname(knl, "t", bt, inner_iname="ti", outer_iname="to") + + if use_compute: + ring_buffer_map = nisl.make_map(f"""{{ + [ts] -> [to, ti, tb] : + tb = ts - (to * {bt} + ti) + 1 + }}""") + + knl = compute( + knl, + "u_hist", + compute_map=ring_buffer_map, + storage_indices=["tb"], + + temporary_name="u_time_buf", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=dtype, + + compute_insn_id="u_time_buf_compute", + inames_to_advance=["ti"], + ) + + knl = lp.tag_inames(knl, {"tb": "unr"}) + + knl = lp.tag_inames(knl, {"to": "g.0"}) + + if print_device_code: + print(lp.generate_code_v2(knl).device_code()) + + if print_kernel: + print(knl) + + if not run_kernel: + return None + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + ex = knl.executor(queue) + + dt2 = dtype(dt**2) + avg_time_per_iter = benchmark_executor( + ex, queue, {"u": u, "dt2": dt2, "omega2": omega2}, + warmup=warmup, iterations=iterations) + avg_gflops = wave_flop_count(ntime) / avg_time_per_iter / 1e9 + + _, out = ex(queue, u=u, dt2=dt2, omega2=omega2) + + ref = np.zeros_like(u) + for time_idx in range(1, ntime - 1): + ref[time_idx + 1] = ( + 2 * u[time_idx] + - u[time_idx - 1] + - dt2 * omega2 * u[time_idx] + ) + + sl = slice(2, ntime) + rel_err = la.norm(ref[sl] - out[0][sl]) / la.norm(ref[sl]) + + print(20 * "=", "Wave recurrence report", 20 * "=") + print(f"Variant : {'compute' if use_compute else 'baseline'}") + print(f"Time steps : {ntime}") + print(f"Iterations : warmup = {warmup}, timed = {iterations}") + print(f"Average time per iteration: {avg_time_per_iter:.6e} s") + print(f"Average throughput: {avg_gflops:.3f} GFLOP/s") + print(f"Relative error: {rel_err:.3e}") + print((40 + len(" Wave recurrence report ")) * "=") + + return avg_time_per_iter + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + + _ = parser.add_argument("--ntime", action="store", type=int, default=128) + + _ = parser.add_argument("--compare", action="store_true") + _ = parser.add_argument("--compute", action="store_true") + _ = parser.add_argument("--run-kernel", action="store_true") + _ = parser.add_argument("--no-run-kernel", action="store_false", + dest="run_kernel") + _ = parser.add_argument("--print-device-code", action="store_true") + _ = parser.add_argument("--print-kernel", action="store_true") + _ = parser.add_argument("--warmup", action="store", type=int, default=3) + _ = parser.add_argument("--iterations", action="store", type=int, default=10) + + args = parser.parse_args() + + if args.compare: + print("Running example without compute...") + no_compute_time = main( + ntime=args.ntime, + use_compute=False, + print_device_code=args.print_device_code, + print_kernel=args.print_kernel, + run_kernel=True, + warmup=args.warmup, + iterations=args.iterations, + ) + print(50 * "=", "\n") + + print("Running example with compute...") + compute_time = main( + ntime=args.ntime, + use_compute=True, + print_device_code=args.print_device_code, + print_kernel=args.print_kernel, + run_kernel=True, + warmup=args.warmup, + iterations=args.iterations, + ) + print(50 * "=", "\n") + + assert no_compute_time is not None + assert compute_time is not None + speedup = no_compute_time / compute_time + print(f"Speedup: {speedup:.3f}x") + time_reduction = (1 - compute_time / no_compute_time) * 100 + print(f"Relative time reduction: {time_reduction:.2f}%") + else: + _ = main( + ntime=args.ntime, + use_compute=args.compute, + print_device_code=args.print_device_code, + print_kernel=args.print_kernel, + run_kernel=args.run_kernel, + warmup=args.warmup, + iterations=args.iterations, + ) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 7bea68a24..a6aa60f22 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -148,6 +148,28 @@ class HappensAfter: variable_name: str | None instances_rel: isl.Map | None + def __setstate__(self, state: Mapping[str, Any] | tuple[Any, ...]) -> None: + if isinstance(state, abc_Mapping): + variable_name = state.get("variable_name") + instances_rel = state.get("instances_rel") + else: + variable_name = state[0] if len(state) == 2 else None + instances_rel = state[-1] if state else None + + object.__setattr__(self, "variable_name", variable_name) + object.__setattr__(self, "instances_rel", instances_rel) + + +def _normalize_happens_after(ha: HappensAfter | isl.Map | None) -> HappensAfter: + if isinstance(ha, HappensAfter): + if not hasattr(ha, "variable_name"): + object.__setattr__(ha, "variable_name", None) + if not hasattr(ha, "instances_rel"): + object.__setattr__(ha, "instances_rel", None) + return ha + + return HappensAfter(variable_name=None, instances_rel=ha) + # }}} @@ -363,7 +385,10 @@ def __init__(self, instances_rel=None) for after_id in happens_after}) elif isinstance(happens_after, dict): - happens_after = constantdict(happens_after) + happens_after = constantdict({ + after_id: _normalize_happens_after(ha) + for after_id, ha in happens_after.items() + }) else: raise TypeError("'happens_after' has unexpected type: " f"{type(happens_after)}") @@ -596,7 +621,7 @@ def __setstate__(self, val): if self.id is not None: # pylint:disable=access-member-before-definition self.id = intern(self.id) self.happens_after = constantdict({ - intern(after_id): ha + intern(after_id): _normalize_happens_after(ha) for after_id, ha in self.happens_after.items()}) self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 4a67d69f5..4475f1664 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1248,7 +1248,7 @@ class ExpansionState: """ kernel: LoopKernel - instruction: InstructionBase + instruction: InstructionBase | None stack: tuple[ConcreteMatchable, ...] """A tuple representing the current expansion stack, as a tuple of ``(name, tag)`` pairs. @@ -1564,7 +1564,7 @@ def map_subst_rule( def __call__(self, # pyright: ignore[reportIncompatibleMethodOverride] expr: Expression, kernel: LoopKernel, - insn: InstructionBase, + insn: InstructionBase | None, *args: P.args, **kwargs: P.kwargs) -> Expression: """ :arg insn: A :class:`~loopy.kernel.InstructionBase` of which *expr* is @@ -2081,8 +2081,8 @@ def aff_to_expr(aff: isl.Aff) -> ArithmeticExpression: from pymbolic import var denom = aff.get_denominator_val().to_python() - result = (aff.get_constant_val()*denom).to_python() + for dt in [isl.dim_type.in_, isl.dim_type.param]: for i in range(aff.dim(dt)): coeff = (aff.get_coefficient_val(dt, i)*denom).to_python() @@ -2548,6 +2548,26 @@ def constraint_to_cond_expr(cns: isl.Constraint) -> ArithmeticExpression: # }}} +# {{{ MultiPwAff from sequence of pymbolic exprs + +def multi_pw_aff_from_exprs( + exprs: Sequence[Expression], + space: isl.Space + ) -> isl.MultiPwAff: + + mpwaff = isl.MultiPwAff.zero(space) + for i in range(len(exprs)): + local_space = mpwaff.get_at(i).get_space().domain() + mpwaff = mpwaff.set_pw_aff( + i, + pwaff_from_expr(local_space, exprs[i]) + ) + + return mpwaff + +# }}} + + # {{{ isl_set_from_expr class ConditionExpressionToBooleanOpsExpression(IdentityMapper[[]]): diff --git a/loopy/transform/compute.py b/loopy/transform/compute.py new file mode 100644 index 000000000..26a5ff3d9 --- /dev/null +++ b/loopy/transform/compute.py @@ -0,0 +1,877 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, TypeAlias + +import namedisl as nisl +from typing_extensions import override + +import islpy as isl +import pymbolic.primitives as p +from pymbolic import var +from pymbolic.mapper.substitutor import make_subst_func +from pytools.tag import Tag + +import loopy as lp +from loopy.kernel.tools import ( + DomainChanger, + find_most_recent_global_barrier, + kernel_has_global_barriers, +) +from loopy.match import StackMatch, parse_stack_match +from loopy.symbolic import ( + DependencyMapper, + ExpansionState, + RuleAwareIdentityMapper, + RuleAwareSubstitutionMapper, + SubstitutionRuleExpander, + SubstitutionRuleMappingContext, + multi_pw_aff_from_exprs, + pw_aff_to_expr, +) +from loopy.transform.precompute import contains_a_subst_rule_invocation +from loopy.translation_unit import for_each_kernel +from loopy.types import ToLoopyTypeConvertible, to_loopy_type + + +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence, Set as AbstractSet + + from pymbolic.typing import Expression + + from loopy.kernel import LoopKernel + from loopy.kernel.data import AddressSpace + + +AccessTuple: TypeAlias = tuple[str, ...] + + +def _access_key(args: Sequence[Expression]) -> AccessTuple: + return tuple(str(arg) for arg in args) + + +def _base_name(name: str) -> str: + return name.removesuffix("_") + + +def _cur_name(name: str) -> str: + return f"{_base_name(name)}_cur" + + +def _prev_name(name: str) -> str: + return f"{_base_name(name)}_prev" + + +def _basic_set_to_predicates(bset: nisl.BasicSet) -> frozenset[Expression]: + isl_bset = bset.get_isl_object() + + predicates: Sequence[Expression] = [] + for constraint in isl_bset.get_constraints(): + expr = pw_aff_to_expr(constraint.get_aff()) + if constraint.is_equality(): + predicates.append(p.Comparison(expr, "==", 0)) + else: + predicates.append(p.Comparison(expr, ">=", 0)) + + return frozenset(predicates) + + +def _set_to_predicate_options( + set_: nisl.Set | nisl.BasicSet, +) -> Sequence[frozenset[Expression]]: + if isinstance(set_, nisl.BasicSet): + if set_.get_isl_object().is_empty(): + return [] + return [_basic_set_to_predicates(set_)] + + predicate_options: Sequence[frozenset[Expression]] = [] + for bset in set_.get_basic_sets(): + if not bset.get_isl_object().is_empty(): + predicate_options.append(_basic_set_to_predicates(bset)) + + return predicate_options + + +# helper for gathering names of variables in pymbolic expressions +def _gather_vars(expr: Expression) -> set[str]: + deps = DependencyMapper()(expr) + var_names: Sequence[str] = [] + for dep in deps: + if isinstance(dep, p.Variable): + var_names.append(dep.name) + elif isinstance(dep, p.Subscript) and isinstance(dep.aggregate, p.Variable): + var_names.append(dep.aggregate.name) + + return set(var_names) + + +def _existing_name_mapping( + map_: nisl.Map | nisl.BasicMap, name_mapping: Mapping[str, str] +) -> Mapping[str, str]: + names = map_.names + return { + source: target + for source, target in name_mapping.items() + if source in names and target in names + } + + +def _normalize_renamed_dims( + map_: nisl.Map | nisl.BasicMap, + name_mapping: Mapping[str, str], +) -> nisl.Map | nisl.BasicMap: + map_ = map_.equate_dims(_existing_name_mapping(map_, name_mapping)) + + names = map_.names + project_names = [ + renamed_name + for original_name, renamed_name in name_mapping.items() + if original_name in names and renamed_name in names + ] + map_ = map_.project_out(project_names) + + names = map_.names + rename_mapping = { + renamed_name: original_name + for original_name, renamed_name in name_mapping.items() + if original_name not in names and renamed_name in names + } + return map_.rename_dims(rename_mapping) + + +def _add_placement_inames_to_compute_map( + compute_map: nisl.Map, + storage_indices: Sequence[str], + placement_inames: Sequence[str], +) -> nisl.Map: + storage_set = frozenset(storage_indices) + placement_set = frozenset(placement_inames) + + if storage_set - compute_map.output_names: + raise ValueError( + "storage indices must appear in compute_map range: " + + ", ".join(sorted(storage_set - compute_map.output_names)) + ) + + if placement_set & storage_set: + raise ValueError( + "placement inames must not be storage indices: " + + ", ".join(sorted(placement_set & storage_set)) + ) + + parameter_context = placement_set & compute_map.parameter_names + if parameter_context: + raise ValueError( + "inames that appear in compute_map constraints must appear in ", + "the compute_map range, not placement_inames: " + + ", ".join(sorted(parameter_context)), + ) + + unsupported_context = placement_set & compute_map.input_names + if unsupported_context: + raise ValueError( + "placement inames must not appear in compute_map domain: " + + ", ".join(sorted(unsupported_context)) + ) + + context_to_add = [ + name for name in placement_inames if name not in compute_map.output_names + ] + return compute_map.add_output_names(context_to_add) + + +# {{{ gathering usage expressions + + +class UsageSiteExpressionGatherer(RuleAwareIdentityMapper[[]]): + """ + Gathers all expressions used as inputs to a particular substitution rule, + identified by name. + """ + + def __init__( + self, + rule_mapping_ctx: SubstitutionRuleMappingContext, + subst_expander: SubstitutionRuleExpander, + kernel: LoopKernel, + subst_name: str, + subst_tag: AbstractSet[Tag] | Tag | None = None, + ) -> None: + + super().__init__(rule_mapping_ctx) + + self.subst_expander: SubstitutionRuleExpander = subst_expander + self.kernel: LoopKernel = kernel + self.subst_name: str = subst_name + self.subst_tag: AbstractSet[Tag] | None = ( + {subst_tag} if isinstance(subst_tag, Tag) else subst_tag + ) + + self.usage_expressions: list[Sequence[Expression]] = [] + + @override + def map_subst_rule( + self, + name: str, + tags: AbstractSet[Tag] | None, + arguments: Sequence[Expression], + expn_state: ExpansionState, + ) -> Expression: + + if name != self.subst_name: + return super().map_subst_rule(name, tags, arguments, expn_state) + + if self.subst_tag is not None and self.subst_tag != tags: + return super().map_subst_rule(name, tags, arguments, expn_state) + + rule = self.rule_mapping_context.old_subst_rules[name] + arg_ctx = self.make_new_arg_context( + name, rule.arguments, arguments, expn_state.arg_context + ) + + self.usage_expressions.append([ + arg_ctx[arg_name] for arg_name in rule.arguments + ]) + + return 0 + + +# }}} + + +# {{{ substitution rule use replacement + + +class RuleInvocationReplacer(RuleAwareIdentityMapper[[]]): + def __init__( + self, + ctx: SubstitutionRuleMappingContext, + subst_name: str, + subst_tag: Sequence[Tag] | None, + usage_descriptors: Mapping[AccessTuple, nisl.Map | nisl.BasicMap], + storage_indices: Sequence[str], + temporary_name: str, + compute_insn_ids: str | Sequence[str], + temporary_address_space: AddressSpace, + footprint: nisl.Set, + compute_read_variables: AbstractSet[str], + ) -> None: + + super().__init__(ctx) + + self.subst_name: str = subst_name + self.subst_tag: Sequence[Tag] | None = subst_tag + + self.usage_descriptors: Mapping[AccessTuple, nisl.Map | nisl.BasicMap] = ( + usage_descriptors + ) + self.storage_indices: Sequence[str] = storage_indices + self.footprint: nisl.Set = footprint + self.compute_read_variables: frozenset[str] = frozenset(compute_read_variables) + self.compute_insn_depends_on: set[str] = set() + + self.temporary_name: str = temporary_name + self.temporary_address_space: AddressSpace = temporary_address_space + self.compute_insn_ids: frozenset[str] = ( + frozenset([compute_insn_ids]) + if isinstance(compute_insn_ids, str) + else frozenset(compute_insn_ids) + ) + + self.replaced_something: bool = False + + self.barrier_insn_depends_on: dict[str, set[str]] = {} + + @override + def map_subst_rule( + self, + name: str, + tags: AbstractSet[Tag] | None, + arguments: Sequence[Expression], + expn_state: ExpansionState, + ) -> Expression: + + rule = self.rule_mapping_context.old_subst_rules[name] + arg_ctx = self.make_new_arg_context( + name, rule.arguments, arguments, expn_state.arg_context + ) + args = [arg_ctx[arg_name] for arg_name in rule.arguments] + + # {{{ validation checks + + if name != self.subst_name: + return super().map_subst_rule(name, tags, arguments, expn_state) + + access_key = _access_key(args) + if access_key not in self.usage_descriptors: + return super().map_subst_rule(name, tags, arguments, expn_state) + + if len(arguments) != len(rule.arguments): + raise ValueError( + f"Number of arguments passed to rule {name} ", + f"does not match the signature of {name}.", + ) + + local_map = self.usage_descriptors[access_key] + temp_footprint = self.footprint.move_dims( + frozenset(self.footprint.names) - frozenset(self.storage_indices), + isl.dim_type.param, + ) + + if not local_map.range() <= temp_footprint: + return super().map_subst_rule(name, tags, arguments, expn_state) + + # }}} + + # {{{ get index expression in terms of global inames + + local_pwmaff = self.usage_descriptors[access_key].as_pw_multi_aff() + + index_exprs: Sequence[Expression] = [] + for dim in range(local_pwmaff.dim(isl.dim_type.out)): + index_exprs.append(pw_aff_to_expr(local_pwmaff.get_at(dim))) + + new_expression = var(self.temporary_name)[tuple(index_exprs)] + + # }}} + + self.replaced_something = True + + return new_expression + + @override + def map_kernel( + self, + kernel: LoopKernel, + within: StackMatch = lambda knl, insn, stack: True, + map_args: bool = True, + map_tvs: bool = True, + ) -> LoopKernel: + + new_insns: Sequence[lp.InstructionBase] = [] + replaced_insn_ids: set[str] = set() + excluded_insn_ids = set(self.compute_insn_ids) + recursive_dep_map = kernel.recursive_insn_dep_map() + for insn in kernel.instructions: + self.replaced_something = False + original_depends_on = insn.depends_on + + if isinstance( + insn, lp.MultiAssignmentBase + ) and not contains_a_subst_rule_invocation(kernel, insn): + new_insns.append(insn) + continue + + insn = insn.with_transformed_expressions( + lambda expr, insn=insn: self(expr, kernel, insn) + ) + + if self.replaced_something: + compute_read_depends_on: set[str] = set() + for dep in original_depends_on: + if dep in excluded_insn_ids: + continue + + dep_insn = kernel.id_to_insn[dep] + if ( + frozenset(dep_insn.assignee_var_names()) + & self.compute_read_variables + ): + compute_read_depends_on.update( + original_depends_on - excluded_insn_ids + ) + + use_dep_ids: frozenset[str] = self.compute_insn_ids + barrier_id = None + if kernel_has_global_barriers(kernel): + barrier_id = find_most_recent_global_barrier(kernel, insn.id) + + if barrier_id is not None: + compute_depends_on_barrier = any( + dep == barrier_id or barrier_id in recursive_dep_map[dep] + for dep in compute_read_depends_on + ) or any( + barrier_id in recursive_dep_map[compute_insn_id] + for compute_insn_id in self.compute_insn_ids + ) + + if ( + self.temporary_address_space == lp.AddressSpace.GLOBAL + and not compute_depends_on_barrier + ): + self.barrier_insn_depends_on.setdefault( + barrier_id, set() + ).update(self.compute_insn_ids) + use_dep_ids = frozenset([barrier_id]) + else: + self.compute_insn_depends_on.add(barrier_id) + + insn = insn.copy(depends_on=(insn.depends_on | use_dep_ids)) + replaced_insn_ids.add(insn.id) + + self.compute_insn_depends_on.update(compute_read_depends_on) + + new_insns.append(insn) + + self.compute_insn_depends_on -= replaced_insn_ids + + return kernel.copy(instructions=new_insns) + + +# }}} + + +def _build_reuse_updates( + compute_map: nisl.Map, + footprint: nisl.Set | nisl.BasicSet, + named_domain: nisl.Set | nisl.BasicSet, + storage_indices: Sequence[str], + context_set: frozenset[str], + inames_to_advance: Sequence[str] | None, + temporary_name: str, + compute_insn_id: str, +) -> tuple[ + list[lp.InstructionBase], + list[str], + Sequence[frozenset[Expression] | None], + frozenset[str], +]: + update_insns: list[lp.InstructionBase] = [] + update_insn_ids: list[str] = [] + refill_predicate_options: Sequence[frozenset[Expression] | None] = [None] + current_update_deps: frozenset[str] = frozenset() + + if inames_to_advance is None: + return ( + update_insns, + update_insn_ids, + refill_predicate_options, + current_update_deps, + ) + + advancing_set = frozenset(inames_to_advance) + + compute_map_cur = compute_map.rename_dims({ + name: _cur_name(name) for name in compute_map.output_names + }) + compute_map_prev = compute_map.rename_dims({ + name: _prev_name(name) for name in compute_map.output_names + }) + + reuse_map = compute_map_prev.reverse().apply_range(compute_map_cur) + reuse_map = reuse_map.add_constraint([ + ( + f"{name}_cur = {name}_prev + 1" + if name in advancing_set + else f"{name}_cur = {name}_prev" + ) + for name in context_set + ]) + + current_footprint = footprint.rename_dims({ + name: _cur_name(name) for name in footprint.names + }) + previous_footprint = footprint.rename_dims({ + name: _prev_name(name) for name in footprint.names + }) + + reuse_map = reuse_map.intersect_domain(previous_footprint) + reuse_map = reuse_map.intersect_range(current_footprint) + reuse_map = reuse_map - nisl.make_map( + "{ [" + + ", ".join(_prev_name(name) for name in footprint.names) + + "] -> [" + + ", ".join(_cur_name(name) for name in footprint.names) + + "] : " + + " and ".join( + f"{_cur_name(name)} = {_prev_name(name)}" for name in storage_indices + ) + + " }" + ) + + reused_current = reuse_map.range() + refill = current_footprint - reused_current + + cur_to_normal = {_cur_name(name): name for name in footprint.names} + reused_current = reused_current.rename_dims(cur_to_normal) + refill = refill.rename_dims(cur_to_normal) + + reused_context = named_domain.project_out_except(reused_current.names) + refill_context = named_domain.project_out_except(refill.names) + + reused_current = reused_current.gist(reused_context) + refill = refill.gist(refill_context) + + refill_predicate_options = _set_to_predicate_options(refill) + + storage_reuse_map = reuse_map.project_out_except( + frozenset(_prev_name(name) for name in storage_indices) + | frozenset(_cur_name(name) for name in storage_indices) + ) + storage_reuse_map = storage_reuse_map.rename_dims({ + _cur_name(name): name for name in storage_indices + }) + cur_to_prev = storage_reuse_map.reverse() + cur_to_prev_pwma = cur_to_prev.as_pw_multi_aff() + prev_expr_by_name = { + cur_to_prev_pwma.get_dim_name(isl.dim_type.out, dim): pw_aff_to_expr( + cur_to_prev_pwma.get_at(dim) + ) + for dim in range(cur_to_prev_pwma.dim(isl.dim_type.out)) + } + prev_storage_exprs = [ + prev_expr_by_name[_prev_name(name)] for name in storage_indices + ] + + shift_assignee = var(temporary_name)[tuple(var(idx) for idx in storage_indices)] + shift_expression = var(temporary_name)[tuple(prev_storage_exprs)] + + storage_set = frozenset(storage_indices) + shift_predicate_options = _set_to_predicate_options(reused_current) + for i, predicates in enumerate(shift_predicate_options): + shift_insn_id = ( + f"{compute_insn_id}_shift" + if len(shift_predicate_options) == 1 + else f"{compute_insn_id}_shift_{i}" + ) + update_insns.append( + lp.Assignment( + id=shift_insn_id, + assignee=shift_assignee, + expression=shift_expression, + within_inames=context_set | storage_set, + predicates=predicates, + depends_on=current_update_deps, + ) + ) + update_insn_ids.append(shift_insn_id) + current_update_deps = frozenset([shift_insn_id]) + + return update_insns, update_insn_ids, refill_predicate_options, current_update_deps + + +@for_each_kernel +def compute( + kernel: LoopKernel, + substitution: str, + compute_map: nisl.Map, + storage_indices: Sequence[str], + placement_inames: Sequence[str] = (), + inames_to_advance: Sequence[str] | None = None, + temporary_name: str | None = None, + temporary_address_space: AddressSpace | None = None, + temporary_dtype: ToLoopyTypeConvertible = None, + compute_insn_id: str | None = None, +) -> LoopKernel: + """ + Inserts an instruction to compute an expression given by :arg:`substitution` + and replaces all invocations of :arg:`substitution` with the result of the + inserted compute instruction. + + :arg substitution: The substitution rule for which the compute + transform should be applied. + + :arg compute_map: An :class:`isl.Map` representing a relation between + substitution rule indices and storage/context inames. Range inames that + are not storage indices are treated as context inames. + + :arg storage_indices: An ordered sequence of names of storage indices. Used + to create inames for the loops that cover the required set of compute points. + + :arg placement_inames: Context inames needed to place the compute + instruction, but not needed by the compute map constraints. + + :arg inames_to_advance: Inames that, when used to observe other contexts, + reveal a reuse relation between buffer states. + + :arg temporary_name: The name of the temporary variable. + + :arg temporary_address_space: The address space where the temporary variable + should live. + + :arg temporary_dtype: The data type of the temporary. + + :arg compute_insn_id: The ID of the inserted instruction. If appropriate, + this will also be used to determine the shift and refill instruction IDs. + """ + + if not temporary_name: + temporary_name = substitution + "_temp" + + # NOTE: we *could* allow compute to update existing temporaries, but for now + # just error if a temp with the same name already exists + if temporary_name in kernel.temporary_variables: + raise ValueError(f"A temporary named {temporary_name} already exists") + + if not compute_insn_id: + compute_insn_id = substitution + "_compute_insn" + + if temporary_address_space is None: + temporary_address_space = lp.AddressSpace.GLOBAL + + compute_map = _add_placement_inames_to_compute_map( + compute_map, storage_indices, placement_inames + ) + + parameter_inames = compute_map.parameter_names & kernel.all_inames() + if parameter_inames: + raise ValueError( + "inames that appear in compute_map constraints must appear in ", + "the compute_map range: " + ", ".join(sorted(parameter_inames)), + ) + + storage_set = frozenset(storage_indices) + context_set = compute_map.output_names - storage_set + + # interface names for composition must not match as required by namedisl + name_mapping = { + name: name + "_" + for name in compute_map.output_names + if name not in storage_indices + } + compute_map = compute_map.rename_dims(name_mapping) + + # {{{ setup and useful variables + + ctx = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator() + ) + + expander = SubstitutionRuleExpander(kernel.substitutions) + expr_gatherer = UsageSiteExpressionGatherer( + ctx, expander, kernel, substitution, None + ) + + _ = expr_gatherer.map_kernel(kernel) + usage_exprs = expr_gatherer.usage_expressions + + if len(usage_exprs) == 0: + raise ValueError(f"No usages of substitution rule {substitution} found") + + all_exprs = [expr for usage in usage_exprs for expr in usage] + used_inames: frozenset[str] = frozenset( + set.union(*(_gather_vars(expr) for expr in all_exprs)) + ) + + # }}} + + # {{{ construct necessary pieces; footprint + + # add compute inames to domain / kernel + domain_changer = DomainChanger(kernel, kernel.all_inames()) + named_domain = nisl.make_basic_set(domain_changer.domain) + + # restrict domain to used inames + local_domain = named_domain.project_out_except(used_inames) + + usage_map_space = nisl.make_map_from_domain_and_range( + local_domain, compute_map.domain() + ).get_space() + + footprint = nisl.make_set( + isl.Set.empty( + compute_map + .rename_dims({value: key for key, value in name_mapping.items()}) + .range() + .project_out_except(context_set | storage_set) + .get_space() + ) + ) + + usage_substs: Mapping[AccessTuple, nisl.Map | nisl.BasicMap] = {} + for usage in usage_exprs: + local_usage_mpwaff = multi_pw_aff_from_exprs(usage, usage_map_space) + + local_usage_map = nisl.make_map(local_usage_mpwaff.as_map()) + local_usage_map = local_usage_map.intersect_domain(local_domain) + + local_storage_map = _normalize_renamed_dims( + local_usage_map.apply_range(compute_map), name_mapping + ) + + # all usages must be accounted for in the derived storage domain + if not local_usage_map.domain() <= local_storage_map.domain(): + continue + + local_footprint = ( + local_storage_map + .move_dims(context_set, isl.dim_type.param) + .range() + .project_out_except(context_set | storage_set) + .move_dims(context_set, isl.dim_type.set) + ) + + footprint = footprint | local_footprint + + # clean up names + local_storage_map = local_storage_map.move_dims( + local_storage_map.names - ((used_inames - context_set) | storage_set), + isl.dim_type.param, + ) + + usage_substs[_access_key(usage)] = local_storage_map + + # }}} + + # {{{ compute bounds and update kernel domain + + # FIXME: convex hull is required because of loopy's current limitations + footprint = footprint.convex_hull() + + named_domain = named_domain & footprint + new_domains = domain_changer.get_domains_with(named_domain.get_isl_object()) + + kernel = kernel.copy(domains=new_domains) + + # }}} + + # {{{ create compute instruction in kernel + + # restore original names to compute map + compute_map = compute_map.rename_dims({ + value: key for key, value in name_mapping.items() + }) + + compute_pw_aff = compute_map.reverse().as_pw_multi_aff() + storage_ax_to_global_expr = { + compute_pw_aff.get_dim_name(isl.dim_type.out, dim): pw_aff_to_expr( + compute_pw_aff.get_at(dim) + ) + for dim in range(compute_pw_aff.dim(isl.dim_type.out)) + } + + expr_subst_map = RuleAwareSubstitutionMapper( + ctx, make_subst_func(storage_ax_to_global_expr), within=parse_stack_match(None) + ) + + subst_expr = kernel.substitutions[substitution].expression + compute_expression = expr_subst_map( + subst_expr, + kernel, + None, + ) + + compute_dep_ids = frozenset().union( + *( + kernel.writer_map().get(var_name, frozenset()) + for var_name in _gather_vars(compute_expression) + ) + ) + + assignee = var(temporary_name)[tuple(var(idx) for idx in storage_indices)] + + update_insns, update_insn_ids, refill_predicate_options, current_update_deps = ( + _build_reuse_updates( + compute_map, + footprint, + named_domain, + storage_indices, + context_set, + inames_to_advance, + temporary_name, + compute_insn_id, + ) + ) + + new_insns = list(kernel.instructions) + new_insns.extend(update_insns) + within_inames = compute_map.output_names + for i, predicates in enumerate(refill_predicate_options): + refill_insn_id = ( + compute_insn_id + if len(refill_predicate_options) == 1 + else f"{compute_insn_id}_refill_{i}" + ) + compute_insn = lp.Assignment( + id=refill_insn_id, + assignee=assignee, + expression=compute_expression, + within_inames=within_inames, + predicates=predicates, + depends_on=current_update_deps | compute_dep_ids, + ) + new_insns.append(compute_insn) + update_insn_ids.append(refill_insn_id) + current_update_deps = frozenset([refill_insn_id]) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + # {{{ replace invocations with new compute instruction + + ctx = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator() + ) + + replacer = RuleInvocationReplacer( + ctx, + substitution, + None, + usage_substs, + storage_indices, + temporary_name, + update_insn_ids, + temporary_address_space, + footprint, + _gather_vars(compute_expression), + ) + + kernel = replacer.map_kernel(kernel) + + if replacer.compute_insn_depends_on or replacer.barrier_insn_depends_on: + compute_insn_depends_on = frozenset(replacer.compute_insn_depends_on) + barrier_insn_depends_on = { + insn_id: frozenset(dep_ids) + for insn_id, dep_ids in replacer.barrier_insn_depends_on.items() + } + kernel = kernel.copy( + instructions=[ + insn.copy( + depends_on=( + insn.depends_on + | ( + compute_insn_depends_on + if insn.id in update_insn_ids + else frozenset() + ) + | barrier_insn_depends_on.get(insn.id, frozenset()) + ) + ) + if ( + (insn.id in update_insn_ids and compute_insn_depends_on) + or insn.id in barrier_insn_depends_on + ) + else insn + for insn in kernel.instructions + ] + ) + + # }}} + + # {{{ create temporary variable for result of compute + + loopy_type = to_loopy_type(temporary_dtype, allow_none=True) + + # bounds can be non-zero based + temp_bounds = tuple( + (pw_aff_to_expr(footprint.dim_max(dim)), pw_aff_to_expr(footprint.dim_min(dim))) + for dim in storage_indices + ) + + temp_var = lp.TemporaryVariable( + name=temporary_name, + dtype=loopy_type, + base_indices=tuple(min for _, min in temp_bounds), + shape=tuple((max - min) + 1 for (max, min) in temp_bounds), + address_space=temporary_address_space, + dim_names=tuple(storage_indices), + ) + + new_temp_vars = dict(kernel.temporary_variables) + new_temp_vars[temporary_name] = temp_var + + kernel = kernel.copy(temporary_variables=new_temp_vars) + + # }}} + + return kernel diff --git a/pyproject.toml b/pyproject.toml index 45f3e90c0..ead5c10ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,8 @@ dependencies = [ "typing-extensions>=4", "strenum>=0.4.15", + + "namedisl @ git+https://github.com/a-alveyblanc/namedisl.git@initial-functionality" ] [project.optional-dependencies] pyopencl = [ @@ -174,6 +176,8 @@ ue = "ue" ba = "ba" # Fortran Loopy floopy = "floopy" +# thread +thr = "thr" [tool.typos.files] extend-exclude = [ diff --git a/test/test_compute.py b/test/test_compute.py new file mode 100644 index 000000000..506271209 --- /dev/null +++ b/test/test_compute.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import namedisl as nisl +import numpy as np + +import loopy as lp +from loopy.transform.compute import compute +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 + + +def test_compute_stub_simple_substitution_codegen() -> None: + knl = lp.make_kernel( + "{ [i] : 0 <= i < n }", + """ + u_(is) := u[is] + out[i] = u_(i) + """, + [ + lp.GlobalArg("u", shape=(16,), dtype=np.float32), + lp.GlobalArg("out", shape=(16,), dtype=np.float32, is_output=True), + ], + lang_version=LOOPY_USE_LANGUAGE_VERSION_2018_2, + ) + knl = lp.fix_parameters(knl, n=16) + + knl = compute( + knl, + "u_", + compute_map=nisl.make_map("{ [is] -> [i_s] : is = i_s }"), + storage_indices=["i_s"], + temporary_name="u_tmp", + temporary_address_space=lp.AddressSpace.PRIVATE, + temporary_dtype=np.float32, + ) + + code = lp.generate_code_v2(knl).device_code() + assert "float u_tmp[16]" in code + assert "u_tmp[i_s] = u[i_s]" in code + assert "out[i] = u_tmp[i]" in code diff --git a/test/test_transform.py b/test/test_transform.py index b6f87c108..4ed11e08c 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -21,6 +21,7 @@ """ import logging +from collections.abc import Mapping import numpy as np import pytest @@ -1745,6 +1746,109 @@ def test_duplicate_iname_not_read_only_nested(ctx_factory: cl.CtxFactory): lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) +@pytest.mark.parametrize("case", ( + {"M": 128, "N": 128, "K": 128, "BM": 32, "BN": 32, "BK": 16}, + {"M": 200, "N": 200, "K": 200, "BM": 32, "BN": 32, "BK": 16}, +)) +def test_compute_simple_tiled_matmul( + ctx_factory: cl.CtxFactory, + case: Mapping[str, int] + ): + + import namedisl as nisl + + m = case["M"] + n = case["N"] + k = case["K"] + bm = case["BM"] + bn = case["BN"] + bk = case["BK"] + + knl = lp.make_kernel( + "{ [i, j, k] : 0 <= i < M and 0 <= j < N and 0 <= k < K }", + """ + a_(is, ks) := a[is, ks] + b_(ks, js) := b[ks, js] + c[i, j] = sum([k], a_(i, k) * b_(k, j)) + """, + [ + lp.GlobalArg("a", shape=(m, k), dtype=np.float64), + lp.GlobalArg("b", shape=(k, n), dtype=np.float64), + lp.GlobalArg("c", shape=(m, n), dtype=np.float64, + is_output=True) + ] + ) + + knl = lp.fix_parameters(knl, M=m, N=n, K=k) + + # shared memory tile-level splitting + knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io") + knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo") + knl = lp.split_iname(knl, "k", bk, inner_iname="ki", outer_iname="ko") + + compute_map_a = nisl.make_map(f"""{{ + [is, ks] -> [ii_s, io, ki_s, ko] : + is = io * {bm} + ii_s and + ks = ko * {bk} + ki_s + }}""") + + compute_map_b = nisl.make_map(f"""{{ + [ks, js] -> [ki_s, ko, ji_s, jo] : + js = jo * {bn} + ji_s and + ks = ko * {bk} + ki_s + }}""") + + from loopy.transform.compute import compute + knl = compute( + knl, + "a_", + compute_map=compute_map_a, + storage_indices=["ii_s", "ki_s"], + placement_inames=["jo"], + temporary_address_space=lp.AddressSpace.LOCAL, + temporary_dtype=np.float64 + ) + + knl = compute( + knl, + "b_", + compute_map=compute_map_b, + storage_indices=["ki_s", "ji_s"], + placement_inames=["io"], + temporary_address_space=lp.AddressSpace.LOCAL, + temporary_dtype=np.float64 + ) + + knl = lp.tag_inames( + knl, { + "io": "g.0", # outer block loop over block rows + "jo": "g.1", # outer block loop over block cols + + "ii": "l.0", # inner block loop over rows + "ji": "l.1", # inner block loop over cols + + "ii_s": "l.0", # inner storage loop over a rows + "ji_s": "l.0", # inner storage loop over b cols + "ki_s": "l.1" # inner storage loop over a cols / b rows + } + ) + + knl = lp.add_inames_for_unused_hw_axes(knl) + + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + rng = np.random.default_rng() + a = rng.standard_normal((m, k)) + b = rng.standard_normal((k, n)) + + ex = knl.executor(ctx) + _, out = ex(queue, a=a, b=b) + + import numpy.linalg as la + assert (la.norm((a @ b) - out) / la.norm(a @ b)) < 1e-15 + + if __name__ == "__main__": import sys if len(sys.argv) > 1: