import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
n = 1024
a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
Here is the simple matrix-matrix multiplication kernel again:
knl = lp.make_kernel(
"{[i,j,k]: 0<=i,j,k<n}",
"c[i, j] = sum(k, a[i, k]*b[k, j])"
)
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b":np.float32})
Let us determine the number of arithmetic operations being carried out:
lp.get_op_poly(knl)
The return type is easy to evaluate for a given set of parameters--just use the .eval_with_dict
method:
poly = lp.get_op_poly(knl)[np.dtype(np.float32), "add"]
poly.eval_with_dict({"n": 15})
lp.get_gmem_access_poly(knl)
opt_knl = knl
opt_knl = lp.assume(opt_knl, "n mod 16 = 0")
opt_knl = lp.split_iname(opt_knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
opt_knl = lp.split_iname(opt_knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
opt_knl = lp.split_iname(opt_knl, "k", 16)
#opt_knl = lp.add_prefetch(opt_knl, "a", "i_inner,k_inner")
#opt_knl = lp.add_prefetch(opt_knl, "b", "j_inner,k_inner")
opt_knl = lp.set_options(opt_knl, write_cl=True)
_ = opt_knl(queue, a=a, b=b)
Now count the memory accesses in the transformed version:
lp.get_gmem_access_poly(opt_knl)
Now enable the prefetch transformation above.