import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
n = 1024
a = cl.clrandom.rand(queue, n, dtype=np.float32)
b = cl.clrandom.rand(queue, n, dtype=np.float32)
knl = lp.make_kernel(
"{[i,j]: 0<=i,j<n}",
"c[i, j] = a[i]*b[j]")
knl = lp.set_options(knl, write_cl=True)
evt, (mat,) = knl(queue, a=a, b=b)
Invocation details handled by generated Python "wrapper":
wknl = lp.set_options(knl, write_wrapper=True, write_cl=False)
evt, (mat,) = wknl(queue, a=a, b=b)
Next: transform kernel. Example: Split a loop into fixed-length "chunks".
isplit_knl = knl
isplit_knl = lp.split_iname(isplit_knl, "i", 4)
evt, (mat,) = isplit_knl(queue, a=a, b=b)
Want to get rid of the conditional?
Every loop axis ("iname") comes with an implementation tag.
isplit_knl = knl
isplit_knl = lp.assume(isplit_knl, "n mod 4 = 0")
isplit_knl = lp.split_iname(isplit_knl, "i", 4)
isplit_knl = lp.tag_inames(isplit_knl, {"i_inner": "unr"})
evt, (mat,) = isplit_knl(queue, a=a, b=b)
May want to influence loop ordering.
"Map to GPU hw axis" is an iname tag as well.
Use shortcuts for less typing:
split_knl = knl
split_knl = lp.split_iname(split_knl, "i", 16,
outer_tag="g.0", inner_tag="l.0")
split_knl = lp.split_iname(split_knl, "j", 16,
outer_tag="g.1", inner_tag="l.1")
evt, (mat,) = split_knl(queue, a=a, b=b)
Better! But still not much data reuse.
fetch1_knl = knl
fetch1_knl = lp.add_prefetch(fetch1_knl, "a")
fetch1_knl = lp.add_prefetch(fetch1_knl, "b")
evt, (mat,) = fetch1_knl(queue, a=a, b=b)
But this is useless for the GPU version. (demo)
Would like to fetch entire "access footprint" of a loop.
fetch_knl = split_knl
fetch_knl = lp.add_prefetch(fetch_knl, "a", ["i_inner"])
fetch_knl = lp.add_prefetch(fetch_knl, "b", ["j_inner"])
evt, (mat,) = fetch_knl(queue, a=a, b=b)
All those conditionals take time to evaluate!
sfetch_knl = knl
sfetch_knl = lp.split_iname(sfetch_knl, "i", 16,
outer_tag="g.0", inner_tag="l.0", slabs=(0,1))
sfetch_knl = lp.split_iname(sfetch_knl, "j", 16,
outer_tag="g.1", inner_tag="l.1", slabs=(0,1))
sfetch_knl = lp.add_prefetch(sfetch_knl, "a", ["i_inner"])
sfetch_knl = lp.add_prefetch(sfetch_knl, "b", ["j_inner"])
evt, (mat,) = sfetch_knl(queue, a=a, b=b)