import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clmath
import pyopencl.clrandom
import loopy as lp
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
h = grid[1] - grid[0]
u = cl.clmath.sin(cl.array.to_device(queue, grid))
We will implement computing the ODE right-hand side for Burgers' equation: $$ \frac{\partial u}{\partial t} + \frac{\partial }{\partial x} \left( \frac{u^2}2 \right) = 0, $$
Now, it is likely that the code fore the derivative is initially separate from the code for the flux function $f(u):=u^2/2$.
For simplicity, we will use central finite differences to build a kernel fin_diff_knl
to take the derivative:
fin_diff_knl = lp.make_kernel(
"{[i]: 1<=i<=n}",
"out[i] = -(f[i+1] - f[i-1])/h",
[lp.GlobalArg("out", shape="2+n"), ...])
print(fin_diff_knl)
Next, make the flux kernel as flux_knl
.
j
as the loop variable.f
and u
to be the right size.flux_knl = lp.make_kernel(
"{[j]: 1<=j<=n}",
"f[j] = u[j]**2/2",
[
lp.GlobalArg("f", shape="2+n"),
lp.GlobalArg("u", shape="2+n"),
])
print(flux_knl)
Next, fuse the two kernels together as fused_knl
, using lp.fuse_kernels([knl_a, knl_b])
:
fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl])
print(fused_knl)
Let's take a look at the generated code:
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
Next, eliminate the separate loop to compute f
:
fused_knl = lp.assignment_to_subst(fused_knl, "f")
print(fused_knl)
Again, let's take a look at the generated code:
fused_knl = lp.set_options(fused_knl, write_cl=True)
evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
For easier transformation, renumber the loop to start at 0, using affine_map_inames(kernel, old_inames, new_inames, equations)
:
fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
print(fused0_knl)
Now, map the kernel to OpenCL axes:
gpu_knl = lp.split_iname(fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
print(gpu_knl)
Finally, precompute the fluxes locally in each workgroup:
precomp_knl = lp.precompute(gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
precomp_knl = lp.set_options(precomp_knl, return_dict=True)
evt, _ = precomp_knl(queue, u=u, h=h)
import matplotlib.pyplot as pt
# Forward Euler on a hyperbolic problem: terrible idea. Oh well.
dt = 0.1*h
u = cl.clmath.sin(cl.array.to_device(queue, grid))
for i in range(1400):
_, result_dict = precomp_knl(queue, u=u, h=h)
out = result_dict["out"]
out[0] = out[-1] = 0
u = u + dt*out
if i % 300 == 0:
pt.plot(u.get())