Loopy: Transforming a PDE to Code

In [1]:
import pymbolic.primitives as p

u = p.Variable("u")

We'll write code that evaluates $\triangle u$ using finite differences.

To that end, we define a new expression 'thing': An operator for the Laplacian.

In [2]:
class Laplacian(p.Expression):
    def __init__(self, child):
        self.child = child
        
    def __getinitargs__(self):
        return (self.child,)
    
    mapper_method = "map_laplacian"
        
pde = Laplacian(u)+u**2-1
pde
Out[2]:
Sum((Laplacian(Variable('u')), Power(Variable('u'), 2), -1))

Now we'll write code to turn Laplacians into their discrete finite difference forms, using i and j as formal indices, using

$$f''(x) \approx \frac{f(x+h) - 2 f(x) + f(x-h)}{h^{2}}$$

Pay close attention to indexing!

In [3]:
from pymbolic.mapper import IdentityMapper

i = p.Variable("i")
j = p.Variable("j")

ii = i+1
jj = j+1
In [4]:
class FDMapper(IdentityMapper):

    def map_variable(self, expr):

        return expr[i, j]

    

    def map_laplacian(self, expr):

        var = expr.child

        return (-4*var[ii,jj] + var[ii+1,jj] + var[ii-1,jj]

                + var[ii,jj+1] + var[ii,jj-1])
In [6]:
fd_mapper = FDMapper()
print(fd_mapper(pde))
u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]

Now let's generate some code for this, using loopy:

In [7]:
import loopy as lp

result = p.Variable("result")

Observe the two parts of the loopy kernel description:

  • Polyhedral loop domain
  • Instructions
In [8]:
knl = lp.make_kernel(

    "{[i,j]: 0<=i,j<n}",

    [lp.ExpressionInstruction(result[i,j], fd_mapper(pde))],

    )

Kernels can always be inspected--simply use print:

In [9]:
print(knl)
---------------------------------------------------------------------------
KERNEL: loopy_kernel
---------------------------------------------------------------------------
ARGUMENTS:
n: ValueArg, type: <runtime>
result: GlobalArg, type: <runtime>, shape: (n, n), dim_tags: (N1:stride:n, N0:stride:1)
u: GlobalArg, type: <runtime>, shape: (2 + n, 2 + n), dim_tags: (N1:stride:2 + n, N0:stride:1)
---------------------------------------------------------------------------
DOMAINS:
[n] -> { [i, j] : i >= 0 and j >= 0 and i <= -1 + n and j <= -1 + n }
---------------------------------------------------------------------------
INAME IMPLEMENTATION TAGS:
i: None
j: None
---------------------------------------------------------------------------
INSTRUCTIONS:
[i,j]                                result[i, j] <- u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]   # insn
---------------------------------------------------------------------------

Let's move towards running this code. To do so, we need pyopencl:

In [10]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

And some data to work with:

In [11]:
n = 1000
u = cl.clrandom.rand(queue, (n+2,n+2), dtype=np.float32)

Now run the code, and tell loopy to print what it generates:

In [12]:
knl = lp.set_options(knl, write_cl=True)

evt, (result,) = knl(queue, u=u, n=n)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{

  for (int j = 0; j <= -1 + n; ++j)
    for (int i = 0; i <= -1 + n; ++i)
      result[n * i + j] = u[(2 + n) * i + j] * u[(2 + n) * i + j] + -1.0f + -4.0f * u[(2 + n) * (i + 1) + j + 1] + u[(2 + n) * (i + 1 + 1) + j + 1] + u[(2 + n) * (i + 1 + -1) + j + 1] + u[(2 + n) * (i + 1) + j + 1 + 1] + u[(2 + n) * (i + 1) + j + 1 + -1];
}
/home/andreas/src/loopy/loopy/compiled.py:841: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
  kernel = get_one_scheduled_kernel(kernel)

That's obviously not very parallel. Introduce parallelism:

In [13]:
tknl = knl
tknl = lp.tag_inames(tknl, {"i": "g.0", "j": "g.1"})
evt, (result,) = tknl(queue, u=u)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{

  result[n * gid(0) + gid(1)] = u[(2 + n) * gid(0) + gid(1)] * u[(2 + n) * gid(0) + gid(1)] + -1.0f + -4.0f * u[(2 + n) * (gid(0) + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + 1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1 + -1) + gid(1) + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + 1] + u[(2 + n) * (gid(0) + 1) + gid(1) + 1 + -1];
}

But OpenCL/CUDA require blocking to be efficient!

In [14]:
sknl = knl
sknl = lp.split_iname(sknl,
        "i", 16, outer_tag="g.1", inner_tag="l.1")
sknl = lp.split_iname(sknl,
        "j", 16, outer_tag="g.0", inner_tag="l.0")
evt, (result,) = sknl(queue, u=u)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{

  if (
      -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
      && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0
    )
    result[n * (lid(1) + gid(1) * 16) + lid(0) + gid(0) * 16] = u[(2 + n) * (lid(1) + gid(1) * 16) + lid(0) + gid(0) * 16] * u[(2 + n) * (lid(1) + gid(1) * 16) + lid(0) + gid(0) * 16] + -1.0f + -4.0f * u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + 1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + -1 + lid(1) + gid(1) * 16) + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + 1 + lid(0) + gid(0) * 16] + u[(2 + n) * (1 + lid(1) + gid(1) * 16) + 1 + -1 + lid(0) + gid(0) * 16];
}

How about some data reuse?

In [15]:
sknl = knl
sknl = lp.split_iname(sknl,
        "i", 16, outer_tag="g.1", inner_tag="l.1")
sknl = lp.split_iname(sknl,
        "j", 16, outer_tag="g.0", inner_tag="l.0")
sknl = lp.add_prefetch(sknl, "u",
    ["i_inner", "j_inner"],
    fetch_bounding_box=True)
evt, (result,) = sknl(queue, u=u, n=n)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(int const n, __global float *restrict result, __global float const *restrict u)
{
  __local float u_fetch[18 * 18];

  for (int u_dim_1_outer = 0; u_dim_1_outer <= 1; ++u_dim_1_outer)
    if (
        17 + -16 * u_dim_1_outer + -1 * lid(0) >= 0
        && 1 + -16 * u_dim_1_outer + -1 * lid(0) + -16 * gid(0) + n >= 0
      )
      for (int u_dim_0_outer = 0; u_dim_0_outer <= 1; ++u_dim_0_outer)
        if (
            1 + -16 * u_dim_0_outer + -1 * lid(1) + -16 * gid(1) + n >= 0
            && 17 + -16 * u_dim_0_outer + -1 * lid(1) >= 0
          )
          u_fetch[18 * (lid(1) + u_dim_0_outer * 16) + lid(0) + u_dim_1_outer * 16] = u[(2 + n) * (16 * gid(1) + lid(1) + u_dim_0_outer * 16) + 16 * gid(0) + lid(0) + u_dim_1_outer * 16];
  barrier(CLK_LOCAL_MEM_FENCE) /* for u_fetch (insn depends on u_fetch_rule) */;
  if (
      -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
      && -1 + -16 * gid(1) + -1 * lid(1) + n >= 0
    )
    result[n * (lid(1) + gid(1) * 16) + lid(0) + gid(0) * 16] = u_fetch[18 * lid(1) + lid(0)] * u_fetch[18 * lid(1) + lid(0)] + -1.0f + -4.0f * u_fetch[18 * (1 + lid(1)) + 1 + lid(0)] + u_fetch[18 * (2 + lid(1)) + 1 + lid(0)] + u_fetch[18 * lid(1) + 1 + lid(0)] + u_fetch[18 * (1 + lid(1)) + 2 + lid(0)] + u_fetch[18 * (1 + lid(1)) + lid(0)];
}
In [ ]: