Loopy: Controlling data layout

Setup code

In [1]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
In [2]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

A kernel on a structured array

In [6]:
knl = lp.make_kernel(
    "{[el,dof, comp]: "
        "0<=el<nels "
        "and 0<=dof<14 "
        "and 0<=comp < 3}",
    "D[el, dof, comp] = eps[el] * E[el, dof, comp]")

knl = lp.set_options(knl, write_cl=True)
In [7]:
eps = np.random.randn(500)
E = cl.clrandom.rand(queue, (500, 14, 3), dtype=np.float64)
In [8]:
mknl = knl.copy()
evt, _ = mknl(queue, eps=eps, E=E)
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double *restrict D, __global double const *restrict E, __global double const *restrict eps, int const nels)
{

  for (int comp = 0; comp <= 2; ++comp)
    if (-1 + nels >= 0)
      for (int dof = 0; dof <= 13; ++dof)
        for (int el = 0; el <= -1 + nels; ++el)
          D[42 * el + 3 * dof + comp] = eps[el] * E[42 * el + 3 * dof + comp];
}

Changing the layout

E and D are currently laid out as AoS. What if I want SoA?

In [9]:
mknl = knl

mknl  = lp.tag_data_axes(mknl, "E", "c,c,sep")
mknl = lp.tag_inames(mknl, {"comp": "unr"})
mknl = lp.set_loop_priority(mknl, "el,dof,comp")

# change data format of E
copy_knl = lp.make_copy_kernel("c,c,sep")
copy_knl = lp.fix_parameters(copy_knl, n2=3)
evt, E_new = copy_knl(queue, input=E)

evt, _ = mknl(queue, eps=eps, E=E_new)
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global double *restrict D, __global double const *restrict E_s0, __global double const *restrict E_s1, __global double const *restrict E_s2, __global double const *restrict eps, int const nels)
{

  for (int el = 0; el <= -1 + nels; ++el)
    for (int dof = 0; dof <= 13; ++dof)
    {
      D[42 * el + 3 * dof + 0] = eps[el] * E_s0[14 * el + dof];
      D[42 * el + 3 * dof + 1] = eps[el] * E_s1[14 * el + dof];
      D[42 * el + 3 * dof + 2] = eps[el] * E_s2[14 * el + dof];
    }
}

May want to add padding (demo).


Grouped padding exists as well.