Hello Loopy: Computing a Rank-One Matrix

Setup Code

In [2]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
In [3]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
In [4]:
n = 1024
a = cl.clrandom.rand(queue, n, dtype=np.float32)
b = cl.clrandom.rand(queue, n, dtype=np.float32)

The Initial Kernel

In [5]:
knl = lp.make_kernel(
    "{[i,j]: 0<=i,j<n}",
    "c[i, j] = a[i]*b[j]")
In [6]:
knl = lp.set_options(knl, write_cl=True)
evt, (mat,) = knl(queue, a=a, b=b)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{

  for (int j = 0; j <= -1 + n; ++j)
    for (int i = 0; i <= -1 + n; ++i)
      c[n * i + j] = a[i] * b[j];
}
/home/andreas/src/loopy/loopy/compiled.py:841: LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
  kernel = get_one_scheduled_kernel(kernel)

Python glue for ease of use: The kernel "wrapper"

Invocation details handled by generated Python "wrapper":

  • Argument derivation
  • Data copying (from/to host, if requested)
  • Shape/type/data layout checks
In [7]:
wknl = lp.set_options(knl, write_wrapper=True, write_cl=False)
evt, (mat,) = wknl(queue, a=a, b=b)
from __future__ import division

import pyopencl as _lpy_cl
import pyopencl.array as _lpy_cl_array
import pyopencl.tools as _lpy_cl_tools
import numpy as _lpy_np
from struct import pack as _lpy_pack

def invoke_loopy_kernel_loopy_kernel(cl_kernel, queue, allocator=None, wait_for=None, out_host=None, a=None, b=None, c=None, n=None):
    if allocator is None:
        allocator = _lpy_cl_tools.DeferredAllocator(queue.context)

    # {{{ find integer arguments from shapes

    if n is None:
        if a is not None:
            n = a.shape[0]
        elif b is not None:
            n = b.shape[0]
        elif c is not None:
            n = c.shape[0]
        elif c is not None:
            n = c.shape[1]

    # }}}

    # {{{ find integer arguments from offsets

    # }}}

    # {{{ find integer arguments from strides

    # }}}

    # {{{ process n

    if n is None:
        raise RuntimeError("input argument 'n' must "
            "be supplied")
    cl_kernel.set_arg(3, _lpy_pack('i', n))

    # }}}

    # {{{ set up array arguments

    _lpy_encountered_numpy = False
    _lpy_encountered_dev = False

    # {{{ process a

    if isinstance(a, _lpy_np.ndarray):
        # synchronous, nothing to worry about
        a = _lpy_cl_array.to_device(queue, a, allocator=allocator)
        _lpy_encountered_numpy = True
    elif a is not None:
        _lpy_encountered_dev = True

    if a is None:
        raise RuntimeError("input argument 'a' must be supplied")

    if True:
        if a.dtype != _lpy_np.float32:
            raise TypeError("dtype mismatch on argument 'a' (got: %s, expected: float32)" % a.dtype)
        if a.shape != (n,):
            raise TypeError("shape mismatch on argument 'a' (got: %s, expected: %s)" % (a.shape, (n,)))
        if a.strides != (4,):
            raise TypeError("strides mismatch on argument 'a' (got: %s, expected: %s)" % (a.strides, (4,)))
        if a.offset:
            raise ValueError("Argument 'a' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")

    cl_kernel.set_arg(0, a.base_data)

    # }}}

    # {{{ process b

    if isinstance(b, _lpy_np.ndarray):
        # synchronous, nothing to worry about
        b = _lpy_cl_array.to_device(queue, b, allocator=allocator)
        _lpy_encountered_numpy = True
    elif b is not None:
        _lpy_encountered_dev = True

    if b is None:
        raise RuntimeError("input argument 'b' must be supplied")

    if True:
        if b.dtype != _lpy_np.float32:
            raise TypeError("dtype mismatch on argument 'b' (got: %s, expected: float32)" % b.dtype)
        if b.shape != (n,):
            raise TypeError("shape mismatch on argument 'b' (got: %s, expected: %s)" % (b.shape, (n,)))
        if b.strides != (4,):
            raise TypeError("strides mismatch on argument 'b' (got: %s, expected: %s)" % (b.strides, (4,)))
        if b.offset:
            raise ValueError("Argument 'b' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")

    cl_kernel.set_arg(1, b.base_data)

    # }}}

    # {{{ process c

    if isinstance(c, _lpy_np.ndarray):
        # synchronous, nothing to worry about
        c = _lpy_cl_array.to_device(queue, c, allocator=allocator)
        _lpy_encountered_numpy = True
    elif c is not None:
        _lpy_encountered_dev = True

    _lpy_made_by_loopy = False

    if c is None:
        _lpy_shape_0 = n
        _lpy_shape_1 = n
        _lpy_strides_0 = 4*n
        _lpy_strides_1 = 4
        assert _lpy_strides_0 > 0, "'c' has negative stride in axis 0"
        assert _lpy_strides_1 > 0, "'c' has negative stride in axis 1"
        _lpy_alloc_size = _lpy_strides_0*(_lpy_shape_0 + -1) + _lpy_strides_1*(_lpy_shape_1 + -1) + 4
        c = _lpy_cl_array.Array(queue, (_lpy_shape_0, _lpy_shape_1), _lpy_np.float32, strides=(_lpy_strides_0, _lpy_strides_1), data=allocator(_lpy_alloc_size), allocator=allocator)
        del _lpy_shape_0
        del _lpy_strides_0
        del _lpy_shape_1
        del _lpy_strides_1
        del _lpy_alloc_size

        _lpy_made_by_loopy = True

    if not _lpy_made_by_loopy:
        if c.dtype != _lpy_np.float32:
            raise TypeError("dtype mismatch on argument 'c' (got: %s, expected: float32)" % c.dtype)
        if c.shape != (n, n):
            raise TypeError("shape mismatch on argument 'c' (got: %s, expected: %s)" % (c.shape, (n, n,)))
        if c.strides != (4*n, 4):
            raise TypeError("strides mismatch on argument 'c' (got: %s, expected: %s)" % (c.strides, (4*n, 4)))
        if c.offset:
            raise ValueError("Argument 'c' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")

    del _lpy_made_by_loopy

    cl_kernel.set_arg(2, c.base_data)

    # }}}

    # }}}

    _lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, cl_kernel, (int(1),), (int(1),),  wait_for=wait_for, g_times_l=True)

    if out_host is None and (_lpy_encountered_numpy and not _lpy_encountered_dev):
        out_host = True
    if out_host:
        pass
        c = c.get(queue=queue)

    return _lpy_evt, (c,)
/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)
  warn(text, type)

Transforming kernels: Loop Splitting

Next: transform kernel. Example: Split a loop into fixed-length "chunks".

In [7]:
isplit_knl = knl
isplit_knl = lp.split_iname(isplit_knl, "i", 4)

evt, (mat,) = isplit_knl(queue, a=a, b=b)
#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{

  for (int i_outer = 0; i_outer <= -1 + int_floor_div_pos_b(3 + n, 4); ++i_outer)
    for (int j = 0; j <= -1 + n; ++j)
      for (int i_inner = 0; i_inner <= 3; ++i_inner)
        if (-1 + -1 * i_inner + -4 * i_outer + n >= 0)
          c[n * (i_inner + i_outer * 4) + j] = a[i_inner + i_outer * 4] * b[j];
}

Want to get rid of the conditional?

Transforming kernels: Implementation Tags

Every loop axis ("iname") comes with an implementation tag.

In [8]:
isplit_knl = knl
isplit_knl = lp.assume(isplit_knl, "n mod 4 = 0")
isplit_knl = lp.split_iname(isplit_knl, "i", 4)
isplit_knl = lp.tag_inames(isplit_knl, {"i_inner": "unr"})

evt, (mat,) = isplit_knl(queue, a=a, b=b)
#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{

  for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer)
    for (int j = 0; j <= -1 + n; ++j)
    {
      c[n * (0 + i_outer * 4) + j] = a[0 + i_outer * 4] * b[j];
      c[n * (1 + i_outer * 4) + j] = a[1 + i_outer * 4] * b[j];
      c[n * (2 + i_outer * 4) + j] = a[2 + i_outer * 4] * b[j];
      c[n * (3 + i_outer * 4) + j] = a[3 + i_outer * 4] * b[j];
    }
}

May want to influence loop ordering.


"Map to GPU hw axis" is an iname tag as well.

Use shortcuts for less typing:

In [9]:
split_knl = knl
split_knl = lp.split_iname(split_knl, "i", 16,
        outer_tag="g.0", inner_tag="l.0")
split_knl = lp.split_iname(split_knl, "j", 16,
        outer_tag="g.1", inner_tag="l.1")

evt, (mat,) = split_knl(queue, a=a, b=b)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{

  if (
      -1 + -16 * gid(1) + -1 * lid(1) + n >= 0
      && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
    )
    c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a[lid(0) + gid(0) * 16] * b[lid(1) + gid(1) * 16];
}
/home/andreas/src/loopy/loopy/diagnostic.py:60: LoopyAdvisory: No device parameter was passed to the PyOpenCLTarget. Perhaps you want to pass a device to benefit from additional checking. (add 'no_device_in_pre_codegen_checks' to silenced_warnings kernel argument to disable)
  warn(text, type)

Transforming kernels: Leveraging data reuse

Better! But still not much data reuse.

In [10]:
fetch1_knl = knl

fetch1_knl = lp.add_prefetch(fetch1_knl, "a")
fetch1_knl = lp.add_prefetch(fetch1_knl, "b")

evt, (mat,) = fetch1_knl(queue, a=a, b=b)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{
  float b_fetch_0;
  float a_fetch_0;

  for (int i = 0; i <= -1 + n; ++i)
  {
    a_fetch_0 = a[i];
    for (int j = 0; j <= -1 + n; ++j)
    {
      b_fetch_0 = b[j];
      c[n * i + j] = a_fetch_0 * b_fetch_0;
    }
  }
}

But this is useless for the GPU version. (demo)


Would like to fetch entire "access footprint" of a loop.

In [11]:
fetch_knl = split_knl

fetch_knl = lp.add_prefetch(fetch_knl, "a", ["i_inner"])
fetch_knl = lp.add_prefetch(fetch_knl, "b", ["j_inner"])

evt, (mat,) = fetch_knl(queue, a=a, b=b)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{
  __local float b_fetch_0[16];
  __local float a_fetch_0[16];

  if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
    a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)];
  if (-1 + -16 * gid(1) + -1 * lid(0) + n >= 0)
    b_fetch_0[lid(0)] = b[lid(0) + 16 * gid(1)];
  barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn depends on a_fetch) */;
  if (
      -1 + -16 * gid(1) + -1 * lid(1) + n >= 0
      && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
    )
    c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch_0[lid(0)] * b_fetch_0[lid(1)];
}

Transforming kernels: Eliminating Conditionals

All those conditionals take time to evaluate!

In [8]:
sfetch_knl = knl
sfetch_knl = lp.split_iname(sfetch_knl, "i", 16,
        outer_tag="g.0", inner_tag="l.0", slabs=(0,1))
sfetch_knl = lp.split_iname(sfetch_knl, "j", 16,
        outer_tag="g.1", inner_tag="l.1", slabs=(0,1))

sfetch_knl = lp.add_prefetch(sfetch_knl, "a", ["i_inner"])
sfetch_knl = lp.add_prefetch(sfetch_knl, "b", ["j_inner"])

evt, (mat,) = sfetch_knl(queue, a=a, b=b)
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{
  __local float a_fetch[16];
  __local float b_fetch[16];

  /* bulk slab for 'j_outer' */

  /* bulk slab for 'i_outer' */

  if (
      -17 + -16 * gid(1) + n >= 0
      && -17 + -16 * gid(0) + n >= 0
    )
  {
    b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];
    a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;
    c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];
  }
  /* final slab for 'i_outer' */

  if (
      16 + 16 * gid(0) + -1 * n >= 0
      && -17 + -16 * gid(1) + n >= 0
    )
  {
    b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];
    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
      a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;
    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
      c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];
  }
  /* final slab for 'j_outer' */

  /* bulk slab for 'i_outer' */

  if (
      16 + 16 * gid(1) + -1 * n >= 0
      && -17 + -16 * gid(0) + n >= 0
    )
  {
    if (-1 + -16 * gid(1) + -1 * lid(0) + n >= 0)
      b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];
    a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;
    if (-1 + -16 * gid(1) + -1 * lid(1) + n >= 0)
      c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];
  }
  /* final slab for 'i_outer' */

  if (
      -1 * gid(0) + gid(1) == 0
      && 16 + 16 * gid(0) + -1 * n >= 0
    )
  {
    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
      b_fetch[lid(0)] = b[lid(0) + 16 * gid(1)];
    if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
      a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
    barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn depends on a_fetch_rule) */;
    if (
        -1 + -16 * gid(0) + -1 * lid(1) + n >= 0
        && -1 + -16 * gid(0) + -1 * lid(0) + n >= 0
      )
      c[n * (lid(0) + gid(0) * 16) + lid(1) + gid(1) * 16] = a_fetch[lid(0)] * b_fetch[lid(1)];
  }
}
In [8]: