Loopy: Counting Operations

Setup code

In [1]:
import numpy as np
import pyopencl as cl
import pyopencl.array
import pyopencl.clrandom
import loopy as lp
In [2]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
In [18]:
n = 1024
a = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
b = cl.clrandom.rand(queue, (n, n), dtype=np.float32)
/usr/lib/python3/dist-packages/IPython/core/completer.py:737: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  args,_,_1,defaults = inspect.getargspec(call_obj)

Operation-counting matrix multiplication

Here is the simple matrix-matrix multiplication kernel again:

In [7]:
knl = lp.make_kernel(
    "{[i,j,k]: 0<=i,j,k<n}",
    "c[i, j] = sum(k, a[i, k]*b[k, j])"
    )
knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b":np.float32})

Counting flops

Let us determine the number of arithmetic operations being carried out:

In [10]:
lp.get_op_poly(knl)
Out[10]:
{(dtype('float32'), 'add'): PwQPolynomial("[n] -> { n^3 : n >= 1 }"),
 (dtype('float32'), 'mul'): PwQPolynomial("[n] -> { n^3 : n >= 1 }")}

The return type is easy to evaluate for a given set of parameters--just use the .eval_with_dict method:

In [15]:
poly = lp.get_op_poly(knl)[np.dtype(np.float32), "add"]
poly.eval_with_dict({"n": 15})
Out[15]:
3375

Counting memory access

In [17]:
lp.get_gmem_access_poly(knl)
Out[17]:
{(dtype('float32'),
  'uniform',
  'load'): PwQPolynomial("[n] -> { 2 * n^3 : n >= 1 }"),
 (dtype('float32'),
  'uniform',
  'store'): PwQPolynomial("[n] -> { n^2 : n >= 1 }")}

Operation-counting a transformed kernel

In [31]:
opt_knl = knl
opt_knl = lp.assume(opt_knl, "n mod 16 = 0")
opt_knl = lp.split_iname(opt_knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
opt_knl = lp.split_iname(opt_knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
opt_knl = lp.split_iname(opt_knl, "k", 16)
#opt_knl = lp.add_prefetch(opt_knl, "a", "i_inner,k_inner")
#opt_knl = lp.add_prefetch(opt_knl, "b", "j_inner,k_inner")

opt_knl = lp.set_options(opt_knl, write_cl=True)
_ = opt_knl(queue, a=a, b=b)
#define int_floor_div_pos_b(a,b) (                 ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)                 )
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) loopy_kernel(__global float const *restrict a, __global float const *restrict b, __global float *restrict c, int const n)
{
  float acc_k_outer_k_inner;

  acc_k_outer_k_inner = 0.0f;
  for (int k_outer = 0; k_outer <= int_floor_div_pos_b(-16 + n, 16); ++k_outer)
    for (int k_inner = 0; k_inner <= 15; ++k_inner)
      acc_k_outer_k_inner = acc_k_outer_k_inner + a[n * (lid(1) + gid(0) * 16) + k_inner + k_outer * 16] * b[n * (k_inner + k_outer * 16) + lid(0) + gid(1) * 16];
  c[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = acc_k_outer_k_inner;
}

Now count the memory accesses in the transformed version:

In [32]:
lp.get_gmem_access_poly(opt_knl)
Out[32]:
{(dtype('float32'),
  'consecutive',
  'load'): PwQPolynomial("[n] -> { (((512 - 192 * n + 24 * n^2 - n^3) * floor((n)/16)^3 + (1536 - 384 * n + 24 * n^2) * floor((n)/16)^4 + (1536 - 192 * n) * floor((n)/16)^5 + 512 * floor((n)/16)^6) + ((1536 - 192 * n - 24 * n^2 + 3 * n^3) * floor((n)/16)^2 + (3072 - 48 * n^2) * floor((n)/16)^3 + (1536 + 192 * n) * floor((n)/16)^4) * floor((15 + n)/16) + ((1536 + 192 * n - 24 * n^2 - 3 * n^3) * floor((n)/16) + 768 * n * floor((n)/16)^2 + (-3072 + 384 * n) * floor((n)/16)^3 - 1536 * floor((n)/16)^4) * floor((15 + n)/16)^2 + ((512 + 192 * n + 24 * n^2 + n^3) + (-3072 + 48 * n^2) * floor((n)/16) + (-3072 - 384 * n) * floor((n)/16)^2) * floor((15 + n)/16)^3 + ((-1536 - 384 * n - 24 * n^2) + (1536 - 192 * n) * floor((n)/16) + 1536 * floor((n)/16)^2) * floor((15 + n)/16)^4 + (1536 + 192 * n) * floor((15 + n)/16)^5 - 512 * floor((15 + n)/16)^6) : n >= 1 }"),
 (dtype('float32'),
  'consecutive',
  'store'): PwQPolynomial("[n] -> { (((8 * n - n^2) * floor((n)/16) + 8 * n * floor((n)/16)^2) + (8 * n + n^2) * floor((15 + n)/16) + -8 * n * floor((15 + n)/16)^2) : n >= 1 }"),
 (dtype('float32'),
  'nonconsecutive',
  'load'): PwQPolynomial("[n] -> { (((512 - 192 * n + 24 * n^2 - n^3) * floor((n)/16)^3 + (1536 - 384 * n + 24 * n^2) * floor((n)/16)^4 + (1536 - 192 * n) * floor((n)/16)^5 + 512 * floor((n)/16)^6) + ((1536 - 192 * n - 24 * n^2 + 3 * n^3) * floor((n)/16)^2 + (3072 - 48 * n^2) * floor((n)/16)^3 + (1536 + 192 * n) * floor((n)/16)^4) * floor((15 + n)/16) + ((1536 + 192 * n - 24 * n^2 - 3 * n^3) * floor((n)/16) + 768 * n * floor((n)/16)^2 + (-3072 + 384 * n) * floor((n)/16)^3 - 1536 * floor((n)/16)^4) * floor((15 + n)/16)^2 + ((512 + 192 * n + 24 * n^2 + n^3) + (-3072 + 48 * n^2) * floor((n)/16) + (-3072 - 384 * n) * floor((n)/16)^2) * floor((15 + n)/16)^3 + ((-1536 - 384 * n - 24 * n^2) + (1536 - 192 * n) * floor((n)/16) + 1536 * floor((n)/16)^2) * floor((15 + n)/16)^4 + (1536 + 192 * n) * floor((15 + n)/16)^5 - 512 * floor((15 + n)/16)^6) : n >= 1 }")}

Now enable the prefetch transformation above.