__kernel void MatrixVectorMul1(const __global float* M,
                               const __global float* V,
                               __global float* W,
                               uint width, uint height)
{
  // Each work-item computes one element of W
  uint y = get_global_id(0);
  if (y >= height) return;

  const __global float* row = M + y * width;

  float dotProduct = 0;
  for (uint x = 0; x < width; ++x)
    dotProduct += row[x] * V[x];

  W[y] = dotProduct;
}

__kernel void MatrixVectorMul2(const __global float* M,
                               const __global float* V,
                               __global float* W,
                               uint width, uint height)
{
  // Each work-item computes multiple elements of W
  for (uint y = get_global_id(0); y < height; y += get_global_size(0)) 
    {
      const __global float* row = M + y * width;

      float dotProduct = 0;
      for (uint x = 0; x < width; ++x)
        dotProduct += row[x] * V[x];
      
      W[y] = dotProduct;
    }
}

__kernel void MatrixVectorMul3(const __global float* M,
                               const __global float* V,
                               __global float* W,
                               uint width, uint height,
                               __local float* partialDotProduct)
{
  // Each work-group computes multiple elements of W
  for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) 
    {
      const __global float* row = M + y * width;
      
      float sum = 0;
      for (uint x = get_local_id(0); x < width; x += get_local_size(0))
        sum += row[x] * V[x];
      
      partialDotProduct[get_local_id(0)] = sum;
      
      barrier(CLK_LOCAL_MEM_FENCE);
      
      if (get_local_id(0) == 0) 
        {
          float dotProduct = 0;
          for (uint t = 0; t < get_local_size(0); ++t)
            dotProduct += partialDotProduct[t];
          W[y] = dotProduct;
        }
      
      barrier(CLK_LOCAL_MEM_FENCE);
    }
}


__kernel void MatrixVectorMul4(const __global float* M,
                              const __global float* V,
                              __global float* W,
                              uint width, uint height,
                              __local float* partialDotProduct)
{
  // Each work-group computes multiple elements of W
  for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) 
    {
      const __global float* row = M + y * width;
      
      float sum = 0;
      for (uint x = get_local_id(0); x < width; x += get_local_size(0))
        sum += row[x] * V[x];
      
      partialDotProduct[get_local_id(0)] = sum;
      
      for (uint stride = 1; stride < get_local_size(0); stride *= 2) 
        {
          barrier(CLK_LOCAL_MEM_FENCE);
          uint index = 2 * stride * get_local_id(0);
          
          if (index < get_local_size(0))
            partialDotProduct[index] +=
              partialDotProduct[index + stride];          
        }
      
      if (get_local_id(0) == 0)
        W[y] = partialDotProduct[0];
      
      barrier(CLK_LOCAL_MEM_FENCE);
    }
}

__kernel void MatrixVectorMul5(const __global float* M,
                              const __global float* V,
                              __global float* W,
                              uint width, uint height,
                              __local float* partialDotProduct)
{
  for (uint y = get_group_id(0); y < height; y += get_num_groups(0)) 
    {
      const __global float* row = M + y * width;
      
      float sum = 0;
      for (uint x = get_local_id(0); x < width; x += get_local_size(0))
	sum += row[x] * V[x];
      
      partialDotProduct[get_local_id(0)] = sum;
      
      for (uint stride = get_local_size(0)/2; stride > 0; stride /= 2) 
	{
	  barrier(CLK_LOCAL_MEM_FENCE);
	  
	  if (get_local_id(0) < stride) 
	    partialDotProduct[get_local_id(0)] +=
	      partialDotProduct[get_local_id(0) + stride];
	}
      
      if (get_local_id(0) == 0) W[y] = partialDotProduct[0];
      
      barrier(CLK_LOCAL_MEM_FENCE);
    }
}
