#define USE_VIENNACL_FAST_COPY
//#undef USE_VIENNACL_FAST_COPY

#include "libscl.h"
#include "vclmat.h"

#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/prod.hpp" 
#include "viennacl/linalg/norm_2.hpp"

using namespace scl;
using namespace std;

const INTEGER p = 30;
const INTEGER n = 100000;

const REAL sig = 0.01;

int main(int argc, char** argp, char** envp)
{

  cout << '\n';
  #if defined USE_VIENNACL_FAST_COPY
    cout << *argp << " is using fast_copy" << '\n';
  #else
    cout << *argp << " is not using fast_copy" << '\n';
  #endif

  realmat X(n,p);
  realmat y(n,1);

  INT_32BIT seed = 128;

  for (INTEGER i=1; i<=n; ++i) {
    REAL sum = 0.0;
    for (INTEGER j=1; j<=p; ++j) {
      sum += X(i,j) = unsk(seed);
    }
    y[i] = sum + sig*unsk(seed);
  }

  stopwatch timer;

  realmat b = invpsd(T(X)*X)*(T(X)*y);

  REAL cpu_time = timer.time();

  cout << '\n';
  cout << "libscl least squares time = " << cpu_time << '\n';

  realmat bcopy = b;

  vclmat cpu_X(X);

  viennacl::matrix<float,viennacl::column_major> gpu_X(n,p);
  viennacl::matrix<float,viennacl::column_major> gpu_C(p,p);
  viennacl::vector<float> gpu_y(n);
  viennacl::vector<float> gpu_b(p);

  timer.reset();

  #if defined USE_VIENNACL_FAST_COPY
    viennacl::fast_copy(cpu_X.begin(),cpu_X.end(),gpu_X);
    viennacl::fast_copy(y.begin(),y.end(),gpu_y.begin());
  #else 
    viennacl::copy(cpu_X,gpu_X);
    viennacl::copy(y.begin(),y.end(),gpu_y.begin());
  #endif

  REAL vcl_time_1 = timer.time();

  cout << "viennacl X & y copy time = " << vcl_time_1 << '\n';

  timer.reset();

  gpu_C = viennacl::linalg::prod(trans(gpu_X), gpu_X);
  gpu_b = viennacl::linalg::prod(trans(gpu_X), gpu_y);
  viennacl::linalg::lu_factorize(gpu_C); 
  viennacl::linalg::lu_substitute(gpu_C, gpu_b);

  REAL vcl_time_2 = timer.time();
  
  cout << "viennacl least squares time = " << vcl_time_2 << '\n';

  timer.reset();

  #if defined USE_VIENNACL_FAST_COPY
    viennacl::fast_copy(gpu_b.begin(),gpu_b.end(),b.begin());
  #else
    viennacl::copy(gpu_b.begin(),gpu_b.end(),b.begin());
  #endif

  REAL vcl_time_3 = timer.time();

  cout << "viennacl b copy time = " << vcl_time_3 << '\n';

  unsigned int count = 0;

  for (INTEGER i = 1; i <= p ; ++i) {
    float err = bcopy[i] - b[i];
    float tol =  bcopy[i]*1.0e-4;
    err = err > 0 ? err : -err;
    tol = tol > 0 ? tol : -tol;
    if (err > tol) ++count;
  }

  REAL vcl_time = vcl_time_1 + vcl_time_2 + vcl_time_3;

  cout << "viennacl total time = " << vcl_time << '\n';

  cout << "GPU/CPU total time          = " 
       << fmt('f',8,4,100.0*vcl_time/cpu_time) << " per cent" << '\n';
  cout << "GPU/CPU least squares time  = " 
       << fmt('f',8,4,100.0*vcl_time_2/cpu_time) << " per cent" << '\n';

  cout << "viennacl errors = " << count << '\n';
  cout << '\n';

  return 0;
}
