/**  
@file 
@warning Automatically Generated
*/
/**  
 @warning AUTOMATICALLY GENERATED
*/




#ifndef RYSQ_KERNEL_QUADRATURE2_IMPL_HPP
#define RYSQ_KERNEL_QUADRATURE2_IMPL_HPP

#include "rysq-core.hpp"
#include "meta.hpp"
#include "kernel/forward.hpp"
#include "kernel/vector.hpp"

#include <boost/config.hpp>

namespace rysq {
namespace kernel {
namespace quadrature {

namespace recurrence {

    BOOST_GPU_ENABLED
    inline double coefficient(double A1, double B, double t2) {
	return 0.5*A1*(1.0 - B*t2);
    }
    
    template<int q, class RAI, class RAB>
    BOOST_GPU_ENABLED
    inline double coefficient(const RAI &rAi, double B,
			      const RAB &rAB, double t2) {
	return rAi[q] - B*rAB[q]*t2;
    }

    // template<int q, int N>
    // BOOST_GPU_ENABLED vector<N> coefficient(const vector<3> &rAi, double B,
    // 					const vector<3> &rAB, double (&t2)[N]) {
    // 	vector<N> C;
    // 	for (int a = 0; a < N; ++a) {
    // 	    C[a] = coefficient<q>(rAi, B, rAB, t2[a]);
    // 	}
    // return C;
    // }

    template<size_t N>
    BOOST_GPU_ENABLED double pow(double x) { return x*pow<N-1>(x); }
    template<>
    BOOST_GPU_ENABLED inline double pow<0>(double) { return double(1); }

}

namespace mpl = boost::mpl;


template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Ix), I+0);
	    update((C[0][0])*W[a]*(Iy), I+1);
	    update((C[0][0])*W[a]*(Iz), I+2);
	    update((C[1][0])*W[a]*((B00 + Ix*Kx)), I+3);
	    update((C[1][0])*W[a]*(Iy*Kx), I+4);
	    update((C[1][0])*W[a]*(Iz*Kx), I+5);
	    update((C[1][0])*W[a]*((B00 + Iy*Ky)), I+6);
	    update((C[1][0])*W[a]*(Iz*Ky), I+7);
	    update((C[1][0])*W[a]*(Ix*Ky), I+8);
	    update((C[1][0])*W[a]*((Iz*Kz + B00)), I+9);
	    update((C[1][0])*W[a]*(Ix*Kz), I+10);
	    update((C[1][0])*W[a]*(Iy*Kz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[4] = T[4];
	I[5] = T[5];
	I[7] = T[6];
	I[8] = T[7];
	I[6] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[10] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 0, 1, 2, 3, 4, 5, 8, 6, 7, 10, 11, 9 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 4, 5, 7, 8, 6, 11, 9, 10
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::SP, rysq::F> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Kx*Ky*Kz), I+0);
	    double f1 = 3*pow(B01,2);
	    update((C[1][0])*W[a]*((3*B01*Kx*(Xkl + 2*Dx) + f1 + Dx*pow(Kx,3))), I+1);
	    update((C[1][0])*W[a]*((Dz*pow(Kz,3) + f1 + 3*B01*Kz*(2*Dz + Zkl))), I+2);
	    update((C[1][0])*W[a]*((Dy*pow(Ky,3) + f1 + 3*B01*Ky*(Ykl + 2*Dy))), I+3);
	    double f10 = (pow(Kz,2) + 3*B01);
	    update((C[1][0])*W[a]*(Dy*Kz*f10), I+4);
	    update((C[1][0])*W[a]*(Dx*Kz*f10), I+5);
	    update((C[0][0])*W[a]*(Kz*f10), I+6);
	    double f11 = (pow(Ky,2) + B01);
	    update((C[1][0])*W[a]*(Dx*Kz*f11), I+7);
	    update((C[0][0])*W[a]*(Kz*f11), I+8);
	    update((C[0][0])*W[a]*(Kx*f11), I+9);
	    update((C[1][0])*W[a]*(Dz*Kx*f11), I+10);
	    double f12 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Kx*Ky*f12), I+11);
	    update((C[1][0])*W[a]*(f11*f12), I+12);
	    double f13 = (3*B01 + pow(Kx,2));
	    update((C[1][0])*W[a]*(Dz*Kx*f13), I+13);
	    update((C[1][0])*W[a]*(Dy*Kx*f13), I+14);
	    update((C[0][0])*W[a]*(Kx*f13), I+15);
	    double f14 = (B01*(2*Zkl + 3*Dz) + Dz*pow(Kz,2));
	    update((C[1][0])*W[a]*(Kx*f14), I+16);
	    update((C[1][0])*W[a]*(Ky*f14), I+17);
	    double f15 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Kx*Kz*f15), I+18);
	    double f2 = (B01*(2*Xkl + 3*Dx) + Dx*pow(Kx,2));
	    update((C[1][0])*W[a]*(Kz*f2), I+19);
	    update((C[1][0])*W[a]*(Ky*f2), I+20);
	    double f4 = (pow(Kz,2) + B01);
	    update((C[1][0])*W[a]*(Dy*Kx*f4), I+21);
	    update((C[0][0])*W[a]*(Kx*f4), I+22);
	    update((C[1][0])*W[a]*(Dx*Ky*f4), I+23);
	    update((C[0][0])*W[a]*(Ky*f4), I+24);
	    update((C[1][0])*W[a]*(f15*f4), I+25);
	    double f5 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Ky*Kz*f5), I+26);
	    update((C[1][0])*W[a]*(f4*f5), I+27);
	    update((C[1][0])*W[a]*(f11*f5), I+28);
	    double f6 = (pow(Ky,2) + 3*B01);
	    update((C[1][0])*W[a]*(Dz*Ky*f6), I+29);
	    update((C[1][0])*W[a]*(Dx*Ky*f6), I+30);
	    update((C[0][0])*W[a]*(Ky*f6), I+31);
	    double f7 = (Dy*pow(Ky,2) + B01*(3*Dy + 2*Ykl));
	    update((C[1][0])*W[a]*(Kx*f7), I+32);
	    update((C[1][0])*W[a]*(Kz*f7), I+33);
	    double f9 = (pow(Kx,2) + B01);
	    update((C[1][0])*W[a]*(f12*f9), I+34);
	    update((C[1][0])*W[a]*(f15*f9), I+35);
	    update((C[0][0])*W[a]*(Kz*f9), I+36);
	    update((C[1][0])*W[a]*(Dy*Kz*f9), I+37);
	    update((C[0][0])*W[a]*(Ky*f9), I+38);
	    update((C[1][0])*W[a]*(Dz*Ky*f9), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[36] = T[0];
	I[1] = T[1];
	I[11] = T[2];
	I[6] = T[3];
	I[10] = T[4];
	I[9] = T[5];
	I[8] = T[6];
	I[25] = T[7];
	I[24] = T[8];
	I[20] = T[9];
	I[23] = T[10];
	I[39] = T[11];
	I[27] = T[12];
	I[3] = T[13];
	I[2] = T[14];
	I[0] = T[15];
	I[31] = T[16];
	I[35] = T[17];
	I[38] = T[18];
	I[17] = T[19];
	I[13] = T[20];
	I[30] = T[21];
	I[28] = T[22];
	I[33] = T[23];
	I[32] = T[24];
	I[34] = T[25];
	I[37] = T[26];
	I[29] = T[27];
	I[21] = T[28];
	I[7] = T[29];
	I[5] = T[30];
	I[4] = T[31];
	I[22] = T[32];
	I[26] = T[33];
	I[19] = T[34];
	I[14] = T[35];
	I[16] = T[36];
	I[18] = T[37];
	I[12] = T[38];
	I[15] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 15, 1, 14, 13, 31, 30, 3, 29, 6, 5, 4, 2, 38, 20, 35, 39, 36, 19, 37, 34, 9, 28, 32, 10, 8, 7, 33, 12, 22, 27, 21, 16, 24, 23, 25, 17, 0, 26, 18, 11 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    36, 1, 11, 6, 10, 9, 8, 25, 24, 20, 23, 39, 27, 3, 2, 0, 31, 35, 38, 17, 13, 30, 28, 33, 32, 34, 37, 29, 21, 7, 5, 4, 22, 26, 19, 14, 16, 18, 12, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 36;
	*idx++ = 1;
	*idx++ = 11;
	*idx++ = 6;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 39;
	*idx++ = 27;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 38;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 34;
	*idx++ = 37;
	*idx++ = 29;
	*idx++ = 21;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 22;
	*idx++ = 26;
	*idx++ = 19;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::F, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 4;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<4> &t2, const vector<4> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[100]) {
	eval<4>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Py*(Cz*Zij + Pz)*(Xij*(Xij + 2*Cx) + Px)), I+0);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*(Cz*Ix*Py*(Xij*(Xij + 2*Cx) + f0)), I+1);
	    update((C[0][0])*W[a]*(Cy*Ix*Pz*(Xij*(Xij + 2*Cx) + f0)), I+2);
	    double f10 = (3*pow(B10,2) + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2));
	    update((C[0][0])*W[a]*(f10*(Cz*Zij + Pz)), I+3);
	    update((C[0][0])*W[a]*(Cz*Iy*f10), I+4);
	    update((C[0][0])*W[a]*(Cy*Iz*f10), I+5);
	    double f12 = (3*B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Cz*Iy*Px*f12), I+6);
	    update((C[0][0])*W[a]*(Cx*Iy*Pz*f12), I+7);
	    update((C[0][0])*W[a]*(Cx*Iy*f0*f12), I+8);
	    double f13 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[0][0])*W[a]*(Cy*f13*(Px + Cx*Xij)), I+9);
	    update((C[0][0])*W[a]*(Ix*Py*f13), I+10);
	    update((C[0][0])*W[a]*(Iy*Px*f13), I+11);
	    double f14 = (3*pow(B10,2) + B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + pow(Cy,2)*pow(Iy,2));
	    update((C[0][0])*W[a]*(f14*(Cz*Zij + Pz)), I+12);
	    update((C[0][0])*W[a]*(Cz*Ix*f14), I+13);
	    update((C[0][0])*W[a]*(f14*(Px + Cx*Xij)), I+14);
	    update((C[0][0])*W[a]*(Cx*Iz*f14), I+15);
	    double f15 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(f15*(Px + Cx*Xij)*(Cz*Zij + Pz)), I+16);
	    update((C[0][0])*W[a]*(Pz*f15*(Xij*(Xij + 2*Cx) + Px)), I+17);
	    update((C[0][0])*W[a]*(Cx*f13*f15), I+18);
	    update((C[0][0])*W[a]*(f10*f15), I+19);
	    double f18 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[0][0])*W[a]*(Cy*f18*(Cz*Zij + Pz)), I+20);
	    update((C[0][0])*W[a]*(Cz*f15*f18), I+21);
	    update((C[0][0])*W[a]*(Iy*Pz*f18), I+22);
	    update((C[0][0])*W[a]*(Iz*Py*f18), I+23);
	    double f19 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[0][0])*W[a]*(Cx*f19*(Cz*Zij + Pz)), I+24);
	    update((C[0][0])*W[a]*(Cz*f19*(Px + Cx*Xij)), I+25);
	    update((C[0][0])*W[a]*(Ix*Pz*f19), I+26);
	    update((C[0][0])*W[a]*(Iz*Px*f19), I+27);
	    double f2 = (3*pow(B10,2)*(5*Cz + 2*Zij) + B10*Cz*(3*pow(Zij,2) + 10*pow(Cz,2) + 12*Cz*Zij) + pow(Cz,3)*pow(Iz,2));
	    update((C[0][0])*W[a]*(Ix*f2), I+28);
	    update((C[0][0])*W[a]*(Iy*f2), I+29);
	    double f20 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Px*f20*(Cz*Zij + Pz)), I+30);
	    update((C[0][0])*W[a]*(Pz*f20*(Px + Cx*Xij)), I+31);
	    update((C[0][0])*W[a]*(Cx*Iz*f0*f20), I+32);
	    double f21 = (pow(Cy,2)*pow(Iy,3) + B10*Iy*(8*Cy*Yij + 10*pow(Cy,2) + pow(Yij,2)) + 3*pow(B10,2)*(5*Cy + 3*Yij));
	    update((C[0][0])*W[a]*(Cz*f21), I+33);
	    update((C[0][0])*W[a]*(Cx*f21), I+34);
	    double f22 = (3*pow(B10,2)*(3*Zij + 5*Cz) + B10*Iz*(8*Cz*Zij + 10*pow(Cz,2) + pow(Zij,2)) + pow(Cz,2)*pow(Iz,3));
	    update((C[0][0])*W[a]*(Cx*f22), I+35);
	    update((C[0][0])*W[a]*(Cy*f22), I+36);
	    double f23 = (3*pow(B10,2) + Cy*pow(Iy,3) + 3*B10*Iy*(Yij + 2*Cy));
	    update((C[0][0])*W[a]*(Cx*Cz*f23), I+37);
	    update((C[0][0])*W[a]*(Pz*f23), I+38);
	    update((C[0][0])*W[a]*(Px*f23), I+39);
	    double f25 = (pow(Cx,2)*pow(Ix,3) + B10*Ix*(8*Cx*Xij + pow(Xij,2) + 10*pow(Cx,2)) + 3*pow(B10,2)*(5*Cx + 3*Xij));
	    update((C[0][0])*W[a]*(Cy*f25), I+40);
	    update((C[0][0])*W[a]*(Cz*f25), I+41);
	    double f27 = (3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3));
	    update((C[0][0])*W[a]*(Ix*Iy*f27), I+42);
	    update((C[0][0])*W[a]*(f27*(Xij*(Xij + 2*Cx) + Px)), I+43);
	    update((C[0][0])*W[a]*(f20*f27), I+44);
	    double f28 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Py*f28*(Px + Cx*Xij)), I+45);
	    update((C[0][0])*W[a]*(Cx*Iy*f0*f28), I+46);
	    update((C[0][0])*W[a]*(Px*f15*f28), I+47);
	    double f29 = (3*B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*Iz*f0*f29), I+48);
	    update((C[0][0])*W[a]*(Cx*Iz*Py*f29), I+49);
	    update((C[0][0])*W[a]*(Cy*Iz*Px*f29), I+50);
	    double f3 = (B10*Cx*(12*Cx*Xij + 10*pow(Cx,2) + 3*pow(Xij,2)) + 3*pow(B10,2)*(5*Cx + 2*Xij) + pow(Cx,3)*pow(Ix,2));
	    update((C[0][0])*W[a]*(Iz*f3), I+51);
	    update((C[0][0])*W[a]*(Iy*f3), I+52);
	    double f30 = (3*pow(B10,2) + 3*B10*Ix*(Xij + 2*Cx) + Cx*pow(Ix,3));
	    update((C[0][0])*W[a]*(Cy*Cz*f30), I+53);
	    update((C[0][0])*W[a]*(Py*f30), I+54);
	    update((C[0][0])*W[a]*(Pz*f30), I+55);
	    double f31 = (3*pow(B10,2) + Cz*pow(Iz,3) + 3*B10*Iz*(2*Cz + Zij));
	    update((C[0][0])*W[a]*(Cx*Cy*f31), I+56);
	    update((C[0][0])*W[a]*(Px*f31), I+57);
	    update((C[0][0])*W[a]*(Py*f31), I+58);
	    double f32 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*(Cz*Ix*f32*(Xij*(Xij + 2*Cx) + f0)), I+59);
	    update((C[0][0])*W[a]*(Cz*Iy*f32*(Xij*(Xij + 2*Cx) + Px)), I+60);
	    update((C[0][0])*W[a]*(Cz*Ix*f20*f32), I+61);
	    update((C[0][0])*W[a]*(Cz*Iy*f12*f32), I+62);
	    double f33 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*(Cy*Ix*f33*(Xij*(Xij + 2*Cx) + f0)), I+63);
	    update((C[0][0])*W[a]*(Cy*Iz*f33*(Xij*(Xij + 2*Cx) + Px)), I+64);
	    update((C[0][0])*W[a]*(Cy*Ix*f28*f33), I+65);
	    update((C[0][0])*W[a]*(Cy*Iz*f29*f33), I+66);
	    double f34 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Iy*f34*(Cz*Zij + Pz)), I+67);
	    update((C[0][0])*W[a]*(Cy*f28*f34), I+68);
	    update((C[0][0])*W[a]*(Cz*f20*f34), I+69);
	    update((C[0][0])*W[a]*(Iz*f15*f34), I+70);
	    update((C[0][0])*W[a]*(f13*f34), I+71);
	    update((C[0][0])*W[a]*(f19*f34), I+72);
	    double f35 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Cy*f35*(Xij*(Xij + 2*Cx) + Px)), I+73);
	    update((C[0][0])*W[a]*(Iy*f35*(Px + Cx*Xij)), I+74);
	    update((C[0][0])*W[a]*(Cx*f20*f35), I+75);
	    update((C[0][0])*W[a]*(Ix*f15*f35), I+76);
	    update((C[0][0])*W[a]*(f19*f35), I+77);
	    update((C[0][0])*W[a]*(f18*f35), I+78);
	    double f36 = 15*pow(B10,3);
	    update((C[0][0])*W[a]*((3*B10*Cx*Ix*(5*Cx*Xij + 5*pow(Cx,2) + pow(Xij,2)) + f36 + 9*pow(B10,2)*(5*Cx*Xij + 5*pow(Cx,2) + pow(Xij,2)) + pow(Cx,3)*pow(Ix,3))), I+79);
	    update((C[0][0])*W[a]*((9*pow(B10,2)*(5*Cy*Yij + 5*pow(Cy,2) + pow(Yij,2)) + f36 + pow(Cy,3)*pow(Iy,3) + 3*B10*Cy*Iy*(5*Cy*Yij + 5*pow(Cy,2) + pow(Yij,2)))), I+80);
	    update((C[0][0])*W[a]*((f36 + 3*B10*Cz*Iz*(5*pow(Cz,2) + 5*Cz*Zij + pow(Zij,2)) + pow(Cz,3)*pow(Iz,3) + 9*pow(B10,2)*(5*pow(Cz,2) + 5*Cz*Zij + pow(Zij,2)))), I+81);
	    double f39 = (3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy));
	    update((C[0][0])*W[a]*(Ix*Iz*f39), I+82);
	    update((C[0][0])*W[a]*(f39*(Xij*(Xij + 2*Cx) + Px)), I+83);
	    update((C[0][0])*W[a]*(f28*f39), I+84);
	    double f6 = (3*pow(B10,2) + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2)));
	    update((C[0][0])*W[a]*(Cy*Ix*f6), I+85);
	    update((C[0][0])*W[a]*(Cx*Iy*f6), I+86);
	    update((C[0][0])*W[a]*(f6*(Px + Cx*Xij)), I+87);
	    update((C[0][0])*W[a]*(f15*f6), I+88);
	    double f7 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Ix*f7*(Cz*Zij + Pz)), I+89);
	    update((C[0][0])*W[a]*(Iz*f7*(Px + Cx*Xij)), I+90);
	    update((C[0][0])*W[a]*(Cz*f7*(Xij*(Xij + 2*Cx) + Px)), I+91);
	    update((C[0][0])*W[a]*(Cx*f28*f7), I+92);
	    update((C[0][0])*W[a]*(f18*f7), I+93);
	    update((C[0][0])*W[a]*(f13*f7), I+94);
	    double f8 = (3*pow(B10,2) + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3));
	    update((C[0][0])*W[a]*(Iy*Iz*f8), I+95);
	    update((C[0][0])*W[a]*(f20*f8), I+96);
	    update((C[0][0])*W[a]*(f28*f8), I+97);
	    double f9 = (3*pow(B10,2)*(5*Cy + 2*Yij) + B10*Cy*(12*Cy*Yij + 3*pow(Yij,2) + 10*pow(Cy,2)) + pow(Cy,3)*pow(Iy,2));
	    update((C[0][0])*W[a]*(Ix*f9), I+98);
	    update((C[0][0])*W[a]*(Iz*f9), I+99);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[100]) {
	double T[100];
	for (int i = 0; i < 100; ++i) {
	    T[i] = I[i];
	}
	I[46] = T[0];
	I[6] = T[1];
	I[8] = T[2];
	I[44] = T[3];
	I[34] = T[4];
	I[43] = T[5];
	I[14] = T[6];
	I[17] = T[7];
	I[10] = T[8];
	I[79] = T[9];
	I[76] = T[10];
	I[84] = T[11];
	I[66] = T[12];
	I[56] = T[13];
	I[55] = T[14];
	I[65] = T[15];
	I[99] = T[16];
	I[38] = T[17];
	I[89] = T[18];
	I[33] = T[19];
	I[49] = T[20];
	I[39] = T[21];
	I[37] = T[22];
	I[45] = T[23];
	I[69] = T[24];
	I[59] = T[25];
	I[58] = T[26];
	I[63] = T[27];
	I[72] = T[28];
	I[82] = T[29];
	I[64] = T[30];
	I[57] = T[31];
	I[60] = T[32];
	I[16] = T[33];
	I[15] = T[34];
	I[27] = T[35];
	I[28] = T[36];
	I[19] = T[37];
	I[18] = T[38];
	I[13] = T[39];
	I[3] = T[40];
	I[4] = T[41];
	I[92] = T[42];
	I[42] = T[43];
	I[62] = T[44];
	I[75] = T[45];
	I[80] = T[46];
	I[83] = T[47];
	I[20] = T[48];
	I[25] = T[49];
	I[23] = T[50];
	I[40] = T[51];
	I[30] = T[52];
	I[9] = T[53];
	I[5] = T[54];
	I[7] = T[55];
	I[29] = T[56];
	I[24] = T[57];
	I[26] = T[58];
	I[2] = T[59];
	I[32] = T[60];
	I[52] = T[61];
	I[12] = T[62];
	I[1] = T[63];
	I[41] = T[64];
	I[71] = T[65];
	I[21] = T[66];
	I[94] = T[67];
	I[73] = T[68];
	I[54] = T[69];
	I[93] = T[70];
	I[74] = T[71];
	I[53] = T[72];
	I[48] = T[73];
	I[97] = T[74];
	I[67] = T[75];
	I[98] = T[76];
	I[68] = T[77];
	I[47] = T[78];
	I[0] = T[79];
	I[11] = T[80];
	I[22] = T[81];
	I[91] = T[82];
	I[31] = T[83];
	I[81] = T[84];
	I[78] = T[85];
	I[87] = T[86];
	I[77] = T[87];
	I[88] = T[88];
	I[96] = T[89];
	I[95] = T[90];
	I[36] = T[91];
	I[85] = T[92];
	I[35] = T[93];
	I[86] = T[94];
	I[90] = T[95];
	I[50] = T[96];
	I[70] = T[97];
	I[51] = T[98];
	I[61] = T[99];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[100] = { 79, 63, 59, 40, 41, 54, 1, 55, 2, 53, 8, 80, 62, 39, 6, 34, 33, 7, 38, 37, 48, 66, 81, 50, 57, 49, 58, 35, 36, 56, 52, 83, 60, 19, 4, 93, 91, 22, 17, 21, 51, 64, 43, 5, 3, 23, 0, 78, 73, 20, 96, 98, 61, 72, 69, 14, 13, 31, 26, 25, 32, 99, 44, 27, 30, 15, 12, 75, 77, 24, 97, 65, 28, 68, 71, 45, 10, 87, 85, 9, 46, 84, 29, 47, 11, 92, 94, 86, 88, 18, 95, 82, 42, 70, 67, 90, 89, 74, 76, 16 };
// 	if (index < 100) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    46, 6, 8, 44, 34, 43, 14, 17, 10, 79, 76, 84, 66, 56, 55, 65, 99, 38, 89, 33, 49, 39, 37, 45, 69, 59, 58, 63, 72, 82, 64, 57, 60, 16, 15, 27, 28, 19, 18, 13, 3, 4, 92, 42, 62, 75, 80, 83, 20, 25, 23, 40, 30, 9, 5, 7, 29, 24, 26, 2, 32, 52, 12, 1, 41, 71, 21, 94, 73, 54, 93, 74, 53, 48, 97, 67, 98, 68, 47, 0, 11, 22, 91, 31, 81, 78, 87, 77, 88, 96, 95, 36, 85, 35, 86, 90, 50, 70, 51, 61
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 46;
	*idx++ = 6;
	*idx++ = 8;
	*idx++ = 44;
	*idx++ = 34;
	*idx++ = 43;
	*idx++ = 14;
	*idx++ = 17;
	*idx++ = 10;
	*idx++ = 79;
	*idx++ = 76;
	*idx++ = 84;
	*idx++ = 66;
	*idx++ = 56;
	*idx++ = 55;
	*idx++ = 65;
	*idx++ = 99;
	*idx++ = 38;
	*idx++ = 89;
	*idx++ = 33;
	*idx++ = 49;
	*idx++ = 39;
	*idx++ = 37;
	*idx++ = 45;
	*idx++ = 69;
	*idx++ = 59;
	*idx++ = 58;
	*idx++ = 63;
	*idx++ = 72;
	*idx++ = 82;
	*idx++ = 64;
	*idx++ = 57;
	*idx++ = 60;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 13;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 92;
	*idx++ = 42;
	*idx++ = 62;
	*idx++ = 75;
	*idx++ = 80;
	*idx++ = 83;
	*idx++ = 20;
	*idx++ = 25;
	*idx++ = 23;
	*idx++ = 40;
	*idx++ = 30;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 2;
	*idx++ = 32;
	*idx++ = 52;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 41;
	*idx++ = 71;
	*idx++ = 21;
	*idx++ = 94;
	*idx++ = 73;
	*idx++ = 54;
	*idx++ = 93;
	*idx++ = 74;
	*idx++ = 53;
	*idx++ = 48;
	*idx++ = 97;
	*idx++ = 67;
	*idx++ = 98;
	*idx++ = 68;
	*idx++ = 47;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 91;
	*idx++ = 31;
	*idx++ = 81;
	*idx++ = 78;
	*idx++ = 87;
	*idx++ = 77;
	*idx++ = 88;
	*idx++ = 96;
	*idx++ = 95;
	*idx++ = 36;
	*idx++ = 85;
	*idx++ = 35;
	*idx++ = 86;
	*idx++ = 90;
	*idx++ = 50;
	*idx++ = 70;
	*idx++ = 51;
	*idx++ = 61;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::D, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][1])*W[a]*(Cz*(B10 + pow(Ix,2))), I+0);
	    update((C[0][0])*W[a]*((B10 + pow(Ix,2))), I+1);
	    update((C[0][1])*W[a]*(Cy*(B10 + pow(Ix,2))), I+2);
	    update((C[0][1])*W[a]*(Iy*(Cx*Ix + B10)), I+3);
	    update((C[0][1])*W[a]*(Cz*(B10 + pow(Iy,2))), I+4);
	    update((C[0][1])*W[a]*(Cx*(B10 + pow(Iy,2))), I+5);
	    update((C[0][0])*W[a]*((B10 + pow(Iy,2))), I+6);
	    update((C[0][1])*W[a]*(Ix*(Cy*Iy + B10)), I+7);
	    update((C[0][1])*W[a]*(Cz*Ix*Iy), I+8);
	    update((C[0][0])*W[a]*(Ix*Iy), I+9);
	    update((C[0][1])*W[a]*(Ix*(B10 + Cz*Iz)), I+10);
	    update((C[0][1])*W[a]*(Iz*(Cx*Ix + B10)), I+11);
	    update((C[0][1])*W[a]*(Iz*(Cy*Iy + B10)), I+12);
	    update((C[0][1])*W[a]*(Cy*(B10 + pow(Iz,2))), I+13);
	    update((C[0][1])*W[a]*(Cx*(B10 + pow(Iz,2))), I+14);
	    update((C[0][1])*W[a]*(Iy*(B10 + Cz*Iz)), I+15);
	    update((C[0][0])*W[a]*((B10 + pow(Iz,2))), I+16);
	    update((C[0][1])*W[a]*(Cy*Ix*Iz), I+17);
	    update((C[0][1])*W[a]*(Cx*Iy*Iz), I+18);
	    update((C[0][0])*W[a]*(Ix*Iz), I+19);
	    update((C[0][0])*W[a]*(Iy*Iz), I+20);
	    update((C[0][1])*W[a]*((B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2))), I+21);
	    update((C[0][1])*W[a]*((Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij))), I+22);
	    update((C[0][1])*W[a]*((B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2))), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[0] = T[1];
	I[2] = T[2];
	I[13] = T[3];
	I[7] = T[4];
	I[5] = T[5];
	I[4] = T[6];
	I[14] = T[7];
	I[15] = T[8];
	I[12] = T[9];
	I[19] = T[10];
	I[17] = T[11];
	I[22] = T[12];
	I[10] = T[13];
	I[9] = T[14];
	I[23] = T[15];
	I[8] = T[16];
	I[18] = T[17];
	I[21] = T[18];
	I[16] = T[19];
	I[20] = T[20];
	I[1] = T[21];
	I[6] = T[22];
	I[11] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 1, 21, 2, 0, 6, 5, 22, 4, 16, 14, 13, 23, 9, 3, 7, 8, 19, 11, 17, 10, 20, 18, 12, 15 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 0, 2, 13, 7, 5, 4, 14, 15, 12, 19, 17, 22, 10, 9, 23, 8, 18, 21, 16, 20, 1, 6, 11
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 0;
	*idx++ = 2;
	*idx++ = 13;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 14;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 19;
	*idx++ = 17;
	*idx++ = 22;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 23;
	*idx++ = 8;
	*idx++ = 18;
	*idx++ = 21;
	*idx++ = 16;
	*idx++ = 20;
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 11;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::SP, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[72]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Cz*Dx), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz*Dy), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Dz), I+2);
	    update((C[0][1])*W[a]*(Dy*Iz*Px), I+3);
	    update((C[0][0])*W[a]*(Dy*Px), I+4);
	    update((C[0][1])*W[a]*(Dz*Iy*Px), I+5);
	    update((C[0][0])*W[a]*(Dz*Px), I+6);
	    update((C[0][1])*W[a]*(Dz*Ix*Py), I+7);
	    update((C[0][1])*W[a]*(Dx*Iz*Py), I+8);
	    update((C[0][0])*W[a]*(Dx*Py), I+9);
	    update((C[0][0])*W[a]*(Dz*Py), I+10);
	    update((C[0][1])*W[a]*(Dx*Iy*Pz), I+11);
	    update((C[0][1])*W[a]*(Dy*Ix*Pz), I+12);
	    update((C[0][0])*W[a]*(Dy*Pz), I+13);
	    update((C[0][0])*W[a]*(Dx*Pz), I+14);
	    update((C[0][1])*W[a]*(Cz*Iy*Qx), I+15);
	    update((C[0][0])*W[a]*(Cz*Qx), I+16);
	    update((C[0][1])*W[a]*(Cy*Iz*Qx), I+17);
	    update((C[0][0])*W[a]*(Cy*Qx), I+18);
	    update((C[0][1])*W[a]*(Cz*Ix*Qy), I+19);
	    update((C[0][0])*W[a]*(Cz*Qy), I+20);
	    update((C[0][1])*W[a]*(Cx*Iz*Qy), I+21);
	    update((C[0][0])*W[a]*(Cx*Qy), I+22);
	    update((C[0][1])*W[a]*(Cx*Iy*Qz), I+23);
	    update((C[0][0])*W[a]*(Cx*Qz), I+24);
	    update((C[0][1])*W[a]*(Cy*Ix*Qz), I+25);
	    update((C[0][0])*W[a]*(Cy*Qz), I+26);
	    update((C[0][1])*W[a]*(Cy*Dz*(Px + Cx*Xij)), I+27);
	    update((C[0][1])*W[a]*(Qy*(Px + Cx*Xij)), I+28);
	    update((C[0][1])*W[a]*(Qz*(Px + Cx*Xij)), I+29);
	    update((C[0][1])*W[a]*(Cz*Dy*(Px + Cx*Xij)), I+30);
	    update((C[0][1])*W[a]*(Cx*Cy*(Dz*Zij + Qz)), I+31);
	    update((C[0][1])*W[a]*(Cx*Dy*(Cz*Zij + Pz)), I+32);
	    update((C[0][1])*W[a]*(Qy*(Cz*Zij + Pz)), I+33);
	    update((C[0][1])*W[a]*(Qx*(Cz*Zij + Pz)), I+34);
	    update((C[0][1])*W[a]*(Cy*Dx*(Cz*Zij + Pz)), I+35);
	    update((C[0][1])*W[a]*(Px*(Dz*Zij + Qz)), I+36);
	    update((C[0][1])*W[a]*(Py*(Dz*Zij + Qz)), I+37);
	    double f1 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][1])*W[a]*(Cx*f1), I+38);
	    update((C[0][1])*W[a]*(Cz*f1), I+39);
	    double f10 = (Dy*Iy + B00);
	    update((C[0][1])*W[a]*(Cx*Cz*f10), I+40);
	    update((C[0][1])*W[a]*(Px*f10), I+41);
	    update((C[0][1])*W[a]*(Pz*f10), I+42);
	    double f11 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Cx*Dz*f11), I+43);
	    update((C[0][1])*W[a]*(Cz*Dx*f11), I+44);
	    update((C[0][1])*W[a]*(Qx*f11), I+45);
	    update((C[0][1])*W[a]*(Qz*f11), I+46);
	    double f12 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][1])*W[a]*(Dy*f12), I+47);
	    update((C[0][1])*W[a]*(Dz*f12), I+48);
	    double f14 = (Dx*Px + 2*B00*Cx);
	    update((C[0][1])*W[a]*(Iz*f14), I+49);
	    update((C[0][1])*W[a]*(Iy*f14), I+50);
	    update((C[0][0])*W[a]*(f14), I+51);
	    double f15 = (Dx*Ix + B00);
	    update((C[0][1])*W[a]*(Cy*Cz*f15), I+52);
	    update((C[0][1])*W[a]*(Py*f15), I+53);
	    update((C[0][1])*W[a]*(Pz*f15), I+54);
	    double f22 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][1])*W[a]*(Dx*f22), I+55);
	    update((C[0][1])*W[a]*(Dy*f22), I+56);
	    double f3 = 3*B00*B10;
	    update((C[0][1])*W[a]*((Dx*Ix*pow(Cx,2) + B00*Cx*(3*Cx + 2*Xij) + B10*Dx*(3*Cx + Xij) + f3)), I+57);
	    update((C[0][1])*W[a]*((Dy*Iy*pow(Cy,2) + f3 + B00*Cy*(3*Cy + 2*Yij) + B10*Dy*(3*Cy + Yij))), I+58);
	    update((C[0][1])*W[a]*((B00*Cz*(3*Cz + 2*Zij) + Dz*Iz*pow(Cz,2) + B10*Dz*(3*Cz + Zij) + f3)), I+59);
	    double f4 = (Dy*Py + 2*B00*Cy);
	    update((C[0][1])*W[a]*(Ix*f4), I+60);
	    update((C[0][1])*W[a]*(Iz*f4), I+61);
	    update((C[0][0])*W[a]*(f4), I+62);
	    double f5 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][1])*W[a]*(Dx*f5), I+63);
	    update((C[0][1])*W[a]*(Dz*f5), I+64);
	    double f6 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][1])*W[a]*(Cx*(f6 + Qz*Zij)), I+65);
	    update((C[0][1])*W[a]*(Cy*(f6 + Qz*Zij)), I+66);
	    update((C[0][1])*W[a]*(Iy*f6), I+67);
	    update((C[0][1])*W[a]*(Ix*f6), I+68);
	    update((C[0][0])*W[a]*(f6), I+69);
	    double f8 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][1])*W[a]*(Cy*f8), I+70);
	    update((C[0][1])*W[a]*(Cz*f8), I+71);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[72]) {
	double T[72];
	for (int i = 0; i < 72; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[28] = T[1];
	I[51] = T[2];
	I[42] = T[3];
	I[24] = T[4];
	I[60] = T[5];
	I[48] = T[6];
	I[55] = T[7];
	I[19] = T[8];
	I[1] = T[9];
	I[49] = T[10];
	I[14] = T[11];
	I[32] = T[12];
	I[26] = T[13];
	I[2] = T[14];
	I[16] = T[15];
	I[4] = T[16];
	I[21] = T[17];
	I[3] = T[18];
	I[35] = T[19];
	I[29] = T[20];
	I[45] = T[21];
	I[27] = T[22];
	I[64] = T[23];
	I[52] = T[24];
	I[59] = T[25];
	I[53] = T[26];
	I[57] = T[27];
	I[33] = T[28];
	I[58] = T[29];
	I[34] = T[30];
	I[69] = T[31];
	I[46] = T[32];
	I[47] = T[33];
	I[22] = T[34];
	I[23] = T[35];
	I[66] = T[36];
	I[67] = T[37];
	I[39] = T[38];
	I[41] = T[39];
	I[40] = T[40];
	I[36] = T[41];
	I[38] = T[42];
	I[63] = T[43];
	I[17] = T[44];
	I[15] = T[45];
	I[65] = T[46];
	I[30] = T[47];
	I[54] = T[48];
	I[18] = T[49];
	I[12] = T[50];
	I[0] = T[51];
	I[11] = T[52];
	I[7] = T[53];
	I[8] = T[54];
	I[20] = T[55];
	I[44] = T[56];
	I[6] = T[57];
	I[37] = T[58];
	I[68] = T[59];
	I[31] = T[60];
	I[43] = T[61];
	I[25] = T[62];
	I[13] = T[63];
	I[61] = T[64];
	I[70] = T[65];
	I[71] = T[66];
	I[62] = T[67];
	I[56] = T[68];
	I[50] = T[69];
	I[9] = T[70];
	I[10] = T[71];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[72] = { 51, 9, 14, 18, 16, 0, 57, 53, 54, 70, 71, 52, 50, 63, 11, 45, 15, 44, 49, 8, 55, 17, 34, 35, 4, 62, 13, 22, 1, 20, 47, 60, 12, 28, 30, 19, 41, 58, 42, 38, 40, 39, 3, 61, 56, 21, 32, 33, 6, 10, 69, 2, 24, 26, 48, 7, 68, 27, 29, 25, 5, 64, 67, 43, 23, 46, 36, 37, 59, 31, 65, 66 };
// 	if (index < 72) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 28, 51, 42, 24, 60, 48, 55, 19, 1, 49, 14, 32, 26, 2, 16, 4, 21, 3, 35, 29, 45, 27, 64, 52, 59, 53, 57, 33, 58, 34, 69, 46, 47, 22, 23, 66, 67, 39, 41, 40, 36, 38, 63, 17, 15, 65, 30, 54, 18, 12, 0, 11, 7, 8, 20, 44, 6, 37, 68, 31, 43, 25, 13, 61, 70, 71, 62, 56, 50, 9, 10
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 28;
	*idx++ = 51;
	*idx++ = 42;
	*idx++ = 24;
	*idx++ = 60;
	*idx++ = 48;
	*idx++ = 55;
	*idx++ = 19;
	*idx++ = 1;
	*idx++ = 49;
	*idx++ = 14;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 2;
	*idx++ = 16;
	*idx++ = 4;
	*idx++ = 21;
	*idx++ = 3;
	*idx++ = 35;
	*idx++ = 29;
	*idx++ = 45;
	*idx++ = 27;
	*idx++ = 64;
	*idx++ = 52;
	*idx++ = 59;
	*idx++ = 53;
	*idx++ = 57;
	*idx++ = 33;
	*idx++ = 58;
	*idx++ = 34;
	*idx++ = 69;
	*idx++ = 46;
	*idx++ = 47;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 66;
	*idx++ = 67;
	*idx++ = 39;
	*idx++ = 41;
	*idx++ = 40;
	*idx++ = 36;
	*idx++ = 38;
	*idx++ = 63;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 65;
	*idx++ = 30;
	*idx++ = 54;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 20;
	*idx++ = 44;
	*idx++ = 6;
	*idx++ = 37;
	*idx++ = 68;
	*idx++ = 31;
	*idx++ = 43;
	*idx++ = 25;
	*idx++ = 13;
	*idx++ = 61;
	*idx++ = 70;
	*idx++ = 71;
	*idx++ = 62;
	*idx++ = 56;
	*idx++ = 50;
	*idx++ = 9;
	*idx++ = 10;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::P, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][1])*W[a]*((Cx*Ix + B10)), I+0);
	    update((C[0][1])*W[a]*(Cz*Ix), I+1);
	    update((C[0][1])*W[a]*(Cy*Ix), I+2);
	    update((C[0][0])*W[a]*(Ix), I+3);
	    update((C[0][1])*W[a]*((Cy*Iy + B10)), I+4);
	    update((C[0][1])*W[a]*(Cz*Iy), I+5);
	    update((C[0][1])*W[a]*(Cx*Iy), I+6);
	    update((C[0][0])*W[a]*(Iy), I+7);
	    update((C[0][1])*W[a]*((B10 + Cz*Iz)), I+8);
	    update((C[0][1])*W[a]*(Cx*Iz), I+9);
	    update((C[0][1])*W[a]*(Cy*Iz), I+10);
	    update((C[0][0])*W[a]*(Iz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[3] = T[1];
	I[2] = T[2];
	I[0] = T[3];
	I[6] = T[4];
	I[7] = T[5];
	I[5] = T[6];
	I[4] = T[7];
	I[11] = T[8];
	I[9] = T[9];
	I[10] = T[10];
	I[8] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 3, 0, 2, 1, 7, 6, 4, 5, 11, 9, 10, 8 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 3, 2, 0, 6, 7, 5, 4, 11, 9, 10, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::SP, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][4],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][4],
	      double (&I)[192]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][4],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][4],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qy = (Cy*Dy + B00);

	    update((C[1][1])*W[a]*(Cy*Dz*Kx), I+0);
	    update((C[1][0])*W[a]*(Dz*Kx), I+1);
	    update((C[1][2])*W[a]*(Dz*Iy*Kx), I+2);
	    update((C[0][2])*W[a]*(Iy*Kx), I+3);
	    update((C[0][3])*W[a]*(Cz*Iy*Kx), I+4);
	    update((C[0][1])*W[a]*(Cz*Kx), I+5);
	    update((C[1][1])*W[a]*(Cz*Dy*Kx), I+6);
	    update((C[1][0])*W[a]*(Dy*Kx), I+7);
	    update((C[1][2])*W[a]*(Dy*Iz*Kx), I+8);
	    update((C[0][2])*W[a]*(Iz*Kx), I+9);
	    update((C[0][3])*W[a]*(Cy*Iz*Kx), I+10);
	    update((C[0][1])*W[a]*(Cy*Kx), I+11);
	    update((C[0][0])*W[a]*(Kx), I+12);
	    update((C[1][2])*W[a]*(Dz*Ix*Ky), I+13);
	    update((C[0][2])*W[a]*(Ix*Ky), I+14);
	    update((C[0][3])*W[a]*(Cz*Ix*Ky), I+15);
	    update((C[0][1])*W[a]*(Cz*Ky), I+16);
	    update((C[1][1])*W[a]*(Cz*Dx*Ky), I+17);
	    update((C[1][0])*W[a]*(Dx*Ky), I+18);
	    update((C[1][2])*W[a]*(Dx*Iz*Ky), I+19);
	    update((C[0][2])*W[a]*(Iz*Ky), I+20);
	    update((C[0][3])*W[a]*(Cx*Iz*Ky), I+21);
	    update((C[0][1])*W[a]*(Cx*Ky), I+22);
	    update((C[1][1])*W[a]*(Cx*Dz*Ky), I+23);
	    update((C[1][0])*W[a]*(Dz*Ky), I+24);
	    update((C[0][0])*W[a]*(Ky), I+25);
	    update((C[1][2])*W[a]*(Ix*(Dz*Kz + B01)), I+26);
	    update((C[1][3])*W[a]*(Cy*Ix*(Dz*Kz + B01)), I+27);
	    update((C[1][1])*W[a]*(Cy*(Dz*Kz + B01)), I+28);
	    update((C[1][2])*W[a]*(Iy*(Dz*Kz + B01)), I+29);
	    update((C[1][3])*W[a]*(Cx*Iy*(Dz*Kz + B01)), I+30);
	    update((C[1][1])*W[a]*(Cx*(Dz*Kz + B01)), I+31);
	    update((C[1][0])*W[a]*((Dz*Kz + B01)), I+32);
	    update((C[1][1])*W[a]*(Cx*Dy*Kz), I+33);
	    update((C[1][2])*W[a]*(Dy*Ix*Kz), I+34);
	    update((C[0][3])*W[a]*(Cy*Ix*Kz), I+35);
	    update((C[1][1])*W[a]*(Cy*Dx*Kz), I+36);
	    update((C[1][2])*W[a]*(Dx*Iy*Kz), I+37);
	    update((C[0][3])*W[a]*(Cx*Iy*Kz), I+38);
	    update((C[0][1])*W[a]*(Cx*Kz), I+39);
	    update((C[0][1])*W[a]*(Cy*Kz), I+40);
	    update((C[0][2])*W[a]*(Ix*Kz), I+41);
	    update((C[0][2])*W[a]*(Iy*Kz), I+42);
	    update((C[1][0])*W[a]*(Dx*Kz), I+43);
	    update((C[1][0])*W[a]*(Dy*Kz), I+44);
	    update((C[0][0])*W[a]*(Kz), I+45);
	    update((C[1][3])*W[a]*(Ix*Kz*Qy), I+46);
	    update((C[1][3])*W[a]*(Iz*Kx*Qy), I+47);
	    update((C[1][1])*W[a]*(Kx*Qy), I+48);
	    update((C[1][1])*W[a]*(Kz*Qy), I+49);
	    update((C[1][3])*W[a]*(Cx*Kz*(Dy*Yij + Qy)), I+50);
	    update((C[1][3])*W[a]*(Cz*Kx*(Dy*Yij + Qy)), I+51);
	    update((C[1][2])*W[a]*(Kx*(Dy*Yij + Qy)), I+52);
	    update((C[1][2])*W[a]*(Kz*(Dy*Yij + Qy)), I+53);
	    double f1 = B00*Zkl;
	    double f19 = Cz*Zkl;
	    double f14 = Cz*pow(Dz,2);
	    update((C[1][3])*W[a]*(Cy*(f14 + Dz*(f19 + 2*B00 + Kz*Zij) + f1 + B01*Iz)), I+54);
	    update((C[1][3])*W[a]*(Cx*(f14 + Dz*(f19 + 2*B00 + Kz*Zij) + f1 + B01*Iz)), I+55);
	    update((C[1][2])*W[a]*((f14 + Dz*(f19 + 2*B00 + Kz*Zij) + f1 + B01*Iz)), I+56);
	    double f2 = Cx*Dx;
	    update((C[1][3])*W[a]*(Ky*(Dx*Px + B00*(Xij + 2*Cx) + f2*Xij)), I+57);
	    update((C[1][3])*W[a]*(Kz*(Dx*Px + B00*(Xij + 2*Cx) + f2*Xij)), I+58);
	    update((C[1][3])*W[a]*(Cy*Kz*(B00 + f2 + Dx*Xij)), I+59);
	    update((C[1][2])*W[a]*(Kz*(B00 + f2 + Dx*Xij)), I+60);
	    update((C[1][3])*W[a]*(Cz*Ky*(B00 + f2 + Dx*Xij)), I+61);
	    update((C[1][2])*W[a]*(Ky*(B00 + f2 + Dx*Xij)), I+62);
	    update((C[1][3])*W[a]*(Iz*Ky*(B00 + f2)), I+63);
	    update((C[1][1])*W[a]*(Ky*(B00 + f2)), I+64);
	    update((C[1][3])*W[a]*(Iy*Kz*(B00 + f2)), I+65);
	    update((C[1][1])*W[a]*(Kz*(B00 + f2)), I+66);
	    double f27 = Cx*Xkl;
	    update((C[1][3])*W[a]*(Dz*(Xij*(f27 + f2) + Kx*Px + B00*(Xij + 2*Cx))), I+67);
	    update((C[1][3])*W[a]*(Dy*(Xij*(f27 + f2) + Kx*Px + B00*(Xij + 2*Cx))), I+68);
	    update((C[0][3])*W[a]*((Xij*(f27 + f2) + Kx*Px + B00*(Xij + 2*Cx))), I+69);
	    update((C[1][3])*W[a]*((Dy*Yij + Qy)*(f27 + B00 + f2)), I+70);
	    update((C[1][3])*W[a]*(Cy*Dz*(f27 + Kx*Xij + B00 + f2)), I+71);
	    update((C[1][2])*W[a]*(Dz*(f27 + Kx*Xij + B00 + f2)), I+72);
	    update((C[1][3])*W[a]*(Qy*(f27 + Kx*Xij + B00 + f2)), I+73);
	    update((C[1][2])*W[a]*(Dy*(f27 + Kx*Xij + B00 + f2)), I+74);
	    update((C[1][3])*W[a]*(Cz*Dy*(f27 + Kx*Xij + B00 + f2)), I+75);
	    update((C[0][3])*W[a]*(Cz*(f27 + Kx*Xij + B00 + f2)), I+76);
	    update((C[0][2])*W[a]*((f27 + Kx*Xij + B00 + f2)), I+77);
	    update((C[0][3])*W[a]*(Cy*(f27 + Kx*Xij + B00 + f2)), I+78);
	    update((C[1][3])*W[a]*(Dz*Iy*(f27 + B00 + f2)), I+79);
	    update((C[1][1])*W[a]*(Dz*(f27 + B00 + f2)), I+80);
	    update((C[1][1])*W[a]*(Dy*(f27 + B00 + f2)), I+81);
	    update((C[1][3])*W[a]*(Dy*Iz*(f27 + B00 + f2)), I+82);
	    update((C[0][3])*W[a]*(Iz*(f27 + B00 + f2)), I+83);
	    update((C[0][1])*W[a]*((f27 + B00 + f2)), I+84);
	    update((C[0][3])*W[a]*(Iy*(f27 + B00 + f2)), I+85);
	    double f31 = (Cx*Ix + B10);
	    update((C[1][3])*W[a]*(f31*(Dz*Kz + B01)), I+86);
	    update((C[1][3])*W[a]*(Dz*Ky*f31), I+87);
	    update((C[0][3])*W[a]*(Ky*f31), I+88);
	    update((C[1][3])*W[a]*(Dy*Kz*f31), I+89);
	    update((C[0][3])*W[a]*(Kz*f31), I+90);
	    double f35 = (B01 + Dx*Kx);
	    update((C[1][2])*W[a]*(Iy*f35), I+91);
	    update((C[1][3])*W[a]*(Cz*Iy*f35), I+92);
	    update((C[1][1])*W[a]*(Cz*f35), I+93);
	    update((C[1][2])*W[a]*(Iz*f35), I+94);
	    update((C[1][3])*W[a]*(Cy*Iz*f35), I+95);
	    update((C[1][1])*W[a]*(Cy*f35), I+96);
	    update((C[1][0])*W[a]*(f35), I+97);
	    double f36 = (Cy*Iy + B10);
	    update((C[1][3])*W[a]*(Dx*Kz*f36), I+98);
	    update((C[1][3])*W[a]*(f36*(Dz*Kz + B01)), I+99);
	    update((C[0][3])*W[a]*(Kz*f36), I+100);
	    update((C[1][3])*W[a]*(f35*f36), I+101);
	    update((C[0][3])*W[a]*(Kx*f36), I+102);
	    update((C[1][3])*W[a]*(Dz*Kx*f36), I+103);
	    double f38 = Cy*Ykl;
	    update((C[1][3])*W[a]*((B00 + f2)*(f38 + Ky*Yij + Qy)), I+104);
	    update((C[1][2])*W[a]*(Dx*(f38 + Ky*Yij + Qy)), I+105);
	    update((C[1][3])*W[a]*(Cz*Dx*(f38 + Ky*Yij + Qy)), I+106);
	    update((C[0][3])*W[a]*(Cz*(f38 + Ky*Yij + Qy)), I+107);
	    update((C[1][2])*W[a]*(Dz*(f38 + Ky*Yij + Qy)), I+108);
	    update((C[1][3])*W[a]*(Cx*Dz*(f38 + Ky*Yij + Qy)), I+109);
	    update((C[0][3])*W[a]*(Cx*(f38 + Ky*Yij + Qy)), I+110);
	    update((C[0][2])*W[a]*((f38 + Ky*Yij + Qy)), I+111);
	    update((C[1][3])*W[a]*((f38 + Qy)*(B00 + f2 + Dx*Xij)), I+112);
	    update((C[1][1])*W[a]*(Dz*(f38 + Qy)), I+113);
	    update((C[1][3])*W[a]*(Dz*Ix*(f38 + Qy)), I+114);
	    update((C[0][3])*W[a]*(Ix*(f38 + Qy)), I+115);
	    update((C[1][1])*W[a]*(Dx*(f38 + Qy)), I+116);
	    update((C[1][3])*W[a]*(Dx*Iz*(f38 + Qy)), I+117);
	    update((C[0][3])*W[a]*(Iz*(f38 + Qy)), I+118);
	    update((C[0][1])*W[a]*((f38 + Qy)), I+119);
	    double f42 = (B01 + Dy*Ky);
	    update((C[1][3])*W[a]*(Cz*Ix*f42), I+120);
	    update((C[1][3])*W[a]*(Cx*Iz*f42), I+121);
	    update((C[1][1])*W[a]*(Cx*f42), I+122);
	    update((C[1][1])*W[a]*(Cz*f42), I+123);
	    update((C[1][2])*W[a]*(Ix*f42), I+124);
	    update((C[1][3])*W[a]*(f31*f42), I+125);
	    update((C[1][2])*W[a]*(Iz*f42), I+126);
	    update((C[1][0])*W[a]*(f42), I+127);
	    double f47 = Cx*pow(Dx,2);
	    double f11 = 2*B00*Dx;
	    update((C[1][3])*W[a]*(Cy*(Xkl*(B00 + f2) + f11 + f47 + B01*Ix + Dx*Kx*Xij)), I+128);
	    update((C[1][3])*W[a]*(Cz*(Xkl*(B00 + f2) + f11 + f47 + B01*Ix + Dx*Kx*Xij)), I+129);
	    update((C[1][2])*W[a]*((Xkl*(B00 + f2) + f11 + f47 + B01*Ix + Dx*Kx*Xij)), I+130);
	    double f44 = B01*Cx;
	    update((C[1][3])*W[a]*(Iy*(Xkl*(B00 + f2) + f11 + f44 + f47)), I+131);
	    update((C[1][3])*W[a]*(Iz*(Xkl*(B00 + f2) + f11 + f44 + f47)), I+132);
	    update((C[1][1])*W[a]*((Xkl*(B00 + f2) + f11 + f44 + f47)), I+133);
	    double f49 = Cz*Dz;
	    update((C[1][3])*W[a]*(Dy*(Kz*Pz + Zij*(f19 + f49) + B00*(2*Cz + Zij))), I+134);
	    update((C[1][3])*W[a]*(Dx*(Kz*Pz + Zij*(f19 + f49) + B00*(2*Cz + Zij))), I+135);
	    update((C[0][3])*W[a]*((Kz*Pz + Zij*(f19 + f49) + B00*(2*Cz + Zij))), I+136);
	    update((C[1][3])*W[a]*(Ky*(f49*Zij + Dz*Pz + B00*(2*Cz + Zij))), I+137);
	    update((C[1][3])*W[a]*(Kx*(f49*Zij + Dz*Pz + B00*(2*Cz + Zij))), I+138);
	    update((C[1][3])*W[a]*((f38 + Qy)*(f49 + B00 + Dz*Zij)), I+139);
	    update((C[1][3])*W[a]*(Qy*(f19 + f49 + B00 + Kz*Zij)), I+140);
	    update((C[1][3])*W[a]*((B00 + f2)*(f19 + f49 + B00 + Kz*Zij)), I+141);
	    update((C[1][2])*W[a]*(Dx*(f19 + f49 + B00 + Kz*Zij)), I+142);
	    update((C[1][3])*W[a]*(Cy*Dx*(f19 + f49 + B00 + Kz*Zij)), I+143);
	    update((C[0][3])*W[a]*(Cy*(f19 + f49 + B00 + Kz*Zij)), I+144);
	    update((C[1][2])*W[a]*(Dy*(f19 + f49 + B00 + Kz*Zij)), I+145);
	    update((C[1][3])*W[a]*(Cx*Dy*(f19 + f49 + B00 + Kz*Zij)), I+146);
	    update((C[0][3])*W[a]*(Cx*(f19 + f49 + B00 + Kz*Zij)), I+147);
	    update((C[0][2])*W[a]*((f19 + f49 + B00 + Kz*Zij)), I+148);
	    update((C[1][3])*W[a]*((f27 + B00 + f2)*(f49 + B00 + Dz*Zij)), I+149);
	    update((C[1][3])*W[a]*((Dy*Yij + Qy)*(f19 + f49 + B00)), I+150);
	    update((C[1][3])*W[a]*((f49 + B00)*(f38 + Ky*Yij + Qy)), I+151);
	    update((C[1][3])*W[a]*((B00 + f2 + Dx*Xij)*(f19 + f49 + B00)), I+152);
	    update((C[1][3])*W[a]*((f49 + B00)*(f27 + Kx*Xij + B00 + f2)), I+153);
	    update((C[1][3])*W[a]*(Ix*Ky*(f49 + B00)), I+154);
	    update((C[1][3])*W[a]*(Cx*Ky*(f49 + B00 + Dz*Zij)), I+155);
	    update((C[1][2])*W[a]*(Ky*(f49 + B00 + Dz*Zij)), I+156);
	    update((C[1][3])*W[a]*(Cy*Kx*(f49 + B00 + Dz*Zij)), I+157);
	    update((C[1][2])*W[a]*(Kx*(f49 + B00 + Dz*Zij)), I+158);
	    update((C[1][3])*W[a]*(Iy*Kx*(f49 + B00)), I+159);
	    update((C[1][1])*W[a]*(Kx*(f49 + B00)), I+160);
	    update((C[1][1])*W[a]*(Dx*(f19 + f49 + B00)), I+161);
	    update((C[1][3])*W[a]*(Dx*Iy*(f19 + f49 + B00)), I+162);
	    update((C[0][3])*W[a]*(Iy*(f19 + f49 + B00)), I+163);
	    update((C[1][1])*W[a]*(Dy*(f19 + f49 + B00)), I+164);
	    update((C[1][3])*W[a]*(Dy*Ix*(f19 + f49 + B00)), I+165);
	    update((C[0][3])*W[a]*(Ix*(f19 + f49 + B00)), I+166);
	    update((C[0][1])*W[a]*((f19 + f49 + B00)), I+167);
	    update((C[1][1])*W[a]*(Ky*(f49 + B00)), I+168);
	    double f53 = 2*pow(B00,2);
	    double f3 = B01*B10;
	    update((C[1][3])*W[a]*((f53 + Px*pow(Dx,2) + f44*(Cx + Xij) + f3 + Dx*Px*Xkl + B00*(2*f27 + 4*f2 + Xij*(Xkl + 2*Dx)) + Xij*(f47 + f2*Xkl))), I+169);
	    double f58 = B01*Cz;
	    update((C[1][3])*W[a]*((f53 + Pz*pow(Dz,2) + Dz*Pz*Zkl + f58*(Cz + Zij) + Zij*(f14 + f1 + Dz*f19) + f3 + 2*Cz*f1 + 2*B00*(2*f49 + Dz*Zij))), I+170);
	    double f41 = 2*B00*Dz;
	    update((C[1][3])*W[a]*(Iy*(f14 + f41 + f58 + f1 + Dz*f19)), I+171);
	    update((C[1][3])*W[a]*(Ix*(f14 + f41 + f58 + f1 + Dz*f19)), I+172);
	    update((C[1][1])*W[a]*((f14 + f41 + f58 + f1 + Dz*f19)), I+173);
	    double f59 = (B10 + Cz*Iz);
	    update((C[1][3])*W[a]*(Dx*Ky*f59), I+174);
	    update((C[1][3])*W[a]*(f42*f59), I+175);
	    update((C[0][3])*W[a]*(Ky*f59), I+176);
	    update((C[1][3])*W[a]*(f35*f59), I+177);
	    update((C[1][3])*W[a]*(Dy*Kx*f59), I+178);
	    update((C[0][3])*W[a]*(Kx*f59), I+179);
	    double f52 = Cy*pow(Dy,2);
	    double f25 = B01*Cy;
	    double f6 = B00*Ykl;
	    double f7 = Cy*Dy;
	    update((C[1][3])*W[a]*((Yij*(f52 + f25 + f7*Ykl + f6) + f53 + Dy*Py*Ykl + B01*pow(Cy,2) + f3 + Py*pow(Dy,2) + 2*Cy*f6 + 2*B00*(Dy*Yij + 2*f7))), I+180);
	    double f32 = 2*B00*Dy;
	    update((C[1][3])*W[a]*(Cz*(f52 + f25 + f32 + f7*Ykl + Yij*(B01 + Dy*Ky) + f6)), I+181);
	    update((C[1][3])*W[a]*(Cx*(f52 + f25 + f32 + f7*Ykl + Yij*(B01 + Dy*Ky) + f6)), I+182);
	    update((C[1][2])*W[a]*((f52 + f25 + f32 + f7*Ykl + Yij*(B01 + Dy*Ky) + f6)), I+183);
	    update((C[1][3])*W[a]*(Iz*(f52 + f25 + f32 + f7*Ykl + f6)), I+184);
	    update((C[1][3])*W[a]*(Ix*(f52 + f25 + f32 + f7*Ykl + f6)), I+185);
	    update((C[1][1])*W[a]*((f52 + f25 + f32 + f7*Ykl + f6)), I+186);
	    update((C[1][3])*W[a]*(Dz*(B00*(Yij + 2*Cy) + Yij*(f38 + f7) + Ky*Py)), I+187);
	    update((C[1][3])*W[a]*(Dx*(B00*(Yij + 2*Cy) + Yij*(f38 + f7) + Ky*Py)), I+188);
	    update((C[0][3])*W[a]*((B00*(Yij + 2*Cy) + Yij*(f38 + f7) + Ky*Py)), I+189);
	    update((C[1][3])*W[a]*(Kx*(f7*Yij + B00*(Yij + 2*Cy) + Dy*Py)), I+190);
	    update((C[1][3])*W[a]*(Kz*(f7*Yij + B00*(Yij + 2*Cy) + Dy*Py)), I+191);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[192]) {
	double T[192];
	for (int i = 0; i < 192; ++i) {
	    T[i] = I[i];
	}
	I[50] = T[0];
	I[48] = T[1];
	I[56] = T[2];
	I[8] = T[3];
	I[11] = T[4];
	I[3] = T[5];
	I[35] = T[6];
	I[32] = T[7];
	I[44] = T[8];
	I[12] = T[9];
	I[14] = T[10];
	I[2] = T[11];
	I[0] = T[12];
	I[116] = T[13];
	I[68] = T[14];
	I[71] = T[15];
	I[67] = T[16];
	I[83] = T[17];
	I[80] = T[18];
	I[92] = T[19];
	I[76] = T[20];
	I[77] = T[21];
	I[65] = T[22];
	I[113] = T[23];
	I[112] = T[24];
	I[64] = T[25];
	I[180] = T[26];
	I[182] = T[27];
	I[178] = T[28];
	I[184] = T[29];
	I[185] = T[30];
	I[177] = T[31];
	I[176] = T[32];
	I[161] = T[33];
	I[164] = T[34];
	I[134] = T[35];
	I[146] = T[36];
	I[152] = T[37];
	I[137] = T[38];
	I[129] = T[39];
	I[130] = T[40];
	I[132] = T[41];
	I[136] = T[42];
	I[144] = T[43];
	I[160] = T[44];
	I[128] = T[45];
	I[166] = T[46];
	I[46] = T[47];
	I[34] = T[48];
	I[162] = T[49];
	I[169] = T[50];
	I[43] = T[51];
	I[40] = T[52];
	I[168] = T[53];
	I[190] = T[54];
	I[189] = T[55];
	I[188] = T[56];
	I[85] = T[57];
	I[149] = T[58];
	I[150] = T[59];
	I[148] = T[60];
	I[87] = T[61];
	I[84] = T[62];
	I[93] = T[63];
	I[81] = T[64];
	I[153] = T[65];
	I[145] = T[66];
	I[53] = T[67];
	I[37] = T[68];
	I[5] = T[69];
	I[41] = T[70];
	I[54] = T[71];
	I[52] = T[72];
	I[38] = T[73];
	I[36] = T[74];
	I[39] = T[75];
	I[7] = T[76];
	I[4] = T[77];
	I[6] = T[78];
	I[57] = T[79];
	I[49] = T[80];
	I[33] = T[81];
	I[45] = T[82];
	I[13] = T[83];
	I[1] = T[84];
	I[9] = T[85];
	I[181] = T[86];
	I[117] = T[87];
	I[69] = T[88];
	I[165] = T[89];
	I[133] = T[90];
	I[24] = T[91];
	I[27] = T[92];
	I[19] = T[93];
	I[28] = T[94];
	I[30] = T[95];
	I[18] = T[96];
	I[16] = T[97];
	I[154] = T[98];
	I[186] = T[99];
	I[138] = T[100];
	I[26] = T[101];
	I[10] = T[102];
	I[58] = T[103];
	I[89] = T[104];
	I[88] = T[105];
	I[91] = T[106];
	I[75] = T[107];
	I[120] = T[108];
	I[121] = T[109];
	I[73] = T[110];
	I[72] = T[111];
	I[86] = T[112];
	I[114] = T[113];
	I[118] = T[114];
	I[70] = T[115];
	I[82] = T[116];
	I[94] = T[117];
	I[78] = T[118];
	I[66] = T[119];
	I[103] = T[120];
	I[109] = T[121];
	I[97] = T[122];
	I[99] = T[123];
	I[100] = T[124];
	I[101] = T[125];
	I[108] = T[126];
	I[96] = T[127];
	I[22] = T[128];
	I[23] = T[129];
	I[20] = T[130];
	I[25] = T[131];
	I[29] = T[132];
	I[17] = T[133];
	I[175] = T[134];
	I[159] = T[135];
	I[143] = T[136];
	I[127] = T[137];
	I[63] = T[138];
	I[126] = T[139];
	I[174] = T[140];
	I[157] = T[141];
	I[156] = T[142];
	I[158] = T[143];
	I[142] = T[144];
	I[172] = T[145];
	I[173] = T[146];
	I[141] = T[147];
	I[140] = T[148];
	I[61] = T[149];
	I[171] = T[150];
	I[123] = T[151];
	I[151] = T[152];
	I[55] = T[153];
	I[119] = T[154];
	I[125] = T[155];
	I[124] = T[156];
	I[62] = T[157];
	I[60] = T[158];
	I[59] = T[159];
	I[51] = T[160];
	I[147] = T[161];
	I[155] = T[162];
	I[139] = T[163];
	I[163] = T[164];
	I[167] = T[165];
	I[135] = T[166];
	I[131] = T[167];
	I[115] = T[168];
	I[21] = T[169];
	I[191] = T[170];
	I[187] = T[171];
	I[183] = T[172];
	I[179] = T[173];
	I[95] = T[174];
	I[111] = T[175];
	I[79] = T[176];
	I[31] = T[177];
	I[47] = T[178];
	I[15] = T[179];
	I[106] = T[180];
	I[107] = T[181];
	I[105] = T[182];
	I[104] = T[183];
	I[110] = T[184];
	I[102] = T[185];
	I[98] = T[186];
	I[122] = T[187];
	I[90] = T[188];
	I[74] = T[189];
	I[42] = T[190];
	I[170] = T[191];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[192] = { 12, 84, 11, 5, 77, 69, 78, 76, 3, 85, 102, 4, 9, 83, 10, 179, 97, 133, 96, 93, 130, 169, 128, 129, 91, 131, 101, 92, 94, 132, 95, 177, 7, 81, 48, 6, 74, 68, 73, 75, 52, 70, 190, 51, 8, 82, 47, 178, 1, 80, 0, 160, 72, 67, 71, 153, 2, 79, 103, 159, 158, 149, 157, 138, 25, 22, 119, 16, 14, 88, 115, 15, 111, 110, 189, 107, 20, 21, 118, 176, 18, 64, 116, 17, 62, 57, 112, 61, 105, 104, 188, 106, 19, 63, 117, 174, 127, 122, 186, 123, 124, 125, 185, 120, 183, 182, 180, 181, 126, 121, 184, 175, 24, 23, 113, 168, 13, 87, 114, 154, 108, 109, 187, 151, 156, 155, 139, 137, 45, 39, 40, 167, 41, 90, 35, 166, 42, 38, 100, 163, 148, 147, 144, 136, 43, 66, 36, 161, 60, 58, 59, 152, 37, 65, 98, 162, 142, 141, 143, 135, 44, 33, 49, 164, 34, 89, 46, 165, 53, 50, 191, 150, 145, 146, 140, 134, 32, 31, 28, 173, 26, 86, 27, 172, 29, 30, 99, 171, 56, 55, 54, 170 };
// 	if (index < 192) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    50, 48, 56, 8, 11, 3, 35, 32, 44, 12, 14, 2, 0, 116, 68, 71, 67, 83, 80, 92, 76, 77, 65, 113, 112, 64, 180, 182, 178, 184, 185, 177, 176, 161, 164, 134, 146, 152, 137, 129, 130, 132, 136, 144, 160, 128, 166, 46, 34, 162, 169, 43, 40, 168, 190, 189, 188, 85, 149, 150, 148, 87, 84, 93, 81, 153, 145, 53, 37, 5, 41, 54, 52, 38, 36, 39, 7, 4, 6, 57, 49, 33, 45, 13, 1, 9, 181, 117, 69, 165, 133, 24, 27, 19, 28, 30, 18, 16, 154, 186, 138, 26, 10, 58, 89, 88, 91, 75, 120, 121, 73, 72, 86, 114, 118, 70, 82, 94, 78, 66, 103, 109, 97, 99, 100, 101, 108, 96, 22, 23, 20, 25, 29, 17, 175, 159, 143, 127, 63, 126, 174, 157, 156, 158, 142, 172, 173, 141, 140, 61, 171, 123, 151, 55, 119, 125, 124, 62, 60, 59, 51, 147, 155, 139, 163, 167, 135, 131, 115, 21, 191, 187, 183, 179, 95, 111, 79, 31, 47, 15, 106, 107, 105, 104, 110, 102, 98, 122, 90, 74, 42, 170
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 50;
	*idx++ = 48;
	*idx++ = 56;
	*idx++ = 8;
	*idx++ = 11;
	*idx++ = 3;
	*idx++ = 35;
	*idx++ = 32;
	*idx++ = 44;
	*idx++ = 12;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 116;
	*idx++ = 68;
	*idx++ = 71;
	*idx++ = 67;
	*idx++ = 83;
	*idx++ = 80;
	*idx++ = 92;
	*idx++ = 76;
	*idx++ = 77;
	*idx++ = 65;
	*idx++ = 113;
	*idx++ = 112;
	*idx++ = 64;
	*idx++ = 180;
	*idx++ = 182;
	*idx++ = 178;
	*idx++ = 184;
	*idx++ = 185;
	*idx++ = 177;
	*idx++ = 176;
	*idx++ = 161;
	*idx++ = 164;
	*idx++ = 134;
	*idx++ = 146;
	*idx++ = 152;
	*idx++ = 137;
	*idx++ = 129;
	*idx++ = 130;
	*idx++ = 132;
	*idx++ = 136;
	*idx++ = 144;
	*idx++ = 160;
	*idx++ = 128;
	*idx++ = 166;
	*idx++ = 46;
	*idx++ = 34;
	*idx++ = 162;
	*idx++ = 169;
	*idx++ = 43;
	*idx++ = 40;
	*idx++ = 168;
	*idx++ = 190;
	*idx++ = 189;
	*idx++ = 188;
	*idx++ = 85;
	*idx++ = 149;
	*idx++ = 150;
	*idx++ = 148;
	*idx++ = 87;
	*idx++ = 84;
	*idx++ = 93;
	*idx++ = 81;
	*idx++ = 153;
	*idx++ = 145;
	*idx++ = 53;
	*idx++ = 37;
	*idx++ = 5;
	*idx++ = 41;
	*idx++ = 54;
	*idx++ = 52;
	*idx++ = 38;
	*idx++ = 36;
	*idx++ = 39;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 6;
	*idx++ = 57;
	*idx++ = 49;
	*idx++ = 33;
	*idx++ = 45;
	*idx++ = 13;
	*idx++ = 1;
	*idx++ = 9;
	*idx++ = 181;
	*idx++ = 117;
	*idx++ = 69;
	*idx++ = 165;
	*idx++ = 133;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 19;
	*idx++ = 28;
	*idx++ = 30;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 154;
	*idx++ = 186;
	*idx++ = 138;
	*idx++ = 26;
	*idx++ = 10;
	*idx++ = 58;
	*idx++ = 89;
	*idx++ = 88;
	*idx++ = 91;
	*idx++ = 75;
	*idx++ = 120;
	*idx++ = 121;
	*idx++ = 73;
	*idx++ = 72;
	*idx++ = 86;
	*idx++ = 114;
	*idx++ = 118;
	*idx++ = 70;
	*idx++ = 82;
	*idx++ = 94;
	*idx++ = 78;
	*idx++ = 66;
	*idx++ = 103;
	*idx++ = 109;
	*idx++ = 97;
	*idx++ = 99;
	*idx++ = 100;
	*idx++ = 101;
	*idx++ = 108;
	*idx++ = 96;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 20;
	*idx++ = 25;
	*idx++ = 29;
	*idx++ = 17;
	*idx++ = 175;
	*idx++ = 159;
	*idx++ = 143;
	*idx++ = 127;
	*idx++ = 63;
	*idx++ = 126;
	*idx++ = 174;
	*idx++ = 157;
	*idx++ = 156;
	*idx++ = 158;
	*idx++ = 142;
	*idx++ = 172;
	*idx++ = 173;
	*idx++ = 141;
	*idx++ = 140;
	*idx++ = 61;
	*idx++ = 171;
	*idx++ = 123;
	*idx++ = 151;
	*idx++ = 55;
	*idx++ = 119;
	*idx++ = 125;
	*idx++ = 124;
	*idx++ = 62;
	*idx++ = 60;
	*idx++ = 59;
	*idx++ = 51;
	*idx++ = 147;
	*idx++ = 155;
	*idx++ = 139;
	*idx++ = 163;
	*idx++ = 167;
	*idx++ = 135;
	*idx++ = 131;
	*idx++ = 115;
	*idx++ = 21;
	*idx++ = 191;
	*idx++ = 187;
	*idx++ = 183;
	*idx++ = 179;
	*idx++ = 95;
	*idx++ = 111;
	*idx++ = 79;
	*idx++ = 31;
	*idx++ = 47;
	*idx++ = 15;
	*idx++ = 106;
	*idx++ = 107;
	*idx++ = 105;
	*idx++ = 104;
	*idx++ = 110;
	*idx++ = 102;
	*idx++ = 98;
	*idx++ = 122;
	*idx++ = 90;
	*idx++ = 74;
	*idx++ = 42;
	*idx++ = 170;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Dz*Px), I+0);
	    update((C[0][0])*W[a]*(Cz*Dy*Px), I+1);
	    update((C[0][0])*W[a]*(Cx*Dz*Py), I+2);
	    update((C[0][0])*W[a]*(Cz*Dx*Py), I+3);
	    update((C[0][0])*W[a]*(Cy*Dx*Pz), I+4);
	    update((C[0][0])*W[a]*(Cx*Dy*Pz), I+5);
	    update((C[0][0])*W[a]*(Cy*Cz*Qx), I+6);
	    update((C[0][0])*W[a]*(Py*Qx), I+7);
	    update((C[0][0])*W[a]*(Pz*Qx), I+8);
	    update((C[0][0])*W[a]*(Cx*Cz*Qy), I+9);
	    update((C[0][0])*W[a]*(Pz*Qy), I+10);
	    update((C[0][0])*W[a]*(Px*Qy), I+11);
	    update((C[0][0])*W[a]*(Cx*Cy*Qz), I+12);
	    update((C[0][0])*W[a]*(Px*Qz), I+13);
	    update((C[0][0])*W[a]*(Py*Qz), I+14);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*(Cx*Dy*f0), I+15);
	    update((C[0][0])*W[a]*(Cx*Dz*f0), I+16);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*f1), I+17);
	    update((C[0][0])*W[a]*(Cy*f1), I+18);
	    double f10 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*(Cz*Dy*f10), I+19);
	    update((C[0][0])*W[a]*(Cz*Dx*f10), I+20);
	    double f11 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*(Cy*Dz*f11), I+21);
	    update((C[0][0])*W[a]*(Cy*Dx*f11), I+22);
	    double f3 = 3*B00*B10;
	    update((C[0][0])*W[a]*((Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f3)), I+23);
	    update((C[0][0])*W[a]*((3*B00*pow(Cy,2) + f3 + Dy*pow(Cy,3) + 3*B10*Cy*Dy)), I+24);
	    update((C[0][0])*W[a]*((3*B10*Cz*Dz + f3 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+25);
	    double f4 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cz*f4), I+26);
	    update((C[0][0])*W[a]*(Cx*f4), I+27);
	    double f6 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cy*f6), I+28);
	    update((C[0][0])*W[a]*(Cz*f6), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[23] = T[0];
	I[14] = T[1];
	I[25] = T[2];
	I[6] = T[3];
	I[8] = T[4];
	I[17] = T[5];
	I[9] = T[6];
	I[5] = T[7];
	I[7] = T[8];
	I[19] = T[9];
	I[18] = T[10];
	I[13] = T[11];
	I[29] = T[12];
	I[24] = T[13];
	I[26] = T[14];
	I[10] = T[15];
	I[20] = T[16];
	I[27] = T[17];
	I[28] = T[18];
	I[12] = T[19];
	I[2] = T[20];
	I[21] = T[21];
	I[1] = T[22];
	I[0] = T[23];
	I[11] = T[24];
	I[22] = T[25];
	I[16] = T[26];
	I[15] = T[27];
	I[3] = T[28];
	I[4] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 23, 22, 20, 28, 29, 7, 3, 8, 4, 6, 15, 24, 19, 11, 1, 27, 26, 5, 10, 9, 16, 21, 25, 0, 13, 2, 14, 17, 18, 12 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    23, 14, 25, 6, 8, 17, 9, 5, 7, 19, 18, 13, 29, 24, 26, 10, 20, 27, 28, 12, 2, 21, 1, 0, 11, 22, 16, 15, 3, 4
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 13;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 10;
	*idx++ = 20;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 21;
	*idx++ = 1;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 3;
	*idx++ = 4;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<0> &t2, const vector<1> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[1]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {




#define pow(x,y) recurrence::pow<y>((x))


	    update((C[0][0])*W[a]*(1), I+0);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[1]) {
	double T[1];
	for (int i = 0; i < 1; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[1] = { 0 };
// 	if (index < 1) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[27]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Dz*Iy*Kx), I+0);
	    update((C[0][0])*W[a]*(Dy*Iz*Kx), I+1);
	    update((C[0][0])*W[a]*(Dx*Iz*Ky), I+2);
	    update((C[0][0])*W[a]*(Dz*Ix*Ky), I+3);
	    update((C[0][0])*W[a]*(Dx*Iy*Kz), I+4);
	    update((C[0][0])*W[a]*(Dy*Ix*Kz), I+5);
	    update((C[0][0])*W[a]*((Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+6);
	    update((C[0][0])*W[a]*((B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky))), I+7);
	    update((C[0][0])*W[a]*((B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+8);
	    double f10 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Ix*f10), I+9);
	    update((C[0][0])*W[a]*(Iy*f10), I+10);
	    double f11 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Iz*f11), I+11);
	    update((C[0][0])*W[a]*(Ix*f11), I+12);
	    double f12 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Dx*f12), I+13);
	    update((C[0][0])*W[a]*(Dz*f12), I+14);
	    double f13 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Kx*f13), I+15);
	    update((C[0][0])*W[a]*(Kz*f13), I+16);
	    double f14 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Ky*f14), I+17);
	    update((C[0][0])*W[a]*(Kz*f14), I+18);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Dz*f3), I+19);
	    update((C[0][0])*W[a]*(Dy*f3), I+20);
	    double f4 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Dx*f4), I+21);
	    update((C[0][0])*W[a]*(Dy*f4), I+22);
	    double f5 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Kx*f5), I+23);
	    update((C[0][0])*W[a]*(Ky*f5), I+24);
	    double f7 = (B01 + Dx*Kx);
	    update((C[0][0])*W[a]*(Iy*f7), I+25);
	    update((C[0][0])*W[a]*(Iz*f7), I+26);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[27]) {
	double T[27];
	for (int i = 0; i < 27; ++i) {
	    T[i] = I[i];
	}
	I[7] = T[0];
	I[5] = T[1];
	I[11] = T[2];
	I[15] = T[3];
	I[19] = T[4];
	I[21] = T[5];
	I[0] = T[6];
	I[13] = T[7];
	I[26] = T[8];
	I[24] = T[9];
	I[25] = T[10];
	I[14] = T[11];
	I[12] = T[12];
	I[10] = T[13];
	I[16] = T[14];
	I[4] = T[15];
	I[22] = T[16];
	I[9] = T[17];
	I[18] = T[18];
	I[6] = T[19];
	I[3] = T[20];
	I[20] = T[21];
	I[23] = T[22];
	I[8] = T[23];
	I[17] = T[24];
	I[1] = T[25];
	I[2] = T[26];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[27] = { 6, 25, 26, 20, 15, 1, 19, 0, 23, 17, 13, 2, 12, 7, 11, 3, 14, 24, 18, 4, 21, 5, 16, 22, 9, 10, 8 };
// 	if (index < 27) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    7, 5, 11, 15, 19, 21, 0, 13, 26, 24, 25, 14, 12, 10, 16, 4, 22, 9, 18, 6, 3, 20, 23, 8, 17, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 11;
	*idx++ = 15;
	*idx++ = 19;
	*idx++ = 21;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 25;
	*idx++ = 14;
	*idx++ = 12;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 4;
	*idx++ = 22;
	*idx++ = 9;
	*idx++ = 18;
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Cx), I+0);
	    update((C[0][0])*W[a]*(Cy), I+1);
	    update((C[0][0])*W[a]*(Cz), I+2);
	    update((C[1][0])*W[a]*((B00 + Cx*Kx)), I+3);
	    update((C[1][0])*W[a]*(Cy*Kx), I+4);
	    update((C[1][0])*W[a]*(Cz*Kx), I+5);
	    update((C[1][0])*W[a]*((B00 + Cy*Ky)), I+6);
	    update((C[1][0])*W[a]*(Cz*Ky), I+7);
	    update((C[1][0])*W[a]*(Cx*Ky), I+8);
	    update((C[1][0])*W[a]*((B00 + Cz*Kz)), I+9);
	    update((C[1][0])*W[a]*(Cx*Kz), I+10);
	    update((C[1][0])*W[a]*(Cy*Kz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[4] = T[4];
	I[5] = T[5];
	I[7] = T[6];
	I[8] = T[7];
	I[6] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[10] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 0, 1, 2, 3, 4, 5, 8, 6, 7, 10, 11, 9 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 4, 5, 7, 8, 6, 11, 9, 10
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::SP, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[120]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Dz*Px), I+0);
	    update((C[0][0])*W[a]*(Cz*Dy*Px), I+1);
	    update((C[0][0])*W[a]*(Cx*Dz*Py), I+2);
	    update((C[0][0])*W[a]*(Cz*Dx*Py), I+3);
	    update((C[0][0])*W[a]*(Cx*Dy*Pz), I+4);
	    update((C[0][0])*W[a]*(Cy*Dx*Pz), I+5);
	    update((C[0][0])*W[a]*(Cy*Cz*Qx), I+6);
	    update((C[0][1])*W[a]*(Iy*Pz*Qx), I+7);
	    update((C[0][0])*W[a]*(Pz*Qx), I+8);
	    update((C[0][1])*W[a]*(Iz*Py*Qx), I+9);
	    update((C[0][0])*W[a]*(Py*Qx), I+10);
	    update((C[0][0])*W[a]*(Cx*Cz*Qy), I+11);
	    update((C[0][1])*W[a]*(Ix*Pz*Qy), I+12);
	    update((C[0][0])*W[a]*(Pz*Qy), I+13);
	    update((C[0][1])*W[a]*(Iz*Px*Qy), I+14);
	    update((C[0][0])*W[a]*(Px*Qy), I+15);
	    update((C[0][0])*W[a]*(Cx*Cy*Qz), I+16);
	    update((C[0][1])*W[a]*(Iy*Px*Qz), I+17);
	    update((C[0][0])*W[a]*(Px*Qz), I+18);
	    update((C[0][1])*W[a]*(Ix*Py*Qz), I+19);
	    update((C[0][0])*W[a]*(Py*Qz), I+20);
	    update((C[0][1])*W[a]*(Cy*Qz*(Px + Cx*Xij)), I+21);
	    update((C[0][1])*W[a]*(Dz*Py*(Px + Cx*Xij)), I+22);
	    update((C[0][1])*W[a]*(Dy*Pz*(Px + Cx*Xij)), I+23);
	    update((C[0][1])*W[a]*(Cz*Qy*(Px + Cx*Xij)), I+24);
	    update((C[0][1])*W[a]*(Cy*Px*(Dz*Zij + Qz)), I+25);
	    update((C[0][1])*W[a]*(Cx*Py*(Dz*Zij + Qz)), I+26);
	    update((C[0][1])*W[a]*(Cx*Qy*(Cz*Zij + Pz)), I+27);
	    update((C[0][1])*W[a]*(Cy*Qx*(Cz*Zij + Pz)), I+28);
	    update((C[0][1])*W[a]*(Dx*Py*(Cz*Zij + Pz)), I+29);
	    update((C[0][1])*W[a]*(Dy*Px*(Cz*Zij + Pz)), I+30);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][1])*W[a]*(Cx*f0*(Dz*Zij + Qz)), I+31);
	    update((C[0][1])*W[a]*(Cx*Dy*Iz*f0), I+32);
	    update((C[0][0])*W[a]*(Cx*Dy*f0), I+33);
	    update((C[0][1])*W[a]*(Cx*Dz*Iy*f0), I+34);
	    update((C[0][0])*W[a]*(Cx*Dz*f0), I+35);
	    double f10 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][1])*W[a]*(Cx*Dz*f10), I+36);
	    update((C[0][1])*W[a]*(Cz*Dx*f10), I+37);
	    update((C[0][1])*W[a]*(Qx*f10), I+38);
	    update((C[0][1])*W[a]*(Qz*f10), I+39);
	    double f12 = (3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3));
	    update((C[0][1])*W[a]*(Dy*f12), I+40);
	    update((C[0][1])*W[a]*(Dx*f12), I+41);
	    double f15 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][1])*W[a]*(Cy*Cz*f15), I+42);
	    update((C[0][1])*W[a]*(Py*f15), I+43);
	    update((C[0][1])*W[a]*(Pz*f15), I+44);
	    double f17 = (Dy*Iy + B00);
	    update((C[0][1])*W[a]*(Cx*f0*f17), I+45);
	    update((C[0][1])*W[a]*(Cx*Pz*f17), I+46);
	    update((C[0][1])*W[a]*(Cz*Px*f17), I+47);
	    double f18 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Dz*Px*f18), I+48);
	    update((C[0][1])*W[a]*(Cz*Qx*f18), I+49);
	    update((C[0][1])*W[a]*(Cx*Qz*f18), I+50);
	    update((C[0][1])*W[a]*(Dx*Pz*f18), I+51);
	    double f19 = (3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy));
	    update((C[0][1])*W[a]*(Dx*f19), I+52);
	    update((C[0][1])*W[a]*(Dz*f19), I+53);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][1])*W[a]*(Cx*Cy*(f2 + Qz*Zij)), I+54);
	    update((C[0][1])*W[a]*(Px*(f2 + Qz*Zij)), I+55);
	    update((C[0][1])*W[a]*(Py*(f2 + Qz*Zij)), I+56);
	    update((C[0][1])*W[a]*(f2*(Px + Cx*Xij)), I+57);
	    update((C[0][1])*W[a]*(Cx*Iy*f2), I+58);
	    update((C[0][0])*W[a]*(Cx*f2), I+59);
	    update((C[0][1])*W[a]*(Cy*Ix*f2), I+60);
	    update((C[0][0])*W[a]*(Cy*f2), I+61);
	    update((C[0][1])*W[a]*(f18*f2), I+62);
	    double f20 = (3*pow(B10,2) + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3));
	    update((C[0][1])*W[a]*(Dz*f20), I+63);
	    update((C[0][1])*W[a]*(Dy*f20), I+64);
	    double f22 = (Dx*Px + 2*B00*Cx);
	    update((C[0][1])*W[a]*(f22*(Cz*Zij + Pz)), I+65);
	    update((C[0][1])*W[a]*(Cz*Iy*f22), I+66);
	    update((C[0][0])*W[a]*(Cz*f22), I+67);
	    update((C[0][1])*W[a]*(Cy*Iz*f22), I+68);
	    update((C[0][0])*W[a]*(Cy*f22), I+69);
	    update((C[0][1])*W[a]*(f18*f22), I+70);
	    double f23 = (Dx*Ix + B00);
	    update((C[0][1])*W[a]*(Cy*Pz*f23), I+71);
	    update((C[0][1])*W[a]*(Cz*Py*f23), I+72);
	    double f24 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][1])*W[a]*(Cz*Dy*f24), I+73);
	    update((C[0][1])*W[a]*(Cy*Dz*f24), I+74);
	    update((C[0][1])*W[a]*(Qz*f24), I+75);
	    update((C[0][1])*W[a]*(Qy*f24), I+76);
	    double f3 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][1])*W[a]*(Cx*Dy*f3), I+77);
	    update((C[0][1])*W[a]*(Cy*Dx*f3), I+78);
	    update((C[0][1])*W[a]*(Qx*f3), I+79);
	    update((C[0][1])*W[a]*(Qy*f3), I+80);
	    double f33 = (3*B10 + pow(Cz,2));
	    update((C[0][1])*W[a]*(Cz*f17*f33), I+81);
	    update((C[0][1])*W[a]*(Cz*f23*f33), I+82);
	    update((C[0][1])*W[a]*(Cz*Dx*Iy*f33), I+83);
	    update((C[0][0])*W[a]*(Cz*Dx*f33), I+84);
	    update((C[0][1])*W[a]*(Cz*Dy*Ix*f33), I+85);
	    update((C[0][0])*W[a]*(Cz*Dy*f33), I+86);
	    double f34 = (3*B10 + pow(Cy,2));
	    update((C[0][1])*W[a]*(Cy*f34*(Dz*Zij + Qz)), I+87);
	    update((C[0][1])*W[a]*(Cy*Dz*Ix*f34), I+88);
	    update((C[0][0])*W[a]*(Cy*Dz*f34), I+89);
	    update((C[0][1])*W[a]*(Cy*f23*f34), I+90);
	    update((C[0][1])*W[a]*(Cy*Dx*Iz*f34), I+91);
	    update((C[0][0])*W[a]*(Cy*Dx*f34), I+92);
	    double f4 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][1])*W[a]*(Cx*Cz*f4), I+93);
	    update((C[0][1])*W[a]*(Px*f4), I+94);
	    update((C[0][1])*W[a]*(Pz*f4), I+95);
	    double f5 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[0][1])*W[a]*(Cy*f5), I+96);
	    update((C[0][1])*W[a]*(Cz*f5), I+97);
	    double f6 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[0][1])*W[a]*(Cx*(f6 + Yij*(Dy*Py + 2*B00*Cy))), I+98);
	    update((C[0][1])*W[a]*(Cz*(f6 + Yij*(Dy*Py + 2*B00*Cy))), I+99);
	    update((C[0][1])*W[a]*(Iz*f6), I+100);
	    update((C[0][1])*W[a]*(Ix*f6), I+101);
	    update((C[0][0])*W[a]*(f6), I+102);
	    double f8 = 3*B00*B10;
	    update((C[0][1])*W[a]*((B00*pow(Cy,2)*(4*Cy + 3*Yij) + 3*Dy*pow(B10,2) + 4*Cy*f8 + Dy*pow(Cy,4) + Dy*Yij*pow(Cy,3) + f8*Yij + 3*B10*Cy*Dy*(Yij + 2*Cy))), I+103);
	    update((C[0][1])*W[a]*(Cy*(B00*Cz*(3*Cz + 2*Zij) + Dz*Iz*pow(Cz,2) + B10*Dz*(3*Cz + Zij) + f8)), I+104);
	    update((C[0][1])*W[a]*(Cx*(B00*Cz*(3*Cz + 2*Zij) + Dz*Iz*pow(Cz,2) + B10*Dz*(3*Cz + Zij) + f8)), I+105);
	    update((C[0][1])*W[a]*((f8*Zij + 3*Dz*pow(B10,2) + B00*pow(Cz,2)*(4*Cz + 3*Zij) + Dz*pow(Cz,4) + 3*B10*Cz*Dz*(2*Cz + Zij) + 4*Cz*f8 + Dz*Zij*pow(Cz,3))), I+106);
	    update((C[0][1])*W[a]*(Iy*(3*B10*Cz*Dz + f8 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+107);
	    update((C[0][1])*W[a]*(Ix*(3*B10*Cz*Dz + f8 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+108);
	    update((C[0][0])*W[a]*((3*B10*Cz*Dz + f8 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+109);
	    update((C[0][1])*W[a]*(Iz*(Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f8)), I+110);
	    update((C[0][1])*W[a]*(Iy*(Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f8)), I+111);
	    update((C[0][0])*W[a]*((Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f8)), I+112);
	    update((C[0][1])*W[a]*((Dx*Xij*pow(Cx,3) + f8*Xij + 3*Dx*pow(B10,2) + Dx*pow(Cx,4) + 4*Cx*f8 + B00*pow(Cx,2)*(4*Cx + 3*Xij) + 3*B10*Cx*Dx*(Xij + 2*Cx))), I+113);
	    double f9 = (Dy*Py + 2*B00*Cy);
	    update((C[0][1])*W[a]*(f9*(Px + Cx*Xij)), I+114);
	    update((C[0][1])*W[a]*(f9*(Cz*Zij + Pz)), I+115);
	    update((C[0][1])*W[a]*(Cz*Ix*f9), I+116);
	    update((C[0][0])*W[a]*(Cz*f9), I+117);
	    update((C[0][1])*W[a]*(Cx*Iz*f9), I+118);
	    update((C[0][0])*W[a]*(Cx*f9), I+119);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[120]) {
	double T[120];
	for (int i = 0; i < 120; ++i) {
	    T[i] = I[i];
	}
	I[83] = T[0];
	I[44] = T[1];
	I[85] = T[2];
	I[6] = T[3];
	I[47] = T[4];
	I[8] = T[5];
	I[9] = T[6];
	I[27] = T[7];
	I[7] = T[8];
	I[35] = T[9];
	I[5] = T[10];
	I[49] = T[11];
	I[58] = T[12];
	I[48] = T[13];
	I[73] = T[14];
	I[43] = T[15];
	I[89] = T[16];
	I[104] = T[17];
	I[84] = T[18];
	I[96] = T[19];
	I[86] = T[20];
	I[99] = T[21];
	I[95] = T[22];
	I[57] = T[23];
	I[59] = T[24];
	I[113] = T[25];
	I[115] = T[26];
	I[79] = T[27];
	I[39] = T[28];
	I[36] = T[29];
	I[74] = T[30];
	I[110] = T[31];
	I[70] = T[32];
	I[40] = T[33];
	I[100] = T[34];
	I[80] = T[35];
	I[105] = T[36];
	I[26] = T[37];
	I[25] = T[38];
	I[106] = T[39];
	I[72] = T[40];
	I[32] = T[41];
	I[19] = T[42];
	I[15] = T[43];
	I[17] = T[44];
	I[60] = T[45];
	I[67] = T[46];
	I[64] = T[47];
	I[103] = T[48];
	I[29] = T[49];
	I[109] = T[50];
	I[28] = T[51];
	I[21] = T[52];
	I[101] = T[53];
	I[119] = T[54];
	I[114] = T[55];
	I[116] = T[56];
	I[97] = T[57];
	I[107] = T[58];
	I[87] = T[59];
	I[98] = T[60];
	I[88] = T[61];
	I[108] = T[62];
	I[90] = T[63];
	I[50] = T[64];
	I[34] = T[65];
	I[24] = T[66];
	I[4] = T[67];
	I[33] = T[68];
	I[3] = T[69];
	I[23] = T[70];
	I[18] = T[71];
	I[16] = T[72];
	I[54] = T[73];
	I[93] = T[74];
	I[94] = T[75];
	I[53] = T[76];
	I[77] = T[77];
	I[38] = T[78];
	I[37] = T[79];
	I[78] = T[80];
	I[62] = T[81];
	I[12] = T[82];
	I[22] = T[83];
	I[2] = T[84];
	I[52] = T[85];
	I[42] = T[86];
	I[111] = T[87];
	I[91] = T[88];
	I[81] = T[89];
	I[11] = T[90];
	I[31] = T[91];
	I[1] = T[92];
	I[69] = T[93];
	I[63] = T[94];
	I[68] = T[95];
	I[13] = T[96];
	I[14] = T[97];
	I[65] = T[98];
	I[66] = T[99];
	I[71] = T[100];
	I[51] = T[101];
	I[41] = T[102];
	I[61] = T[103];
	I[118] = T[104];
	I[117] = T[105];
	I[112] = T[106];
	I[102] = T[107];
	I[92] = T[108];
	I[82] = T[109];
	I[30] = T[110];
	I[20] = T[111];
	I[0] = T[112];
	I[10] = T[113];
	I[55] = T[114];
	I[76] = T[115];
	I[56] = T[116];
	I[46] = T[117];
	I[75] = T[118];
	I[45] = T[119];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[120] = { 112, 92, 84, 69, 67, 10, 3, 8, 5, 6, 113, 90, 82, 96, 97, 43, 72, 44, 71, 42, 111, 52, 83, 70, 66, 38, 37, 7, 51, 49, 110, 91, 41, 68, 65, 9, 29, 79, 78, 28, 33, 102, 86, 15, 1, 119, 117, 4, 13, 11, 64, 101, 85, 76, 73, 114, 116, 23, 12, 24, 45, 103, 81, 94, 47, 98, 99, 46, 95, 93, 32, 100, 40, 14, 30, 118, 115, 77, 80, 27, 35, 89, 109, 0, 18, 2, 20, 59, 61, 16, 63, 88, 108, 74, 75, 22, 19, 57, 60, 21, 34, 53, 107, 48, 17, 36, 39, 58, 62, 50, 31, 87, 106, 25, 55, 26, 56, 105, 104, 54 };
// 	if (index < 120) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    83, 44, 85, 6, 47, 8, 9, 27, 7, 35, 5, 49, 58, 48, 73, 43, 89, 104, 84, 96, 86, 99, 95, 57, 59, 113, 115, 79, 39, 36, 74, 110, 70, 40, 100, 80, 105, 26, 25, 106, 72, 32, 19, 15, 17, 60, 67, 64, 103, 29, 109, 28, 21, 101, 119, 114, 116, 97, 107, 87, 98, 88, 108, 90, 50, 34, 24, 4, 33, 3, 23, 18, 16, 54, 93, 94, 53, 77, 38, 37, 78, 62, 12, 22, 2, 52, 42, 111, 91, 81, 11, 31, 1, 69, 63, 68, 13, 14, 65, 66, 71, 51, 41, 61, 118, 117, 112, 102, 92, 82, 30, 20, 0, 10, 55, 76, 56, 46, 75, 45
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 83;
	*idx++ = 44;
	*idx++ = 85;
	*idx++ = 6;
	*idx++ = 47;
	*idx++ = 8;
	*idx++ = 9;
	*idx++ = 27;
	*idx++ = 7;
	*idx++ = 35;
	*idx++ = 5;
	*idx++ = 49;
	*idx++ = 58;
	*idx++ = 48;
	*idx++ = 73;
	*idx++ = 43;
	*idx++ = 89;
	*idx++ = 104;
	*idx++ = 84;
	*idx++ = 96;
	*idx++ = 86;
	*idx++ = 99;
	*idx++ = 95;
	*idx++ = 57;
	*idx++ = 59;
	*idx++ = 113;
	*idx++ = 115;
	*idx++ = 79;
	*idx++ = 39;
	*idx++ = 36;
	*idx++ = 74;
	*idx++ = 110;
	*idx++ = 70;
	*idx++ = 40;
	*idx++ = 100;
	*idx++ = 80;
	*idx++ = 105;
	*idx++ = 26;
	*idx++ = 25;
	*idx++ = 106;
	*idx++ = 72;
	*idx++ = 32;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 60;
	*idx++ = 67;
	*idx++ = 64;
	*idx++ = 103;
	*idx++ = 29;
	*idx++ = 109;
	*idx++ = 28;
	*idx++ = 21;
	*idx++ = 101;
	*idx++ = 119;
	*idx++ = 114;
	*idx++ = 116;
	*idx++ = 97;
	*idx++ = 107;
	*idx++ = 87;
	*idx++ = 98;
	*idx++ = 88;
	*idx++ = 108;
	*idx++ = 90;
	*idx++ = 50;
	*idx++ = 34;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 23;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 54;
	*idx++ = 93;
	*idx++ = 94;
	*idx++ = 53;
	*idx++ = 77;
	*idx++ = 38;
	*idx++ = 37;
	*idx++ = 78;
	*idx++ = 62;
	*idx++ = 12;
	*idx++ = 22;
	*idx++ = 2;
	*idx++ = 52;
	*idx++ = 42;
	*idx++ = 111;
	*idx++ = 91;
	*idx++ = 81;
	*idx++ = 11;
	*idx++ = 31;
	*idx++ = 1;
	*idx++ = 69;
	*idx++ = 63;
	*idx++ = 68;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 65;
	*idx++ = 66;
	*idx++ = 71;
	*idx++ = 51;
	*idx++ = 41;
	*idx++ = 61;
	*idx++ = 118;
	*idx++ = 117;
	*idx++ = 112;
	*idx++ = 102;
	*idx++ = 92;
	*idx++ = 82;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 10;
	*idx++ = 55;
	*idx++ = 76;
	*idx++ = 56;
	*idx++ = 46;
	*idx++ = 75;
	*idx++ = 45;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Cz*Dx*Dy), I+0);
	    update((C[0][0])*W[a]*(Cx*Dy*Dz), I+1);
	    update((C[0][0])*W[a]*(Cy*Dx*Dz), I+2);
	    update((C[0][0])*W[a]*(Dz*Qx), I+3);
	    update((C[0][0])*W[a]*(Dy*Qx), I+4);
	    update((C[0][0])*W[a]*(Dx*Qy), I+5);
	    update((C[0][0])*W[a]*(Dz*Qy), I+6);
	    update((C[0][0])*W[a]*(Dx*Qz), I+7);
	    update((C[0][0])*W[a]*(Dy*Qz), I+8);
	    update((C[0][0])*W[a]*((2*B00*Dx + Cx*Rx)), I+9);
	    update((C[0][0])*W[a]*(Cy*Rx), I+10);
	    update((C[0][0])*W[a]*(Cz*Rx), I+11);
	    update((C[0][0])*W[a]*((2*B00*Dy + Cy*Ry)), I+12);
	    update((C[0][0])*W[a]*(Cz*Ry), I+13);
	    update((C[0][0])*W[a]*(Cx*Ry), I+14);
	    update((C[0][0])*W[a]*((2*B00*Dz + Cz*Rz)), I+15);
	    update((C[0][0])*W[a]*(Cx*Rz), I+16);
	    update((C[0][0])*W[a]*(Cy*Rz), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[11] = T[0];
	I[15] = T[1];
	I[13] = T[2];
	I[12] = T[3];
	I[9] = T[4];
	I[10] = T[5];
	I[16] = T[6];
	I[14] = T[7];
	I[17] = T[8];
	I[0] = T[9];
	I[1] = T[10];
	I[2] = T[11];
	I[4] = T[12];
	I[5] = T[13];
	I[3] = T[14];
	I[8] = T[15];
	I[6] = T[16];
	I[7] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 9, 10, 11, 14, 12, 13, 16, 17, 15, 4, 5, 0, 3, 2, 7, 1, 6, 8 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    11, 15, 13, 12, 9, 10, 16, 14, 17, 0, 1, 2, 4, 5, 3, 8, 6, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 11;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 14;
	*idx++ = 17;
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*((B00 + Ix*Kx)), I+0);
	    update((C[0][1])*W[a]*(Iz*Kx), I+1);
	    update((C[0][1])*W[a]*(Iy*Kx), I+2);
	    update((C[0][0])*W[a]*(Kx), I+3);
	    update((C[0][1])*W[a]*((B00 + Iy*Ky)), I+4);
	    update((C[0][1])*W[a]*(Iz*Ky), I+5);
	    update((C[0][1])*W[a]*(Ix*Ky), I+6);
	    update((C[0][0])*W[a]*(Ky), I+7);
	    update((C[0][1])*W[a]*((Iz*Kz + B00)), I+8);
	    update((C[0][1])*W[a]*(Ix*Kz), I+9);
	    update((C[0][1])*W[a]*(Iy*Kz), I+10);
	    update((C[0][0])*W[a]*(Kz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[3] = T[1];
	I[2] = T[2];
	I[0] = T[3];
	I[6] = T[4];
	I[7] = T[5];
	I[5] = T[6];
	I[4] = T[7];
	I[11] = T[8];
	I[9] = T[9];
	I[10] = T[10];
	I[8] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 3, 0, 2, 1, 7, 6, 4, 5, 11, 9, 10, 8 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 3, 2, 0, 6, 7, 5, 4, 11, 9, 10, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[60]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dy*Pz*Qx), I+0);
	    update((C[0][0])*W[a]*(Dz*Py*Qx), I+1);
	    update((C[0][0])*W[a]*(Dx*Pz*Qy), I+2);
	    update((C[0][0])*W[a]*(Cz*Qx*Qy), I+3);
	    update((C[0][0])*W[a]*(Dz*Px*Qy), I+4);
	    update((C[0][0])*W[a]*(Dx*Py*Qz), I+5);
	    update((C[0][0])*W[a]*(Cy*Qx*Qz), I+6);
	    update((C[0][0])*W[a]*(Dy*Px*Qz), I+7);
	    update((C[0][0])*W[a]*(Cx*Qy*Qz), I+8);
	    update((C[0][0])*W[a]*(Cy*Pz*Rx), I+9);
	    update((C[0][0])*W[a]*(Cz*Py*Rx), I+10);
	    update((C[0][0])*W[a]*(Cx*Pz*Ry), I+11);
	    update((C[0][0])*W[a]*(Cz*Px*Ry), I+12);
	    update((C[0][0])*W[a]*(Cy*Px*Rz), I+13);
	    update((C[0][0])*W[a]*(Cx*Py*Rz), I+14);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*((6*Cx*pow(B00,2) + Cx*Rx*f0 + 6*B00*Dx*Px)), I+15);
	    update((C[0][0])*W[a]*(Cx*Dy*Dz*f0), I+16);
	    update((C[0][0])*W[a]*(Cx*Rz*f0), I+17);
	    update((C[0][0])*W[a]*(Cx*Ry*f0), I+18);
	    double f1 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cz*Dx*f1), I+19);
	    update((C[0][0])*W[a]*(Cx*Dz*f1), I+20);
	    update((C[0][0])*W[a]*(Qx*f1), I+21);
	    update((C[0][0])*W[a]*(Qz*f1), I+22);
	    double f11 = (2*B00*Dx + Cx*Rx);
	    update((C[0][0])*W[a]*(Cy*Cz*f11), I+23);
	    update((C[0][0])*W[a]*(Py*f11), I+24);
	    update((C[0][0])*W[a]*(Pz*f11), I+25);
	    double f12 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cz*Dy*f12), I+26);
	    update((C[0][0])*W[a]*(Cy*Dz*f12), I+27);
	    update((C[0][0])*W[a]*(Qz*f12), I+28);
	    update((C[0][0])*W[a]*(Qy*f12), I+29);
	    double f14 = (Px*Rx + 2*pow(B00,2) + 4*B00*Cx*Dx);
	    update((C[0][0])*W[a]*(Cy*f14), I+30);
	    update((C[0][0])*W[a]*(Cz*f14), I+31);
	    double f17 = (Cx*Dx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[0][0])*W[a]*(Dz*f17), I+32);
	    update((C[0][0])*W[a]*(Dy*f17), I+33);
	    double f19 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*((6*B00*Dz*Pz + 6*Cz*pow(B00,2) + Cz*Rz*f19)), I+34);
	    update((C[0][0])*W[a]*(Cz*Dx*Dy*f19), I+35);
	    update((C[0][0])*W[a]*(Cz*Ry*f19), I+36);
	    update((C[0][0])*W[a]*(Cz*Rx*f19), I+37);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cy*Dx*f2), I+38);
	    update((C[0][0])*W[a]*(Cx*Dy*f2), I+39);
	    update((C[0][0])*W[a]*(Qx*f2), I+40);
	    update((C[0][0])*W[a]*(Qy*f2), I+41);
	    double f20 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*((6*Cy*pow(B00,2) + Cy*Ry*f20 + 6*B00*Dy*Py)), I+42);
	    update((C[0][0])*W[a]*(Cy*Dx*Dz*f20), I+43);
	    update((C[0][0])*W[a]*(Cy*Rz*f20), I+44);
	    update((C[0][0])*W[a]*(Cy*Rx*f20), I+45);
	    double f21 = (2*pow(B00,2) + Pz*Rz + 4*B00*Cz*Dz);
	    update((C[0][0])*W[a]*(Cx*f21), I+46);
	    update((C[0][0])*W[a]*(Cy*f21), I+47);
	    double f22 = (2*B00*Dy + Cy*Ry);
	    update((C[0][0])*W[a]*(Cx*Cz*f22), I+48);
	    update((C[0][0])*W[a]*(Pz*f22), I+49);
	    update((C[0][0])*W[a]*(Px*f22), I+50);
	    double f3 = (4*B00*Cy*Dy + Py*Ry + 2*pow(B00,2));
	    update((C[0][0])*W[a]*(Cz*f3), I+51);
	    update((C[0][0])*W[a]*(Cx*f3), I+52);
	    double f4 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[0][0])*W[a]*(Dx*f4), I+53);
	    update((C[0][0])*W[a]*(Dz*f4), I+54);
	    double f7 = (2*B00*Dz + Cz*Rz);
	    update((C[0][0])*W[a]*(Cx*Cy*f7), I+55);
	    update((C[0][0])*W[a]*(Px*f7), I+56);
	    update((C[0][0])*W[a]*(Py*f7), I+57);
	    double f8 = (3*B00*Pz + Cz*Dz*(3*B10 + pow(Cz,2)));
	    update((C[0][0])*W[a]*(Dx*f8), I+58);
	    update((C[0][0])*W[a]*(Dy*f8), I+59);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[60]) {
	double T[60];
	for (int i = 0; i < 60; ++i) {
	    T[i] = I[i];
	}
	I[37] = T[0];
	I[45] = T[1];
	I[38] = T[2];
	I[39] = T[3];
	I[53] = T[4];
	I[46] = T[5];
	I[49] = T[6];
	I[54] = T[7];
	I[59] = T[8];
	I[8] = T[9];
	I[6] = T[10];
	I[17] = T[11];
	I[14] = T[12];
	I[23] = T[13];
	I[25] = T[14];
	I[0] = T[15];
	I[50] = T[16];
	I[20] = T[17];
	I[10] = T[18];
	I[36] = T[19];
	I[55] = T[20];
	I[35] = T[21];
	I[56] = T[22];
	I[9] = T[23];
	I[5] = T[24];
	I[7] = T[25];
	I[34] = T[26];
	I[43] = T[27];
	I[44] = T[28];
	I[33] = T[29];
	I[3] = T[30];
	I[4] = T[31];
	I[40] = T[32];
	I[30] = T[33];
	I[22] = T[34];
	I[32] = T[35];
	I[12] = T[36];
	I[2] = T[37];
	I[48] = T[38];
	I[57] = T[39];
	I[47] = T[40];
	I[58] = T[41];
	I[11] = T[42];
	I[41] = T[43];
	I[21] = T[44];
	I[1] = T[45];
	I[27] = T[46];
	I[28] = T[47];
	I[19] = T[48];
	I[18] = T[49];
	I[13] = T[50];
	I[16] = T[51];
	I[15] = T[52];
	I[31] = T[53];
	I[51] = T[54];
	I[29] = T[55];
	I[24] = T[56];
	I[26] = T[57];
	I[42] = T[58];
	I[52] = T[59];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[60] = { 15, 45, 37, 30, 31, 24, 10, 25, 9, 23, 18, 42, 36, 50, 12, 52, 51, 11, 49, 48, 17, 44, 34, 13, 56, 14, 57, 46, 47, 55, 33, 53, 35, 29, 26, 21, 19, 0, 2, 3, 32, 43, 58, 27, 28, 1, 5, 40, 38, 6, 16, 54, 59, 4, 7, 20, 22, 39, 41, 8 };
// 	if (index < 60) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    37, 45, 38, 39, 53, 46, 49, 54, 59, 8, 6, 17, 14, 23, 25, 0, 50, 20, 10, 36, 55, 35, 56, 9, 5, 7, 34, 43, 44, 33, 3, 4, 40, 30, 22, 32, 12, 2, 48, 57, 47, 58, 11, 41, 21, 1, 27, 28, 19, 18, 13, 16, 15, 31, 51, 29, 24, 26, 42, 52
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 37;
	*idx++ = 45;
	*idx++ = 38;
	*idx++ = 39;
	*idx++ = 53;
	*idx++ = 46;
	*idx++ = 49;
	*idx++ = 54;
	*idx++ = 59;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 17;
	*idx++ = 14;
	*idx++ = 23;
	*idx++ = 25;
	*idx++ = 0;
	*idx++ = 50;
	*idx++ = 20;
	*idx++ = 10;
	*idx++ = 36;
	*idx++ = 55;
	*idx++ = 35;
	*idx++ = 56;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 34;
	*idx++ = 43;
	*idx++ = 44;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 40;
	*idx++ = 30;
	*idx++ = 22;
	*idx++ = 32;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 48;
	*idx++ = 57;
	*idx++ = 47;
	*idx++ = 58;
	*idx++ = 11;
	*idx++ = 41;
	*idx++ = 21;
	*idx++ = 1;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 13;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 31;
	*idx++ = 51;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 42;
	*idx++ = 52;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::P, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Dz*Ix), I+0);
	    update((C[0][0])*W[a]*(Dy*Ix), I+1);
	    update((C[0][0])*W[a]*(Dz*Iy), I+2);
	    update((C[0][0])*W[a]*(Dx*Iy), I+3);
	    update((C[0][0])*W[a]*(Dy*Iz), I+4);
	    update((C[0][0])*W[a]*(Dx*Iz), I+5);
	    update((C[1][0])*W[a]*(Dz*Iy*Kx), I+6);
	    update((C[1][0])*W[a]*(Dy*Iz*Kx), I+7);
	    update((C[1][0])*W[a]*(Dz*Ix*Ky), I+8);
	    update((C[1][0])*W[a]*(Dx*Iz*Ky), I+9);
	    update((C[1][0])*W[a]*(Dy*Ix*Kz), I+10);
	    update((C[1][0])*W[a]*(Dx*Iy*Kz), I+11);
	    update((C[1][0])*W[a]*((Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+12);
	    update((C[1][0])*W[a]*((B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky))), I+13);
	    update((C[1][0])*W[a]*((B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+14);
	    double f10 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Ix*f10), I+15);
	    update((C[1][0])*W[a]*(Iy*f10), I+16);
	    double f11 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Ix*f11), I+17);
	    update((C[1][0])*W[a]*(Iz*f11), I+18);
	    double f12 = (B00 + Iy*Ky);
	    update((C[1][0])*W[a]*(Dx*f12), I+19);
	    update((C[1][0])*W[a]*(Dz*f12), I+20);
	    double f13 = (Dy*Iy + B00);
	    update((C[1][0])*W[a]*(Kz*f13), I+21);
	    update((C[1][0])*W[a]*(Kx*f13), I+22);
	    update((C[0][0])*W[a]*(f13), I+23);
	    double f14 = (Dx*Ix + B00);
	    update((C[1][0])*W[a]*(Kz*f14), I+24);
	    update((C[1][0])*W[a]*(Ky*f14), I+25);
	    update((C[0][0])*W[a]*(f14), I+26);
	    double f3 = (B00 + Ix*Kx);
	    update((C[1][0])*W[a]*(Dy*f3), I+27);
	    update((C[1][0])*W[a]*(Dz*f3), I+28);
	    double f4 = (Iz*Kz + B00);
	    update((C[1][0])*W[a]*(Dx*f4), I+29);
	    update((C[1][0])*W[a]*(Dy*f4), I+30);
	    double f5 = (B00 + Dz*Iz);
	    update((C[1][0])*W[a]*(Kx*f5), I+31);
	    update((C[1][0])*W[a]*(Ky*f5), I+32);
	    update((C[0][0])*W[a]*(f5), I+33);
	    double f7 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Iy*f7), I+34);
	    update((C[1][0])*W[a]*(Iz*f7), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[6] = T[0];
	I[3] = T[1];
	I[7] = T[2];
	I[1] = T[3];
	I[5] = T[4];
	I[2] = T[5];
	I[16] = T[6];
	I[14] = T[7];
	I[24] = T[8];
	I[20] = T[9];
	I[30] = T[10];
	I[28] = T[11];
	I[9] = T[12];
	I[22] = T[13];
	I[35] = T[14];
	I[33] = T[15];
	I[34] = T[16];
	I[21] = T[17];
	I[23] = T[18];
	I[19] = T[19];
	I[25] = T[20];
	I[31] = T[21];
	I[13] = T[22];
	I[4] = T[23];
	I[27] = T[24];
	I[18] = T[25];
	I[0] = T[26];
	I[12] = T[27];
	I[15] = T[28];
	I[29] = T[29];
	I[32] = T[30];
	I[17] = T[31];
	I[26] = T[32];
	I[8] = T[33];
	I[10] = T[34];
	I[11] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 26, 3, 5, 1, 23, 4, 0, 2, 33, 12, 34, 35, 27, 22, 7, 28, 6, 31, 25, 19, 9, 17, 13, 18, 8, 20, 32, 24, 11, 29, 10, 21, 30, 15, 16, 14 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    6, 3, 7, 1, 5, 2, 16, 14, 24, 20, 30, 28, 9, 22, 35, 33, 34, 21, 23, 19, 25, 31, 13, 4, 27, 18, 0, 12, 15, 29, 32, 17, 26, 8, 10, 11
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 5;
	*idx++ = 2;
	*idx++ = 16;
	*idx++ = 14;
	*idx++ = 24;
	*idx++ = 20;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 9;
	*idx++ = 22;
	*idx++ = 35;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 21;
	*idx++ = 23;
	*idx++ = 19;
	*idx++ = 25;
	*idx++ = 31;
	*idx++ = 13;
	*idx++ = 4;
	*idx++ = 27;
	*idx++ = 18;
	*idx++ = 0;
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 29;
	*idx++ = 32;
	*idx++ = 17;
	*idx++ = 26;
	*idx++ = 8;
	*idx++ = 10;
	*idx++ = 11;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[36]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Ky*Kz*Px), I+0);
	    update((C[0][0])*W[a]*(Kx*Kz*Py), I+1);
	    update((C[0][0])*W[a]*(Kx*Ky*Pz), I+2);
	    double f0 = (2*B00*Ky + Cy*(pow(Ky,2) + B01));
	    update((C[0][0])*W[a]*(Cz*f0), I+3);
	    update((C[0][0])*W[a]*(Cx*f0), I+4);
	    double f10 = (2*B00*Kx + Cx*(pow(Kx,2) + B01));
	    update((C[0][0])*W[a]*(Cy*f10), I+5);
	    update((C[0][0])*W[a]*(Cz*f10), I+6);
	    double f11 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Cz*Ky*f11), I+7);
	    update((C[0][0])*W[a]*(Cy*Kz*f11), I+8);
	    double f12 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Cz*Kx*f12), I+9);
	    update((C[0][0])*W[a]*(Cx*Kz*f12), I+10);
	    update((C[0][0])*W[a]*(f11*f12), I+11);
	    double f15 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Cx*Cz*f15), I+12);
	    update((C[0][0])*W[a]*(Pz*f15), I+13);
	    update((C[0][0])*W[a]*(Px*f15), I+14);
	    double f16 = (Kz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Kx*f16), I+15);
	    update((C[0][0])*W[a]*(Ky*f16), I+16);
	    double f18 = (B00 + Cz*Kz);
	    update((C[0][0])*W[a]*(Cy*Kx*f18), I+17);
	    update((C[0][0])*W[a]*(Cx*Ky*f18), I+18);
	    update((C[0][0])*W[a]*(f11*f18), I+19);
	    update((C[0][0])*W[a]*(f12*f18), I+20);
	    double f19 = (Ky*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Kx*f19), I+21);
	    update((C[0][0])*W[a]*(Kz*f19), I+22);
	    double f3 = (Kx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Kz*f3), I+23);
	    update((C[0][0])*W[a]*(Ky*f3), I+24);
	    double f4 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Cx*Cy*f4), I+25);
	    update((C[0][0])*W[a]*(Px*f4), I+26);
	    update((C[0][0])*W[a]*(Py*f4), I+27);
	    double f6 = (2*B00*Kz + Cz*(pow(Kz,2) + B01));
	    update((C[0][0])*W[a]*(Cx*f6), I+28);
	    update((C[0][0])*W[a]*(Cy*f6), I+29);
	    double f7 = B01*B10;
	    double f14 = 2*pow(B00,2);
	    update((C[0][0])*W[a]*((f14 + B01*pow(Cy,2) + f7 + Py*pow(Ky,2) + 4*B00*Cy*Ky)), I+30);
	    update((C[0][0])*W[a]*((Pz*pow(Kz,2) + f14 + B01*pow(Cz,2) + f7 + 4*B00*Cz*Kz)), I+31);
	    update((C[0][0])*W[a]*((B01*pow(Cx,2) + f14 + Px*pow(Kx,2) + f7 + 4*B00*Cx*Kx)), I+32);
	    double f9 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Cy*Cz*f9), I+33);
	    update((C[0][0])*W[a]*(Py*f9), I+34);
	    update((C[0][0])*W[a]*(Pz*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[25] = T[1];
	I[20] = T[2];
	I[11] = T[3];
	I[9] = T[4];
	I[3] = T[5];
	I[4] = T[6];
	I[22] = T[7];
	I[27] = T[8];
	I[23] = T[9];
	I[33] = T[10];
	I[21] = T[11];
	I[10] = T[12];
	I[8] = T[13];
	I[6] = T[14];
	I[26] = T[15];
	I[32] = T[16];
	I[29] = T[17];
	I[34] = T[18];
	I[28] = T[19];
	I[35] = T[20];
	I[19] = T[21];
	I[31] = T[22];
	I[24] = T[23];
	I[18] = T[24];
	I[15] = T[25];
	I[12] = T[26];
	I[13] = T[27];
	I[16] = T[28];
	I[17] = T[29];
	I[7] = T[30];
	I[14] = T[31];
	I[0] = T[32];
	I[5] = T[33];
	I[1] = T[34];
	I[2] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 32, 34, 35, 5, 6, 33, 14, 30, 13, 4, 12, 3, 26, 27, 31, 25, 28, 29, 24, 21, 2, 11, 7, 9, 23, 1, 15, 8, 19, 17, 0, 22, 16, 10, 18, 20 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    30, 25, 20, 11, 9, 3, 4, 22, 27, 23, 33, 21, 10, 8, 6, 26, 32, 29, 34, 28, 35, 19, 31, 24, 18, 15, 12, 13, 16, 17, 7, 14, 0, 5, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 30;
	*idx++ = 25;
	*idx++ = 20;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 22;
	*idx++ = 27;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 21;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 26;
	*idx++ = 32;
	*idx++ = 29;
	*idx++ = 34;
	*idx++ = 28;
	*idx++ = 35;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 24;
	*idx++ = 18;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 13;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 0;
	*idx++ = 5;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[120]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Kz*Px), I+0);
	    update((C[0][0])*W[a]*(Cz*Ky*Px), I+1);
	    update((C[0][0])*W[a]*(Cz*Kx*Py), I+2);
	    update((C[0][0])*W[a]*(Cx*Kz*Py), I+3);
	    update((C[0][0])*W[a]*(Cx*Ky*Pz), I+4);
	    update((C[0][0])*W[a]*(Cy*Kx*Pz), I+5);
	    update((C[1][0])*W[a]*(Kz*Py*Qx), I+6);
	    update((C[1][0])*W[a]*(Ky*Pz*Qx), I+7);
	    update((C[1][0])*W[a]*(Kz*Px*Qy), I+8);
	    update((C[1][0])*W[a]*(Kx*Pz*Qy), I+9);
	    update((C[1][0])*W[a]*(Ky*Px*Qz), I+10);
	    update((C[1][0])*W[a]*(Kx*Py*Qz), I+11);
	    update((C[1][0])*W[a]*(Cx*Qz*(Cy*Ykl + Qy)), I+12);
	    update((C[0][0])*W[a]*(Cx*Cz*(Cy*Ykl + Qy)), I+13);
	    update((C[1][0])*W[a]*(Cz*Qx*(Cy*Ykl + Qy)), I+14);
	    update((C[1][0])*W[a]*(Dx*Pz*(Cy*Ykl + Qy)), I+15);
	    update((C[0][0])*W[a]*(Pz*(Cy*Ykl + Qy)), I+16);
	    update((C[1][0])*W[a]*(Dz*Px*(Cy*Ykl + Qy)), I+17);
	    update((C[0][0])*W[a]*(Px*(Cy*Ykl + Qy)), I+18);
	    update((C[1][0])*W[a]*(Cy*Qx*(Cz*Zkl + Qz)), I+19);
	    update((C[0][0])*W[a]*(Cx*Cy*(Cz*Zkl + Qz)), I+20);
	    update((C[1][0])*W[a]*(Cx*Qy*(Cz*Zkl + Qz)), I+21);
	    update((C[1][0])*W[a]*(Dx*Py*(Cz*Zkl + Qz)), I+22);
	    update((C[0][0])*W[a]*(Py*(Cz*Zkl + Qz)), I+23);
	    update((C[1][0])*W[a]*(Dy*Px*(Cz*Zkl + Qz)), I+24);
	    update((C[0][0])*W[a]*(Px*(Cz*Zkl + Qz)), I+25);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[1][0])*W[a]*(Cx*Dz*Ky*f0), I+26);
	    update((C[0][0])*W[a]*(Cx*Ky*f0), I+27);
	    update((C[1][0])*W[a]*(Cx*Dy*Kz*f0), I+28);
	    update((C[0][0])*W[a]*(Cx*Kz*f0), I+29);
	    double f1 = (Cx*Kx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[1][0])*W[a]*(Dz*f1), I+30);
	    update((C[1][0])*W[a]*(Dy*f1), I+31);
	    update((C[0][0])*W[a]*(f1), I+32);
	    double f10 = (2*B00*Cx*(Xkl + 2*Dx) + 2*pow(B00,2) + Px*(B01 + Dx*Kx));
	    update((C[1][0])*W[a]*(Cz*f10), I+33);
	    update((C[1][0])*W[a]*(Cy*f10), I+34);
	    double f11 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Cz*Py*f11), I+35);
	    update((C[1][0])*W[a]*(Cy*Pz*f11), I+36);
	    double f14 = (Kx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Qy*f14), I+37);
	    update((C[1][0])*W[a]*(Qz*f14), I+38);
	    update((C[1][0])*W[a]*(Cy*Dz*f14), I+39);
	    update((C[0][0])*W[a]*(Cy*f14), I+40);
	    update((C[1][0])*W[a]*(Cz*Dy*f14), I+41);
	    update((C[0][0])*W[a]*(Cz*f14), I+42);
	    double f16 = (Dy*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(Qz*(f16 + Py*Ykl)), I+43);
	    update((C[1][0])*W[a]*(Qx*(f16 + Py*Ykl)), I+44);
	    update((C[1][0])*W[a]*(Cz*Dx*(f16 + Py*Ykl)), I+45);
	    update((C[0][0])*W[a]*(Cz*(f16 + Py*Ykl)), I+46);
	    update((C[1][0])*W[a]*(Cx*Dz*(f16 + Py*Ykl)), I+47);
	    update((C[0][0])*W[a]*(Cx*(f16 + Py*Ykl)), I+48);
	    update((C[1][0])*W[a]*(Cx*Kz*f16), I+49);
	    update((C[1][0])*W[a]*(Cz*Kx*f16), I+50);
	    update((C[1][0])*W[a]*(f16*(Cz*Zkl + Qz)), I+51);
	    double f17 = (2*pow(B00,2) + Pz*(Dz*Kz + B01) + 2*B00*Cz*(2*Dz + Zkl));
	    update((C[1][0])*W[a]*(Cx*f17), I+52);
	    update((C[1][0])*W[a]*(Cy*f17), I+53);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*(Cy*Dx*(Pz*Zkl + f2)), I+54);
	    update((C[1][0])*W[a]*(Cx*Dy*(Pz*Zkl + f2)), I+55);
	    update((C[0][0])*W[a]*(Cx*(Pz*Zkl + f2)), I+56);
	    update((C[1][0])*W[a]*(Qx*(Pz*Zkl + f2)), I+57);
	    update((C[1][0])*W[a]*(Qy*(Pz*Zkl + f2)), I+58);
	    update((C[0][0])*W[a]*(Cy*(Pz*Zkl + f2)), I+59);
	    update((C[1][0])*W[a]*(Cx*Ky*f2), I+60);
	    update((C[1][0])*W[a]*(Cy*Kx*f2), I+61);
	    update((C[1][0])*W[a]*(f2*(Cy*Ykl + Qy)), I+62);
	    double f21 = (Dx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(f21*(Cy*Ykl + Qy)), I+63);
	    update((C[1][0])*W[a]*(Cy*Kz*f21), I+64);
	    update((C[1][0])*W[a]*(f21*(Cz*Zkl + Qz)), I+65);
	    update((C[1][0])*W[a]*(Cz*Ky*f21), I+66);
	    double f22 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Cx*f0*f22), I+67);
	    update((C[1][0])*W[a]*(Cy*Px*f22), I+68);
	    update((C[1][0])*W[a]*(Cx*Py*f22), I+69);
	    double f23 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Cx*Pz*f23), I+70);
	    update((C[1][0])*W[a]*(Cx*f0*f23), I+71);
	    update((C[1][0])*W[a]*(Cz*Px*f23), I+72);
	    double f25 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Cz*Qy*f25), I+73);
	    update((C[1][0])*W[a]*(Cy*Qz*f25), I+74);
	    update((C[0][0])*W[a]*(Cy*Cz*f25), I+75);
	    update((C[1][0])*W[a]*(f16*f25), I+76);
	    update((C[1][0])*W[a]*(f2*f25), I+77);
	    update((C[1][0])*W[a]*(Dz*Py*f25), I+78);
	    update((C[0][0])*W[a]*(Py*f25), I+79);
	    update((C[1][0])*W[a]*(Dy*Pz*f25), I+80);
	    update((C[0][0])*W[a]*(Pz*f25), I+81);
	    double f3 = (Py*(B01 + Dy*Ky) + 2*pow(B00,2) + 2*B00*Cy*(Ykl + 2*Dy));
	    update((C[1][0])*W[a]*(Cz*f3), I+82);
	    update((C[1][0])*W[a]*(Cx*f3), I+83);
	    double f31 = (3*B10 + pow(Cz,2));
	    update((C[1][0])*W[a]*(Cz*f23*f31), I+84);
	    update((C[1][0])*W[a]*(Cz*f11*f31), I+85);
	    update((C[1][0])*W[a]*(Cz*Dy*Kx*f31), I+86);
	    update((C[0][0])*W[a]*(Cz*Kx*f31), I+87);
	    update((C[1][0])*W[a]*(Cz*Dx*Ky*f31), I+88);
	    update((C[0][0])*W[a]*(Cz*Ky*f31), I+89);
	    double f32 = (3*B10 + pow(Cy,2));
	    update((C[1][0])*W[a]*(Cy*Dx*Kz*f32), I+90);
	    update((C[0][0])*W[a]*(Cy*Kz*f32), I+91);
	    update((C[1][0])*W[a]*(Cy*f22*f32), I+92);
	    update((C[1][0])*W[a]*(Cy*f11*f32), I+93);
	    update((C[1][0])*W[a]*(Cy*Dz*Kx*f32), I+94);
	    update((C[0][0])*W[a]*(Cy*Kx*f32), I+95);
	    double f36 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[1][0])*W[a]*(Cx*Cy*f36), I+96);
	    update((C[1][0])*W[a]*(Px*f36), I+97);
	    update((C[1][0])*W[a]*(Py*f36), I+98);
	    double f4 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[1][0])*W[a]*(Cx*Cz*f4), I+99);
	    update((C[1][0])*W[a]*(Pz*f4), I+100);
	    update((C[1][0])*W[a]*(Px*f4), I+101);
	    double f5 = (3*B00*Py + Cy*Ky*(3*B10 + pow(Cy,2)));
	    update((C[1][0])*W[a]*(Dx*f5), I+102);
	    update((C[1][0])*W[a]*(Dz*f5), I+103);
	    update((C[0][0])*W[a]*(f5), I+104);
	    double f6 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[1][0])*W[a]*(Kx*f6), I+105);
	    update((C[1][0])*W[a]*(Kz*f6), I+106);
	    double f7 = 3*B00*B10;
	    update((C[1][0])*W[a]*((Dy*(2*f7 + Dy*pow(Cy,3) + 3*B10*Cy*Dy) + 6*Cy*pow(B00,2) + Ykl*(f7 + Dy*pow(Cy,3) + 3*B10*Cy*Dy) + B01*(3*B10*Cy + pow(Cy,3)) + 3*B00*pow(Cy,2)*(Ykl + 2*Dy))), I+107);
	    update((C[1][0])*W[a]*((Xkl*(Dx*pow(Cx,3) + 3*B10*Cx*Dx + f7) + Dx*(Dx*pow(Cx,3) + 3*B10*Cx*Dx + 2*f7) + 3*B00*pow(Cx,2)*(Xkl + 2*Dx) + 6*Cx*pow(B00,2) + B01*(3*B10*Cx + pow(Cx,3)))), I+108);
	    update((C[1][0])*W[a]*(Kz*(Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f7)), I+109);
	    update((C[1][0])*W[a]*(Ky*(Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f7)), I+110);
	    update((C[1][0])*W[a]*(Ky*(3*B10*Cz*Dz + f7 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+111);
	    update((C[1][0])*W[a]*(Kx*(3*B10*Cz*Dz + f7 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+112);
	    update((C[1][0])*W[a]*((Dz*(3*B10*Cz*Dz + 2*f7 + Dz*pow(Cz,3)) + 3*B00*pow(Cz,2)*(2*Dz + Zkl) + Zkl*(3*B10*Cz*Dz + f7 + Dz*pow(Cz,3)) + B01*(3*B10*Cz + pow(Cz,3)) + 6*Cz*pow(B00,2))), I+113);
	    update((C[1][0])*W[a]*(Dy*(Cz*Kz*f31 + f7 + 3*B00*pow(Cz,2))), I+114);
	    update((C[0][0])*W[a]*((Cz*Kz*f31 + f7 + 3*B00*pow(Cz,2))), I+115);
	    update((C[1][0])*W[a]*(Dx*(Cz*Kz*f31 + f7 + 3*B00*pow(Cz,2))), I+116);
	    double f8 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[1][0])*W[a]*(Cy*Cz*f8), I+117);
	    update((C[1][0])*W[a]*(Pz*f8), I+118);
	    update((C[1][0])*W[a]*(Py*f8), I+119);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[120]) {
	double T[120];
	for (int i = 0; i < 120; ++i) {
	    T[i] = I[i];
	}
	I[83] = T[0];
	I[44] = T[1];
	I[6] = T[2];
	I[85] = T[3];
	I[47] = T[4];
	I[8] = T[5];
	I[95] = T[6];
	I[57] = T[7];
	I[103] = T[8];
	I[28] = T[9];
	I[74] = T[10];
	I[36] = T[11];
	I[79] = T[12];
	I[49] = T[13];
	I[59] = T[14];
	I[58] = T[15];
	I[48] = T[16];
	I[73] = T[17];
	I[43] = T[18];
	I[99] = T[19];
	I[89] = T[20];
	I[109] = T[21];
	I[96] = T[22];
	I[86] = T[23];
	I[104] = T[24];
	I[84] = T[25];
	I[70] = T[26];
	I[40] = T[27];
	I[100] = T[28];
	I[80] = T[29];
	I[30] = T[30];
	I[20] = T[31];
	I[0] = T[32];
	I[14] = T[33];
	I[13] = T[34];
	I[16] = T[35];
	I[18] = T[36];
	I[23] = T[37];
	I[34] = T[38];
	I[33] = T[39];
	I[3] = T[40];
	I[24] = T[41];
	I[4] = T[42];
	I[76] = T[43];
	I[55] = T[44];
	I[56] = T[45];
	I[46] = T[46];
	I[75] = T[47];
	I[45] = T[48];
	I[105] = T[49];
	I[26] = T[50];
	I[106] = T[51];
	I[117] = T[52];
	I[118] = T[53];
	I[98] = T[54];
	I[107] = T[55];
	I[87] = T[56];
	I[97] = T[57];
	I[108] = T[58];
	I[88] = T[59];
	I[77] = T[60];
	I[38] = T[61];
	I[78] = T[62];
	I[53] = T[63];
	I[93] = T[64];
	I[94] = T[65];
	I[54] = T[66];
	I[110] = T[67];
	I[113] = T[68];
	I[115] = T[69];
	I[67] = T[70];
	I[60] = T[71];
	I[64] = T[72];
	I[29] = T[73];
	I[39] = T[74];
	I[9] = T[75];
	I[25] = T[76];
	I[37] = T[77];
	I[35] = T[78];
	I[5] = T[79];
	I[27] = T[80];
	I[7] = T[81];
	I[66] = T[82];
	I[65] = T[83];
	I[62] = T[84];
	I[12] = T[85];
	I[22] = T[86];
	I[2] = T[87];
	I[52] = T[88];
	I[42] = T[89];
	I[91] = T[90];
	I[81] = T[91];
	I[111] = T[92];
	I[11] = T[93];
	I[31] = T[94];
	I[1] = T[95];
	I[119] = T[96];
	I[114] = T[97];
	I[116] = T[98];
	I[69] = T[99];
	I[68] = T[100];
	I[63] = T[101];
	I[51] = T[102];
	I[71] = T[103];
	I[41] = T[104];
	I[21] = T[105];
	I[101] = T[106];
	I[61] = T[107];
	I[10] = T[108];
	I[90] = T[109];
	I[50] = T[110];
	I[72] = T[111];
	I[32] = T[112];
	I[112] = T[113];
	I[102] = T[114];
	I[82] = T[115];
	I[92] = T[116];
	I[19] = T[117];
	I[17] = T[118];
	I[15] = T[119];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[120] = { 32, 95, 87, 40, 42, 79, 2, 81, 5, 75, 108, 93, 85, 34, 33, 119, 35, 118, 36, 117, 31, 105, 86, 37, 41, 76, 50, 80, 9, 73, 30, 94, 112, 39, 38, 78, 11, 77, 61, 74, 27, 104, 89, 18, 1, 48, 46, 4, 16, 13, 110, 102, 88, 63, 66, 44, 45, 7, 15, 14, 71, 107, 84, 101, 72, 83, 82, 70, 100, 99, 26, 103, 111, 17, 10, 47, 43, 60, 62, 12, 29, 91, 115, 0, 25, 3, 23, 56, 59, 20, 109, 90, 116, 64, 65, 6, 22, 57, 54, 19, 28, 106, 114, 8, 24, 49, 51, 55, 58, 21, 67, 92, 113, 68, 97, 69, 98, 52, 53, 96 };
// 	if (index < 120) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    83, 44, 6, 85, 47, 8, 95, 57, 103, 28, 74, 36, 79, 49, 59, 58, 48, 73, 43, 99, 89, 109, 96, 86, 104, 84, 70, 40, 100, 80, 30, 20, 0, 14, 13, 16, 18, 23, 34, 33, 3, 24, 4, 76, 55, 56, 46, 75, 45, 105, 26, 106, 117, 118, 98, 107, 87, 97, 108, 88, 77, 38, 78, 53, 93, 94, 54, 110, 113, 115, 67, 60, 64, 29, 39, 9, 25, 37, 35, 5, 27, 7, 66, 65, 62, 12, 22, 2, 52, 42, 91, 81, 111, 11, 31, 1, 119, 114, 116, 69, 68, 63, 51, 71, 41, 21, 101, 61, 10, 90, 50, 72, 32, 112, 102, 82, 92, 19, 17, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 83;
	*idx++ = 44;
	*idx++ = 6;
	*idx++ = 85;
	*idx++ = 47;
	*idx++ = 8;
	*idx++ = 95;
	*idx++ = 57;
	*idx++ = 103;
	*idx++ = 28;
	*idx++ = 74;
	*idx++ = 36;
	*idx++ = 79;
	*idx++ = 49;
	*idx++ = 59;
	*idx++ = 58;
	*idx++ = 48;
	*idx++ = 73;
	*idx++ = 43;
	*idx++ = 99;
	*idx++ = 89;
	*idx++ = 109;
	*idx++ = 96;
	*idx++ = 86;
	*idx++ = 104;
	*idx++ = 84;
	*idx++ = 70;
	*idx++ = 40;
	*idx++ = 100;
	*idx++ = 80;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 23;
	*idx++ = 34;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 76;
	*idx++ = 55;
	*idx++ = 56;
	*idx++ = 46;
	*idx++ = 75;
	*idx++ = 45;
	*idx++ = 105;
	*idx++ = 26;
	*idx++ = 106;
	*idx++ = 117;
	*idx++ = 118;
	*idx++ = 98;
	*idx++ = 107;
	*idx++ = 87;
	*idx++ = 97;
	*idx++ = 108;
	*idx++ = 88;
	*idx++ = 77;
	*idx++ = 38;
	*idx++ = 78;
	*idx++ = 53;
	*idx++ = 93;
	*idx++ = 94;
	*idx++ = 54;
	*idx++ = 110;
	*idx++ = 113;
	*idx++ = 115;
	*idx++ = 67;
	*idx++ = 60;
	*idx++ = 64;
	*idx++ = 29;
	*idx++ = 39;
	*idx++ = 9;
	*idx++ = 25;
	*idx++ = 37;
	*idx++ = 35;
	*idx++ = 5;
	*idx++ = 27;
	*idx++ = 7;
	*idx++ = 66;
	*idx++ = 65;
	*idx++ = 62;
	*idx++ = 12;
	*idx++ = 22;
	*idx++ = 2;
	*idx++ = 52;
	*idx++ = 42;
	*idx++ = 91;
	*idx++ = 81;
	*idx++ = 111;
	*idx++ = 11;
	*idx++ = 31;
	*idx++ = 1;
	*idx++ = 119;
	*idx++ = 114;
	*idx++ = 116;
	*idx++ = 69;
	*idx++ = 68;
	*idx++ = 63;
	*idx++ = 51;
	*idx++ = 71;
	*idx++ = 41;
	*idx++ = 21;
	*idx++ = 101;
	*idx++ = 61;
	*idx++ = 10;
	*idx++ = 90;
	*idx++ = 50;
	*idx++ = 72;
	*idx++ = 32;
	*idx++ = 112;
	*idx++ = 102;
	*idx++ = 82;
	*idx++ = 92;
	*idx++ = 19;
	*idx++ = 17;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::P, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Dz*Kx*Ky), I+0);
	    update((C[0][0])*W[a]*(Dy*Kx*Kz), I+1);
	    update((C[0][0])*W[a]*(Dx*Ky*Kz), I+2);
	    update((C[0][0])*W[a]*((B01*(2*Xkl + 3*Dx) + Dx*pow(Kx,2))), I+3);
	    update((C[0][0])*W[a]*((Dy*pow(Ky,2) + B01*(3*Dy + 2*Ykl))), I+4);
	    update((C[0][0])*W[a]*((B01*(2*Zkl + 3*Dz) + Dz*pow(Kz,2))), I+5);
	    double f2 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Dx*f2), I+6);
	    update((C[0][0])*W[a]*(Dy*f2), I+7);
	    double f3 = (B01 + Dx*Kx);
	    update((C[0][0])*W[a]*(Kz*f3), I+8);
	    update((C[0][0])*W[a]*(Ky*f3), I+9);
	    double f5 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Dy*f5), I+10);
	    update((C[0][0])*W[a]*(Dz*f5), I+11);
	    double f6 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Kx*f6), I+12);
	    update((C[0][0])*W[a]*(Ky*f6), I+13);
	    double f7 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Dz*f7), I+14);
	    update((C[0][0])*W[a]*(Dx*f7), I+15);
	    double f8 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Kx*f8), I+16);
	    update((C[0][0])*W[a]*(Kz*f8), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[11] = T[0];
	I[13] = T[1];
	I[15] = T[2];
	I[0] = T[3];
	I[4] = T[4];
	I[8] = T[5];
	I[6] = T[6];
	I[7] = T[7];
	I[12] = T[8];
	I[9] = T[9];
	I[1] = T[10];
	I[2] = T[11];
	I[14] = T[12];
	I[17] = T[13];
	I[5] = T[14];
	I[3] = T[15];
	I[10] = T[16];
	I[16] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 3, 10, 11, 15, 4, 14, 6, 7, 5, 9, 16, 0, 8, 1, 12, 2, 17, 13 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    11, 13, 15, 0, 4, 8, 6, 7, 12, 9, 1, 2, 14, 17, 5, 3, 10, 16
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 11;
	*idx++ = 13;
	*idx++ = 15;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 14;
	*idx++ = 17;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 10;
	*idx++ = 16;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::P, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[48]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[1][1])*W[a]*(Dy*(Cx*Ix + B10)), I+0);
	    update((C[1][1])*W[a]*(Dz*(Cx*Ix + B10)), I+1);
	    update((C[0][1])*W[a]*((Cx*Ix + B10)), I+2);
	    update((C[1][0])*W[a]*(Dy*Ix), I+3);
	    update((C[1][1])*W[a]*(Cz*Dy*Ix), I+4);
	    update((C[0][1])*W[a]*(Cz*Ix), I+5);
	    update((C[1][0])*W[a]*(Dz*Ix), I+6);
	    update((C[1][1])*W[a]*(Cy*Dz*Ix), I+7);
	    update((C[0][1])*W[a]*(Cy*Ix), I+8);
	    update((C[0][0])*W[a]*(Ix), I+9);
	    update((C[1][1])*W[a]*(Dx*(Cy*Iy + B10)), I+10);
	    update((C[0][1])*W[a]*((Cy*Iy + B10)), I+11);
	    update((C[1][1])*W[a]*(Dz*(Cy*Iy + B10)), I+12);
	    update((C[1][1])*W[a]*(Cz*Dx*Iy), I+13);
	    update((C[0][1])*W[a]*(Cz*Iy), I+14);
	    update((C[0][1])*W[a]*(Cx*Iy), I+15);
	    update((C[1][1])*W[a]*(Cx*Dz*Iy), I+16);
	    update((C[1][0])*W[a]*(Dz*Iy), I+17);
	    update((C[1][0])*W[a]*(Dx*Iy), I+18);
	    update((C[0][0])*W[a]*(Iy), I+19);
	    update((C[1][1])*W[a]*(Dx*(B10 + Cz*Iz)), I+20);
	    update((C[0][1])*W[a]*((B10 + Cz*Iz)), I+21);
	    update((C[1][1])*W[a]*(Dy*(B10 + Cz*Iz)), I+22);
	    update((C[1][0])*W[a]*(Dx*Iz), I+23);
	    update((C[1][1])*W[a]*(Cy*Dx*Iz), I+24);
	    update((C[0][1])*W[a]*(Cy*Iz), I+25);
	    update((C[1][0])*W[a]*(Dy*Iz), I+26);
	    update((C[1][1])*W[a]*(Cx*Dy*Iz), I+27);
	    update((C[0][1])*W[a]*(Cx*Iz), I+28);
	    update((C[0][0])*W[a]*(Iz), I+29);
	    update((C[1][1])*W[a]*(Iz*Qx), I+30);
	    update((C[1][1])*W[a]*(Iy*Qx), I+31);
	    update((C[1][1])*W[a]*(Iz*Qy), I+32);
	    update((C[1][1])*W[a]*(Ix*Qy), I+33);
	    update((C[1][1])*W[a]*(Ix*Qz), I+34);
	    update((C[1][1])*W[a]*(Iy*Qz), I+35);
	    update((C[1][1])*W[a]*((Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+36);
	    update((C[1][1])*W[a]*(Cz*(Dx*Xij + Qx)), I+37);
	    update((C[1][1])*W[a]*(Cy*(Dx*Xij + Qx)), I+38);
	    update((C[1][0])*W[a]*((Dx*Xij + Qx)), I+39);
	    update((C[1][1])*W[a]*((B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+40);
	    update((C[1][1])*W[a]*(Cz*(Dy*Yij + Qy)), I+41);
	    update((C[1][1])*W[a]*(Cx*(Dy*Yij + Qy)), I+42);
	    update((C[1][0])*W[a]*((Dy*Yij + Qy)), I+43);
	    update((C[1][1])*W[a]*((Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+44);
	    update((C[1][1])*W[a]*(Cy*(Dz*Zij + Qz)), I+45);
	    update((C[1][1])*W[a]*(Cx*(Dz*Zij + Qz)), I+46);
	    update((C[1][0])*W[a]*((Dz*Zij + Qz)), I+47);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[48]) {
	double T[48];
	for (int i = 0; i < 48; ++i) {
	    T[i] = I[i];
	}
	I[25] = T[0];
	I[37] = T[1];
	I[1] = T[2];
	I[24] = T[3];
	I[27] = T[4];
	I[3] = T[5];
	I[36] = T[6];
	I[38] = T[7];
	I[2] = T[8];
	I[0] = T[9];
	I[18] = T[10];
	I[6] = T[11];
	I[42] = T[12];
	I[19] = T[13];
	I[7] = T[14];
	I[5] = T[15];
	I[41] = T[16];
	I[40] = T[17];
	I[16] = T[18];
	I[4] = T[19];
	I[23] = T[20];
	I[11] = T[21];
	I[35] = T[22];
	I[20] = T[23];
	I[22] = T[24];
	I[10] = T[25];
	I[32] = T[26];
	I[33] = T[27];
	I[9] = T[28];
	I[8] = T[29];
	I[21] = T[30];
	I[17] = T[31];
	I[34] = T[32];
	I[26] = T[33];
	I[39] = T[34];
	I[43] = T[35];
	I[13] = T[36];
	I[15] = T[37];
	I[14] = T[38];
	I[12] = T[39];
	I[30] = T[40];
	I[31] = T[41];
	I[29] = T[42];
	I[28] = T[43];
	I[47] = T[44];
	I[46] = T[45];
	I[45] = T[46];
	I[44] = T[47];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[48] = { 9, 2, 8, 5, 19, 15, 11, 14, 29, 28, 25, 21, 39, 36, 38, 37, 18, 31, 10, 13, 23, 30, 24, 20, 3, 0, 33, 4, 43, 42, 40, 41, 26, 27, 32, 22, 6, 1, 7, 34, 17, 16, 12, 35, 47, 46, 45, 44 };
// 	if (index < 48) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    25, 37, 1, 24, 27, 3, 36, 38, 2, 0, 18, 6, 42, 19, 7, 5, 41, 40, 16, 4, 23, 11, 35, 20, 22, 10, 32, 33, 9, 8, 21, 17, 34, 26, 39, 43, 13, 15, 14, 12, 30, 31, 29, 28, 47, 46, 45, 44
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 25;
	*idx++ = 37;
	*idx++ = 1;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 3;
	*idx++ = 36;
	*idx++ = 38;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 18;
	*idx++ = 6;
	*idx++ = 42;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 41;
	*idx++ = 40;
	*idx++ = 16;
	*idx++ = 4;
	*idx++ = 23;
	*idx++ = 11;
	*idx++ = 35;
	*idx++ = 20;
	*idx++ = 22;
	*idx++ = 10;
	*idx++ = 32;
	*idx++ = 33;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 21;
	*idx++ = 17;
	*idx++ = 34;
	*idx++ = 26;
	*idx++ = 39;
	*idx++ = 43;
	*idx++ = 13;
	*idx++ = 15;
	*idx++ = 14;
	*idx++ = 12;
	*idx++ = 30;
	*idx++ = 31;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 47;
	*idx++ = 46;
	*idx++ = 45;
	*idx++ = 44;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[9]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((B01 + Dx*Kx)), I+0);
	    update((C[0][0])*W[a]*(Dy*Kx), I+1);
	    update((C[0][0])*W[a]*(Dz*Kx), I+2);
	    update((C[0][0])*W[a]*((B01 + Dy*Ky)), I+3);
	    update((C[0][0])*W[a]*(Dz*Ky), I+4);
	    update((C[0][0])*W[a]*(Dx*Ky), I+5);
	    update((C[0][0])*W[a]*((Dz*Kz + B01)), I+6);
	    update((C[0][0])*W[a]*(Dx*Kz), I+7);
	    update((C[0][0])*W[a]*(Dy*Kz), I+8);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[9]) {
	double T[9];
	for (int i = 0; i < 9; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[4] = T[3];
	I[5] = T[4];
	I[3] = T[5];
	I[8] = T[6];
	I[6] = T[7];
	I[7] = T[8];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[9] = { 0, 1, 2, 5, 3, 4, 7, 8, 6 };
// 	if (index < 9) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 4, 5, 3, 8, 6, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((2*B00*Ix + Kx*(B10 + pow(Ix,2)))), I+0);
	    update((C[0][0])*W[a]*(Iy*Iz*Kx), I+1);
	    update((C[0][0])*W[a]*((2*B00*Iy + Ky*(B10 + pow(Iy,2)))), I+2);
	    update((C[0][0])*W[a]*(Ix*Iz*Ky), I+3);
	    update((C[0][0])*W[a]*((2*B00*Iz + Kz*(B10 + pow(Iz,2)))), I+4);
	    update((C[0][0])*W[a]*(Ix*Iy*Kz), I+5);
	    double f10 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Ix*f10), I+6);
	    update((C[0][0])*W[a]*(Iy*f10), I+7);
	    double f11 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Ky*f11), I+8);
	    update((C[0][0])*W[a]*(Kz*f11), I+9);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Iy*f3), I+10);
	    update((C[0][0])*W[a]*(Iz*f3), I+11);
	    double f7 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Kx*f7), I+12);
	    update((C[0][0])*W[a]*(Kz*f7), I+13);
	    double f8 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Iz*f8), I+14);
	    update((C[0][0])*W[a]*(Ix*f8), I+15);
	    double f9 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Kx*f9), I+16);
	    update((C[0][0])*W[a]*(Ky*f9), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[5] = T[1];
	I[7] = T[2];
	I[10] = T[3];
	I[14] = T[4];
	I[15] = T[5];
	I[16] = T[6];
	I[17] = T[7];
	I[6] = T[8];
	I[12] = T[9];
	I[3] = T[10];
	I[4] = T[11];
	I[1] = T[12];
	I[13] = T[13];
	I[11] = T[14];
	I[9] = T[15];
	I[2] = T[16];
	I[8] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 0, 12, 16, 10, 11, 1, 8, 2, 17, 15, 3, 14, 9, 13, 4, 5, 6, 7 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 5, 7, 10, 14, 15, 16, 17, 6, 12, 3, 4, 1, 13, 11, 9, 2, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 10;
	*idx++ = 14;
	*idx++ = 15;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 2;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::P, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Dx), I+0);
	    update((C[0][0])*W[a]*(Dy), I+1);
	    update((C[0][0])*W[a]*(Dz), I+2);
	    update((C[1][0])*W[a]*((B01 + Dx*Kx)), I+3);
	    update((C[1][0])*W[a]*(Dy*Kx), I+4);
	    update((C[1][0])*W[a]*(Dz*Kx), I+5);
	    update((C[1][0])*W[a]*((B01 + Dy*Ky)), I+6);
	    update((C[1][0])*W[a]*(Dz*Ky), I+7);
	    update((C[1][0])*W[a]*(Dx*Ky), I+8);
	    update((C[1][0])*W[a]*((Dz*Kz + B01)), I+9);
	    update((C[1][0])*W[a]*(Dx*Kz), I+10);
	    update((C[1][0])*W[a]*(Dy*Kz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[4] = T[4];
	I[5] = T[5];
	I[7] = T[6];
	I[8] = T[7];
	I[6] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[10] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 0, 1, 2, 3, 4, 5, 8, 6, 7, 10, 11, 9 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 4, 5, 7, 8, 6, 11, 9, 10
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::P, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[81]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Iz*Ky*Qx), I+0);
	    update((C[0][0])*W[a]*(Iy*Kz*Qx), I+1);
	    update((C[0][0])*W[a]*(Ix*Kz*Qy), I+2);
	    update((C[0][0])*W[a]*(Iz*Kx*Qy), I+3);
	    update((C[0][0])*W[a]*(Ix*Ky*Qz), I+4);
	    update((C[0][0])*W[a]*(Iy*Kx*Qz), I+5);
	    update((C[0][0])*W[a]*(Cy*Kz*(Dx*Xij + Qx)), I+6);
	    update((C[0][0])*W[a]*(Cz*Ky*(Dx*Xij + Qx)), I+7);
	    update((C[0][0])*W[a]*(Dy*Iz*(Cx*Xkl + Qx)), I+8);
	    update((C[0][0])*W[a]*(Dz*Iy*(Cx*Xkl + Qx)), I+9);
	    update((C[0][0])*W[a]*((Cz*Zkl + Qz)*(Dx*Xij + Qx)), I+10);
	    update((C[0][0])*W[a]*(Dy*Ix*(Cz*Zkl + Qz)), I+11);
	    update((C[0][0])*W[a]*(Dx*Iy*(Cz*Zkl + Qz)), I+12);
	    double f1 = (B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky));
	    update((C[0][0])*W[a]*(Cx*f1), I+13);
	    update((C[0][0])*W[a]*(Cz*f1), I+14);
	    double f10 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(f10*(Cx*Xkl + Qx)), I+15);
	    update((C[0][0])*W[a]*(Cy*Kx*f10), I+16);
	    update((C[0][0])*W[a]*(Cx*Ky*f10), I+17);
	    double f12 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][0])*W[a]*(Ky*f12), I+18);
	    update((C[0][0])*W[a]*(Kz*f12), I+19);
	    double f13 = (B01 + Dx*Kx);
	    update((C[0][0])*W[a]*(Cy*Iz*f13), I+20);
	    update((C[0][0])*W[a]*(Cz*Iy*f13), I+21);
	    double f14 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Dz*Kx*f14), I+22);
	    update((C[0][0])*W[a]*(Dx*Kz*f14), I+23);
	    update((C[0][0])*W[a]*(f13*f14), I+24);
	    double f16 = (B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10));
	    update((C[0][0])*W[a]*(Dx*f16), I+25);
	    update((C[0][0])*W[a]*(Dz*f16), I+26);
	    double f18 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[0][0])*W[a]*(Cx*(Zij*(Dz*Kz + B01) + f18)), I+27);
	    update((C[0][0])*W[a]*(Cy*(Zij*(Dz*Kz + B01) + f18)), I+28);
	    update((C[0][0])*W[a]*(Ix*f18), I+29);
	    update((C[0][0])*W[a]*(Iy*f18), I+30);
	    double f19 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Cx*Dz*f19), I+31);
	    update((C[0][0])*W[a]*(Cz*Dx*f19), I+32);
	    update((C[0][0])*W[a]*(Qx*f19), I+33);
	    update((C[0][0])*W[a]*(Qz*f19), I+34);
	    double f2 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[0][0])*W[a]*(Ix*f2), I+35);
	    update((C[0][0])*W[a]*(Iz*f2), I+36);
	    double f21 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][0])*W[a]*(Kx*f21), I+37);
	    update((C[0][0])*W[a]*(Kz*f21), I+38);
	    double f23 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Cy*Ix*f23), I+39);
	    update((C[0][0])*W[a]*(Cx*Iy*f23), I+40);
	    update((C[0][0])*W[a]*(f14*f23), I+41);
	    double f24 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Cz*Ix*f24), I+42);
	    update((C[0][0])*W[a]*(Cx*Iz*f24), I+43);
	    double f26 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(f26*(Dx*Xij + Qx)), I+44);
	    update((C[0][0])*W[a]*(Dx*Iz*f26), I+45);
	    update((C[0][0])*W[a]*(Dz*Ix*f26), I+46);
	    update((C[0][0])*W[a]*(f10*f26), I+47);
	    double f15 = B01*B10;
	    double f27 = 2*pow(B00,2);
	    update((C[0][0])*W[a]*((B00*(Xij + 2*Cx)*(Xkl + 2*Dx) + f15 + f27 + B01*Cx*Ix + Dx*Kx*(Cx*Ix + B10))), I+48);
	    update((C[0][0])*W[a]*((f15 + f27 + Dy*Ky*(Cy*Iy + B10) + B01*Cy*Iy + B00*(Yij + 2*Cy)*(Ykl + 2*Dy))), I+49);
	    update((C[0][0])*W[a]*((Dz*Kz*(B10 + Cz*Iz) + f15 + f27 + B00*(2*Cz + Zij)*(2*Dz + Zkl) + B01*Cz*Iz)), I+50);
	    double f28 = (Cx*Ix + B10);
	    update((C[0][0])*W[a]*(Dz*Ky*f28), I+51);
	    update((C[0][0])*W[a]*(Dy*Kz*f28), I+52);
	    update((C[0][0])*W[a]*(f24*f28), I+53);
	    update((C[0][0])*W[a]*(f23*f28), I+54);
	    double f29 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(f29*(Cx*Xkl + Qx)), I+55);
	    update((C[0][0])*W[a]*(Cx*Kz*f29), I+56);
	    update((C[0][0])*W[a]*(f29*(Cz*Zkl + Qz)), I+57);
	    update((C[0][0])*W[a]*(Cz*Kx*f29), I+58);
	    double f31 = (Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij));
	    update((C[0][0])*W[a]*(Dy*(f31 + Zkl*(B10 + Cz*Iz))), I+59);
	    update((C[0][0])*W[a]*(Dx*(f31 + Zkl*(B10 + Cz*Iz))), I+60);
	    update((C[0][0])*W[a]*(Kx*f31), I+61);
	    update((C[0][0])*W[a]*(Ky*f31), I+62);
	    double f34 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Dx*Ky*f34), I+63);
	    update((C[0][0])*W[a]*(Dy*Kx*f34), I+64);
	    update((C[0][0])*W[a]*(f13*f34), I+65);
	    update((C[0][0])*W[a]*(f24*f34), I+66);
	    double f4 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[0][0])*W[a]*(Cz*(f4 + Xij*(B01 + Dx*Kx))), I+67);
	    update((C[0][0])*W[a]*(Cy*(f4 + Xij*(B01 + Dx*Kx))), I+68);
	    update((C[0][0])*W[a]*(Iz*f4), I+69);
	    update((C[0][0])*W[a]*(Iy*f4), I+70);
	    double f5 = (Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][0])*W[a]*(Dy*f5), I+71);
	    update((C[0][0])*W[a]*(Dz*f5), I+72);
	    double f7 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Cy*Dz*f7), I+73);
	    update((C[0][0])*W[a]*(Cz*Dy*f7), I+74);
	    update((C[0][0])*W[a]*(Qz*f7), I+75);
	    update((C[0][0])*W[a]*(Qy*f7), I+76);
	    double f9 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Cy*Dx*f9), I+77);
	    update((C[0][0])*W[a]*(Cx*Dy*f9), I+78);
	    update((C[0][0])*W[a]*(Qx*f9), I+79);
	    update((C[0][0])*W[a]*(Qy*f9), I+80);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[81]) {
	double T[81];
	for (int i = 0; i < 81; ++i) {
	    T[i] = I[i];
	}
	I[33] = T[0];
	I[57] = T[1];
	I[64] = T[2];
	I[16] = T[3];
	I[47] = T[4];
	I[23] = T[5];
	I[55] = T[6];
	I[29] = T[7];
	I[15] = T[8];
	I[21] = T[9];
	I[56] = T[10];
	I[65] = T[11];
	I[59] = T[12];
	I[39] = T[13];
	I[41] = T[14];
	I[24] = T[15];
	I[25] = T[16];
	I[51] = T[17];
	I[27] = T[18];
	I[54] = T[19];
	I[7] = T[20];
	I[5] = T[21];
	I[22] = T[22];
	I[58] = T[23];
	I[4] = T[24];
	I[31] = T[25];
	I[49] = T[26];
	I[78] = T[27];
	I[79] = T[28];
	I[74] = T[29];
	I[77] = T[30];
	I[48] = T[31];
	I[32] = T[32];
	I[30] = T[33];
	I[50] = T[34];
	I[37] = T[35];
	I[43] = T[36];
	I[13] = T[37];
	I[67] = T[38];
	I[73] = T[39];
	I[75] = T[40];
	I[76] = T[41];
	I[38] = T[42];
	I[42] = T[43];
	I[28] = T[44];
	I[34] = T[45];
	I[46] = T[46];
	I[52] = T[47];
	I[0] = T[48];
	I[40] = T[49];
	I[80] = T[50];
	I[45] = T[51];
	I[63] = T[52];
	I[36] = T[53];
	I[72] = T[54];
	I[12] = T[55];
	I[66] = T[56];
	I[68] = T[57];
	I[14] = T[58];
	I[71] = T[59];
	I[62] = T[60];
	I[26] = T[61];
	I[53] = T[62];
	I[35] = T[63];
	I[17] = T[64];
	I[8] = T[65];
	I[44] = T[66];
	I[2] = T[67];
	I[1] = T[68];
	I[6] = T[69];
	I[3] = T[70];
	I[9] = T[71];
	I[18] = T[72];
	I[19] = T[73];
	I[11] = T[74];
	I[20] = T[75];
	I[10] = T[76];
	I[61] = T[77];
	I[69] = T[78];
	I[60] = T[79];
	I[70] = T[80];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[81] = { 48, 68, 67, 70, 24, 21, 69, 20, 65, 71, 76, 74, 55, 37, 58, 8, 3, 64, 72, 73, 75, 9, 22, 5, 15, 16, 61, 18, 44, 7, 33, 25, 32, 0, 45, 63, 53, 35, 42, 13, 49, 14, 43, 36, 66, 51, 46, 4, 31, 26, 34, 17, 47, 62, 19, 6, 10, 1, 23, 12, 79, 77, 60, 52, 2, 11, 56, 38, 57, 78, 80, 59, 54, 39, 29, 40, 41, 30, 27, 28, 50 };
// 	if (index < 81) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    33, 57, 64, 16, 47, 23, 55, 29, 15, 21, 56, 65, 59, 39, 41, 24, 25, 51, 27, 54, 7, 5, 22, 58, 4, 31, 49, 78, 79, 74, 77, 48, 32, 30, 50, 37, 43, 13, 67, 73, 75, 76, 38, 42, 28, 34, 46, 52, 0, 40, 80, 45, 63, 36, 72, 12, 66, 68, 14, 71, 62, 26, 53, 35, 17, 8, 44, 2, 1, 6, 3, 9, 18, 19, 11, 20, 10, 61, 69, 60, 70
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 33;
	*idx++ = 57;
	*idx++ = 64;
	*idx++ = 16;
	*idx++ = 47;
	*idx++ = 23;
	*idx++ = 55;
	*idx++ = 29;
	*idx++ = 15;
	*idx++ = 21;
	*idx++ = 56;
	*idx++ = 65;
	*idx++ = 59;
	*idx++ = 39;
	*idx++ = 41;
	*idx++ = 24;
	*idx++ = 25;
	*idx++ = 51;
	*idx++ = 27;
	*idx++ = 54;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 22;
	*idx++ = 58;
	*idx++ = 4;
	*idx++ = 31;
	*idx++ = 49;
	*idx++ = 78;
	*idx++ = 79;
	*idx++ = 74;
	*idx++ = 77;
	*idx++ = 48;
	*idx++ = 32;
	*idx++ = 30;
	*idx++ = 50;
	*idx++ = 37;
	*idx++ = 43;
	*idx++ = 13;
	*idx++ = 67;
	*idx++ = 73;
	*idx++ = 75;
	*idx++ = 76;
	*idx++ = 38;
	*idx++ = 42;
	*idx++ = 28;
	*idx++ = 34;
	*idx++ = 46;
	*idx++ = 52;
	*idx++ = 0;
	*idx++ = 40;
	*idx++ = 80;
	*idx++ = 45;
	*idx++ = 63;
	*idx++ = 36;
	*idx++ = 72;
	*idx++ = 12;
	*idx++ = 66;
	*idx++ = 68;
	*idx++ = 14;
	*idx++ = 71;
	*idx++ = 62;
	*idx++ = 26;
	*idx++ = 53;
	*idx++ = 35;
	*idx++ = 17;
	*idx++ = 8;
	*idx++ = 44;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 9;
	*idx++ = 18;
	*idx++ = 19;
	*idx++ = 11;
	*idx++ = 20;
	*idx++ = 10;
	*idx++ = 61;
	*idx++ = 69;
	*idx++ = 60;
	*idx++ = 70;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::P, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[108]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Iz*Qx*Qy), I+0);
	    update((C[0][0])*W[a]*(Iy*Qx*Qz), I+1);
	    update((C[0][0])*W[a]*(Ix*Qy*Qz), I+2);
	    update((C[0][0])*W[a]*(Iz*Py*Rx), I+3);
	    update((C[0][0])*W[a]*(Iy*Pz*Rx), I+4);
	    update((C[0][0])*W[a]*(Ix*Pz*Ry), I+5);
	    update((C[0][0])*W[a]*(Iz*Px*Ry), I+6);
	    update((C[0][0])*W[a]*(Iy*Px*Rz), I+7);
	    update((C[0][0])*W[a]*(Ix*Py*Rz), I+8);
	    update((C[0][0])*W[a]*((6*B00*Dx*Px + Rx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 4*B00*Cx*Dx*Xij + 2*pow(B00,2)*(3*Cx + Xij))), I+9);
	    update((C[0][0])*W[a]*(Dy*Pz*(Dx*Xij + Qx)), I+10);
	    update((C[0][0])*W[a]*(Cz*Qy*(Dx*Xij + Qx)), I+11);
	    update((C[0][0])*W[a]*(Dz*Py*(Dx*Xij + Qx)), I+12);
	    update((C[0][0])*W[a]*(Cy*Qz*(Dx*Xij + Qx)), I+13);
	    update((C[0][0])*W[a]*(Cy*Rz*(Px + Cx*Xij)), I+14);
	    update((C[0][0])*W[a]*(Dz*Qy*(Px + Cx*Xij)), I+15);
	    update((C[0][0])*W[a]*(Dy*Qz*(Px + Cx*Xij)), I+16);
	    update((C[0][0])*W[a]*(Cz*Ry*(Px + Cx*Xij)), I+17);
	    update((C[0][0])*W[a]*((4*B00*Cy*Dy*Yij + 2*pow(B00,2)*(3*Cy + Yij) + Ry*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)) + 6*B00*Dy*Py)), I+18);
	    update((C[0][0])*W[a]*((6*B00*Dz*Pz + 2*pow(B00,2)*(3*Cz + Zij) + 4*B00*Cz*Dz*Zij + Rz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)))), I+19);
	    update((C[0][0])*W[a]*(Dx*Qy*(Cz*Zij + Pz)), I+20);
	    update((C[0][0])*W[a]*(Dx*Py*(Dz*Zij + Qz)), I+21);
	    update((C[0][0])*W[a]*(Cy*Qx*(Dz*Zij + Qz)), I+22);
	    update((C[0][0])*W[a]*(Dy*Px*(Dz*Zij + Qz)), I+23);
	    update((C[0][0])*W[a]*(Cx*Qy*(Dz*Zij + Qz)), I+24);
	    update((C[0][0])*W[a]*(Dy*Qx*(Cz*Zij + Pz)), I+25);
	    update((C[0][0])*W[a]*(Cy*Rx*(Cz*Zij + Pz)), I+26);
	    update((C[0][0])*W[a]*(Cx*Ry*(Cz*Zij + Pz)), I+27);
	    double f0 = (Iz*Rz + 2*B00*Dz);
	    update((C[0][0])*W[a]*(Cx*Cy*f0), I+28);
	    update((C[0][0])*W[a]*(Px*f0), I+29);
	    update((C[0][0])*W[a]*(Py*f0), I+30);
	    double f1 = (Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij);
	    update((C[0][0])*W[a]*(Dx*f1), I+31);
	    update((C[0][0])*W[a]*(Dy*f1), I+32);
	    double f12 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][0])*W[a]*(Cy*Dz*f12), I+33);
	    update((C[0][0])*W[a]*(Cz*Dy*f12), I+34);
	    update((C[0][0])*W[a]*(Qy*f12), I+35);
	    update((C[0][0])*W[a]*(Qz*f12), I+36);
	    double f15 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Dz*Qx*f15), I+37);
	    update((C[0][0])*W[a]*(Dx*Qz*f15), I+38);
	    update((C[0][0])*W[a]*(Cx*Rz*f15), I+39);
	    update((C[0][0])*W[a]*(Cz*Rx*f15), I+40);
	    double f16 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Dy*Dz*f16), I+41);
	    update((C[0][0])*W[a]*(Ry*f16), I+42);
	    update((C[0][0])*W[a]*(Rz*f16), I+43);
	    double f17 = (2*B00*Dx + Cx*Rx);
	    update((C[0][0])*W[a]*(Cy*Cz*(f17 + Rx*Xij)), I+44);
	    update((C[0][0])*W[a]*(Pz*(f17 + Rx*Xij)), I+45);
	    update((C[0][0])*W[a]*(Py*(f17 + Rx*Xij)), I+46);
	    update((C[0][0])*W[a]*(f17*(Cz*Zij + Pz)), I+47);
	    update((C[0][0])*W[a]*(Cz*Iy*f17), I+48);
	    update((C[0][0])*W[a]*(Cy*Iz*f17), I+49);
	    update((C[0][0])*W[a]*(f15*f17), I+50);
	    double f2 = (4*B00*Cy*Dy + Py*Ry + 2*pow(B00,2));
	    update((C[0][0])*W[a]*(Cz*(Yij*(2*B00*Dy + Cy*Ry) + f2)), I+51);
	    update((C[0][0])*W[a]*(Cx*(Yij*(2*B00*Dy + Cy*Ry) + f2)), I+52);
	    update((C[0][0])*W[a]*(Ix*f2), I+53);
	    update((C[0][0])*W[a]*(Iz*f2), I+54);
	    double f20 = (2*B00*Dx*(Xij + 2*Cx) + Rx*(Cx*Ix + B10) + 2*pow(B00,2));
	    update((C[0][0])*W[a]*(Cy*f20), I+55);
	    update((C[0][0])*W[a]*(Cz*f20), I+56);
	    double f22 = (Px*Rx + 2*pow(B00,2) + 4*B00*Cx*Dx);
	    update((C[0][0])*W[a]*(Iz*f22), I+57);
	    update((C[0][0])*W[a]*(Iy*f22), I+58);
	    double f24 = (2*B00*Cy*Yij + 3*B00*Py + Dy*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)));
	    update((C[0][0])*W[a]*(Dx*f24), I+59);
	    update((C[0][0])*W[a]*(Dz*f24), I+60);
	    double f26 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Dx*Pz*f26), I+61);
	    update((C[0][0])*W[a]*(Cz*Qx*f26), I+62);
	    update((C[0][0])*W[a]*(Dz*Px*f26), I+63);
	    update((C[0][0])*W[a]*(Cx*Qz*f26), I+64);
	    double f29 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(f29*(Dz*Zij + Qz)), I+65);
	    update((C[0][0])*W[a]*(Dz*Iy*f29), I+66);
	    update((C[0][0])*W[a]*(Dy*Iz*f29), I+67);
	    update((C[0][0])*W[a]*(f26*f29), I+68);
	    double f3 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cx*Dz*(Qy*Yij + f3)), I+69);
	    update((C[0][0])*W[a]*(Qz*(Qy*Yij + f3)), I+70);
	    update((C[0][0])*W[a]*(Qx*(Qy*Yij + f3)), I+71);
	    update((C[0][0])*W[a]*(Cz*Dx*(Qy*Yij + f3)), I+72);
	    update((C[0][0])*W[a]*(f3*(Dz*Zij + Qz)), I+73);
	    update((C[0][0])*W[a]*(Dz*Ix*f3), I+74);
	    update((C[0][0])*W[a]*(f3*(Dx*Xij + Qx)), I+75);
	    update((C[0][0])*W[a]*(Dx*Iz*f3), I+76);
	    double f30 = (2*B00*Dy + Iy*Ry);
	    update((C[0][0])*W[a]*(Cx*Cz*f30), I+77);
	    update((C[0][0])*W[a]*(Pz*f30), I+78);
	    update((C[0][0])*W[a]*(Px*f30), I+79);
	    double f31 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Dx*Dy*f31), I+80);
	    update((C[0][0])*W[a]*(Rx*f31), I+81);
	    update((C[0][0])*W[a]*(Ry*f31), I+82);
	    double f32 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[0][0])*W[a]*(Dz*f32), I+83);
	    update((C[0][0])*W[a]*(Dy*f32), I+84);
	    double f33 = (2*B00*Dy + Cy*Ry);
	    update((C[0][0])*W[a]*(f33*(Px + Cx*Xij)), I+85);
	    update((C[0][0])*W[a]*(Cx*Iz*f33), I+86);
	    update((C[0][0])*W[a]*(Cz*Ix*f33), I+87);
	    update((C[0][0])*W[a]*(f33*(Cz*Zij + Pz)), I+88);
	    double f39 = (Rz*(B10 + Cz*Iz) + 2*B00*Dz*(2*Cz + Zij) + 2*pow(B00,2));
	    update((C[0][0])*W[a]*(Cx*f39), I+89);
	    update((C[0][0])*W[a]*(Cy*f39), I+90);
	    double f40 = (2*pow(B00,2) + Pz*Rz + 4*B00*Cz*Dz);
	    update((C[0][0])*W[a]*(Ix*f40), I+91);
	    update((C[0][0])*W[a]*(Iy*f40), I+92);
	    double f5 = (2*B00*Dz + Cz*Rz);
	    update((C[0][0])*W[a]*(f5*(Px + Cx*Xij)), I+93);
	    update((C[0][0])*W[a]*(Cx*Iy*f5), I+94);
	    update((C[0][0])*W[a]*(Cy*Ix*f5), I+95);
	    update((C[0][0])*W[a]*(f15*f5), I+96);
	    double f6 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Dx*Dz*f6), I+97);
	    update((C[0][0])*W[a]*(Rx*f6), I+98);
	    update((C[0][0])*W[a]*(Rz*f6), I+99);
	    double f9 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cy*Dx*(f9 + Qz*Zij)), I+100);
	    update((C[0][0])*W[a]*(Qx*(f9 + Qz*Zij)), I+101);
	    update((C[0][0])*W[a]*(Qy*(f9 + Qz*Zij)), I+102);
	    update((C[0][0])*W[a]*(Cx*Dy*(f9 + Qz*Zij)), I+103);
	    update((C[0][0])*W[a]*(Dy*Ix*f9), I+104);
	    update((C[0][0])*W[a]*(Dx*Iy*f9), I+105);
	    update((C[0][0])*W[a]*(f9*(Dx*Xij + Qx)), I+106);
	    update((C[0][0])*W[a]*(f26*f9), I+107);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[108]) {
	double T[108];
	for (int i = 0; i < 108; ++i) {
	    T[i] = I[i];
	}
	I[69] = T[0];
	I[82] = T[1];
	I[95] = T[2];
	I[13] = T[3];
	I[8] = T[4];
	I[20] = T[5];
	I[30] = T[6];
	I[42] = T[7];
	I[37] = T[8];
	I[0] = T[9];
	I[56] = T[10];
	I[59] = T[11];
	I[73] = T[12];
	I[77] = T[13];
	I[39] = T[14];
	I[93] = T[15];
	I[94] = T[16];
	I[22] = T[17];
	I[25] = T[18];
	I[50] = T[19];
	I[71] = T[20];
	I[85] = T[21];
	I[87] = T[22];
	I[102] = T[23];
	I[105] = T[24];
	I[70] = T[25];
	I[17] = T[26];
	I[34] = T[27];
	I[51] = T[28];
	I[48] = T[29];
	I[49] = T[30];
	I[86] = T[31];
	I[104] = T[32];
	I[75] = T[33];
	I[58] = T[34];
	I[57] = T[35];
	I[76] = T[36];
	I[81] = T[37];
	I[83] = T[38];
	I[45] = T[39];
	I[11] = T[40];
	I[90] = T[41];
	I[18] = T[42];
	I[36] = T[43];
	I[5] = T[44];
	I[2] = T[45];
	I[1] = T[46];
	I[16] = T[47];
	I[10] = T[48];
	I[15] = T[49];
	I[9] = T[50];
	I[29] = T[51];
	I[27] = T[52];
	I[19] = T[53];
	I[31] = T[54];
	I[3] = T[55];
	I[4] = T[56];
	I[12] = T[57];
	I[6] = T[58];
	I[61] = T[59];
	I[97] = T[60];
	I[62] = T[61];
	I[64] = T[62];
	I[96] = T[63];
	I[100] = T[64];
	I[84] = T[65];
	I[78] = T[66];
	I[66] = T[67];
	I[60] = T[68];
	I[99] = T[69];
	I[101] = T[70];
	I[63] = T[71];
	I[65] = T[72];
	I[103] = T[73];
	I[91] = T[74];
	I[55] = T[75];
	I[67] = T[76];
	I[28] = T[77];
	I[26] = T[78];
	I[24] = T[79];
	I[68] = T[80];
	I[14] = T[81];
	I[32] = T[82];
	I[72] = T[83];
	I[54] = T[84];
	I[21] = T[85];
	I[33] = T[86];
	I[23] = T[87];
	I[35] = T[88];
	I[52] = T[89];
	I[53] = T[90];
	I[38] = T[91];
	I[44] = T[92];
	I[40] = T[93];
	I[46] = T[94];
	I[41] = T[95];
	I[47] = T[96];
	I[79] = T[97];
	I[7] = T[98];
	I[43] = T[99];
	I[89] = T[100];
	I[88] = T[101];
	I[107] = T[102];
	I[106] = T[103];
	I[92] = T[104];
	I[80] = T[105];
	I[74] = T[106];
	I[98] = T[107];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[108] = { 9, 46, 45, 55, 56, 44, 58, 98, 4, 50, 48, 40, 57, 3, 81, 49, 47, 26, 42, 53, 5, 85, 17, 87, 79, 18, 78, 52, 77, 51, 6, 54, 82, 86, 27, 88, 43, 8, 91, 14, 93, 95, 7, 99, 92, 39, 94, 96, 29, 30, 19, 28, 89, 90, 84, 75, 10, 35, 34, 11, 68, 59, 61, 71, 62, 72, 67, 76, 80, 0, 25, 20, 83, 12, 106, 33, 36, 13, 66, 97, 105, 37, 1, 38, 65, 21, 31, 22, 101, 100, 41, 74, 104, 15, 16, 2, 63, 60, 107, 69, 64, 70, 23, 73, 32, 24, 103, 102 };
// 	if (index < 108) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    69, 82, 95, 13, 8, 20, 30, 42, 37, 0, 56, 59, 73, 77, 39, 93, 94, 22, 25, 50, 71, 85, 87, 102, 105, 70, 17, 34, 51, 48, 49, 86, 104, 75, 58, 57, 76, 81, 83, 45, 11, 90, 18, 36, 5, 2, 1, 16, 10, 15, 9, 29, 27, 19, 31, 3, 4, 12, 6, 61, 97, 62, 64, 96, 100, 84, 78, 66, 60, 99, 101, 63, 65, 103, 91, 55, 67, 28, 26, 24, 68, 14, 32, 72, 54, 21, 33, 23, 35, 52, 53, 38, 44, 40, 46, 41, 47, 79, 7, 43, 89, 88, 107, 106, 92, 80, 74, 98
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 69;
	*idx++ = 82;
	*idx++ = 95;
	*idx++ = 13;
	*idx++ = 8;
	*idx++ = 20;
	*idx++ = 30;
	*idx++ = 42;
	*idx++ = 37;
	*idx++ = 0;
	*idx++ = 56;
	*idx++ = 59;
	*idx++ = 73;
	*idx++ = 77;
	*idx++ = 39;
	*idx++ = 93;
	*idx++ = 94;
	*idx++ = 22;
	*idx++ = 25;
	*idx++ = 50;
	*idx++ = 71;
	*idx++ = 85;
	*idx++ = 87;
	*idx++ = 102;
	*idx++ = 105;
	*idx++ = 70;
	*idx++ = 17;
	*idx++ = 34;
	*idx++ = 51;
	*idx++ = 48;
	*idx++ = 49;
	*idx++ = 86;
	*idx++ = 104;
	*idx++ = 75;
	*idx++ = 58;
	*idx++ = 57;
	*idx++ = 76;
	*idx++ = 81;
	*idx++ = 83;
	*idx++ = 45;
	*idx++ = 11;
	*idx++ = 90;
	*idx++ = 18;
	*idx++ = 36;
	*idx++ = 5;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 16;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 9;
	*idx++ = 29;
	*idx++ = 27;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 12;
	*idx++ = 6;
	*idx++ = 61;
	*idx++ = 97;
	*idx++ = 62;
	*idx++ = 64;
	*idx++ = 96;
	*idx++ = 100;
	*idx++ = 84;
	*idx++ = 78;
	*idx++ = 66;
	*idx++ = 60;
	*idx++ = 99;
	*idx++ = 101;
	*idx++ = 63;
	*idx++ = 65;
	*idx++ = 103;
	*idx++ = 91;
	*idx++ = 55;
	*idx++ = 67;
	*idx++ = 28;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 68;
	*idx++ = 14;
	*idx++ = 32;
	*idx++ = 72;
	*idx++ = 54;
	*idx++ = 21;
	*idx++ = 33;
	*idx++ = 23;
	*idx++ = 35;
	*idx++ = 52;
	*idx++ = 53;
	*idx++ = 38;
	*idx++ = 44;
	*idx++ = 40;
	*idx++ = 46;
	*idx++ = 41;
	*idx++ = 47;
	*idx++ = 79;
	*idx++ = 7;
	*idx++ = 43;
	*idx++ = 89;
	*idx++ = 88;
	*idx++ = 107;
	*idx++ = 106;
	*idx++ = 92;
	*idx++ = 80;
	*idx++ = 74;
	*idx++ = 98;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::D, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz), I+1);
	    update((C[0][0])*W[a]*(Dy*Dz), I+2);
	    update((C[1][0])*W[a]*(Dy*Dz*Kx), I+3);
	    update((C[1][0])*W[a]*(Dx*Dz*Ky), I+4);
	    update((C[1][0])*W[a]*(Dx*Dy*Kz), I+5);
	    update((C[1][0])*W[a]*(Kz*Rx), I+6);
	    update((C[1][0])*W[a]*(Ky*Rx), I+7);
	    update((C[0][0])*W[a]*(Rx), I+8);
	    update((C[1][0])*W[a]*(Kz*Ry), I+9);
	    update((C[1][0])*W[a]*(Kx*Ry), I+10);
	    update((C[0][0])*W[a]*(Ry), I+11);
	    update((C[1][0])*W[a]*(Kx*Rz), I+12);
	    update((C[1][0])*W[a]*(Ky*Rz), I+13);
	    update((C[0][0])*W[a]*(Rz), I+14);
	    update((C[1][0])*W[a]*((Kx*pow(Dx,2) + B01*(3*Dx + Xkl))), I+15);
	    update((C[1][0])*W[a]*(Dz*(Rx + Dx*Xkl)), I+16);
	    update((C[1][0])*W[a]*(Dy*(Rx + Dx*Xkl)), I+17);
	    update((C[1][0])*W[a]*(Dx*(Ry + Dy*Ykl)), I+18);
	    update((C[1][0])*W[a]*(Dz*(Ry + Dy*Ykl)), I+19);
	    update((C[1][0])*W[a]*((B01*(3*Dy + Ykl) + Ky*pow(Dy,2))), I+20);
	    update((C[1][0])*W[a]*((B01*(3*Dz + Zkl) + Kz*pow(Dz,2))), I+21);
	    double f5 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Dx*f5), I+22);
	    update((C[1][0])*W[a]*(Dy*f5), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[4] = T[1];
	I[5] = T[2];
	I[11] = T[3];
	I[16] = T[4];
	I[21] = T[5];
	I[18] = T[6];
	I[12] = T[7];
	I[0] = T[8];
	I[19] = T[9];
	I[7] = T[10];
	I[1] = T[11];
	I[8] = T[12];
	I[14] = T[13];
	I[2] = T[14];
	I[6] = T[15];
	I[10] = T[16];
	I[9] = T[17];
	I[15] = T[18];
	I[17] = T[19];
	I[13] = T[20];
	I[20] = T[21];
	I[22] = T[22];
	I[23] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 8, 11, 14, 0, 1, 2, 15, 10, 12, 17, 16, 3, 7, 20, 13, 18, 4, 19, 6, 9, 21, 5, 22, 23 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 4, 5, 11, 16, 21, 18, 12, 0, 19, 7, 1, 8, 14, 2, 6, 10, 9, 15, 17, 13, 20, 22, 23
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 11;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 8;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 6;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 20;
	*idx++ = 22;
	*idx++ = 23;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz), I+1);
	    update((C[0][0])*W[a]*(Dy*Dz), I+2);
	    update((C[0][1])*W[a]*(Dy*Dz*Ix), I+3);
	    update((C[0][1])*W[a]*(Dx*Dz*Iy), I+4);
	    update((C[0][1])*W[a]*(Dx*Dy*Iz), I+5);
	    update((C[0][1])*W[a]*((2*B00*Dx + Ix*Rx)), I+6);
	    update((C[0][1])*W[a]*(Iz*Rx), I+7);
	    update((C[0][1])*W[a]*(Iy*Rx), I+8);
	    update((C[0][0])*W[a]*(Rx), I+9);
	    update((C[0][1])*W[a]*((2*B00*Dy + Iy*Ry)), I+10);
	    update((C[0][1])*W[a]*(Ix*Ry), I+11);
	    update((C[0][1])*W[a]*(Iz*Ry), I+12);
	    update((C[0][0])*W[a]*(Ry), I+13);
	    update((C[0][1])*W[a]*((Iz*Rz + 2*B00*Dz)), I+14);
	    update((C[0][1])*W[a]*(Iy*Rz), I+15);
	    update((C[0][1])*W[a]*(Ix*Rz), I+16);
	    update((C[0][0])*W[a]*(Rz), I+17);
	    double f3 = (B00 + Dz*Iz);
	    update((C[0][1])*W[a]*(Dx*f3), I+18);
	    update((C[0][1])*W[a]*(Dy*f3), I+19);
	    double f6 = (Dy*Iy + B00);
	    update((C[0][1])*W[a]*(Dx*f6), I+20);
	    update((C[0][1])*W[a]*(Dz*f6), I+21);
	    double f7 = (Dx*Ix + B00);
	    update((C[0][1])*W[a]*(Dz*f7), I+22);
	    update((C[0][1])*W[a]*(Dy*f7), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[12] = T[0];
	I[16] = T[1];
	I[20] = T[2];
	I[21] = T[3];
	I[18] = T[4];
	I[15] = T[5];
	I[1] = T[6];
	I[3] = T[7];
	I[2] = T[8];
	I[0] = T[9];
	I[6] = T[10];
	I[5] = T[11];
	I[7] = T[12];
	I[4] = T[13];
	I[11] = T[14];
	I[10] = T[15];
	I[9] = T[16];
	I[8] = T[17];
	I[19] = T[18];
	I[23] = T[19];
	I[14] = T[20];
	I[22] = T[21];
	I[17] = T[22];
	I[13] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 9, 6, 8, 7, 13, 11, 10, 12, 17, 16, 15, 14, 0, 23, 20, 5, 1, 22, 4, 18, 2, 3, 21, 19 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    12, 16, 20, 21, 18, 15, 1, 3, 2, 0, 6, 5, 7, 4, 11, 10, 9, 8, 19, 23, 14, 22, 17, 13
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 12;
	*idx++ = 16;
	*idx++ = 20;
	*idx++ = 21;
	*idx++ = 18;
	*idx++ = 15;
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 19;
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 17;
	*idx++ = 13;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::D, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[36]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Ky*Kz*Rx), I+0);
	    update((C[0][0])*W[a]*(Kx*Kz*Ry), I+1);
	    update((C[0][0])*W[a]*(Kx*Ky*Rz), I+2);
	    update((C[0][0])*W[a]*(Dy*Kz*(Rx + Dx*Xkl)), I+3);
	    update((C[0][0])*W[a]*(Dz*Ky*(Rx + Dx*Xkl)), I+4);
	    update((C[0][0])*W[a]*(Dy*Dz*(Rx + Xkl*(Xkl + 2*Dx))), I+5);
	    update((C[0][0])*W[a]*(Rz*(Rx + Xkl*(Xkl + 2*Dx))), I+6);
	    update((C[0][0])*W[a]*(Ry*(Rx + Xkl*(Xkl + 2*Dx))), I+7);
	    update((C[0][0])*W[a]*(Dz*Kx*(Ry + Dy*Ykl)), I+8);
	    update((C[0][0])*W[a]*(Dx*Dz*(Ykl*(Ykl + 2*Dy) + Ry)), I+9);
	    update((C[0][0])*W[a]*(Dx*Kz*(Ry + Dy*Ykl)), I+10);
	    update((C[0][0])*W[a]*(Rz*(Ykl*(Ykl + 2*Dy) + Ry)), I+11);
	    update((C[0][0])*W[a]*(Rx*(Ykl*(Ykl + 2*Dy) + Ry)), I+12);
	    update((C[0][0])*W[a]*((Rx + Dx*Xkl)*(Ry + Dy*Ykl)), I+13);
	    update((C[0][0])*W[a]*((Ry + Dy*Ykl)*(Rz + Dz*Zkl)), I+14);
	    update((C[0][0])*W[a]*((Rx + Dx*Xkl)*(Rz + Dz*Zkl)), I+15);
	    update((C[0][0])*W[a]*(Ry*(Rz + Zkl*(2*Dz + Zkl))), I+16);
	    update((C[0][0])*W[a]*(Rx*(Rz + Zkl*(2*Dz + Zkl))), I+17);
	    update((C[0][0])*W[a]*(Dx*Ky*(Rz + Dz*Zkl)), I+18);
	    update((C[0][0])*W[a]*(Dx*Dy*(Rz + Zkl*(2*Dz + Zkl))), I+19);
	    update((C[0][0])*W[a]*(Dy*Kx*(Rz + Dz*Zkl)), I+20);
	    double f13 = (Kx*pow(Dx,2) + B01*(3*Dx + Xkl));
	    update((C[0][0])*W[a]*(Kz*f13), I+21);
	    update((C[0][0])*W[a]*(Ky*f13), I+22);
	    double f16 = (B01*(2*Zkl + 3*Dz) + Dz*pow(Kz,2));
	    update((C[0][0])*W[a]*(Dx*f16), I+23);
	    update((C[0][0])*W[a]*(Dy*f16), I+24);
	    double f17 = (B01*(3*Dy + Ykl) + Ky*pow(Dy,2));
	    update((C[0][0])*W[a]*(Kx*f17), I+25);
	    update((C[0][0])*W[a]*(Kz*f17), I+26);
	    double f18 = (B01*(3*Dz + Zkl) + Kz*pow(Dz,2));
	    update((C[0][0])*W[a]*(Kx*f18), I+27);
	    update((C[0][0])*W[a]*(Ky*f18), I+28);
	    double f2 = 3*pow(B01,2);
	    update((C[0][0])*W[a]*((pow(Dy,2)*pow(Ky,2) + f2 + B01*(6*Dy*Ykl + pow(Ykl,2) + 6*pow(Dy,2)))), I+29);
	    update((C[0][0])*W[a]*((B01*(pow(Xkl,2) + 6*Dx*Xkl + 6*pow(Dx,2)) + pow(Dx,2)*pow(Kx,2) + f2)), I+30);
	    update((C[0][0])*W[a]*((B01*(6*pow(Dz,2) + 6*Dz*Zkl + pow(Zkl,2)) + f2 + pow(Dz,2)*pow(Kz,2))), I+31);
	    double f4 = (B01*(2*Xkl + 3*Dx) + Dx*pow(Kx,2));
	    update((C[0][0])*W[a]*(Dy*f4), I+32);
	    update((C[0][0])*W[a]*(Dz*f4), I+33);
	    double f9 = (Dy*pow(Ky,2) + B01*(3*Dy + 2*Ykl));
	    update((C[0][0])*W[a]*(Dz*f9), I+34);
	    update((C[0][0])*W[a]*(Dx*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[25] = T[1];
	I[20] = T[2];
	I[27] = T[3];
	I[22] = T[4];
	I[5] = T[5];
	I[2] = T[6];
	I[1] = T[7];
	I[23] = T[8];
	I[10] = T[9];
	I[33] = T[10];
	I[8] = T[11];
	I[6] = T[12];
	I[21] = T[13];
	I[35] = T[14];
	I[28] = T[15];
	I[13] = T[16];
	I[12] = T[17];
	I[34] = T[18];
	I[15] = T[19];
	I[29] = T[20];
	I[24] = T[21];
	I[18] = T[22];
	I[16] = T[23];
	I[17] = T[24];
	I[19] = T[25];
	I[31] = T[26];
	I[26] = T[27];
	I[32] = T[28];
	I[7] = T[29];
	I[0] = T[30];
	I[14] = T[31];
	I[3] = T[32];
	I[4] = T[33];
	I[11] = T[34];
	I[9] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 30, 7, 6, 32, 33, 5, 12, 29, 11, 35, 9, 34, 17, 16, 31, 19, 23, 24, 22, 25, 2, 13, 4, 8, 21, 1, 27, 3, 15, 20, 0, 26, 28, 10, 18, 14 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    30, 25, 20, 27, 22, 5, 2, 1, 23, 10, 33, 8, 6, 21, 35, 28, 13, 12, 34, 15, 29, 24, 18, 16, 17, 19, 31, 26, 32, 7, 0, 14, 3, 4, 11, 9
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 30;
	*idx++ = 25;
	*idx++ = 20;
	*idx++ = 27;
	*idx++ = 22;
	*idx++ = 5;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 23;
	*idx++ = 10;
	*idx++ = 33;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 21;
	*idx++ = 35;
	*idx++ = 28;
	*idx++ = 13;
	*idx++ = 12;
	*idx++ = 34;
	*idx++ = 15;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 26;
	*idx++ = 32;
	*idx++ = 7;
	*idx++ = 0;
	*idx++ = 14;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::F, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    double f11 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Ix*Kz*f11), I+0);
	    update((C[0][0])*W[a]*(Iz*Kx*f11), I+1);
	    double f12 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Ix*Iz*f12), I+2);
	    double f13 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Iy*Kx*f13), I+3);
	    update((C[0][0])*W[a]*(Ix*Ky*f13), I+4);
	    update((C[0][0])*W[a]*(f12*f13), I+5);
	    double f14 = (3*B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Iz*Ky*f14), I+6);
	    update((C[0][0])*W[a]*(Iz*Kx*f14), I+7);
	    double f15 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Iy*Kz*f15), I+8);
	    update((C[0][0])*W[a]*(Iz*Ky*f15), I+9);
	    update((C[0][0])*W[a]*(f12*f15), I+10);
	    double f16 = (3*B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Ix*Ky*f16), I+11);
	    update((C[0][0])*W[a]*(Ix*Kz*f16), I+12);
	    double f17 = (2*B00*Ix + Kx*(B10 + pow(Ix,2)));
	    update((C[0][0])*W[a]*(Iy*f17), I+13);
	    update((C[0][0])*W[a]*(Iz*f17), I+14);
	    double f18 = (2*B00*Iy + Ky*(B10 + pow(Iy,2)));
	    update((C[0][0])*W[a]*(Iz*f18), I+15);
	    update((C[0][0])*W[a]*(Ix*f18), I+16);
	    double f2 = 3*B00*B10;
	    update((C[0][0])*W[a]*((Ix*Kx*(3*B10 + pow(Ix,2)) + f2 + 3*B00*pow(Ix,2))), I+17);
	    update((C[0][0])*W[a]*((Iy*Ky*(3*B10 + pow(Iy,2)) + f2 + 3*B00*pow(Iy,2))), I+18);
	    update((C[0][0])*W[a]*((3*B00*pow(Iz,2) + f2 + Iz*Kz*(3*B10 + pow(Iz,2)))), I+19);
	    double f4 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Iy*Iz*f4), I+20);
	    update((C[0][0])*W[a]*(f11*f4), I+21);
	    update((C[0][0])*W[a]*(f13*f4), I+22);
	    double f5 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Ix*Iy*f5), I+23);
	    update((C[0][0])*W[a]*(f15*f5), I+24);
	    update((C[0][0])*W[a]*(f11*f5), I+25);
	    double f7 = (3*B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Iy*Kz*f7), I+26);
	    update((C[0][0])*W[a]*(Iy*Kx*f7), I+27);
	    double f8 = (2*B00*Iz + Kz*(B10 + pow(Iz,2)));
	    update((C[0][0])*W[a]*(Ix*f8), I+28);
	    update((C[0][0])*W[a]*(Iy*f8), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[25] = T[0];
	I[6] = T[1];
	I[19] = T[2];
	I[8] = T[3];
	I[17] = T[4];
	I[18] = T[5];
	I[12] = T[6];
	I[2] = T[7];
	I[23] = T[8];
	I[14] = T[9];
	I[13] = T[10];
	I[10] = T[11];
	I[20] = T[12];
	I[3] = T[13];
	I[4] = T[14];
	I[16] = T[15];
	I[15] = T[16];
	I[0] = T[17];
	I[11] = T[18];
	I[22] = T[19];
	I[9] = T[20];
	I[5] = T[21];
	I[7] = T[22];
	I[29] = T[23];
	I[24] = T[24];
	I[26] = T[25];
	I[21] = T[26];
	I[1] = T[27];
	I[27] = T[28];
	I[28] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 17, 27, 7, 13, 14, 21, 1, 22, 3, 20, 11, 18, 6, 10, 9, 16, 15, 4, 5, 2, 12, 26, 19, 8, 24, 0, 25, 28, 29, 23 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    25, 6, 19, 8, 17, 18, 12, 2, 23, 14, 13, 10, 20, 3, 4, 16, 15, 0, 11, 22, 9, 5, 7, 29, 24, 26, 21, 1, 27, 28
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 19;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 10;
	*idx++ = 20;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 21;
	*idx++ = 1;
	*idx++ = 27;
	*idx++ = 28;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cy*Kz*Px), I+0);
	    update((C[0][0])*W[a]*(Cz*Ky*Px), I+1);
	    update((C[0][0])*W[a]*(Cx*Kz*Py), I+2);
	    update((C[0][0])*W[a]*(Cz*Kx*Py), I+3);
	    update((C[0][0])*W[a]*(Cy*Kx*Pz), I+4);
	    update((C[0][0])*W[a]*(Cx*Ky*Pz), I+5);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*(Cx*Ky*f0), I+6);
	    update((C[0][0])*W[a]*(Cx*Kz*f0), I+7);
	    double f11 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Cx*Cz*f11), I+8);
	    update((C[0][0])*W[a]*(Pz*f11), I+9);
	    update((C[0][0])*W[a]*(Px*f11), I+10);
	    double f12 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*(Cz*Ky*f12), I+11);
	    update((C[0][0])*W[a]*(Cz*Kx*f12), I+12);
	    double f13 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*(Cy*Kz*f13), I+13);
	    update((C[0][0])*W[a]*(Cy*Kx*f13), I+14);
	    double f14 = (Kz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*f14), I+15);
	    update((C[0][0])*W[a]*(Cy*f14), I+16);
	    double f15 = (B00 + Cz*Kz);
	    update((C[0][0])*W[a]*(Cx*Cy*f15), I+17);
	    update((C[0][0])*W[a]*(Px*f15), I+18);
	    update((C[0][0])*W[a]*(Py*f15), I+19);
	    double f2 = 3*B00*B10;
	    update((C[0][0])*W[a]*((Cx*Kx*f0 + 3*B00*pow(Cx,2) + f2)), I+20);
	    update((C[0][0])*W[a]*((3*B00*pow(Cy,2) + Cy*Ky*f13 + f2)), I+21);
	    update((C[0][0])*W[a]*((f2 + 3*B00*pow(Cz,2) + Cz*Kz*f12)), I+22);
	    double f5 = (Kx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cy*f5), I+23);
	    update((C[0][0])*W[a]*(Cz*f5), I+24);
	    double f7 = (Ky*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cz*f7), I+25);
	    update((C[0][0])*W[a]*(Cx*f7), I+26);
	    double f9 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Cy*Cz*f9), I+27);
	    update((C[0][0])*W[a]*(Py*f9), I+28);
	    update((C[0][0])*W[a]*(Pz*f9), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[23] = T[0];
	I[14] = T[1];
	I[25] = T[2];
	I[6] = T[3];
	I[8] = T[4];
	I[17] = T[5];
	I[10] = T[6];
	I[20] = T[7];
	I[19] = T[8];
	I[18] = T[9];
	I[13] = T[10];
	I[12] = T[11];
	I[2] = T[12];
	I[21] = T[13];
	I[1] = T[14];
	I[27] = T[15];
	I[28] = T[16];
	I[29] = T[17];
	I[24] = T[18];
	I[26] = T[19];
	I[0] = T[20];
	I[11] = T[21];
	I[22] = T[22];
	I[3] = T[23];
	I[4] = T[24];
	I[16] = T[25];
	I[15] = T[26];
	I[9] = T[27];
	I[5] = T[28];
	I[7] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 20, 14, 12, 23, 24, 28, 3, 29, 4, 27, 6, 21, 11, 10, 1, 26, 25, 5, 9, 8, 7, 13, 22, 0, 18, 2, 19, 15, 16, 17 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    23, 14, 25, 6, 8, 17, 10, 20, 19, 18, 13, 12, 2, 21, 1, 27, 28, 29, 24, 26, 0, 11, 22, 3, 4, 16, 15, 9, 5, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 10;
	*idx++ = 20;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 13;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 21;
	*idx++ = 1;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::P, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cy*Iz*Px), I+0);
	    update((C[0][0])*W[a]*(Cz*Iy*Px), I+1);
	    update((C[0][0])*W[a]*(Cx*Iz*Py), I+2);
	    update((C[0][0])*W[a]*(Cz*Ix*Py), I+3);
	    update((C[0][0])*W[a]*(Cy*Ix*Pz), I+4);
	    update((C[0][0])*W[a]*(Cx*Iy*Pz), I+5);
	    update((C[0][0])*W[a]*(Cy*Cz*(Px + Cx*Xij)), I+6);
	    update((C[0][0])*W[a]*(Py*(Px + Cx*Xij)), I+7);
	    update((C[0][0])*W[a]*(Pz*(Px + Cx*Xij)), I+8);
	    update((C[0][0])*W[a]*(Cx*Cy*(Cz*Zij + Pz)), I+9);
	    update((C[0][0])*W[a]*(Px*(Cz*Zij + Pz)), I+10);
	    update((C[0][0])*W[a]*(Py*(Cz*Zij + Pz)), I+11);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*(Cx*Iy*f0), I+12);
	    update((C[0][0])*W[a]*(Cx*Iz*f0), I+13);
	    double f11 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*(Cz*Ix*f11), I+14);
	    update((C[0][0])*W[a]*(Cz*Iy*f11), I+15);
	    double f12 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*(Cy*Ix*f12), I+16);
	    update((C[0][0])*W[a]*(Cy*Iz*f12), I+17);
	    double f13 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Cx*f13), I+18);
	    update((C[0][0])*W[a]*(Cy*f13), I+19);
	    double f2 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Cz*f2), I+20);
	    update((C[0][0])*W[a]*(Cx*f2), I+21);
	    double f5 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Cx*Cz*f5), I+22);
	    update((C[0][0])*W[a]*(Px*f5), I+23);
	    update((C[0][0])*W[a]*(Pz*f5), I+24);
	    double f6 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Cy*f6), I+25);
	    update((C[0][0])*W[a]*(Cz*f6), I+26);
	    double f8 = 3*pow(B10,2);
	    update((C[0][0])*W[a]*((3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3) + f8)), I+27);
	    update((C[0][0])*W[a]*((Iy*pow(Cy,3) + f8 + 3*B10*Cy*(Yij + 2*Cy))), I+28);
	    update((C[0][0])*W[a]*((3*B10*Cz*(2*Cz + Zij) + f8 + Iz*pow(Cz,3))), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[23] = T[0];
	I[14] = T[1];
	I[25] = T[2];
	I[6] = T[3];
	I[8] = T[4];
	I[17] = T[5];
	I[9] = T[6];
	I[5] = T[7];
	I[7] = T[8];
	I[29] = T[9];
	I[24] = T[10];
	I[26] = T[11];
	I[10] = T[12];
	I[20] = T[13];
	I[2] = T[14];
	I[12] = T[15];
	I[1] = T[16];
	I[21] = T[17];
	I[27] = T[18];
	I[28] = T[19];
	I[16] = T[20];
	I[15] = T[21];
	I[19] = T[22];
	I[13] = T[23];
	I[18] = T[24];
	I[3] = T[25];
	I[4] = T[26];
	I[0] = T[27];
	I[11] = T[28];
	I[22] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 27, 16, 14, 25, 26, 7, 3, 8, 4, 6, 12, 28, 15, 23, 1, 21, 20, 5, 24, 22, 13, 17, 29, 0, 10, 2, 11, 18, 19, 9 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    23, 14, 25, 6, 8, 17, 9, 5, 7, 29, 24, 26, 10, 20, 2, 12, 1, 21, 27, 28, 16, 15, 19, 13, 18, 3, 4, 0, 11, 22
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 10;
	*idx++ = 20;
	*idx++ = 2;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 21;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 19;
	*idx++ = 13;
	*idx++ = 18;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[3]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {


	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Ix), I+0);
	    update((C[0][0])*W[a]*(Iy), I+1);
	    update((C[0][0])*W[a]*(Iz), I+2);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[3]) {
	double T[3];
	for (int i = 0; i < 3; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[3] = { 0, 1, 2 };
// 	if (index < 3) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cx*Cy*Cz), I+0);
	    update((C[1][0])*W[a]*(Cz*Ky*Px), I+1);
	    update((C[0][0])*W[a]*(Cz*Px), I+2);
	    update((C[1][0])*W[a]*(Cy*Kz*Px), I+3);
	    update((C[0][0])*W[a]*(Cy*Px), I+4);
	    update((C[1][0])*W[a]*(Cx*Kz*Py), I+5);
	    update((C[1][0])*W[a]*(Cz*Kx*Py), I+6);
	    update((C[0][0])*W[a]*(Cz*Py), I+7);
	    update((C[0][0])*W[a]*(Cx*Py), I+8);
	    update((C[0][0])*W[a]*(Cy*Pz), I+9);
	    update((C[1][0])*W[a]*(Cy*Kx*Pz), I+10);
	    update((C[0][0])*W[a]*(Cx*Pz), I+11);
	    update((C[1][0])*W[a]*(Cx*Ky*Pz), I+12);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[1][0])*W[a]*(Cx*Kz*f0), I+13);
	    update((C[1][0])*W[a]*(Cx*Ky*f0), I+14);
	    update((C[0][0])*W[a]*(Cx*f0), I+15);
	    double f11 = (B00 + Cy*Ky);
	    update((C[1][0])*W[a]*(Cx*Cz*f11), I+16);
	    update((C[1][0])*W[a]*(Px*f11), I+17);
	    update((C[1][0])*W[a]*(Pz*f11), I+18);
	    double f12 = (3*B10 + pow(Cz,2));
	    update((C[1][0])*W[a]*(Cz*Ky*f12), I+19);
	    update((C[1][0])*W[a]*(Cz*Kx*f12), I+20);
	    update((C[0][0])*W[a]*(Cz*f12), I+21);
	    double f13 = (3*B10 + pow(Cy,2));
	    update((C[1][0])*W[a]*(Cy*Kx*f13), I+22);
	    update((C[0][0])*W[a]*(Cy*f13), I+23);
	    update((C[1][0])*W[a]*(Cy*Kz*f13), I+24);
	    double f14 = (Kz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*(Cx*f14), I+25);
	    update((C[1][0])*W[a]*(Cy*f14), I+26);
	    double f15 = (B00 + Cz*Kz);
	    update((C[1][0])*W[a]*(Cx*Cy*f15), I+27);
	    update((C[1][0])*W[a]*(Px*f15), I+28);
	    update((C[1][0])*W[a]*(Py*f15), I+29);
	    double f2 = 3*B00*B10;
	    update((C[1][0])*W[a]*((Cx*Kx*f0 + 3*B00*pow(Cx,2) + f2)), I+30);
	    update((C[1][0])*W[a]*((3*B00*pow(Cy,2) + Cy*Ky*f13 + f2)), I+31);
	    update((C[1][0])*W[a]*((f2 + 3*B00*pow(Cz,2) + Cz*Kz*f12)), I+32);
	    double f5 = (Kx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Cy*f5), I+33);
	    update((C[1][0])*W[a]*(Cz*f5), I+34);
	    double f7 = (Ky*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(Cx*f7), I+35);
	    update((C[1][0])*W[a]*(Cz*f7), I+36);
	    double f9 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Cy*Cz*f9), I+37);
	    update((C[1][0])*W[a]*(Py*f9), I+38);
	    update((C[1][0])*W[a]*(Pz*f9), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[24] = T[1];
	I[4] = T[2];
	I[33] = T[3];
	I[3] = T[4];
	I[35] = T[5];
	I[16] = T[6];
	I[6] = T[7];
	I[5] = T[8];
	I[8] = T[9];
	I[18] = T[10];
	I[7] = T[11];
	I[27] = T[12];
	I[30] = T[13];
	I[20] = T[14];
	I[0] = T[15];
	I[29] = T[16];
	I[23] = T[17];
	I[28] = T[18];
	I[22] = T[19];
	I[12] = T[20];
	I[2] = T[21];
	I[11] = T[22];
	I[1] = T[23];
	I[31] = T[24];
	I[37] = T[25];
	I[38] = T[26];
	I[39] = T[27];
	I[34] = T[28];
	I[36] = T[29];
	I[10] = T[30];
	I[21] = T[31];
	I[32] = T[32];
	I[13] = T[33];
	I[14] = T[34];
	I[25] = T[35];
	I[26] = T[36];
	I[19] = T[37];
	I[15] = T[38];
	I[17] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 15, 23, 21, 4, 2, 8, 7, 11, 9, 0, 30, 22, 20, 33, 34, 38, 6, 39, 10, 37, 14, 31, 19, 17, 1, 35, 36, 12, 18, 16, 13, 24, 32, 3, 28, 5, 29, 25, 26, 27 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 24, 4, 33, 3, 35, 16, 6, 5, 8, 18, 7, 27, 30, 20, 0, 29, 23, 28, 22, 12, 2, 11, 1, 31, 37, 38, 39, 34, 36, 10, 21, 32, 13, 14, 25, 26, 19, 15, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 35;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 8;
	*idx++ = 18;
	*idx++ = 7;
	*idx++ = 27;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 29;
	*idx++ = 23;
	*idx++ = 28;
	*idx++ = 22;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 31;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 39;
	*idx++ = 34;
	*idx++ = 36;
	*idx++ = 10;
	*idx++ = 21;
	*idx++ = 32;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 25;
	*idx++ = 26;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*((B00 + Cx*Kx)), I+0);
	    update((C[0][1])*W[a]*(Cz*Kx), I+1);
	    update((C[0][1])*W[a]*(Cy*Kx), I+2);
	    update((C[0][0])*W[a]*(Kx), I+3);
	    update((C[0][1])*W[a]*((B00 + Cy*Ky)), I+4);
	    update((C[0][1])*W[a]*(Cz*Ky), I+5);
	    update((C[0][1])*W[a]*(Cx*Ky), I+6);
	    update((C[0][0])*W[a]*(Ky), I+7);
	    update((C[0][1])*W[a]*((B00 + Cz*Kz)), I+8);
	    update((C[0][1])*W[a]*(Cx*Kz), I+9);
	    update((C[0][1])*W[a]*(Cy*Kz), I+10);
	    update((C[0][0])*W[a]*(Kz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[3] = T[1];
	I[2] = T[2];
	I[0] = T[3];
	I[6] = T[4];
	I[7] = T[5];
	I[5] = T[6];
	I[4] = T[7];
	I[11] = T[8];
	I[9] = T[9];
	I[10] = T[10];
	I[8] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 3, 0, 2, 1, 7, 6, 4, 5, 11, 9, 10, 8 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 3, 2, 0, 6, 7, 5, 4, 11, 9, 10, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::SP, rysq::SP, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][4],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[4][4],
	      double (&I)[256]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][4],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][4],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Qy = (Cy*Dy + B00);
 	    double Ry = (B01 + pow(Dy,2));

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Cx), I+1);
	    update((C[0][1])*W[a]*(Cy), I+2);
	    update((C[0][1])*W[a]*(Cz), I+3);
	    update((C[1][1])*W[a]*(Cy*Dx), I+4);
	    update((C[1][1])*W[a]*(Cz*Dx), I+5);
	    update((C[1][0])*W[a]*(Dx), I+6);
	    update((C[1][1])*W[a]*(Cz*Dy), I+7);
	    update((C[1][1])*W[a]*(Cx*Dy), I+8);
	    update((C[1][0])*W[a]*(Dy), I+9);
	    update((C[1][1])*W[a]*(Cy*Dz), I+10);
	    update((C[1][1])*W[a]*(Cx*Dz), I+11);
	    update((C[1][0])*W[a]*(Dz), I+12);
	    update((C[1][3])*W[a]*(Dz*(Cx*Ix + B10)), I+13);
	    update((C[1][3])*W[a]*(Dy*(Cx*Ix + B10)), I+14);
	    update((C[0][3])*W[a]*((Cx*Ix + B10)), I+15);
	    update((C[1][2])*W[a]*(Dy*Ix), I+16);
	    update((C[1][3])*W[a]*(Cz*Dy*Ix), I+17);
	    update((C[0][3])*W[a]*(Cz*Ix), I+18);
	    update((C[1][2])*W[a]*(Dz*Ix), I+19);
	    update((C[1][3])*W[a]*(Cy*Dz*Ix), I+20);
	    update((C[0][3])*W[a]*(Cy*Ix), I+21);
	    update((C[0][2])*W[a]*(Ix), I+22);
	    update((C[1][3])*W[a]*(Dz*(Cy*Iy + B10)), I+23);
	    update((C[0][3])*W[a]*((Cy*Iy + B10)), I+24);
	    update((C[1][3])*W[a]*(Dx*(Cy*Iy + B10)), I+25);
	    update((C[1][2])*W[a]*(Dz*Iy), I+26);
	    update((C[1][2])*W[a]*(Dx*Iy), I+27);
	    update((C[1][3])*W[a]*(Cz*Dx*Iy), I+28);
	    update((C[0][3])*W[a]*(Cz*Iy), I+29);
	    update((C[1][3])*W[a]*(Cx*Dz*Iy), I+30);
	    update((C[0][3])*W[a]*(Cx*Iy), I+31);
	    update((C[0][2])*W[a]*(Iy), I+32);
	    update((C[1][3])*W[a]*(Dy*(B10 + Cz*Iz)), I+33);
	    update((C[1][3])*W[a]*(Dx*(B10 + Cz*Iz)), I+34);
	    update((C[0][3])*W[a]*((B10 + Cz*Iz)), I+35);
	    update((C[1][2])*W[a]*(Dy*Iz), I+36);
	    update((C[1][2])*W[a]*(Dx*Iz), I+37);
	    update((C[1][3])*W[a]*(Cy*Dx*Iz), I+38);
	    update((C[0][3])*W[a]*(Cy*Iz), I+39);
	    update((C[1][3])*W[a]*(Cx*Dy*Iz), I+40);
	    update((C[0][3])*W[a]*(Cx*Iz), I+41);
	    update((C[0][2])*W[a]*(Iz), I+42);
	    update((C[3][1])*W[a]*(Cy*Dz*Kx), I+43);
	    update((C[3][2])*W[a]*(Dz*Iy*Kx), I+44);
	    update((C[2][3])*W[a]*(Kx*(B10 + Cz*Iz)), I+45);
	    update((C[3][3])*W[a]*(Dy*Kx*(B10 + Cz*Iz)), I+46);
	    update((C[2][3])*W[a]*(Kx*(Cy*Iy + B10)), I+47);
	    update((C[3][3])*W[a]*(Dz*Kx*(Cy*Iy + B10)), I+48);
	    update((C[2][3])*W[a]*(Cy*Iz*Kx), I+49);
	    update((C[2][1])*W[a]*(Cy*Kx), I+50);
	    update((C[2][2])*W[a]*(Iy*Kx), I+51);
	    update((C[2][2])*W[a]*(Iz*Kx), I+52);
	    update((C[3][2])*W[a]*(Dy*Iz*Kx), I+53);
	    update((C[3][0])*W[a]*(Dy*Kx), I+54);
	    update((C[3][0])*W[a]*(Dz*Kx), I+55);
	    update((C[2][3])*W[a]*(Cz*Iy*Kx), I+56);
	    update((C[2][1])*W[a]*(Cz*Kx), I+57);
	    update((C[3][1])*W[a]*(Cz*Dy*Kx), I+58);
	    update((C[2][0])*W[a]*(Kx), I+59);
	    update((C[2][3])*W[a]*(Ky*(B10 + Cz*Iz)), I+60);
	    update((C[3][3])*W[a]*(Dx*Ky*(B10 + Cz*Iz)), I+61);
	    update((C[2][3])*W[a]*(Ky*(Cx*Ix + B10)), I+62);
	    update((C[3][3])*W[a]*(Dz*Ky*(Cx*Ix + B10)), I+63);
	    update((C[3][2])*W[a]*(Dx*Iz*Ky), I+64);
	    update((C[2][2])*W[a]*(Iz*Ky), I+65);
	    update((C[3][0])*W[a]*(Dx*Ky), I+66);
	    update((C[2][2])*W[a]*(Ix*Ky), I+67);
	    update((C[2][3])*W[a]*(Cz*Ix*Ky), I+68);
	    update((C[2][1])*W[a]*(Cz*Ky), I+69);
	    update((C[3][1])*W[a]*(Cz*Dx*Ky), I+70);
	    update((C[3][2])*W[a]*(Dz*Ix*Ky), I+71);
	    update((C[3][0])*W[a]*(Dz*Ky), I+72);
	    update((C[2][3])*W[a]*(Cx*Iz*Ky), I+73);
	    update((C[2][1])*W[a]*(Cx*Ky), I+74);
	    update((C[3][1])*W[a]*(Cx*Dz*Ky), I+75);
	    update((C[2][0])*W[a]*(Ky), I+76);
	    update((C[2][3])*W[a]*(Kz*(Cx*Ix + B10)), I+77);
	    update((C[3][3])*W[a]*(Dy*Kz*(Cx*Ix + B10)), I+78);
	    update((C[2][3])*W[a]*(Kz*(Cy*Iy + B10)), I+79);
	    update((C[3][3])*W[a]*(Dx*Kz*(Cy*Iy + B10)), I+80);
	    update((C[3][1])*W[a]*(Cy*Dx*Kz), I+81);
	    update((C[2][3])*W[a]*(Cx*Iy*Kz), I+82);
	    update((C[3][2])*W[a]*(Dx*Iy*Kz), I+83);
	    update((C[3][2])*W[a]*(Dy*Ix*Kz), I+84);
	    update((C[2][3])*W[a]*(Cy*Ix*Kz), I+85);
	    update((C[2][1])*W[a]*(Cy*Kz), I+86);
	    update((C[2][2])*W[a]*(Ix*Kz), I+87);
	    update((C[2][2])*W[a]*(Iy*Kz), I+88);
	    update((C[3][0])*W[a]*(Dx*Kz), I+89);
	    update((C[3][0])*W[a]*(Dy*Kz), I+90);
	    update((C[2][1])*W[a]*(Cx*Kz), I+91);
	    update((C[3][1])*W[a]*(Cx*Dy*Kz), I+92);
	    update((C[2][0])*W[a]*(Kz), I+93);
	    update((C[3][3])*W[a]*(Iz*Kx*Qy), I+94);
	    update((C[3][1])*W[a]*(Kx*Qy), I+95);
	    update((C[1][3])*W[a]*(Iz*Qy), I+96);
	    update((C[1][3])*W[a]*(Ix*Qy), I+97);
	    update((C[3][3])*W[a]*(Ix*Kz*Qy), I+98);
	    update((C[3][1])*W[a]*(Kz*Qy), I+99);
	    update((C[1][1])*W[a]*(Qy), I+100);
	    double f13 = (Dz*Kz + B01);
	    update((C[3][3])*W[a]*(f13*(Cx*Ix + B10)), I+101);
	    update((C[3][3])*W[a]*(f13*(Cy*Iy + B10)), I+102);
	    update((C[3][3])*W[a]*(Cy*Ix*f13), I+103);
	    update((C[3][3])*W[a]*(Cx*Iy*f13), I+104);
	    update((C[3][1])*W[a]*(Cx*f13), I+105);
	    update((C[3][1])*W[a]*(Cy*f13), I+106);
	    update((C[3][2])*W[a]*(Ix*f13), I+107);
	    update((C[3][2])*W[a]*(Iy*f13), I+108);
	    update((C[3][0])*W[a]*(f13), I+109);
	    double f20 = Cz*Dz;
	    update((C[3][3])*W[a]*(Iy*Kx*(f20 + B00)), I+110);
	    update((C[3][1])*W[a]*(Kx*(f20 + B00)), I+111);
	    update((C[1][3])*W[a]*(Ix*(f20 + B00)), I+112);
	    update((C[3][3])*W[a]*(Ix*Ky*(f20 + B00)), I+113);
	    update((C[3][1])*W[a]*(Ky*(f20 + B00)), I+114);
	    update((C[1][3])*W[a]*(Iy*(f20 + B00)), I+115);
	    update((C[1][1])*W[a]*((f20 + B00)), I+116);
	    double f26 = Cz*Zkl;
	    update((C[3][1])*W[a]*(Dx*(f20 + f26 + B00)), I+117);
	    update((C[3][3])*W[a]*(Dx*Iy*(f20 + f26 + B00)), I+118);
	    update((C[2][3])*W[a]*(Iy*(f20 + f26 + B00)), I+119);
	    update((C[3][1])*W[a]*(Dy*(f20 + f26 + B00)), I+120);
	    update((C[3][3])*W[a]*(Dy*Ix*(f20 + f26 + B00)), I+121);
	    update((C[2][3])*W[a]*(Ix*(f20 + f26 + B00)), I+122);
	    update((C[2][1])*W[a]*((f20 + f26 + B00)), I+123);
	    double f33 = Dy*Yij;
	    update((C[3][3])*W[a]*((f33 + Qy)*(f20 + f26 + B00)), I+124);
	    update((C[3][3])*W[a]*(Cz*Kx*(f33 + Qy)), I+125);
	    update((C[1][3])*W[a]*(Cz*(f33 + Qy)), I+126);
	    update((C[3][2])*W[a]*(Kz*(f33 + Qy)), I+127);
	    update((C[3][3])*W[a]*(Cx*Kz*(f33 + Qy)), I+128);
	    update((C[1][3])*W[a]*(Cx*(f33 + Qy)), I+129);
	    update((C[1][2])*W[a]*((f33 + Qy)), I+130);
	    update((C[3][2])*W[a]*(Kx*(f33 + Qy)), I+131);
	    double f38 = Cx*Xkl;
	    double f4 = Cx*Dx;
	    update((C[3][3])*W[a]*(Dz*Iy*(f38 + B00 + f4)), I+132);
	    update((C[3][3])*W[a]*((f33 + Qy)*(f38 + B00 + f4)), I+133);
	    update((C[3][3])*W[a]*(Dy*Iz*(f38 + B00 + f4)), I+134);
	    update((C[3][1])*W[a]*(Dy*(f38 + B00 + f4)), I+135);
	    update((C[3][1])*W[a]*(Dz*(f38 + B00 + f4)), I+136);
	    update((C[3][1])*W[a]*(Ky*(B00 + f4)), I+137);
	    update((C[3][3])*W[a]*(Iz*Ky*(B00 + f4)), I+138);
	    update((C[1][3])*W[a]*(Iz*(B00 + f4)), I+139);
	    update((C[2][3])*W[a]*(Iz*(f38 + B00 + f4)), I+140);
	    update((C[2][1])*W[a]*((f38 + B00 + f4)), I+141);
	    update((C[2][3])*W[a]*(Iy*(f38 + B00 + f4)), I+142);
	    update((C[1][3])*W[a]*(Iy*(B00 + f4)), I+143);
	    update((C[3][3])*W[a]*(Iy*Kz*(B00 + f4)), I+144);
	    update((C[3][1])*W[a]*(Kz*(B00 + f4)), I+145);
	    update((C[1][1])*W[a]*((B00 + f4)), I+146);
	    double f48 = (B01 + Dx*Kx);
	    update((C[3][3])*W[a]*(Cz*Iy*f48), I+147);
	    update((C[3][3])*W[a]*(f48*(B10 + Cz*Iz)), I+148);
	    update((C[3][3])*W[a]*(f48*(Cy*Iy + B10)), I+149);
	    update((C[3][3])*W[a]*(Cy*Iz*f48), I+150);
	    update((C[3][1])*W[a]*(Cy*f48), I+151);
	    update((C[3][1])*W[a]*(Cz*f48), I+152);
	    update((C[3][2])*W[a]*(Iy*f48), I+153);
	    update((C[3][2])*W[a]*(Iz*f48), I+154);
	    update((C[3][0])*W[a]*(f48), I+155);
	    double f49 = Dz*Zij;
	    update((C[3][3])*W[a]*(Qy*(f49 + f20 + f26 + Zij*Zkl + B00)), I+156);
	    update((C[3][3])*W[a]*((B00 + f4)*(f49 + f20 + f26 + Zij*Zkl + B00)), I+157);
	    update((C[3][2])*W[a]*(Dx*(f49 + f20 + f26 + Zij*Zkl + B00)), I+158);
	    update((C[3][3])*W[a]*(Cy*Dx*(f49 + f20 + f26 + Zij*Zkl + B00)), I+159);
	    update((C[2][3])*W[a]*(Cy*(f49 + f20 + f26 + Zij*Zkl + B00)), I+160);
	    update((C[3][2])*W[a]*(Dy*(f49 + f20 + f26 + Zij*Zkl + B00)), I+161);
	    update((C[3][3])*W[a]*(Cx*Dy*(f49 + f20 + f26 + Zij*Zkl + B00)), I+162);
	    update((C[2][3])*W[a]*(Cx*(f49 + f20 + f26 + Zij*Zkl + B00)), I+163);
	    update((C[2][2])*W[a]*((f49 + f20 + f26 + Zij*Zkl + B00)), I+164);
	    update((C[3][3])*W[a]*((f38 + B00 + f4)*(f49 + f20 + B00)), I+165);
	    update((C[3][2])*W[a]*(Kx*(f49 + f20 + B00)), I+166);
	    update((C[3][3])*W[a]*(Cy*Kx*(f49 + f20 + B00)), I+167);
	    update((C[1][3])*W[a]*(Cy*(f49 + f20 + B00)), I+168);
	    update((C[3][2])*W[a]*(Ky*(f49 + f20 + B00)), I+169);
	    update((C[3][3])*W[a]*(Cx*Ky*(f49 + f20 + B00)), I+170);
	    update((C[1][3])*W[a]*(Cx*(f49 + f20 + B00)), I+171);
	    update((C[1][2])*W[a]*((f49 + f20 + B00)), I+172);
	    double f51 = Cy*Ykl;
	    update((C[3][3])*W[a]*((f20 + B00)*(f51 + f33 + Yij*Ykl + Qy)), I+173);
	    update((C[3][3])*W[a]*((B00 + f4)*(f51 + f33 + Yij*Ykl + Qy)), I+174);
	    update((C[3][2])*W[a]*(Dx*(f51 + f33 + Yij*Ykl + Qy)), I+175);
	    update((C[3][3])*W[a]*(Cz*Dx*(f51 + f33 + Yij*Ykl + Qy)), I+176);
	    update((C[2][3])*W[a]*(Cz*(f51 + f33 + Yij*Ykl + Qy)), I+177);
	    update((C[3][2])*W[a]*(Dz*(f51 + f33 + Yij*Ykl + Qy)), I+178);
	    update((C[3][3])*W[a]*(Cx*Dz*(f51 + f33 + Yij*Ykl + Qy)), I+179);
	    update((C[2][3])*W[a]*(Cx*(f51 + f33 + Yij*Ykl + Qy)), I+180);
	    update((C[2][2])*W[a]*((f51 + f33 + Yij*Ykl + Qy)), I+181);
	    update((C[3][3])*W[a]*((f51 + Qy)*(f49 + f20 + B00)), I+182);
	    update((C[3][3])*W[a]*(Dz*Ix*(f51 + Qy)), I+183);
	    update((C[3][1])*W[a]*(Dz*(f51 + Qy)), I+184);
	    update((C[3][3])*W[a]*(Dx*Iz*(f51 + Qy)), I+185);
	    update((C[3][1])*W[a]*(Dx*(f51 + Qy)), I+186);
	    update((C[2][3])*W[a]*(Iz*(f51 + Qy)), I+187);
	    update((C[2][1])*W[a]*((f51 + Qy)), I+188);
	    update((C[2][3])*W[a]*(Ix*(f51 + Qy)), I+189);
	    double f54 = 2*B00*Dz;
	    double f21 = Cz*pow(Dz,2);
	    double f3 = B00*Zkl;
	    update((C[3][3])*W[a]*(Cy*(f54 + f21 + Zij*pow(Dz,2) + f3 + Zkl*(f49 + f20) + B01*Iz)), I+190);
	    update((C[3][3])*W[a]*(Cx*(f54 + f21 + Zij*pow(Dz,2) + f3 + Zkl*(f49 + f20) + B01*Iz)), I+191);
	    update((C[3][2])*W[a]*((f54 + f21 + Zij*pow(Dz,2) + f3 + Zkl*(f49 + f20) + B01*Iz)), I+192);
	    double f56 = (B01 + Dy*Ky);
	    update((C[3][3])*W[a]*(Cz*Ix*f56), I+193);
	    update((C[3][3])*W[a]*(f56*(B10 + Cz*Iz)), I+194);
	    update((C[3][3])*W[a]*(f56*(Cx*Ix + B10)), I+195);
	    update((C[3][3])*W[a]*(Cx*Iz*f56), I+196);
	    update((C[3][1])*W[a]*(Cx*f56), I+197);
	    update((C[3][1])*W[a]*(Cz*f56), I+198);
	    update((C[3][2])*W[a]*(Ix*f56), I+199);
	    update((C[3][2])*W[a]*(Iz*f56), I+200);
	    update((C[3][0])*W[a]*(f56), I+201);
	    double f12 = Cy*Dy;
	    double f59 = Dy*pow(Cy,2);
	    update((C[3][3])*W[a]*(Dz*(f59 + B00*(Yij + 2*Cy) + B10*Ky + Yij*(f12 + f51) + Ykl*pow(Cy,2))), I+202);
	    update((C[3][3])*W[a]*(Dx*(f59 + B00*(Yij + 2*Cy) + B10*Ky + Yij*(f12 + f51) + Ykl*pow(Cy,2))), I+203);
	    update((C[2][3])*W[a]*((f59 + B00*(Yij + 2*Cy) + B10*Ky + Yij*(f12 + f51) + Ykl*pow(Cy,2))), I+204);
	    double f45 = B10*Dy;
	    double f41 = 2*B00*Cy;
	    update((C[3][3])*W[a]*(Kz*(f41 + f45 + f59 + Yij*(f12 + B00))), I+205);
	    update((C[3][3])*W[a]*(Kx*(f41 + f45 + f59 + Yij*(f12 + B00))), I+206);
	    update((C[1][3])*W[a]*((f41 + f45 + f59 + Yij*(f12 + B00))), I+207);
	    double f6 = 2*B00*Cz;
	    double f10 = Cz*Dz*Zij;
	    double f46 = Dz*pow(Cz,2);
	    update((C[3][3])*W[a]*(Dy*(Zkl*pow(Cz,2) + f10 + f46 + Zij*(f26 + B00) + B10*Kz + f6)), I+208);
	    update((C[3][3])*W[a]*(Dx*(Zkl*pow(Cz,2) + f10 + f46 + Zij*(f26 + B00) + B10*Kz + f6)), I+209);
	    update((C[2][3])*W[a]*((Zkl*pow(Cz,2) + f10 + f46 + Zij*(f26 + B00) + B10*Kz + f6)), I+210);
	    double f63 = Cx*pow(Dx,2);
	    double f58 = B01*Cx;
	    double f17 = 2*B00*Dx;
	    update((C[3][3])*W[a]*(Iy*(f17 + f63 + f58 + Xkl*(B00 + f4))), I+211);
	    update((C[3][3])*W[a]*(Iz*(f17 + f63 + f58 + Xkl*(B00 + f4))), I+212);
	    update((C[3][1])*W[a]*((f17 + f63 + f58 + Xkl*(B00 + f4))), I+213);
	    double f55 = B10*Dz;
	    double f64 = B00*Zij;
	    update((C[3][3])*W[a]*(Ky*(f10 + f64 + f46 + f55 + f6)), I+214);
	    update((C[3][3])*W[a]*(Kx*(f10 + f64 + f46 + f55 + f6)), I+215);
	    update((C[1][3])*W[a]*((f10 + f64 + f46 + f55 + f6)), I+216);
	    double f68 = Cy*pow(Dy,2);
	    double f5 = B01*B10;
	    double f35 = B01*Cy;
	    double f11 = B00*Ykl;
	    update((C[3][3])*W[a]*((Cy*(f68 + 2*f11) + 2*B00*(f33 + B00 + 2*f12) + Yij*(f11 + f68 + f35) + B01*pow(Cy,2) + f12*Yij*Ykl + f59*Ykl + f5 + f45*(Ykl + Dy))), I+217);
	    double f44 = 2*B00*Dy;
	    update((C[3][3])*W[a]*(Cz*(f11 + f68 + f44 + f35 + Ry*Yij + Ykl*(f12 + f33))), I+218);
	    update((C[3][3])*W[a]*(Cx*(f11 + f68 + f44 + f35 + Ry*Yij + Ykl*(f12 + f33))), I+219);
	    update((C[3][2])*W[a]*((f11 + f68 + f44 + f35 + Ry*Yij + Ykl*(f12 + f33))), I+220);
	    update((C[3][3])*W[a]*(Iz*(f11 + f68 + f44 + f35 + f12*Ykl)), I+221);
	    update((C[3][3])*W[a]*(Ix*(f11 + f68 + f44 + f35 + f12*Ykl)), I+222);
	    update((C[3][1])*W[a]*((f11 + f68 + f44 + f35 + f12*Ykl)), I+223);
	    double f1 = Cx*Dx*Xij;
	    double f2 = B00*Xij;
	    double f34 = B10*Dx;
	    double f7 = Dx*pow(Cx,2);
	    update((C[3][3])*W[a]*((f58*(Cx + Xij) + f1*Xkl + f5 + f63*Xij + Dx*(2*f2 + Dx*Px) + Xkl*(f34 + f2 + f7) + 2*B00*(f38 + B00 + 2*f4))), I+224);
	    double f67 = 2*B00*Cx;
	    update((C[3][3])*W[a]*(Dz*(f67 + f34 + Px*Xkl + f1 + f2 + f7 + f38*Xij)), I+225);
	    update((C[3][3])*W[a]*(Dy*(f67 + f34 + Px*Xkl + f1 + f2 + f7 + f38*Xij)), I+226);
	    update((C[2][3])*W[a]*((f67 + f34 + Px*Xkl + f1 + f2 + f7 + f38*Xij)), I+227);
	    update((C[3][3])*W[a]*(Ky*(f67 + f34 + f1 + f2 + f7)), I+228);
	    update((C[3][3])*W[a]*(Kz*(f67 + f34 + f1 + f2 + f7)), I+229);
	    update((C[1][3])*W[a]*((f67 + f34 + f1 + f2 + f7)), I+230);
	    double f76 = Dx*Xij;
	    update((C[3][3])*W[a]*(Cy*(Xij*pow(Dx,2) + f17 + f63 + B01*Ix + Xkl*(f76 + B00 + f4))), I+231);
	    update((C[3][3])*W[a]*(Cz*(Xij*pow(Dx,2) + f17 + f63 + B01*Ix + Xkl*(f76 + B00 + f4))), I+232);
	    update((C[3][2])*W[a]*((Xij*pow(Dx,2) + f17 + f63 + B01*Ix + Xkl*(f76 + B00 + f4))), I+233);
	    update((C[3][3])*W[a]*((f20 + B00)*(f76 + f38 + Xij*Xkl + B00 + f4)), I+234);
	    update((C[3][3])*W[a]*(Qy*(f76 + f38 + Xij*Xkl + B00 + f4)), I+235);
	    update((C[3][2])*W[a]*(Dy*(f76 + f38 + Xij*Xkl + B00 + f4)), I+236);
	    update((C[3][3])*W[a]*(Cz*Dy*(f76 + f38 + Xij*Xkl + B00 + f4)), I+237);
	    update((C[2][3])*W[a]*(Cz*(f76 + f38 + Xij*Xkl + B00 + f4)), I+238);
	    update((C[3][3])*W[a]*(Cy*Dz*(f76 + f38 + Xij*Xkl + B00 + f4)), I+239);
	    update((C[2][3])*W[a]*(Cy*(f76 + f38 + Xij*Xkl + B00 + f4)), I+240);
	    update((C[2][2])*W[a]*((f76 + f38 + Xij*Xkl + B00 + f4)), I+241);
	    update((C[3][2])*W[a]*(Dz*(f76 + f38 + Xij*Xkl + B00 + f4)), I+242);
	    update((C[3][3])*W[a]*((f20 + f26 + B00)*(f76 + B00 + f4)), I+243);
	    update((C[3][3])*W[a]*((f51 + Qy)*(f76 + B00 + f4)), I+244);
	    update((C[1][3])*W[a]*(Cz*(f76 + B00 + f4)), I+245);
	    update((C[3][3])*W[a]*(Cz*Ky*(f76 + B00 + f4)), I+246);
	    update((C[3][2])*W[a]*(Ky*(f76 + B00 + f4)), I+247);
	    update((C[3][2])*W[a]*(Kz*(f76 + B00 + f4)), I+248);
	    update((C[3][3])*W[a]*(Cy*Kz*(f76 + B00 + f4)), I+249);
	    update((C[1][3])*W[a]*(Cy*(f76 + B00 + f4)), I+250);
	    update((C[1][2])*W[a]*((f76 + B00 + f4)), I+251);
	    double f77 = B01*Cz;
	    update((C[3][3])*W[a]*((2*B00*(f49 + B00) + Zkl*(f10 + f46) + f3*Zij + f21*Zij + f5 + 2*Cz*f3 + pow(Cz,2)*pow(Dz,2) + 2*Dz*f6 + f77*(Cz + Zij) + f55*(Dz + Zkl))), I+252);
	    update((C[3][3])*W[a]*(Iy*(f77 + f54 + f21 + f3 + f20*Zkl)), I+253);
	    update((C[3][3])*W[a]*(Ix*(f77 + f54 + f21 + f3 + f20*Zkl)), I+254);
	    update((C[3][1])*W[a]*((f77 + f54 + f21 + f3 + f20*Zkl)), I+255);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[256]) {
	double T[256];
	for (int i = 0; i < 256; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[18] = T[4];
	I[19] = T[5];
	I[16] = T[6];
	I[35] = T[7];
	I[33] = T[8];
	I[32] = T[9];
	I[50] = T[10];
	I[49] = T[11];
	I[48] = T[12];
	I[53] = T[13];
	I[37] = T[14];
	I[5] = T[15];
	I[36] = T[16];
	I[39] = T[17];
	I[7] = T[18];
	I[52] = T[19];
	I[54] = T[20];
	I[6] = T[21];
	I[4] = T[22];
	I[58] = T[23];
	I[10] = T[24];
	I[26] = T[25];
	I[56] = T[26];
	I[24] = T[27];
	I[27] = T[28];
	I[11] = T[29];
	I[57] = T[30];
	I[9] = T[31];
	I[8] = T[32];
	I[47] = T[33];
	I[31] = T[34];
	I[15] = T[35];
	I[44] = T[36];
	I[28] = T[37];
	I[30] = T[38];
	I[14] = T[39];
	I[45] = T[40];
	I[13] = T[41];
	I[12] = T[42];
	I[114] = T[43];
	I[120] = T[44];
	I[79] = T[45];
	I[111] = T[46];
	I[74] = T[47];
	I[122] = T[48];
	I[78] = T[49];
	I[66] = T[50];
	I[72] = T[51];
	I[76] = T[52];
	I[108] = T[53];
	I[96] = T[54];
	I[112] = T[55];
	I[75] = T[56];
	I[67] = T[57];
	I[99] = T[58];
	I[64] = T[59];
	I[143] = T[60];
	I[159] = T[61];
	I[133] = T[62];
	I[181] = T[63];
	I[156] = T[64];
	I[140] = T[65];
	I[144] = T[66];
	I[132] = T[67];
	I[135] = T[68];
	I[131] = T[69];
	I[147] = T[70];
	I[180] = T[71];
	I[176] = T[72];
	I[141] = T[73];
	I[129] = T[74];
	I[177] = T[75];
	I[128] = T[76];
	I[197] = T[77];
	I[229] = T[78];
	I[202] = T[79];
	I[218] = T[80];
	I[210] = T[81];
	I[201] = T[82];
	I[216] = T[83];
	I[228] = T[84];
	I[198] = T[85];
	I[194] = T[86];
	I[196] = T[87];
	I[200] = T[88];
	I[208] = T[89];
	I[224] = T[90];
	I[193] = T[91];
	I[225] = T[92];
	I[192] = T[93];
	I[110] = T[94];
	I[98] = T[95];
	I[46] = T[96];
	I[38] = T[97];
	I[230] = T[98];
	I[226] = T[99];
	I[34] = T[100];
	I[245] = T[101];
	I[250] = T[102];
	I[246] = T[103];
	I[249] = T[104];
	I[241] = T[105];
	I[242] = T[106];
	I[244] = T[107];
	I[248] = T[108];
	I[240] = T[109];
	I[123] = T[110];
	I[115] = T[111];
	I[55] = T[112];
	I[183] = T[113];
	I[179] = T[114];
	I[59] = T[115];
	I[51] = T[116];
	I[211] = T[117];
	I[219] = T[118];
	I[203] = T[119];
	I[227] = T[120];
	I[231] = T[121];
	I[199] = T[122];
	I[195] = T[123];
	I[235] = T[124];
	I[107] = T[125];
	I[43] = T[126];
	I[232] = T[127];
	I[233] = T[128];
	I[41] = T[129];
	I[40] = T[130];
	I[104] = T[131];
	I[121] = T[132];
	I[105] = T[133];
	I[109] = T[134];
	I[97] = T[135];
	I[113] = T[136];
	I[145] = T[137];
	I[157] = T[138];
	I[29] = T[139];
	I[77] = T[140];
	I[65] = T[141];
	I[73] = T[142];
	I[25] = T[143];
	I[217] = T[144];
	I[209] = T[145];
	I[17] = T[146];
	I[91] = T[147];
	I[95] = T[148];
	I[90] = T[149];
	I[94] = T[150];
	I[82] = T[151];
	I[83] = T[152];
	I[88] = T[153];
	I[92] = T[154];
	I[80] = T[155];
	I[238] = T[156];
	I[221] = T[157];
	I[220] = T[158];
	I[222] = T[159];
	I[206] = T[160];
	I[236] = T[161];
	I[237] = T[162];
	I[205] = T[163];
	I[204] = T[164];
	I[125] = T[165];
	I[124] = T[166];
	I[126] = T[167];
	I[62] = T[168];
	I[188] = T[169];
	I[189] = T[170];
	I[61] = T[171];
	I[60] = T[172];
	I[187] = T[173];
	I[153] = T[174];
	I[152] = T[175];
	I[155] = T[176];
	I[139] = T[177];
	I[184] = T[178];
	I[185] = T[179];
	I[137] = T[180];
	I[136] = T[181];
	I[190] = T[182];
	I[182] = T[183];
	I[178] = T[184];
	I[158] = T[185];
	I[146] = T[186];
	I[142] = T[187];
	I[130] = T[188];
	I[134] = T[189];
	I[254] = T[190];
	I[253] = T[191];
	I[252] = T[192];
	I[167] = T[193];
	I[175] = T[194];
	I[165] = T[195];
	I[173] = T[196];
	I[161] = T[197];
	I[163] = T[198];
	I[164] = T[199];
	I[172] = T[200];
	I[160] = T[201];
	I[186] = T[202];
	I[154] = T[203];
	I[138] = T[204];
	I[234] = T[205];
	I[106] = T[206];
	I[42] = T[207];
	I[239] = T[208];
	I[223] = T[209];
	I[207] = T[210];
	I[89] = T[211];
	I[93] = T[212];
	I[81] = T[213];
	I[191] = T[214];
	I[127] = T[215];
	I[63] = T[216];
	I[170] = T[217];
	I[171] = T[218];
	I[169] = T[219];
	I[168] = T[220];
	I[174] = T[221];
	I[166] = T[222];
	I[162] = T[223];
	I[85] = T[224];
	I[117] = T[225];
	I[101] = T[226];
	I[69] = T[227];
	I[149] = T[228];
	I[213] = T[229];
	I[21] = T[230];
	I[86] = T[231];
	I[87] = T[232];
	I[84] = T[233];
	I[119] = T[234];
	I[102] = T[235];
	I[100] = T[236];
	I[103] = T[237];
	I[71] = T[238];
	I[118] = T[239];
	I[70] = T[240];
	I[68] = T[241];
	I[116] = T[242];
	I[215] = T[243];
	I[150] = T[244];
	I[23] = T[245];
	I[151] = T[246];
	I[148] = T[247];
	I[212] = T[248];
	I[214] = T[249];
	I[22] = T[250];
	I[20] = T[251];
	I[255] = T[252];
	I[251] = T[253];
	I[247] = T[254];
	I[243] = T[255];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[256] = { 0, 1, 2, 3, 22, 15, 21, 18, 32, 31, 24, 29, 42, 41, 39, 35, 6, 146, 4, 5, 251, 230, 250, 245, 27, 143, 25, 28, 37, 139, 38, 34, 9, 8, 100, 7, 16, 14, 97, 17, 130, 129, 207, 126, 36, 40, 96, 33, 12, 11, 10, 116, 19, 13, 20, 112, 26, 30, 23, 115, 172, 171, 168, 216, 59, 141, 50, 57, 241, 227, 240, 238, 51, 142, 47, 56, 52, 140, 49, 45, 155, 213, 151, 152, 233, 224, 231, 232, 153, 211, 149, 147, 154, 212, 150, 148, 54, 135, 95, 58, 236, 226, 235, 237, 131, 133, 206, 125, 53, 134, 94, 46, 55, 136, 43, 111, 242, 225, 239, 234, 44, 132, 48, 110, 166, 165, 167, 215, 76, 74, 188, 69, 67, 62, 189, 68, 181, 180, 204, 177, 65, 73, 187, 60, 66, 137, 186, 70, 247, 228, 244, 246, 175, 174, 203, 176, 64, 138, 185, 61, 201, 197, 223, 198, 199, 195, 222, 193, 220, 219, 217, 218, 200, 196, 221, 194, 72, 75, 184, 114, 71, 63, 183, 113, 178, 179, 202, 173, 169, 170, 182, 214, 93, 91, 86, 123, 87, 77, 85, 122, 88, 82, 79, 119, 164, 163, 160, 210, 89, 145, 81, 117, 248, 229, 249, 243, 83, 144, 80, 118, 158, 157, 159, 209, 90, 92, 99, 120, 84, 78, 98, 121, 127, 128, 205, 124, 161, 162, 156, 208, 109, 105, 106, 255, 107, 101, 103, 254, 108, 104, 102, 253, 192, 191, 190, 252 };
// 	if (index < 256) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 18, 19, 16, 35, 33, 32, 50, 49, 48, 53, 37, 5, 36, 39, 7, 52, 54, 6, 4, 58, 10, 26, 56, 24, 27, 11, 57, 9, 8, 47, 31, 15, 44, 28, 30, 14, 45, 13, 12, 114, 120, 79, 111, 74, 122, 78, 66, 72, 76, 108, 96, 112, 75, 67, 99, 64, 143, 159, 133, 181, 156, 140, 144, 132, 135, 131, 147, 180, 176, 141, 129, 177, 128, 197, 229, 202, 218, 210, 201, 216, 228, 198, 194, 196, 200, 208, 224, 193, 225, 192, 110, 98, 46, 38, 230, 226, 34, 245, 250, 246, 249, 241, 242, 244, 248, 240, 123, 115, 55, 183, 179, 59, 51, 211, 219, 203, 227, 231, 199, 195, 235, 107, 43, 232, 233, 41, 40, 104, 121, 105, 109, 97, 113, 145, 157, 29, 77, 65, 73, 25, 217, 209, 17, 91, 95, 90, 94, 82, 83, 88, 92, 80, 238, 221, 220, 222, 206, 236, 237, 205, 204, 125, 124, 126, 62, 188, 189, 61, 60, 187, 153, 152, 155, 139, 184, 185, 137, 136, 190, 182, 178, 158, 146, 142, 130, 134, 254, 253, 252, 167, 175, 165, 173, 161, 163, 164, 172, 160, 186, 154, 138, 234, 106, 42, 239, 223, 207, 89, 93, 81, 191, 127, 63, 170, 171, 169, 168, 174, 166, 162, 85, 117, 101, 69, 149, 213, 21, 86, 87, 84, 119, 102, 100, 103, 71, 118, 70, 68, 116, 215, 150, 23, 151, 148, 212, 214, 22, 20, 255, 251, 247, 243
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 18;
	*idx++ = 19;
	*idx++ = 16;
	*idx++ = 35;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 50;
	*idx++ = 49;
	*idx++ = 48;
	*idx++ = 53;
	*idx++ = 37;
	*idx++ = 5;
	*idx++ = 36;
	*idx++ = 39;
	*idx++ = 7;
	*idx++ = 52;
	*idx++ = 54;
	*idx++ = 6;
	*idx++ = 4;
	*idx++ = 58;
	*idx++ = 10;
	*idx++ = 26;
	*idx++ = 56;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 11;
	*idx++ = 57;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 47;
	*idx++ = 31;
	*idx++ = 15;
	*idx++ = 44;
	*idx++ = 28;
	*idx++ = 30;
	*idx++ = 14;
	*idx++ = 45;
	*idx++ = 13;
	*idx++ = 12;
	*idx++ = 114;
	*idx++ = 120;
	*idx++ = 79;
	*idx++ = 111;
	*idx++ = 74;
	*idx++ = 122;
	*idx++ = 78;
	*idx++ = 66;
	*idx++ = 72;
	*idx++ = 76;
	*idx++ = 108;
	*idx++ = 96;
	*idx++ = 112;
	*idx++ = 75;
	*idx++ = 67;
	*idx++ = 99;
	*idx++ = 64;
	*idx++ = 143;
	*idx++ = 159;
	*idx++ = 133;
	*idx++ = 181;
	*idx++ = 156;
	*idx++ = 140;
	*idx++ = 144;
	*idx++ = 132;
	*idx++ = 135;
	*idx++ = 131;
	*idx++ = 147;
	*idx++ = 180;
	*idx++ = 176;
	*idx++ = 141;
	*idx++ = 129;
	*idx++ = 177;
	*idx++ = 128;
	*idx++ = 197;
	*idx++ = 229;
	*idx++ = 202;
	*idx++ = 218;
	*idx++ = 210;
	*idx++ = 201;
	*idx++ = 216;
	*idx++ = 228;
	*idx++ = 198;
	*idx++ = 194;
	*idx++ = 196;
	*idx++ = 200;
	*idx++ = 208;
	*idx++ = 224;
	*idx++ = 193;
	*idx++ = 225;
	*idx++ = 192;
	*idx++ = 110;
	*idx++ = 98;
	*idx++ = 46;
	*idx++ = 38;
	*idx++ = 230;
	*idx++ = 226;
	*idx++ = 34;
	*idx++ = 245;
	*idx++ = 250;
	*idx++ = 246;
	*idx++ = 249;
	*idx++ = 241;
	*idx++ = 242;
	*idx++ = 244;
	*idx++ = 248;
	*idx++ = 240;
	*idx++ = 123;
	*idx++ = 115;
	*idx++ = 55;
	*idx++ = 183;
	*idx++ = 179;
	*idx++ = 59;
	*idx++ = 51;
	*idx++ = 211;
	*idx++ = 219;
	*idx++ = 203;
	*idx++ = 227;
	*idx++ = 231;
	*idx++ = 199;
	*idx++ = 195;
	*idx++ = 235;
	*idx++ = 107;
	*idx++ = 43;
	*idx++ = 232;
	*idx++ = 233;
	*idx++ = 41;
	*idx++ = 40;
	*idx++ = 104;
	*idx++ = 121;
	*idx++ = 105;
	*idx++ = 109;
	*idx++ = 97;
	*idx++ = 113;
	*idx++ = 145;
	*idx++ = 157;
	*idx++ = 29;
	*idx++ = 77;
	*idx++ = 65;
	*idx++ = 73;
	*idx++ = 25;
	*idx++ = 217;
	*idx++ = 209;
	*idx++ = 17;
	*idx++ = 91;
	*idx++ = 95;
	*idx++ = 90;
	*idx++ = 94;
	*idx++ = 82;
	*idx++ = 83;
	*idx++ = 88;
	*idx++ = 92;
	*idx++ = 80;
	*idx++ = 238;
	*idx++ = 221;
	*idx++ = 220;
	*idx++ = 222;
	*idx++ = 206;
	*idx++ = 236;
	*idx++ = 237;
	*idx++ = 205;
	*idx++ = 204;
	*idx++ = 125;
	*idx++ = 124;
	*idx++ = 126;
	*idx++ = 62;
	*idx++ = 188;
	*idx++ = 189;
	*idx++ = 61;
	*idx++ = 60;
	*idx++ = 187;
	*idx++ = 153;
	*idx++ = 152;
	*idx++ = 155;
	*idx++ = 139;
	*idx++ = 184;
	*idx++ = 185;
	*idx++ = 137;
	*idx++ = 136;
	*idx++ = 190;
	*idx++ = 182;
	*idx++ = 178;
	*idx++ = 158;
	*idx++ = 146;
	*idx++ = 142;
	*idx++ = 130;
	*idx++ = 134;
	*idx++ = 254;
	*idx++ = 253;
	*idx++ = 252;
	*idx++ = 167;
	*idx++ = 175;
	*idx++ = 165;
	*idx++ = 173;
	*idx++ = 161;
	*idx++ = 163;
	*idx++ = 164;
	*idx++ = 172;
	*idx++ = 160;
	*idx++ = 186;
	*idx++ = 154;
	*idx++ = 138;
	*idx++ = 234;
	*idx++ = 106;
	*idx++ = 42;
	*idx++ = 239;
	*idx++ = 223;
	*idx++ = 207;
	*idx++ = 89;
	*idx++ = 93;
	*idx++ = 81;
	*idx++ = 191;
	*idx++ = 127;
	*idx++ = 63;
	*idx++ = 170;
	*idx++ = 171;
	*idx++ = 169;
	*idx++ = 168;
	*idx++ = 174;
	*idx++ = 166;
	*idx++ = 162;
	*idx++ = 85;
	*idx++ = 117;
	*idx++ = 101;
	*idx++ = 69;
	*idx++ = 149;
	*idx++ = 213;
	*idx++ = 21;
	*idx++ = 86;
	*idx++ = 87;
	*idx++ = 84;
	*idx++ = 119;
	*idx++ = 102;
	*idx++ = 100;
	*idx++ = 103;
	*idx++ = 71;
	*idx++ = 118;
	*idx++ = 70;
	*idx++ = 68;
	*idx++ = 116;
	*idx++ = 215;
	*idx++ = 150;
	*idx++ = 23;
	*idx++ = 151;
	*idx++ = 148;
	*idx++ = 212;
	*idx++ = 214;
	*idx++ = 22;
	*idx++ = 20;
	*idx++ = 255;
	*idx++ = 251;
	*idx++ = 247;
	*idx++ = 243;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[16]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Cx), I+1);
	    update((C[0][1])*W[a]*(Cy), I+2);
	    update((C[0][1])*W[a]*(Cz), I+3);
	    update((C[1][1])*W[a]*((B00 + Cx*Kx)), I+4);
	    update((C[1][1])*W[a]*(Cy*Kx), I+5);
	    update((C[1][1])*W[a]*(Cz*Kx), I+6);
	    update((C[1][0])*W[a]*(Kx), I+7);
	    update((C[1][1])*W[a]*((B00 + Cy*Ky)), I+8);
	    update((C[1][1])*W[a]*(Cz*Ky), I+9);
	    update((C[1][1])*W[a]*(Cx*Ky), I+10);
	    update((C[1][0])*W[a]*(Ky), I+11);
	    update((C[1][1])*W[a]*((B00 + Cz*Kz)), I+12);
	    update((C[1][1])*W[a]*(Cx*Kz), I+13);
	    update((C[1][1])*W[a]*(Cy*Kz), I+14);
	    update((C[1][0])*W[a]*(Kz), I+15);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[16]) {
	double T[16];
	for (int i = 0; i < 16; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[5] = T[4];
	I[6] = T[5];
	I[7] = T[6];
	I[4] = T[7];
	I[10] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[8] = T[11];
	I[15] = T[12];
	I[13] = T[13];
	I[14] = T[14];
	I[12] = T[15];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[16] = { 0, 1, 2, 3, 7, 4, 5, 6, 11, 10, 8, 9, 15, 13, 14, 12 };
// 	if (index < 16) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 5, 6, 7, 4, 10, 11, 9, 8, 15, 13, 14, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::P, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[1][0])*W[a]*(Dz*(Cx*Ix + B10)), I+0);
	    update((C[0][0])*W[a]*((Cx*Ix + B10)), I+1);
	    update((C[1][0])*W[a]*(Dy*(Cx*Ix + B10)), I+2);
	    update((C[1][0])*W[a]*(Cz*Dy*Ix), I+3);
	    update((C[0][0])*W[a]*(Cz*Ix), I+4);
	    update((C[1][0])*W[a]*(Cy*Dz*Ix), I+5);
	    update((C[0][0])*W[a]*(Cy*Ix), I+6);
	    update((C[1][0])*W[a]*(Dz*(Cy*Iy + B10)), I+7);
	    update((C[1][0])*W[a]*(Dx*(Cy*Iy + B10)), I+8);
	    update((C[0][0])*W[a]*((Cy*Iy + B10)), I+9);
	    update((C[1][0])*W[a]*(Cx*Dz*Iy), I+10);
	    update((C[0][0])*W[a]*(Cx*Iy), I+11);
	    update((C[1][0])*W[a]*(Cz*Dx*Iy), I+12);
	    update((C[0][0])*W[a]*(Cz*Iy), I+13);
	    update((C[1][0])*W[a]*(Dy*(B10 + Cz*Iz)), I+14);
	    update((C[1][0])*W[a]*(Dx*(B10 + Cz*Iz)), I+15);
	    update((C[0][0])*W[a]*((B10 + Cz*Iz)), I+16);
	    update((C[1][0])*W[a]*(Cx*Dy*Iz), I+17);
	    update((C[1][0])*W[a]*(Cy*Dx*Iz), I+18);
	    update((C[0][0])*W[a]*(Cy*Iz), I+19);
	    update((C[0][0])*W[a]*(Cx*Iz), I+20);
	    update((C[1][0])*W[a]*(Iy*Qx), I+21);
	    update((C[1][0])*W[a]*(Iz*Qx), I+22);
	    update((C[1][0])*W[a]*(Ix*Qy), I+23);
	    update((C[1][0])*W[a]*(Iz*Qy), I+24);
	    update((C[1][0])*W[a]*(Ix*Qz), I+25);
	    update((C[1][0])*W[a]*(Iy*Qz), I+26);
	    update((C[1][0])*W[a]*(Cy*(Dx*Xij + Qx)), I+27);
	    update((C[1][0])*W[a]*(Cz*(Dx*Xij + Qx)), I+28);
	    update((C[1][0])*W[a]*((Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+29);
	    update((C[1][0])*W[a]*((B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+30);
	    update((C[1][0])*W[a]*(Cx*(Dy*Yij + Qy)), I+31);
	    update((C[1][0])*W[a]*(Cz*(Dy*Yij + Qy)), I+32);
	    update((C[1][0])*W[a]*((Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+33);
	    update((C[1][0])*W[a]*(Cx*(Dz*Zij + Qz)), I+34);
	    update((C[1][0])*W[a]*(Cy*(Dz*Zij + Qz)), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[27] = T[0];
	I[0] = T[1];
	I[18] = T[2];
	I[20] = T[3];
	I[2] = T[4];
	I[28] = T[5];
	I[1] = T[6];
	I[31] = T[7];
	I[13] = T[8];
	I[4] = T[9];
	I[30] = T[10];
	I[3] = T[11];
	I[14] = T[12];
	I[5] = T[13];
	I[26] = T[14];
	I[17] = T[15];
	I[8] = T[16];
	I[24] = T[17];
	I[16] = T[18];
	I[7] = T[19];
	I[6] = T[20];
	I[12] = T[21];
	I[15] = T[22];
	I[19] = T[23];
	I[25] = T[24];
	I[29] = T[25];
	I[32] = T[26];
	I[10] = T[27];
	I[11] = T[28];
	I[9] = T[29];
	I[22] = T[30];
	I[21] = T[31];
	I[23] = T[32];
	I[35] = T[33];
	I[33] = T[34];
	I[34] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 1, 6, 4, 11, 9, 13, 20, 19, 16, 29, 27, 28, 21, 8, 12, 22, 18, 15, 2, 23, 3, 31, 30, 32, 17, 24, 14, 0, 5, 25, 10, 7, 26, 34, 35, 33 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    27, 0, 18, 20, 2, 28, 1, 31, 13, 4, 30, 3, 14, 5, 26, 17, 8, 24, 16, 7, 6, 12, 15, 19, 25, 29, 32, 10, 11, 9, 22, 21, 23, 35, 33, 34
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 27;
	*idx++ = 0;
	*idx++ = 18;
	*idx++ = 20;
	*idx++ = 2;
	*idx++ = 28;
	*idx++ = 1;
	*idx++ = 31;
	*idx++ = 13;
	*idx++ = 4;
	*idx++ = 30;
	*idx++ = 3;
	*idx++ = 14;
	*idx++ = 5;
	*idx++ = 26;
	*idx++ = 17;
	*idx++ = 8;
	*idx++ = 24;
	*idx++ = 16;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 19;
	*idx++ = 25;
	*idx++ = 29;
	*idx++ = 32;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 22;
	*idx++ = 21;
	*idx++ = 23;
	*idx++ = 35;
	*idx++ = 33;
	*idx++ = 34;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::P, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cy*Cz*Ix), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz*Iy), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Iz), I+2);
	    update((C[0][0])*W[a]*(Iy*Px), I+3);
	    update((C[0][0])*W[a]*(Iz*Px), I+4);
	    update((C[0][0])*W[a]*(Ix*Py), I+5);
	    update((C[0][0])*W[a]*(Iz*Py), I+6);
	    update((C[0][0])*W[a]*(Ix*Pz), I+7);
	    update((C[0][0])*W[a]*(Iy*Pz), I+8);
	    update((C[0][0])*W[a]*((B10*(3*Cx + Xij) + Ix*pow(Cx,2))), I+9);
	    update((C[0][0])*W[a]*(Cz*(Px + Cx*Xij)), I+10);
	    update((C[0][0])*W[a]*(Cy*(Px + Cx*Xij)), I+11);
	    update((C[0][0])*W[a]*(Cx*(Py + Cy*Yij)), I+12);
	    update((C[0][0])*W[a]*(Cz*(Py + Cy*Yij)), I+13);
	    update((C[0][0])*W[a]*((B10*(3*Cy + Yij) + Iy*pow(Cy,2))), I+14);
	    update((C[0][0])*W[a]*((Iz*pow(Cz,2) + B10*(3*Cz + Zij))), I+15);
	    double f2 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Cx*f2), I+16);
	    update((C[0][0])*W[a]*(Cy*f2), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[10] = T[1];
	I[15] = T[2];
	I[6] = T[3];
	I[12] = T[4];
	I[1] = T[5];
	I[13] = T[6];
	I[2] = T[7];
	I[8] = T[8];
	I[0] = T[9];
	I[4] = T[10];
	I[3] = T[11];
	I[9] = T[12];
	I[11] = T[13];
	I[7] = T[14];
	I[14] = T[15];
	I[16] = T[16];
	I[17] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 9, 5, 7, 11, 10, 0, 3, 14, 8, 12, 1, 13, 4, 6, 15, 2, 16, 17 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 10, 15, 6, 12, 1, 13, 2, 8, 0, 4, 3, 9, 11, 7, 14, 16, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 2;
	*idx++ = 8;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 3;
	*idx++ = 9;
	*idx++ = 11;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[72]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Cz*Kx), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz*Ky), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Kz), I+2);
	    update((C[1][0])*W[a]*(Dy*Kz*Px), I+3);
	    update((C[0][0])*W[a]*(Kz*Px), I+4);
	    update((C[1][0])*W[a]*(Dz*Ky*Px), I+5);
	    update((C[0][0])*W[a]*(Ky*Px), I+6);
	    update((C[1][0])*W[a]*(Dx*Kz*Py), I+7);
	    update((C[0][0])*W[a]*(Kz*Py), I+8);
	    update((C[1][0])*W[a]*(Dz*Kx*Py), I+9);
	    update((C[0][0])*W[a]*(Kx*Py), I+10);
	    update((C[1][0])*W[a]*(Dy*Kx*Pz), I+11);
	    update((C[0][0])*W[a]*(Kx*Pz), I+12);
	    update((C[1][0])*W[a]*(Dx*Ky*Pz), I+13);
	    update((C[0][0])*W[a]*(Ky*Pz), I+14);
	    update((C[1][0])*W[a]*(Cy*Kz*Qx), I+15);
	    update((C[1][0])*W[a]*(Cz*Ky*Qx), I+16);
	    update((C[1][0])*W[a]*(Cx*Kz*Qy), I+17);
	    update((C[1][0])*W[a]*(Cz*Kx*Qy), I+18);
	    update((C[1][0])*W[a]*(Cx*Ky*Qz), I+19);
	    update((C[1][0])*W[a]*(Cy*Kx*Qz), I+20);
	    update((C[1][0])*W[a]*(Qy*(Cz*Zkl + Qz)), I+21);
	    update((C[1][0])*W[a]*(Cy*Dx*(Cz*Zkl + Qz)), I+22);
	    update((C[0][0])*W[a]*(Cy*(Cz*Zkl + Qz)), I+23);
	    update((C[1][0])*W[a]*(Cx*Dy*(Cz*Zkl + Qz)), I+24);
	    update((C[0][0])*W[a]*(Cx*(Cz*Zkl + Qz)), I+25);
	    update((C[1][0])*W[a]*(Qx*(Cz*Zkl + Qz)), I+26);
	    double f0 = (Dy*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(Dx*(f0 + Py*Ykl)), I+27);
	    update((C[1][0])*W[a]*(Dz*(f0 + Py*Ykl)), I+28);
	    update((C[0][0])*W[a]*((f0 + Py*Ykl)), I+29);
	    update((C[1][0])*W[a]*(Kx*f0), I+30);
	    update((C[1][0])*W[a]*(Kz*f0), I+31);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*(Dy*(Pz*Zkl + f1)), I+32);
	    update((C[1][0])*W[a]*(Dx*(Pz*Zkl + f1)), I+33);
	    update((C[0][0])*W[a]*((Pz*Zkl + f1)), I+34);
	    update((C[1][0])*W[a]*(Kx*f1), I+35);
	    update((C[1][0])*W[a]*(Ky*f1), I+36);
	    double f13 = (Dx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Ky*f13), I+37);
	    update((C[1][0])*W[a]*(Kz*f13), I+38);
	    double f15 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Cx*Cy*f15), I+39);
	    update((C[1][0])*W[a]*(Px*f15), I+40);
	    update((C[1][0])*W[a]*(Py*f15), I+41);
	    double f16 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Qy*f16), I+42);
	    update((C[1][0])*W[a]*(Qz*f16), I+43);
	    update((C[1][0])*W[a]*(Cy*Dz*f16), I+44);
	    update((C[0][0])*W[a]*(Cy*f16), I+45);
	    update((C[1][0])*W[a]*(Cz*Dy*f16), I+46);
	    update((C[0][0])*W[a]*(Cz*f16), I+47);
	    double f17 = (B00 + Cy*Ky);
	    update((C[1][0])*W[a]*(Qx*f17), I+48);
	    update((C[1][0])*W[a]*(Qz*f17), I+49);
	    update((C[1][0])*W[a]*(Cz*Dx*f17), I+50);
	    update((C[0][0])*W[a]*(Cz*f17), I+51);
	    update((C[1][0])*W[a]*(Cx*Dz*f17), I+52);
	    update((C[0][0])*W[a]*(Cx*f17), I+53);
	    double f10 = B01*B10;
	    double f19 = 2*pow(B00,2);
	    update((C[1][0])*W[a]*((B01*pow(Cx,2) + f10 + f19 + 2*B00*Cx*(Xkl + 2*Dx) + Dx*Kx*Px)), I+54);
	    update((C[1][0])*W[a]*((f10 + f19 + B01*pow(Cz,2) + Dz*Kz*Pz + 2*B00*Cz*(2*Dz + Zkl))), I+55);
	    update((C[1][0])*W[a]*((f10 + f19 + Dy*Ky*Py + B01*pow(Cy,2) + 2*B00*Cy*(Ykl + 2*Dy))), I+56);
	    double f2 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[1][0])*W[a]*(Cx*f2), I+57);
	    update((C[1][0])*W[a]*(Cz*f2), I+58);
	    double f21 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Cx*Cz*f21), I+59);
	    update((C[1][0])*W[a]*(Px*f21), I+60);
	    update((C[1][0])*W[a]*(Pz*f21), I+61);
	    double f4 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[1][0])*W[a]*(Cz*f4), I+62);
	    update((C[1][0])*W[a]*(Cy*f4), I+63);
	    double f6 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[1][0])*W[a]*(Cx*f6), I+64);
	    update((C[1][0])*W[a]*(Cy*f6), I+65);
	    double f7 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Cy*Cz*f7), I+66);
	    update((C[1][0])*W[a]*(Pz*f7), I+67);
	    update((C[1][0])*W[a]*(Py*f7), I+68);
	    double f9 = (Kx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Dz*f9), I+69);
	    update((C[1][0])*W[a]*(Dy*f9), I+70);
	    update((C[0][0])*W[a]*(f9), I+71);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[72]) {
	double T[72];
	for (int i = 0; i < 72; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[28] = T[1];
	I[51] = T[2];
	I[60] = T[3];
	I[48] = T[4];
	I[42] = T[5];
	I[24] = T[6];
	I[55] = T[7];
	I[49] = T[8];
	I[19] = T[9];
	I[1] = T[10];
	I[14] = T[11];
	I[2] = T[12];
	I[32] = T[13];
	I[26] = T[14];
	I[57] = T[15];
	I[34] = T[16];
	I[63] = T[17];
	I[17] = T[18];
	I[46] = T[19];
	I[23] = T[20];
	I[65] = T[21];
	I[59] = T[22];
	I[53] = T[23];
	I[64] = T[24];
	I[52] = T[25];
	I[58] = T[26];
	I[31] = T[27];
	I[43] = T[28];
	I[25] = T[29];
	I[13] = T[30];
	I[61] = T[31];
	I[62] = T[32];
	I[56] = T[33];
	I[50] = T[34];
	I[20] = T[35];
	I[44] = T[36];
	I[30] = T[37];
	I[54] = T[38];
	I[69] = T[39];
	I[66] = T[40];
	I[67] = T[41];
	I[15] = T[42];
	I[22] = T[43];
	I[21] = T[44];
	I[3] = T[45];
	I[16] = T[46];
	I[4] = T[47];
	I[33] = T[48];
	I[47] = T[49];
	I[35] = T[50];
	I[29] = T[51];
	I[45] = T[52];
	I[27] = T[53];
	I[6] = T[54];
	I[68] = T[55];
	I[37] = T[56];
	I[39] = T[57];
	I[41] = T[58];
	I[40] = T[59];
	I[36] = T[60];
	I[38] = T[61];
	I[10] = T[62];
	I[9] = T[63];
	I[70] = T[64];
	I[71] = T[65];
	I[11] = T[66];
	I[8] = T[67];
	I[7] = T[68];
	I[18] = T[69];
	I[12] = T[70];
	I[0] = T[71];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[72] = { 71, 10, 12, 45, 47, 0, 54, 68, 67, 63, 62, 66, 70, 30, 11, 42, 46, 18, 69, 9, 35, 44, 43, 20, 6, 29, 14, 53, 1, 51, 37, 27, 13, 48, 16, 50, 60, 56, 61, 57, 59, 58, 5, 28, 36, 52, 19, 49, 4, 8, 34, 2, 25, 23, 38, 7, 33, 15, 26, 22, 3, 31, 32, 17, 24, 21, 40, 41, 55, 39, 64, 65 };
// 	if (index < 72) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 28, 51, 60, 48, 42, 24, 55, 49, 19, 1, 14, 2, 32, 26, 57, 34, 63, 17, 46, 23, 65, 59, 53, 64, 52, 58, 31, 43, 25, 13, 61, 62, 56, 50, 20, 44, 30, 54, 69, 66, 67, 15, 22, 21, 3, 16, 4, 33, 47, 35, 29, 45, 27, 6, 68, 37, 39, 41, 40, 36, 38, 10, 9, 70, 71, 11, 8, 7, 18, 12, 0
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 28;
	*idx++ = 51;
	*idx++ = 60;
	*idx++ = 48;
	*idx++ = 42;
	*idx++ = 24;
	*idx++ = 55;
	*idx++ = 49;
	*idx++ = 19;
	*idx++ = 1;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 57;
	*idx++ = 34;
	*idx++ = 63;
	*idx++ = 17;
	*idx++ = 46;
	*idx++ = 23;
	*idx++ = 65;
	*idx++ = 59;
	*idx++ = 53;
	*idx++ = 64;
	*idx++ = 52;
	*idx++ = 58;
	*idx++ = 31;
	*idx++ = 43;
	*idx++ = 25;
	*idx++ = 13;
	*idx++ = 61;
	*idx++ = 62;
	*idx++ = 56;
	*idx++ = 50;
	*idx++ = 20;
	*idx++ = 44;
	*idx++ = 30;
	*idx++ = 54;
	*idx++ = 69;
	*idx++ = 66;
	*idx++ = 67;
	*idx++ = 15;
	*idx++ = 22;
	*idx++ = 21;
	*idx++ = 3;
	*idx++ = 16;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 47;
	*idx++ = 35;
	*idx++ = 29;
	*idx++ = 45;
	*idx++ = 27;
	*idx++ = 6;
	*idx++ = 68;
	*idx++ = 37;
	*idx++ = 39;
	*idx++ = 41;
	*idx++ = 40;
	*idx++ = 36;
	*idx++ = 38;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 70;
	*idx++ = 71;
	*idx++ = 11;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 0;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::S, rysq::F> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    double f1 = (Iz*(pow(Kz,2) + B01) + 2*B00*Kz);
	    update((C[0][0])*W[a]*(Kx*f1), I+0);
	    update((C[0][0])*W[a]*(Ky*f1), I+1);
	    double f10 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Iy*Kx*f10), I+2);
	    update((C[0][0])*W[a]*(Ix*Ky*f10), I+3);
	    double f11 = (2*B00*Ky + Iy*(pow(Ky,2) + B01));
	    update((C[0][0])*W[a]*(Kx*f11), I+4);
	    update((C[0][0])*W[a]*(Kz*f11), I+5);
	    double f14 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Iy*Kz*f14), I+6);
	    update((C[0][0])*W[a]*(Iz*Ky*f14), I+7);
	    double f15 = (pow(Kz,2) + 3*B01);
	    update((C[0][0])*W[a]*(Iy*Kz*f15), I+8);
	    update((C[0][0])*W[a]*(Ix*Kz*f15), I+9);
	    double f16 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Kx*Kz*f16), I+10);
	    update((C[0][0])*W[a]*(f14*f16), I+11);
	    update((C[0][0])*W[a]*(f10*f16), I+12);
	    double f17 = (3*B01 + pow(Kx,2));
	    update((C[0][0])*W[a]*(Iz*Kx*f17), I+13);
	    update((C[0][0])*W[a]*(Iy*Kx*f17), I+14);
	    double f18 = (Ix*(pow(Kx,2) + B01) + 2*B00*Kx);
	    update((C[0][0])*W[a]*(Kz*f18), I+15);
	    update((C[0][0])*W[a]*(Ky*f18), I+16);
	    double f4 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Ky*Kz*f4), I+17);
	    update((C[0][0])*W[a]*(f10*f4), I+18);
	    double f5 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Ix*Kz*f5), I+19);
	    update((C[0][0])*W[a]*(Iz*Kx*f5), I+20);
	    update((C[0][0])*W[a]*(f4*f5), I+21);
	    double f6 = (pow(Ky,2) + 3*B01);
	    update((C[0][0])*W[a]*(Ix*Ky*f6), I+22);
	    update((C[0][0])*W[a]*(Iz*Ky*f6), I+23);
	    double f7 = 3*B00*B01;
	    update((C[0][0])*W[a]*((f7 + Kx*(Ix*(3*B01 + pow(Kx,2)) + 3*B00*Kx))), I+24);
	    update((C[0][0])*W[a]*((f7 + Ky*(3*B00*Ky + Iy*(pow(Ky,2) + 3*B01)))), I+25);
	    update((C[0][0])*W[a]*((f7 + Kz*(3*B00*Kz + Iz*(pow(Kz,2) + 3*B01)))), I+26);
	    double f8 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Kx*Ky*f8), I+27);
	    update((C[0][0])*W[a]*(f14*f8), I+28);
	    update((C[0][0])*W[a]*(f5*f8), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[23] = T[0];
	I[26] = T[1];
	I[22] = T[2];
	I[24] = T[3];
	I[16] = T[4];
	I[19] = T[5];
	I[13] = T[6];
	I[11] = T[7];
	I[7] = T[8];
	I[6] = T[9];
	I[28] = T[10];
	I[10] = T[11];
	I[25] = T[12];
	I[2] = T[13];
	I[1] = T[14];
	I[12] = T[15];
	I[9] = T[16];
	I[27] = T[17];
	I[21] = T[18];
	I[18] = T[19];
	I[17] = T[20];
	I[15] = T[21];
	I[3] = T[22];
	I[5] = T[23];
	I[0] = T[24];
	I[4] = T[25];
	I[8] = T[26];
	I[29] = T[27];
	I[14] = T[28];
	I[20] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 24, 14, 13, 22, 25, 23, 9, 8, 26, 16, 11, 7, 15, 6, 28, 21, 4, 20, 19, 5, 29, 18, 2, 0, 3, 12, 1, 17, 10, 27 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    23, 26, 22, 24, 16, 19, 13, 11, 7, 6, 28, 10, 25, 2, 1, 12, 9, 27, 21, 18, 17, 15, 3, 5, 0, 4, 8, 29, 14, 20
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 23;
	*idx++ = 26;
	*idx++ = 22;
	*idx++ = 24;
	*idx++ = 16;
	*idx++ = 19;
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 28;
	*idx++ = 10;
	*idx++ = 25;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 27;
	*idx++ = 21;
	*idx++ = 18;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 29;
	*idx++ = 14;
	*idx++ = 20;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::P, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*(Cy*Iz*Kx), I+0);
	    update((C[0][0])*W[a]*(Iz*Kx), I+1);
	    update((C[0][1])*W[a]*(Cz*Iy*Kx), I+2);
	    update((C[0][0])*W[a]*(Iy*Kx), I+3);
	    update((C[0][1])*W[a]*(Cz*Ix*Ky), I+4);
	    update((C[0][1])*W[a]*(Cx*Iz*Ky), I+5);
	    update((C[0][0])*W[a]*(Iz*Ky), I+6);
	    update((C[0][0])*W[a]*(Ix*Ky), I+7);
	    update((C[0][1])*W[a]*(Ix*(B00 + Cz*Kz)), I+8);
	    update((C[0][1])*W[a]*(Iy*(B00 + Cz*Kz)), I+9);
	    update((C[0][1])*W[a]*(Cx*Iy*Kz), I+10);
	    update((C[0][1])*W[a]*(Cy*Ix*Kz), I+11);
	    update((C[0][0])*W[a]*(Ix*Kz), I+12);
	    update((C[0][0])*W[a]*(Iy*Kz), I+13);
	    update((C[0][1])*W[a]*((Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+14);
	    update((C[0][1])*W[a]*((B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+15);
	    update((C[0][1])*W[a]*((Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+16);
	    double f10 = (B00 + Cy*Ky);
	    update((C[0][1])*W[a]*(Ix*f10), I+17);
	    update((C[0][1])*W[a]*(Iz*f10), I+18);
	    double f11 = (Cx*Ix + B10);
	    update((C[0][1])*W[a]*(Ky*f11), I+19);
	    update((C[0][1])*W[a]*(Kz*f11), I+20);
	    double f13 = (B10 + Cz*Iz);
	    update((C[0][1])*W[a]*(Kx*f13), I+21);
	    update((C[0][1])*W[a]*(Ky*f13), I+22);
	    double f14 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Kx*f14), I+23);
	    update((C[0][1])*W[a]*(Kz*f14), I+24);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][1])*W[a]*(Cz*f3), I+25);
	    update((C[0][1])*W[a]*(Cy*f3), I+26);
	    update((C[0][0])*W[a]*(f3), I+27);
	    double f4 = (Iz*Kz + B00);
	    update((C[0][1])*W[a]*(Cx*f4), I+28);
	    update((C[0][1])*W[a]*(Cy*f4), I+29);
	    update((C[0][0])*W[a]*(f4), I+30);
	    double f7 = (B00 + Iy*Ky);
	    update((C[0][1])*W[a]*(Cz*f7), I+31);
	    update((C[0][1])*W[a]*(Cx*f7), I+32);
	    update((C[0][0])*W[a]*(f7), I+33);
	    double f9 = (B00 + Cx*Kx);
	    update((C[0][1])*W[a]*(Iz*f9), I+34);
	    update((C[0][1])*W[a]*(Iy*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[10] = T[0];
	I[8] = T[1];
	I[7] = T[2];
	I[4] = T[3];
	I[15] = T[4];
	I[21] = T[5];
	I[20] = T[6];
	I[12] = T[7];
	I[27] = T[8];
	I[31] = T[9];
	I[29] = T[10];
	I[26] = T[11];
	I[24] = T[12];
	I[28] = T[13];
	I[1] = T[14];
	I[18] = T[15];
	I[35] = T[16];
	I[14] = T[17];
	I[22] = T[18];
	I[13] = T[19];
	I[25] = T[20];
	I[11] = T[21];
	I[23] = T[22];
	I[6] = T[23];
	I[30] = T[24];
	I[3] = T[25];
	I[2] = T[26];
	I[0] = T[27];
	I[33] = T[28];
	I[34] = T[29];
	I[32] = T[30];
	I[19] = T[31];
	I[17] = T[32];
	I[16] = T[33];
	I[9] = T[34];
	I[5] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 27, 14, 26, 25, 3, 35, 23, 2, 1, 34, 0, 21, 7, 19, 17, 4, 33, 32, 15, 31, 6, 5, 18, 22, 12, 20, 11, 8, 13, 10, 24, 9, 30, 28, 29, 16 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    10, 8, 7, 4, 15, 21, 20, 12, 27, 31, 29, 26, 24, 28, 1, 18, 35, 14, 22, 13, 25, 11, 23, 6, 30, 3, 2, 0, 33, 34, 32, 19, 17, 16, 9, 5
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 15;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 12;
	*idx++ = 27;
	*idx++ = 31;
	*idx++ = 29;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 28;
	*idx++ = 1;
	*idx++ = 18;
	*idx++ = 35;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 13;
	*idx++ = 25;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 6;
	*idx++ = 30;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 32;
	*idx++ = 19;
	*idx++ = 17;
	*idx++ = 16;
	*idx++ = 9;
	*idx++ = 5;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[1][0])*W[a]*((Dx*Ix + B00)), I+0);
	    update((C[1][0])*W[a]*(Dz*Ix), I+1);
	    update((C[1][0])*W[a]*(Dy*Ix), I+2);
	    update((C[0][0])*W[a]*(Ix), I+3);
	    update((C[1][0])*W[a]*((Dy*Iy + B00)), I+4);
	    update((C[1][0])*W[a]*(Dz*Iy), I+5);
	    update((C[1][0])*W[a]*(Dx*Iy), I+6);
	    update((C[0][0])*W[a]*(Iy), I+7);
	    update((C[1][0])*W[a]*((B00 + Dz*Iz)), I+8);
	    update((C[1][0])*W[a]*(Dx*Iz), I+9);
	    update((C[1][0])*W[a]*(Dy*Iz), I+10);
	    update((C[0][0])*W[a]*(Iz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[9] = T[1];
	I[6] = T[2];
	I[0] = T[3];
	I[7] = T[4];
	I[10] = T[5];
	I[4] = T[6];
	I[1] = T[7];
	I[11] = T[8];
	I[5] = T[9];
	I[8] = T[10];
	I[2] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 3, 7, 11, 0, 6, 9, 2, 4, 10, 1, 5, 8 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 9, 6, 0, 7, 10, 4, 1, 11, 5, 8, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 9;
	*idx++ = 6;
	*idx++ = 0;
	*idx++ = 7;
	*idx++ = 10;
	*idx++ = 4;
	*idx++ = 1;
	*idx++ = 11;
	*idx++ = 5;
	*idx++ = 8;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::F, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[10]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Ix*(3*B10 + pow(Ix,2))), I+0);
	    update((C[0][0])*W[a]*(Iy*(3*B10 + pow(Iy,2))), I+1);
	    update((C[0][0])*W[a]*(Iz*(3*B10 + pow(Iz,2))), I+2);
	    update((C[0][0])*W[a]*(Ix*Iy*Iz), I+3);
	    double f0 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Iy*f0), I+4);
	    update((C[0][0])*W[a]*(Iz*f0), I+5);
	    double f3 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Iz*f3), I+6);
	    update((C[0][0])*W[a]*(Ix*f3), I+7);
	    double f5 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Ix*f5), I+8);
	    update((C[0][0])*W[a]*(Iy*f5), I+9);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[10]) {
	double T[10];
	for (int i = 0; i < 10; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[9] = T[3];
	I[3] = T[4];
	I[4] = T[5];
	I[6] = T[6];
	I[5] = T[7];
	I[7] = T[8];
	I[8] = T[9];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[10] = { 0, 1, 2, 4, 5, 7, 6, 8, 9, 3 };
// 	if (index < 10) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 9, 3, 4, 6, 5, 7, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 9;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[9]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*((Dx*Ix + B00)), I+0);
	    update((C[0][0])*W[a]*(Dy*Ix), I+1);
	    update((C[0][0])*W[a]*(Dz*Ix), I+2);
	    update((C[0][0])*W[a]*((Dy*Iy + B00)), I+3);
	    update((C[0][0])*W[a]*(Dx*Iy), I+4);
	    update((C[0][0])*W[a]*(Dz*Iy), I+5);
	    update((C[0][0])*W[a]*((B00 + Dz*Iz)), I+6);
	    update((C[0][0])*W[a]*(Dx*Iz), I+7);
	    update((C[0][0])*W[a]*(Dy*Iz), I+8);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[9]) {
	double T[9];
	for (int i = 0; i < 9; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[3] = T[1];
	I[6] = T[2];
	I[4] = T[3];
	I[1] = T[4];
	I[7] = T[5];
	I[8] = T[6];
	I[2] = T[7];
	I[5] = T[8];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[9] = { 0, 4, 7, 1, 3, 8, 2, 5, 6 };
// 	if (index < 9) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 3, 6, 4, 1, 7, 8, 2, 5
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 3;
	*idx++ = 6;
	*idx++ = 4;
	*idx++ = 1;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 5;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[1][0])*W[a]*((2*B00*Ix + Dx*(B10 + pow(Ix,2)))), I+0);
	    update((C[1][0])*W[a]*(Dy*(B10 + pow(Ix,2))), I+1);
	    update((C[1][0])*W[a]*(Dz*(B10 + pow(Ix,2))), I+2);
	    update((C[0][0])*W[a]*((B10 + pow(Ix,2))), I+3);
	    update((C[1][0])*W[a]*((2*B00*Iy + Dy*(B10 + pow(Iy,2)))), I+4);
	    update((C[1][0])*W[a]*(Dz*(B10 + pow(Iy,2))), I+5);
	    update((C[1][0])*W[a]*(Dx*(B10 + pow(Iy,2))), I+6);
	    update((C[0][0])*W[a]*((B10 + pow(Iy,2))), I+7);
	    update((C[1][0])*W[a]*(Dz*Ix*Iy), I+8);
	    update((C[0][0])*W[a]*(Ix*Iy), I+9);
	    update((C[1][0])*W[a]*((2*B00*Iz + Dz*(B10 + pow(Iz,2)))), I+10);
	    update((C[1][0])*W[a]*(Dy*(B10 + pow(Iz,2))), I+11);
	    update((C[1][0])*W[a]*(Dx*(B10 + pow(Iz,2))), I+12);
	    update((C[0][0])*W[a]*((B10 + pow(Iz,2))), I+13);
	    update((C[1][0])*W[a]*(Dy*Ix*Iz), I+14);
	    update((C[1][0])*W[a]*(Dx*Iy*Iz), I+15);
	    update((C[0][0])*W[a]*(Iy*Iz), I+16);
	    update((C[0][0])*W[a]*(Ix*Iz), I+17);
	    double f6 = (Dy*Iy + B00);
	    update((C[1][0])*W[a]*(Ix*f6), I+18);
	    update((C[1][0])*W[a]*(Iz*f6), I+19);
	    double f7 = (B00 + Dz*Iz);
	    update((C[1][0])*W[a]*(Ix*f7), I+20);
	    update((C[1][0])*W[a]*(Iy*f7), I+21);
	    double f8 = (Dx*Ix + B00);
	    update((C[1][0])*W[a]*(Iy*f8), I+22);
	    update((C[1][0])*W[a]*(Iz*f8), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[6] = T[0];
	I[12] = T[1];
	I[18] = T[2];
	I[0] = T[3];
	I[13] = T[4];
	I[19] = T[5];
	I[7] = T[6];
	I[1] = T[7];
	I[21] = T[8];
	I[3] = T[9];
	I[20] = T[10];
	I[14] = T[11];
	I[8] = T[12];
	I[2] = T[13];
	I[16] = T[14];
	I[11] = T[15];
	I[5] = T[16];
	I[4] = T[17];
	I[15] = T[18];
	I[17] = T[19];
	I[22] = T[20];
	I[23] = T[21];
	I[9] = T[22];
	I[10] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 3, 7, 13, 9, 17, 16, 0, 6, 12, 22, 23, 15, 1, 4, 11, 18, 14, 19, 2, 5, 10, 8, 20, 21 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    6, 12, 18, 0, 13, 19, 7, 1, 21, 3, 20, 14, 8, 2, 16, 11, 5, 4, 15, 17, 22, 23, 9, 10
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 18;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 21;
	*idx++ = 3;
	*idx++ = 20;
	*idx++ = 14;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 16;
	*idx++ = 11;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 9;
	*idx++ = 10;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[6]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cx*Cy), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz), I+1);
	    update((C[0][0])*W[a]*(Cy*Cz), I+2);
	    update((C[0][0])*W[a]*(Px), I+3);
	    update((C[0][0])*W[a]*(Py), I+4);
	    update((C[0][0])*W[a]*(Pz), I+5);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[6]) {
	double T[6];
	for (int i = 0; i < 6; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[4] = T[1];
	I[5] = T[2];
	I[0] = T[3];
	I[1] = T[4];
	I[2] = T[5];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[6] = { 3, 4, 5, 0, 1, 2 };
// 	if (index < 6) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 4, 5, 0, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[90]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Ky*Pz*Qx), I+0);
	    update((C[0][0])*W[a]*(Kz*Py*Qx), I+1);
	    update((C[0][0])*W[a]*(Kz*Px*Qy), I+2);
	    update((C[0][0])*W[a]*(Kx*Pz*Qy), I+3);
	    update((C[0][0])*W[a]*(Ky*Px*Qz), I+4);
	    update((C[0][0])*W[a]*(Kx*Py*Qz), I+5);
	    update((C[0][0])*W[a]*(Cx*Qz*(Cy*Ykl + Qy)), I+6);
	    update((C[0][0])*W[a]*(Dz*Px*(Cy*Ykl + Qy)), I+7);
	    update((C[0][0])*W[a]*(Cz*Qx*(Cy*Ykl + Qy)), I+8);
	    update((C[0][0])*W[a]*(Dx*Pz*(Cy*Ykl + Qy)), I+9);
	    update((C[0][0])*W[a]*(Dx*Py*(Cz*Zkl + Qz)), I+10);
	    update((C[0][0])*W[a]*(Cy*Qx*(Cz*Zkl + Qz)), I+11);
	    update((C[0][0])*W[a]*(Dy*Px*(Cz*Zkl + Qz)), I+12);
	    update((C[0][0])*W[a]*(Cx*Qy*(Cz*Zkl + Qz)), I+13);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*((3*B00*Px*(Xkl + 2*Dx) + Cx*f0*(B01 + Dx*Kx) + 6*Cx*pow(B00,2))), I+14);
	    update((C[0][0])*W[a]*(Cx*Dz*Ky*f0), I+15);
	    update((C[0][0])*W[a]*(Cx*Dy*Kz*f0), I+16);
	    double f1 = (Cx*Kx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[0][0])*W[a]*(Dz*f1), I+17);
	    update((C[0][0])*W[a]*(Dy*f1), I+18);
	    double f10 = (Cz*Kz*(3*B10 + pow(Cz,2)) + 3*B00*Pz);
	    update((C[0][0])*W[a]*(Dx*f10), I+19);
	    update((C[0][0])*W[a]*(Dy*f10), I+20);
	    double f11 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[0][0])*W[a]*(Cx*Cy*f11), I+21);
	    update((C[0][0])*W[a]*(Px*f11), I+22);
	    update((C[0][0])*W[a]*(Py*f11), I+23);
	    double f12 = (B01 + Dx*Kx);
	    update((C[0][0])*W[a]*(Cy*Pz*f12), I+24);
	    update((C[0][0])*W[a]*(Cz*Py*f12), I+25);
	    double f14 = (3*B00*Pz + Cz*Dz*(3*B10 + pow(Cz,2)));
	    update((C[0][0])*W[a]*(Kx*f14), I+26);
	    update((C[0][0])*W[a]*(Ky*f14), I+27);
	    double f15 = (Kx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cy*Dz*f15), I+28);
	    update((C[0][0])*W[a]*(Cz*Dy*f15), I+29);
	    update((C[0][0])*W[a]*(Qy*f15), I+30);
	    update((C[0][0])*W[a]*(Qz*f15), I+31);
	    double f16 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cx*Dz*(f16 + Py*Ykl)), I+32);
	    update((C[0][0])*W[a]*(Cz*Dx*(f16 + Py*Ykl)), I+33);
	    update((C[0][0])*W[a]*(Qz*(f16 + Py*Ykl)), I+34);
	    update((C[0][0])*W[a]*(Qx*(f16 + Py*Ykl)), I+35);
	    update((C[0][0])*W[a]*(Cx*Kz*f16), I+36);
	    update((C[0][0])*W[a]*(f16*(Cz*Zkl + Qz)), I+37);
	    update((C[0][0])*W[a]*(Cz*Kx*f16), I+38);
	    double f17 = (2*pow(B00,2) + Pz*(Dz*Kz + B01) + 2*B00*Cz*(2*Dz + Zkl));
	    update((C[0][0])*W[a]*(Cx*f17), I+39);
	    update((C[0][0])*W[a]*(Cy*f17), I+40);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*Dy*(Pz*Zkl + f2)), I+41);
	    update((C[0][0])*W[a]*(Cy*Dx*(Pz*Zkl + f2)), I+42);
	    update((C[0][0])*W[a]*(Qx*(Pz*Zkl + f2)), I+43);
	    update((C[0][0])*W[a]*(Qy*(Pz*Zkl + f2)), I+44);
	    update((C[0][0])*W[a]*(Cx*Ky*f2), I+45);
	    update((C[0][0])*W[a]*(f2*(Cy*Ykl + Qy)), I+46);
	    update((C[0][0])*W[a]*(Cy*Kx*f2), I+47);
	    double f20 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cy*Kz*f20), I+48);
	    update((C[0][0])*W[a]*(f20*(Cy*Ykl + Qy)), I+49);
	    update((C[0][0])*W[a]*(Cz*Ky*f20), I+50);
	    update((C[0][0])*W[a]*(f20*(Cz*Zkl + Qz)), I+51);
	    double f22 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Cy*Px*f22), I+52);
	    update((C[0][0])*W[a]*(Cx*f0*f22), I+53);
	    update((C[0][0])*W[a]*(Cx*Py*f22), I+54);
	    double f23 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Cy*Qz*f23), I+55);
	    update((C[0][0])*W[a]*(Dz*Py*f23), I+56);
	    update((C[0][0])*W[a]*(Cz*Qy*f23), I+57);
	    update((C[0][0])*W[a]*(Dy*Pz*f23), I+58);
	    update((C[0][0])*W[a]*(f16*f23), I+59);
	    update((C[0][0])*W[a]*(f2*f23), I+60);
	    double f29 = (Cx*Dx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[0][0])*W[a]*(Ky*f29), I+61);
	    update((C[0][0])*W[a]*(Kz*f29), I+62);
	    double f3 = (Py*(B01 + Dy*Ky) + 2*pow(B00,2) + 2*B00*Cy*(Ykl + 2*Dy));
	    update((C[0][0])*W[a]*(Cz*f3), I+63);
	    update((C[0][0])*W[a]*(Cx*f3), I+64);
	    double f30 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Cz*Px*f30), I+65);
	    update((C[0][0])*W[a]*(Cx*Pz*f30), I+66);
	    update((C[0][0])*W[a]*(Cx*f0*f30), I+67);
	    double f31 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*((Cz*f31*(Dz*Kz + B01) + 3*B00*Pz*(2*Dz + Zkl) + 6*Cz*pow(B00,2))), I+68);
	    update((C[0][0])*W[a]*(Cz*f30*f31), I+69);
	    update((C[0][0])*W[a]*(Cz*Dx*Ky*f31), I+70);
	    update((C[0][0])*W[a]*(Cz*f12*f31), I+71);
	    update((C[0][0])*W[a]*(Cz*Dy*Kx*f31), I+72);
	    double f32 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*((6*Cy*pow(B00,2) + 3*B00*Py*(Ykl + 2*Dy) + Cy*f32*(B01 + Dy*Ky))), I+73);
	    update((C[0][0])*W[a]*(Cy*Dx*Kz*f32), I+74);
	    update((C[0][0])*W[a]*(Cy*Dz*Kx*f32), I+75);
	    update((C[0][0])*W[a]*(Cy*f22*f32), I+76);
	    update((C[0][0])*W[a]*(Cy*f12*f32), I+77);
	    double f4 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[0][0])*W[a]*(Cx*Cz*f4), I+78);
	    update((C[0][0])*W[a]*(Pz*f4), I+79);
	    update((C[0][0])*W[a]*(Px*f4), I+80);
	    double f5 = (3*B00*Py + Cy*Ky*(3*B10 + pow(Cy,2)));
	    update((C[0][0])*W[a]*(Dx*f5), I+81);
	    update((C[0][0])*W[a]*(Dz*f5), I+82);
	    double f6 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[0][0])*W[a]*(Kx*f6), I+83);
	    update((C[0][0])*W[a]*(Kz*f6), I+84);
	    double f7 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[0][0])*W[a]*(Cy*Cz*f7), I+85);
	    update((C[0][0])*W[a]*(Py*f7), I+86);
	    update((C[0][0])*W[a]*(Pz*f7), I+87);
	    double f9 = (2*B00*Cx*(Xkl + 2*Dx) + 2*pow(B00,2) + Px*(B01 + Dx*Kx));
	    update((C[0][0])*W[a]*(Cy*f9), I+88);
	    update((C[0][0])*W[a]*(Cz*f9), I+89);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[90]) {
	double T[90];
	for (int i = 0; i < 90; ++i) {
	    T[i] = I[i];
	}
	I[37] = T[0];
	I[65] = T[1];
	I[73] = T[2];
	I[18] = T[3];
	I[54] = T[4];
	I[26] = T[5];
	I[59] = T[6];
	I[53] = T[7];
	I[39] = T[8];
	I[38] = T[9];
	I[66] = T[10];
	I[69] = T[11];
	I[74] = T[12];
	I[79] = T[13];
	I[0] = T[14];
	I[50] = T[15];
	I[70] = T[16];
	I[20] = T[17];
	I[10] = T[18];
	I[62] = T[19];
	I[72] = T[20];
	I[89] = T[21];
	I[84] = T[22];
	I[86] = T[23];
	I[8] = T[24];
	I[6] = T[25];
	I[22] = T[26];
	I[52] = T[27];
	I[23] = T[28];
	I[14] = T[29];
	I[13] = T[30];
	I[24] = T[31];
	I[55] = T[32];
	I[36] = T[33];
	I[56] = T[34];
	I[35] = T[35];
	I[75] = T[36];
	I[76] = T[37];
	I[16] = T[38];
	I[87] = T[39];
	I[88] = T[40];
	I[77] = T[41];
	I[68] = T[42];
	I[67] = T[43];
	I[78] = T[44];
	I[57] = T[45];
	I[58] = T[46];
	I[28] = T[47];
	I[63] = T[48];
	I[33] = T[49];
	I[34] = T[50];
	I[64] = T[51];
	I[83] = T[52];
	I[80] = T[53];
	I[85] = T[54];
	I[29] = T[55];
	I[25] = T[56];
	I[19] = T[57];
	I[17] = T[58];
	I[15] = T[59];
	I[27] = T[60];
	I[30] = T[61];
	I[60] = T[62];
	I[46] = T[63];
	I[45] = T[64];
	I[44] = T[65];
	I[47] = T[66];
	I[40] = T[67];
	I[82] = T[68];
	I[42] = T[69];
	I[32] = T[70];
	I[2] = T[71];
	I[12] = T[72];
	I[41] = T[73];
	I[61] = T[74];
	I[21] = T[75];
	I[81] = T[76];
	I[1] = T[77];
	I[49] = T[78];
	I[48] = T[79];
	I[43] = T[80];
	I[31] = T[81];
	I[51] = T[82];
	I[11] = T[83];
	I[71] = T[84];
	I[9] = T[85];
	I[5] = T[86];
	I[7] = T[87];
	I[3] = T[88];
	I[4] = T[89];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[90] = { 14, 77, 71, 88, 89, 86, 25, 87, 24, 85, 18, 83, 72, 30, 29, 59, 38, 58, 3, 57, 17, 75, 26, 28, 31, 56, 5, 60, 47, 55, 61, 81, 70, 49, 50, 35, 33, 0, 9, 8, 67, 73, 69, 80, 65, 64, 63, 66, 79, 78, 15, 82, 27, 7, 4, 32, 34, 45, 46, 6, 62, 74, 19, 48, 51, 1, 10, 43, 42, 11, 16, 84, 20, 2, 12, 36, 37, 41, 44, 13, 53, 76, 68, 52, 22, 54, 23, 39, 40, 21 };
// 	if (index < 90) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    37, 65, 73, 18, 54, 26, 59, 53, 39, 38, 66, 69, 74, 79, 0, 50, 70, 20, 10, 62, 72, 89, 84, 86, 8, 6, 22, 52, 23, 14, 13, 24, 55, 36, 56, 35, 75, 76, 16, 87, 88, 77, 68, 67, 78, 57, 58, 28, 63, 33, 34, 64, 83, 80, 85, 29, 25, 19, 17, 15, 27, 30, 60, 46, 45, 44, 47, 40, 82, 42, 32, 2, 12, 41, 61, 21, 81, 1, 49, 48, 43, 31, 51, 11, 71, 9, 5, 7, 3, 4
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 37;
	*idx++ = 65;
	*idx++ = 73;
	*idx++ = 18;
	*idx++ = 54;
	*idx++ = 26;
	*idx++ = 59;
	*idx++ = 53;
	*idx++ = 39;
	*idx++ = 38;
	*idx++ = 66;
	*idx++ = 69;
	*idx++ = 74;
	*idx++ = 79;
	*idx++ = 0;
	*idx++ = 50;
	*idx++ = 70;
	*idx++ = 20;
	*idx++ = 10;
	*idx++ = 62;
	*idx++ = 72;
	*idx++ = 89;
	*idx++ = 84;
	*idx++ = 86;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 22;
	*idx++ = 52;
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 24;
	*idx++ = 55;
	*idx++ = 36;
	*idx++ = 56;
	*idx++ = 35;
	*idx++ = 75;
	*idx++ = 76;
	*idx++ = 16;
	*idx++ = 87;
	*idx++ = 88;
	*idx++ = 77;
	*idx++ = 68;
	*idx++ = 67;
	*idx++ = 78;
	*idx++ = 57;
	*idx++ = 58;
	*idx++ = 28;
	*idx++ = 63;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 64;
	*idx++ = 83;
	*idx++ = 80;
	*idx++ = 85;
	*idx++ = 29;
	*idx++ = 25;
	*idx++ = 19;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 27;
	*idx++ = 30;
	*idx++ = 60;
	*idx++ = 46;
	*idx++ = 45;
	*idx++ = 44;
	*idx++ = 47;
	*idx++ = 40;
	*idx++ = 82;
	*idx++ = 42;
	*idx++ = 32;
	*idx++ = 2;
	*idx++ = 12;
	*idx++ = 41;
	*idx++ = 61;
	*idx++ = 21;
	*idx++ = 81;
	*idx++ = 1;
	*idx++ = 49;
	*idx++ = 48;
	*idx++ = 43;
	*idx++ = 31;
	*idx++ = 51;
	*idx++ = 11;
	*idx++ = 71;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 3;
	*idx++ = 4;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::P, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((Cx*Ix + B10)), I+0);
	    update((C[0][0])*W[a]*(Cz*Ix), I+1);
	    update((C[0][0])*W[a]*(Cy*Ix), I+2);
	    update((C[0][0])*W[a]*((Cy*Iy + B10)), I+3);
	    update((C[0][0])*W[a]*(Cz*Iy), I+4);
	    update((C[0][0])*W[a]*(Cx*Iy), I+5);
	    update((C[0][0])*W[a]*((B10 + Cz*Iz)), I+6);
	    update((C[0][0])*W[a]*(Cx*Iz), I+7);
	    update((C[0][0])*W[a]*(Cy*Iz), I+8);
	    update((C[1][0])*W[a]*(Kx*(Cy*Iy + B10)), I+9);
	    update((C[1][0])*W[a]*(Cy*Iz*Kx), I+10);
	    update((C[1][0])*W[a]*(Kx*(B10 + Cz*Iz)), I+11);
	    update((C[1][0])*W[a]*(Cz*Iy*Kx), I+12);
	    update((C[1][0])*W[a]*(Ky*(B10 + Cz*Iz)), I+13);
	    update((C[1][0])*W[a]*(Ky*(Cx*Ix + B10)), I+14);
	    update((C[1][0])*W[a]*(Cx*Iz*Ky), I+15);
	    update((C[1][0])*W[a]*(Cz*Ix*Ky), I+16);
	    update((C[1][0])*W[a]*(Kz*(Cy*Iy + B10)), I+17);
	    update((C[1][0])*W[a]*(Kz*(Cx*Ix + B10)), I+18);
	    update((C[1][0])*W[a]*(Cx*Iy*Kz), I+19);
	    update((C[1][0])*W[a]*(Cy*Ix*Kz), I+20);
	    update((C[1][0])*W[a]*((Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+21);
	    update((C[1][0])*W[a]*((B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+22);
	    update((C[1][0])*W[a]*((Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+23);
	    double f10 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Iy*f10), I+24);
	    update((C[1][0])*W[a]*(Iz*f10), I+25);
	    double f11 = (B00 + Cy*Ky);
	    update((C[1][0])*W[a]*(Ix*f11), I+26);
	    update((C[1][0])*W[a]*(Iz*f11), I+27);
	    double f15 = (B00 + Cz*Kz);
	    update((C[1][0])*W[a]*(Ix*f15), I+28);
	    update((C[1][0])*W[a]*(Iy*f15), I+29);
	    double f4 = (B00 + Ix*Kx);
	    update((C[1][0])*W[a]*(Cy*f4), I+30);
	    update((C[1][0])*W[a]*(Cz*f4), I+31);
	    double f5 = (Iz*Kz + B00);
	    update((C[1][0])*W[a]*(Cx*f5), I+32);
	    update((C[1][0])*W[a]*(Cy*f5), I+33);
	    double f8 = (B00 + Iy*Ky);
	    update((C[1][0])*W[a]*(Cx*f8), I+34);
	    update((C[1][0])*W[a]*(Cz*f8), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[2] = T[1];
	I[1] = T[2];
	I[4] = T[3];
	I[5] = T[4];
	I[3] = T[5];
	I[8] = T[6];
	I[6] = T[7];
	I[7] = T[8];
	I[13] = T[9];
	I[16] = T[10];
	I[17] = T[11];
	I[14] = T[12];
	I[26] = T[13];
	I[18] = T[14];
	I[24] = T[15];
	I[20] = T[16];
	I[31] = T[17];
	I[27] = T[18];
	I[30] = T[19];
	I[28] = T[20];
	I[9] = T[21];
	I[22] = T[22];
	I[35] = T[23];
	I[12] = T[24];
	I[15] = T[25];
	I[19] = T[26];
	I[25] = T[27];
	I[29] = T[28];
	I[32] = T[29];
	I[10] = T[30];
	I[11] = T[31];
	I[33] = T[32];
	I[34] = T[33];
	I[21] = T[34];
	I[23] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 0, 2, 1, 5, 3, 4, 7, 8, 6, 21, 30, 31, 24, 9, 12, 25, 10, 11, 14, 26, 16, 34, 22, 35, 15, 27, 13, 18, 20, 28, 19, 17, 29, 32, 33, 23 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 2, 1, 4, 5, 3, 8, 6, 7, 13, 16, 17, 14, 26, 18, 24, 20, 31, 27, 30, 28, 9, 22, 35, 12, 15, 19, 25, 29, 32, 10, 11, 33, 34, 21, 23
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 13;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 14;
	*idx++ = 26;
	*idx++ = 18;
	*idx++ = 24;
	*idx++ = 20;
	*idx++ = 31;
	*idx++ = 27;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 9;
	*idx++ = 22;
	*idx++ = 35;
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 19;
	*idx++ = 25;
	*idx++ = 29;
	*idx++ = 32;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 21;
	*idx++ = 23;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Cz*Dx), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz*Dy), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Dz), I+2);
	    update((C[0][0])*W[a]*((Dx*Px + 2*B00*Cx)), I+3);
	    update((C[0][0])*W[a]*(Dy*Px), I+4);
	    update((C[0][0])*W[a]*(Dz*Px), I+5);
	    update((C[0][0])*W[a]*((Dy*Py + 2*B00*Cy)), I+6);
	    update((C[0][0])*W[a]*(Dx*Py), I+7);
	    update((C[0][0])*W[a]*(Dz*Py), I+8);
	    update((C[0][0])*W[a]*((Dz*Pz + 2*B00*Cz)), I+9);
	    update((C[0][0])*W[a]*(Dx*Pz), I+10);
	    update((C[0][0])*W[a]*(Dy*Pz), I+11);
	    update((C[0][0])*W[a]*(Cy*Qx), I+12);
	    update((C[0][0])*W[a]*(Cz*Qx), I+13);
	    update((C[0][0])*W[a]*(Cz*Qy), I+14);
	    update((C[0][0])*W[a]*(Cx*Qy), I+15);
	    update((C[0][0])*W[a]*(Cx*Qz), I+16);
	    update((C[0][0])*W[a]*(Cy*Qz), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[10] = T[1];
	I[15] = T[2];
	I[0] = T[3];
	I[6] = T[4];
	I[12] = T[5];
	I[7] = T[6];
	I[1] = T[7];
	I[13] = T[8];
	I[14] = T[9];
	I[2] = T[10];
	I[8] = T[11];
	I[3] = T[12];
	I[4] = T[13];
	I[11] = T[14];
	I[9] = T[15];
	I[16] = T[16];
	I[17] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 3, 7, 10, 12, 13, 0, 4, 6, 11, 15, 1, 14, 5, 8, 9, 2, 16, 17 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 10, 15, 0, 6, 12, 7, 1, 13, 14, 2, 8, 3, 4, 11, 9, 16, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 8;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 16;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[3]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {


	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))


	    update((C[0][0])*W[a]*(Cx), I+0);
	    update((C[0][0])*W[a]*(Cy), I+1);
	    update((C[0][0])*W[a]*(Cz), I+2);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[3]) {
	double T[3];
	for (int i = 0; i < 3; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[3] = { 0, 1, 2 };
// 	if (index < 3) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[16]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Cx), I+1);
	    update((C[0][1])*W[a]*(Cy), I+2);
	    update((C[0][1])*W[a]*(Cz), I+3);
	    update((C[1][1])*W[a]*(Cy*Dx), I+4);
	    update((C[1][1])*W[a]*(Cz*Dx), I+5);
	    update((C[1][0])*W[a]*(Dx), I+6);
	    update((C[1][1])*W[a]*(Cz*Dy), I+7);
	    update((C[1][1])*W[a]*(Cx*Dy), I+8);
	    update((C[1][0])*W[a]*(Dy), I+9);
	    update((C[1][1])*W[a]*(Cx*Dz), I+10);
	    update((C[1][1])*W[a]*(Cy*Dz), I+11);
	    update((C[1][0])*W[a]*(Dz), I+12);
	    update((C[1][1])*W[a]*(Qx), I+13);
	    update((C[1][1])*W[a]*(Qy), I+14);
	    update((C[1][1])*W[a]*(Qz), I+15);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[16]) {
	double T[16];
	for (int i = 0; i < 16; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[6] = T[4];
	I[7] = T[5];
	I[4] = T[6];
	I[11] = T[7];
	I[9] = T[8];
	I[8] = T[9];
	I[13] = T[10];
	I[14] = T[11];
	I[12] = T[12];
	I[5] = T[13];
	I[10] = T[14];
	I[15] = T[15];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[16] = { 0, 1, 2, 3, 6, 13, 4, 5, 9, 8, 14, 7, 12, 10, 11, 15 };
// 	if (index < 16) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 6, 7, 4, 11, 9, 8, 13, 14, 12, 5, 10, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 12;
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::SP, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][4],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][4],
	      double (&I)[64]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][4],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][4],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qy = (Cy*Dy + B00);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Cx), I+1);
	    update((C[0][1])*W[a]*(Cy), I+2);
	    update((C[0][1])*W[a]*(Cz), I+3);
	    update((C[1][1])*W[a]*(Cy*Dx), I+4);
	    update((C[1][1])*W[a]*(Cz*Dx), I+5);
	    update((C[1][0])*W[a]*(Dx), I+6);
	    update((C[1][1])*W[a]*(Cx*Dy), I+7);
	    update((C[1][1])*W[a]*(Cz*Dy), I+8);
	    update((C[1][0])*W[a]*(Dy), I+9);
	    update((C[1][1])*W[a]*(Cx*Dz), I+10);
	    update((C[1][1])*W[a]*(Cy*Dz), I+11);
	    update((C[1][0])*W[a]*(Dz), I+12);
	    update((C[1][3])*W[a]*(Dy*(Cx*Ix + B10)), I+13);
	    update((C[0][3])*W[a]*((Cx*Ix + B10)), I+14);
	    update((C[1][3])*W[a]*(Dz*(Cx*Ix + B10)), I+15);
	    update((C[1][2])*W[a]*(Dy*Ix), I+16);
	    update((C[1][3])*W[a]*(Cz*Dy*Ix), I+17);
	    update((C[0][3])*W[a]*(Cz*Ix), I+18);
	    update((C[0][3])*W[a]*(Cy*Ix), I+19);
	    update((C[1][3])*W[a]*(Cy*Dz*Ix), I+20);
	    update((C[1][2])*W[a]*(Dz*Ix), I+21);
	    update((C[0][2])*W[a]*(Ix), I+22);
	    update((C[1][3])*W[a]*(Dz*(Cy*Iy + B10)), I+23);
	    update((C[1][3])*W[a]*(Dx*(Cy*Iy + B10)), I+24);
	    update((C[0][3])*W[a]*((Cy*Iy + B10)), I+25);
	    update((C[1][3])*W[a]*(Cx*Dz*Iy), I+26);
	    update((C[1][2])*W[a]*(Dx*Iy), I+27);
	    update((C[1][3])*W[a]*(Cz*Dx*Iy), I+28);
	    update((C[0][3])*W[a]*(Cz*Iy), I+29);
	    update((C[1][2])*W[a]*(Dz*Iy), I+30);
	    update((C[0][3])*W[a]*(Cx*Iy), I+31);
	    update((C[0][2])*W[a]*(Iy), I+32);
	    update((C[1][3])*W[a]*(Dy*(B10 + Cz*Iz)), I+33);
	    update((C[1][3])*W[a]*(Dx*(B10 + Cz*Iz)), I+34);
	    update((C[0][3])*W[a]*((B10 + Cz*Iz)), I+35);
	    update((C[1][2])*W[a]*(Dy*Iz), I+36);
	    update((C[1][2])*W[a]*(Dx*Iz), I+37);
	    update((C[1][3])*W[a]*(Cx*Dy*Iz), I+38);
	    update((C[0][3])*W[a]*(Cx*Iz), I+39);
	    update((C[0][3])*W[a]*(Cy*Iz), I+40);
	    update((C[1][3])*W[a]*(Cy*Dx*Iz), I+41);
	    update((C[0][2])*W[a]*(Iz), I+42);
	    update((C[1][3])*W[a]*(Iz*Qy), I+43);
	    update((C[1][3])*W[a]*(Ix*Qy), I+44);
	    update((C[1][1])*W[a]*(Qy), I+45);
	    update((C[1][3])*W[a]*(Cz*(Dy*Yij + Qy)), I+46);
	    update((C[1][3])*W[a]*(Cx*(Dy*Yij + Qy)), I+47);
	    update((C[1][2])*W[a]*((Dy*Yij + Qy)), I+48);
	    double f2 = Cz*Dz;
	    update((C[1][3])*W[a]*((Dz*Pz + B00*(2*Cz + Zij) + f2*Zij)), I+49);
	    update((C[1][3])*W[a]*(Cy*(B00 + Dz*Zij + f2)), I+50);
	    update((C[1][3])*W[a]*(Cx*(B00 + Dz*Zij + f2)), I+51);
	    update((C[1][2])*W[a]*((B00 + Dz*Zij + f2)), I+52);
	    update((C[1][3])*W[a]*(Iy*(B00 + f2)), I+53);
	    update((C[1][3])*W[a]*(Ix*(B00 + f2)), I+54);
	    update((C[1][1])*W[a]*((B00 + f2)), I+55);
	    double f4 = Cx*Dx;
	    update((C[1][3])*W[a]*((f4*Xij + Dx*Px + B00*(Xij + 2*Cx))), I+56);
	    update((C[1][3])*W[a]*(Cy*(B00 + f4 + Dx*Xij)), I+57);
	    update((C[1][3])*W[a]*(Cz*(B00 + f4 + Dx*Xij)), I+58);
	    update((C[1][2])*W[a]*((B00 + f4 + Dx*Xij)), I+59);
	    update((C[1][3])*W[a]*(Iz*(B00 + f4)), I+60);
	    update((C[1][1])*W[a]*((B00 + f4)), I+61);
	    update((C[1][3])*W[a]*(Iy*(B00 + f4)), I+62);
	    double f8 = Cy*Dy;
	    update((C[1][3])*W[a]*((B00*(Yij + 2*Cy) + f8*Yij + Dy*Py)), I+63);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[64]) {
	double T[64];
	for (int i = 0; i < 64; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[18] = T[4];
	I[19] = T[5];
	I[16] = T[6];
	I[33] = T[7];
	I[35] = T[8];
	I[32] = T[9];
	I[49] = T[10];
	I[50] = T[11];
	I[48] = T[12];
	I[37] = T[13];
	I[5] = T[14];
	I[53] = T[15];
	I[36] = T[16];
	I[39] = T[17];
	I[7] = T[18];
	I[6] = T[19];
	I[54] = T[20];
	I[52] = T[21];
	I[4] = T[22];
	I[58] = T[23];
	I[26] = T[24];
	I[10] = T[25];
	I[57] = T[26];
	I[24] = T[27];
	I[27] = T[28];
	I[11] = T[29];
	I[56] = T[30];
	I[9] = T[31];
	I[8] = T[32];
	I[47] = T[33];
	I[31] = T[34];
	I[15] = T[35];
	I[44] = T[36];
	I[28] = T[37];
	I[45] = T[38];
	I[13] = T[39];
	I[14] = T[40];
	I[30] = T[41];
	I[12] = T[42];
	I[46] = T[43];
	I[38] = T[44];
	I[34] = T[45];
	I[43] = T[46];
	I[41] = T[47];
	I[40] = T[48];
	I[63] = T[49];
	I[62] = T[50];
	I[61] = T[51];
	I[60] = T[52];
	I[59] = T[53];
	I[55] = T[54];
	I[51] = T[55];
	I[21] = T[56];
	I[22] = T[57];
	I[23] = T[58];
	I[20] = T[59];
	I[29] = T[60];
	I[17] = T[61];
	I[25] = T[62];
	I[42] = T[63];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[64] = { 0, 1, 2, 3, 22, 14, 19, 18, 32, 31, 25, 29, 42, 39, 40, 35, 6, 61, 4, 5, 59, 56, 57, 58, 27, 62, 24, 28, 37, 60, 41, 34, 9, 7, 45, 8, 16, 13, 44, 17, 48, 47, 63, 46, 36, 38, 43, 33, 12, 10, 11, 55, 21, 15, 20, 54, 30, 26, 23, 53, 52, 51, 50, 49 };
// 	if (index < 64) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 18, 19, 16, 33, 35, 32, 49, 50, 48, 37, 5, 53, 36, 39, 7, 6, 54, 52, 4, 58, 26, 10, 57, 24, 27, 11, 56, 9, 8, 47, 31, 15, 44, 28, 45, 13, 14, 30, 12, 46, 38, 34, 43, 41, 40, 63, 62, 61, 60, 59, 55, 51, 21, 22, 23, 20, 29, 17, 25, 42
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 18;
	*idx++ = 19;
	*idx++ = 16;
	*idx++ = 33;
	*idx++ = 35;
	*idx++ = 32;
	*idx++ = 49;
	*idx++ = 50;
	*idx++ = 48;
	*idx++ = 37;
	*idx++ = 5;
	*idx++ = 53;
	*idx++ = 36;
	*idx++ = 39;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 54;
	*idx++ = 52;
	*idx++ = 4;
	*idx++ = 58;
	*idx++ = 26;
	*idx++ = 10;
	*idx++ = 57;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 11;
	*idx++ = 56;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 47;
	*idx++ = 31;
	*idx++ = 15;
	*idx++ = 44;
	*idx++ = 28;
	*idx++ = 45;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 30;
	*idx++ = 12;
	*idx++ = 46;
	*idx++ = 38;
	*idx++ = 34;
	*idx++ = 43;
	*idx++ = 41;
	*idx++ = 40;
	*idx++ = 63;
	*idx++ = 62;
	*idx++ = 61;
	*idx++ = 60;
	*idx++ = 59;
	*idx++ = 55;
	*idx++ = 51;
	*idx++ = 21;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 20;
	*idx++ = 29;
	*idx++ = 17;
	*idx++ = 25;
	*idx++ = 42;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::SP, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[96]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx*Cy), I+0);
	    update((C[0][0])*W[a]*(Cy*Cz), I+1);
	    update((C[0][0])*W[a]*(Cx*Cz), I+2);
	    update((C[1][0])*W[a]*(Cy*Cz*Dx), I+3);
	    update((C[1][0])*W[a]*(Cx*Cz*Dy), I+4);
	    update((C[1][0])*W[a]*(Cx*Cy*Dz), I+5);
	    update((C[1][1])*W[a]*(Cy*Dz*(Cx*Ix + B10)), I+6);
	    update((C[0][1])*W[a]*(Cy*Cz*Ix), I+7);
	    update((C[0][1])*W[a]*(Cz*(Cx*Ix + B10)), I+8);
	    update((C[1][1])*W[a]*(Cz*Dy*(Cx*Ix + B10)), I+9);
	    update((C[0][1])*W[a]*(Cy*(Cx*Ix + B10)), I+10);
	    update((C[0][1])*W[a]*(Cx*Cz*Iy), I+11);
	    update((C[1][1])*W[a]*(Cz*Dx*(Cy*Iy + B10)), I+12);
	    update((C[0][1])*W[a]*(Cz*(Cy*Iy + B10)), I+13);
	    update((C[1][1])*W[a]*(Cx*Dz*(Cy*Iy + B10)), I+14);
	    update((C[0][1])*W[a]*(Cx*(Cy*Iy + B10)), I+15);
	    update((C[1][1])*W[a]*(Cy*Dx*(B10 + Cz*Iz)), I+16);
	    update((C[0][1])*W[a]*(Cy*(B10 + Cz*Iz)), I+17);
	    update((C[1][1])*W[a]*(Cx*Dy*(B10 + Cz*Iz)), I+18);
	    update((C[0][1])*W[a]*(Cx*(B10 + Cz*Iz)), I+19);
	    update((C[0][1])*W[a]*(Cx*Cy*Iz), I+20);
	    update((C[1][1])*W[a]*(Iz*(Dx*Px + 2*B00*Cx)), I+21);
	    update((C[1][1])*W[a]*(Iy*(Dx*Px + 2*B00*Cx)), I+22);
	    update((C[1][1])*W[a]*(Dz*Iy*Px), I+23);
	    update((C[1][0])*W[a]*((Dx*Px + 2*B00*Cx)), I+24);
	    update((C[1][0])*W[a]*(Dz*Px), I+25);
	    update((C[1][0])*W[a]*(Dy*Px), I+26);
	    update((C[1][1])*W[a]*(Dy*Iz*Px), I+27);
	    update((C[0][1])*W[a]*(Iz*Px), I+28);
	    update((C[0][0])*W[a]*(Px), I+29);
	    update((C[0][1])*W[a]*(Iy*Px), I+30);
	    update((C[1][1])*W[a]*(Iz*(Dy*Py + 2*B00*Cy)), I+31);
	    update((C[1][1])*W[a]*(Ix*(Dy*Py + 2*B00*Cy)), I+32);
	    update((C[1][0])*W[a]*((Dy*Py + 2*B00*Cy)), I+33);
	    update((C[0][1])*W[a]*(Iz*Py), I+34);
	    update((C[1][1])*W[a]*(Dx*Iz*Py), I+35);
	    update((C[1][0])*W[a]*(Dx*Py), I+36);
	    update((C[1][0])*W[a]*(Dz*Py), I+37);
	    update((C[1][1])*W[a]*(Dz*Ix*Py), I+38);
	    update((C[0][1])*W[a]*(Ix*Py), I+39);
	    update((C[0][0])*W[a]*(Py), I+40);
	    update((C[1][1])*W[a]*(Iy*(Dz*Pz + 2*B00*Cz)), I+41);
	    update((C[1][1])*W[a]*(Ix*(Dz*Pz + 2*B00*Cz)), I+42);
	    update((C[1][0])*W[a]*((Dz*Pz + 2*B00*Cz)), I+43);
	    update((C[0][1])*W[a]*(Iy*Pz), I+44);
	    update((C[1][1])*W[a]*(Dx*Iy*Pz), I+45);
	    update((C[1][0])*W[a]*(Dx*Pz), I+46);
	    update((C[1][0])*W[a]*(Dy*Pz), I+47);
	    update((C[1][1])*W[a]*(Dy*Ix*Pz), I+48);
	    update((C[0][1])*W[a]*(Ix*Pz), I+49);
	    update((C[0][0])*W[a]*(Pz), I+50);
	    update((C[1][1])*W[a]*(Qx*(Cy*Iy + B10)), I+51);
	    update((C[1][1])*W[a]*(Cy*Iz*Qx), I+52);
	    update((C[1][1])*W[a]*(Qx*(B10 + Cz*Iz)), I+53);
	    update((C[1][1])*W[a]*(Cz*Iy*Qx), I+54);
	    update((C[1][0])*W[a]*(Cy*Qx), I+55);
	    update((C[1][0])*W[a]*(Cz*Qx), I+56);
	    update((C[1][1])*W[a]*(Qy*(Cx*Ix + B10)), I+57);
	    update((C[1][1])*W[a]*(Qy*(B10 + Cz*Iz)), I+58);
	    update((C[1][1])*W[a]*(Cz*Ix*Qy), I+59);
	    update((C[1][1])*W[a]*(Cx*Iz*Qy), I+60);
	    update((C[1][0])*W[a]*(Cx*Qy), I+61);
	    update((C[1][0])*W[a]*(Cz*Qy), I+62);
	    update((C[1][1])*W[a]*(Qz*(Cx*Ix + B10)), I+63);
	    update((C[1][1])*W[a]*(Qz*(Cy*Iy + B10)), I+64);
	    update((C[1][1])*W[a]*(Cy*Ix*Qz), I+65);
	    update((C[1][1])*W[a]*(Cx*Iy*Qz), I+66);
	    update((C[1][0])*W[a]*(Cx*Qz), I+67);
	    update((C[1][0])*W[a]*(Cy*Qz), I+68);
	    update((C[1][1])*W[a]*(Cz*(Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+69);
	    update((C[1][1])*W[a]*(Cy*(Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+70);
	    update((C[1][1])*W[a]*((Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px)), I+71);
	    update((C[1][1])*W[a]*(Dz*(B10*(3*Cx + Xij) + Ix*pow(Cx,2))), I+72);
	    update((C[1][1])*W[a]*(Dy*(B10*(3*Cx + Xij) + Ix*pow(Cx,2))), I+73);
	    update((C[0][1])*W[a]*((B10*(3*Cx + Xij) + Ix*pow(Cx,2))), I+74);
	    update((C[1][1])*W[a]*(Cz*(B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+75);
	    update((C[1][1])*W[a]*(Cx*(B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+76);
	    update((C[1][1])*W[a]*((2*B00*Cy*Yij + 3*B00*Py + Dy*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)))), I+77);
	    update((C[1][1])*W[a]*(Dx*(B10*(3*Cy + Yij) + Iy*pow(Cy,2))), I+78);
	    update((C[0][1])*W[a]*((B10*(3*Cy + Yij) + Iy*pow(Cy,2))), I+79);
	    update((C[1][1])*W[a]*(Dz*(B10*(3*Cy + Yij) + Iy*pow(Cy,2))), I+80);
	    update((C[1][1])*W[a]*(Py*(Dz*Zij + Qz)), I+81);
	    update((C[1][1])*W[a]*(Cx*Cy*(Dz*Zij + Qz)), I+82);
	    update((C[1][1])*W[a]*(Px*(Dz*Zij + Qz)), I+83);
	    update((C[1][1])*W[a]*(Cx*(Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+84);
	    update((C[1][1])*W[a]*(Cy*(Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+85);
	    update((C[1][1])*W[a]*((Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij)), I+86);
	    update((C[1][1])*W[a]*(Dx*(Iz*pow(Cz,2) + B10*(3*Cz + Zij))), I+87);
	    update((C[0][1])*W[a]*((Iz*pow(Cz,2) + B10*(3*Cz + Zij))), I+88);
	    update((C[1][1])*W[a]*(Dy*(Iz*pow(Cz,2) + B10*(3*Cz + Zij))), I+89);
	    double f10 = (Dy*Iy + B00);
	    update((C[1][1])*W[a]*(Px*f10), I+90);
	    update((C[1][1])*W[a]*(Pz*f10), I+91);
	    update((C[1][1])*W[a]*(Cx*Cz*f10), I+92);
	    double f15 = (Dx*Ix + B00);
	    update((C[1][1])*W[a]*(Cy*Cz*f15), I+93);
	    update((C[1][1])*W[a]*(Pz*f15), I+94);
	    update((C[1][1])*W[a]*(Py*f15), I+95);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[96]) {
	double T[96];
	for (int i = 0; i < 96; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[5] = T[1];
	I[4] = T[2];
	I[29] = T[3];
	I[52] = T[4];
	I[75] = T[5];
	I[81] = T[6];
	I[11] = T[7];
	I[10] = T[8];
	I[58] = T[9];
	I[9] = T[10];
	I[16] = T[11];
	I[41] = T[12];
	I[17] = T[13];
	I[87] = T[14];
	I[15] = T[15];
	I[47] = T[16];
	I[23] = T[17];
	I[70] = T[18];
	I[22] = T[19];
	I[21] = T[20];
	I[42] = T[21];
	I[36] = T[22];
	I[84] = T[23];
	I[24] = T[24];
	I[72] = T[25];
	I[48] = T[26];
	I[66] = T[27];
	I[18] = T[28];
	I[0] = T[29];
	I[12] = T[30];
	I[67] = T[31];
	I[55] = T[32];
	I[49] = T[33];
	I[19] = T[34];
	I[43] = T[35];
	I[25] = T[36];
	I[73] = T[37];
	I[79] = T[38];
	I[7] = T[39];
	I[1] = T[40];
	I[86] = T[41];
	I[80] = T[42];
	I[74] = T[43];
	I[14] = T[44];
	I[38] = T[45];
	I[26] = T[46];
	I[50] = T[47];
	I[56] = T[48];
	I[8] = T[49];
	I[2] = T[50];
	I[39] = T[51];
	I[45] = T[52];
	I[46] = T[53];
	I[40] = T[54];
	I[27] = T[55];
	I[28] = T[56];
	I[57] = T[57];
	I[71] = T[58];
	I[59] = T[59];
	I[69] = T[60];
	I[51] = T[61];
	I[53] = T[62];
	I[82] = T[63];
	I[89] = T[64];
	I[83] = T[65];
	I[88] = T[66];
	I[76] = T[67];
	I[77] = T[68];
	I[34] = T[69];
	I[33] = T[70];
	I[30] = T[71];
	I[78] = T[72];
	I[54] = T[73];
	I[6] = T[74];
	I[65] = T[75];
	I[63] = T[76];
	I[61] = T[77];
	I[37] = T[78];
	I[13] = T[79];
	I[85] = T[80];
	I[91] = T[81];
	I[93] = T[82];
	I[90] = T[83];
	I[94] = T[84];
	I[95] = T[85];
	I[92] = T[86];
	I[44] = T[87];
	I[20] = T[88];
	I[68] = T[89];
	I[60] = T[90];
	I[62] = T[91];
	I[64] = T[92];
	I[35] = T[93];
	I[32] = T[94];
	I[31] = T[95];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[96] = { 29, 40, 50, 0, 2, 1, 74, 39, 49, 10, 8, 7, 30, 79, 44, 15, 11, 13, 28, 34, 88, 20, 19, 17, 24, 36, 46, 55, 56, 3, 71, 95, 94, 70, 69, 93, 22, 78, 45, 51, 54, 12, 21, 35, 87, 52, 53, 16, 26, 33, 47, 61, 4, 62, 73, 32, 48, 57, 9, 59, 90, 77, 91, 76, 92, 75, 27, 31, 89, 60, 18, 58, 25, 37, 43, 5, 67, 68, 72, 38, 42, 6, 63, 65, 23, 80, 41, 14, 66, 64, 83, 81, 86, 82, 84, 85 };
// 	if (index < 96) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 5, 4, 29, 52, 75, 81, 11, 10, 58, 9, 16, 41, 17, 87, 15, 47, 23, 70, 22, 21, 42, 36, 84, 24, 72, 48, 66, 18, 0, 12, 67, 55, 49, 19, 43, 25, 73, 79, 7, 1, 86, 80, 74, 14, 38, 26, 50, 56, 8, 2, 39, 45, 46, 40, 27, 28, 57, 71, 59, 69, 51, 53, 82, 89, 83, 88, 76, 77, 34, 33, 30, 78, 54, 6, 65, 63, 61, 37, 13, 85, 91, 93, 90, 94, 95, 92, 44, 20, 68, 60, 62, 64, 35, 32, 31
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 29;
	*idx++ = 52;
	*idx++ = 75;
	*idx++ = 81;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 58;
	*idx++ = 9;
	*idx++ = 16;
	*idx++ = 41;
	*idx++ = 17;
	*idx++ = 87;
	*idx++ = 15;
	*idx++ = 47;
	*idx++ = 23;
	*idx++ = 70;
	*idx++ = 22;
	*idx++ = 21;
	*idx++ = 42;
	*idx++ = 36;
	*idx++ = 84;
	*idx++ = 24;
	*idx++ = 72;
	*idx++ = 48;
	*idx++ = 66;
	*idx++ = 18;
	*idx++ = 0;
	*idx++ = 12;
	*idx++ = 67;
	*idx++ = 55;
	*idx++ = 49;
	*idx++ = 19;
	*idx++ = 43;
	*idx++ = 25;
	*idx++ = 73;
	*idx++ = 79;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 86;
	*idx++ = 80;
	*idx++ = 74;
	*idx++ = 14;
	*idx++ = 38;
	*idx++ = 26;
	*idx++ = 50;
	*idx++ = 56;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 39;
	*idx++ = 45;
	*idx++ = 46;
	*idx++ = 40;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 57;
	*idx++ = 71;
	*idx++ = 59;
	*idx++ = 69;
	*idx++ = 51;
	*idx++ = 53;
	*idx++ = 82;
	*idx++ = 89;
	*idx++ = 83;
	*idx++ = 88;
	*idx++ = 76;
	*idx++ = 77;
	*idx++ = 34;
	*idx++ = 33;
	*idx++ = 30;
	*idx++ = 78;
	*idx++ = 54;
	*idx++ = 6;
	*idx++ = 65;
	*idx++ = 63;
	*idx++ = 61;
	*idx++ = 37;
	*idx++ = 13;
	*idx++ = 85;
	*idx++ = 91;
	*idx++ = 93;
	*idx++ = 90;
	*idx++ = 94;
	*idx++ = 95;
	*idx++ = 92;
	*idx++ = 44;
	*idx++ = 20;
	*idx++ = 68;
	*idx++ = 60;
	*idx++ = 62;
	*idx++ = 64;
	*idx++ = 35;
	*idx++ = 32;
	*idx++ = 31;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[6]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((pow(Kx,2) + B01)), I+0);
	    update((C[0][0])*W[a]*((pow(Ky,2) + B01)), I+1);
	    update((C[0][0])*W[a]*(Kx*Ky), I+2);
	    update((C[0][0])*W[a]*((pow(Kz,2) + B01)), I+3);
	    update((C[0][0])*W[a]*(Kx*Kz), I+4);
	    update((C[0][0])*W[a]*(Ky*Kz), I+5);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[6]) {
	double T[6];
	for (int i = 0; i < 6; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[3] = T[2];
	I[2] = T[3];
	I[4] = T[4];
	I[5] = T[5];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[6] = { 0, 1, 3, 2, 4, 5 };
// 	if (index < 6) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 3, 2, 4, 5
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((2*B00*Kx + Cx*(pow(Kx,2) + B01))), I+0);
	    update((C[0][0])*W[a]*((2*B00*Ky + Cy*(pow(Ky,2) + B01))), I+1);
	    update((C[0][0])*W[a]*(Cz*Kx*Ky), I+2);
	    update((C[0][0])*W[a]*((2*B00*Kz + Cz*(pow(Kz,2) + B01))), I+3);
	    update((C[0][0])*W[a]*(Cx*Ky*Kz), I+4);
	    update((C[0][0])*W[a]*(Cy*Kx*Kz), I+5);
	    double f2 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Cx*f2), I+6);
	    update((C[0][0])*W[a]*(Cy*f2), I+7);
	    double f4 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Cy*f4), I+8);
	    update((C[0][0])*W[a]*(Cz*f4), I+9);
	    double f5 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Kx*f5), I+10);
	    update((C[0][0])*W[a]*(Kz*f5), I+11);
	    double f6 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Kz*f6), I+12);
	    update((C[0][0])*W[a]*(Ky*f6), I+13);
	    double f7 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Cz*f7), I+14);
	    update((C[0][0])*W[a]*(Cx*f7), I+15);
	    double f8 = (B00 + Cz*Kz);
	    update((C[0][0])*W[a]*(Kx*f8), I+16);
	    update((C[0][0])*W[a]*(Ky*f8), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[4] = T[1];
	I[11] = T[2];
	I[8] = T[3];
	I[15] = T[4];
	I[13] = T[5];
	I[6] = T[6];
	I[7] = T[7];
	I[1] = T[8];
	I[2] = T[9];
	I[10] = T[10];
	I[16] = T[11];
	I[12] = T[12];
	I[9] = T[13];
	I[5] = T[14];
	I[3] = T[15];
	I[14] = T[16];
	I[17] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 0, 8, 9, 15, 1, 14, 6, 7, 3, 13, 10, 2, 12, 5, 16, 4, 11, 17 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 4, 11, 8, 15, 13, 6, 7, 1, 2, 10, 16, 12, 9, 5, 3, 14, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 14;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy), I+0);
	    update((C[0][1])*W[a]*(Cz*Dx*Dy), I+1);
	    update((C[0][1])*W[a]*(Cy*Dx*Dz), I+2);
	    update((C[0][0])*W[a]*(Dx*Dz), I+3);
	    update((C[0][1])*W[a]*(Cx*Dy*Dz), I+4);
	    update((C[0][0])*W[a]*(Dy*Dz), I+5);
	    update((C[0][1])*W[a]*(Dz*Qx), I+6);
	    update((C[0][1])*W[a]*(Dy*Qx), I+7);
	    update((C[0][1])*W[a]*(Dx*Qy), I+8);
	    update((C[0][1])*W[a]*(Dz*Qy), I+9);
	    update((C[0][1])*W[a]*(Dx*Qz), I+10);
	    update((C[0][1])*W[a]*(Dy*Qz), I+11);
	    update((C[0][1])*W[a]*((2*B00*Dx + Cx*Rx)), I+12);
	    update((C[0][1])*W[a]*(Cz*Rx), I+13);
	    update((C[0][1])*W[a]*(Cy*Rx), I+14);
	    update((C[0][0])*W[a]*(Rx), I+15);
	    update((C[0][1])*W[a]*((2*B00*Dy + Cy*Ry)), I+16);
	    update((C[0][1])*W[a]*(Cz*Ry), I+17);
	    update((C[0][1])*W[a]*(Cx*Ry), I+18);
	    update((C[0][0])*W[a]*(Ry), I+19);
	    update((C[0][1])*W[a]*((2*B00*Dz + Cz*Rz)), I+20);
	    update((C[0][1])*W[a]*(Cy*Rz), I+21);
	    update((C[0][1])*W[a]*(Cx*Rz), I+22);
	    update((C[0][0])*W[a]*(Rz), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[12] = T[0];
	I[15] = T[1];
	I[18] = T[2];
	I[16] = T[3];
	I[21] = T[4];
	I[20] = T[5];
	I[17] = T[6];
	I[13] = T[7];
	I[14] = T[8];
	I[22] = T[9];
	I[19] = T[10];
	I[23] = T[11];
	I[1] = T[12];
	I[3] = T[13];
	I[2] = T[14];
	I[0] = T[15];
	I[6] = T[16];
	I[7] = T[17];
	I[5] = T[18];
	I[4] = T[19];
	I[11] = T[20];
	I[10] = T[21];
	I[9] = T[22];
	I[8] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 15, 12, 14, 13, 19, 18, 16, 17, 23, 22, 21, 20, 0, 7, 8, 1, 3, 6, 2, 10, 5, 4, 9, 11 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    12, 15, 18, 16, 21, 20, 17, 13, 14, 22, 19, 23, 1, 3, 2, 0, 6, 7, 5, 4, 11, 10, 9, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 19;
	*idx++ = 23;
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::P, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[27]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Cy*Iz*Kx), I+0);
	    update((C[0][0])*W[a]*(Cz*Iy*Kx), I+1);
	    update((C[0][0])*W[a]*(Cz*Ix*Ky), I+2);
	    update((C[0][0])*W[a]*(Cx*Iz*Ky), I+3);
	    update((C[0][0])*W[a]*(Cy*Ix*Kz), I+4);
	    update((C[0][0])*W[a]*(Cx*Iy*Kz), I+5);
	    update((C[0][0])*W[a]*((Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+6);
	    update((C[0][0])*W[a]*((B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+7);
	    update((C[0][0])*W[a]*((Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+8);
	    double f10 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Iz*f10), I+9);
	    update((C[0][0])*W[a]*(Iy*f10), I+10);
	    double f11 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Ix*f11), I+11);
	    update((C[0][0])*W[a]*(Iz*f11), I+12);
	    double f12 = (Cx*Ix + B10);
	    update((C[0][0])*W[a]*(Ky*f12), I+13);
	    update((C[0][0])*W[a]*(Kz*f12), I+14);
	    double f13 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Kx*f13), I+15);
	    update((C[0][0])*W[a]*(Ky*f13), I+16);
	    double f14 = (B00 + Cz*Kz);
	    update((C[0][0])*W[a]*(Ix*f14), I+17);
	    update((C[0][0])*W[a]*(Iy*f14), I+18);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Cy*f3), I+19);
	    update((C[0][0])*W[a]*(Cz*f3), I+20);
	    double f4 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Cx*f4), I+21);
	    update((C[0][0])*W[a]*(Cy*f4), I+22);
	    double f6 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Kx*f6), I+23);
	    update((C[0][0])*W[a]*(Kz*f6), I+24);
	    double f8 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Cz*f8), I+25);
	    update((C[0][0])*W[a]*(Cx*f8), I+26);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[27]) {
	double T[27];
	for (int i = 0; i < 27; ++i) {
	    T[i] = I[i];
	}
	I[7] = T[0];
	I[5] = T[1];
	I[11] = T[2];
	I[15] = T[3];
	I[19] = T[4];
	I[21] = T[5];
	I[0] = T[6];
	I[13] = T[7];
	I[26] = T[8];
	I[6] = T[9];
	I[3] = T[10];
	I[10] = T[11];
	I[16] = T[12];
	I[9] = T[13];
	I[18] = T[14];
	I[8] = T[15];
	I[17] = T[16];
	I[20] = T[17];
	I[23] = T[18];
	I[1] = T[19];
	I[2] = T[20];
	I[24] = T[21];
	I[25] = T[22];
	I[4] = T[23];
	I[22] = T[24];
	I[14] = T[25];
	I[12] = T[26];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[27] = { 6, 19, 20, 10, 23, 1, 9, 0, 15, 13, 11, 2, 26, 7, 25, 3, 12, 16, 14, 4, 17, 5, 24, 18, 21, 22, 8 };
// 	if (index < 27) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    7, 5, 11, 15, 19, 21, 0, 13, 26, 6, 3, 10, 16, 9, 18, 8, 17, 20, 23, 1, 2, 24, 25, 4, 22, 14, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 11;
	*idx++ = 15;
	*idx++ = 19;
	*idx++ = 21;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 26;
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 9;
	*idx++ = 18;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 24;
	*idx++ = 25;
	*idx++ = 4;
	*idx++ = 22;
	*idx++ = 14;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::P, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[108]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Dy*Iz*Kx), I+0);
	    update((C[0][0])*W[a]*(Dz*Iy*Kx), I+1);
	    update((C[0][0])*W[a]*(Dz*Ix*Ky), I+2);
	    update((C[0][0])*W[a]*(Dx*Iz*Ky), I+3);
	    update((C[0][0])*W[a]*(Dy*Ix*Kz), I+4);
	    update((C[0][0])*W[a]*(Dx*Iy*Kz), I+5);
	    update((C[0][1])*W[a]*(Iy*Kz*Qx), I+6);
	    update((C[0][1])*W[a]*(Iz*Ky*Qx), I+7);
	    update((C[0][1])*W[a]*(Ix*Kz*Qy), I+8);
	    update((C[0][1])*W[a]*(Iz*Kx*Qy), I+9);
	    update((C[0][1])*W[a]*(Ix*Ky*Qz), I+10);
	    update((C[0][1])*W[a]*(Iy*Kx*Qz), I+11);
	    update((C[0][1])*W[a]*(Cy*Kz*(Dx*Xij + Qx)), I+12);
	    update((C[0][0])*W[a]*(Kz*(Dx*Xij + Qx)), I+13);
	    update((C[0][1])*W[a]*(Cz*Ky*(Dx*Xij + Qx)), I+14);
	    update((C[0][0])*W[a]*(Ky*(Dx*Xij + Qx)), I+15);
	    update((C[0][1])*W[a]*(Dy*Ix*(Cz*Zkl + Qz)), I+16);
	    update((C[0][1])*W[a]*((Cz*Zkl + Qz)*(Dx*Xij + Qx)), I+17);
	    update((C[0][1])*W[a]*(Dx*Iy*(Cz*Zkl + Qz)), I+18);
	    double f11 = (B00 + Iy*Ky);
	    update((C[0][1])*W[a]*(Cx*Dz*f11), I+19);
	    update((C[0][0])*W[a]*(Dz*f11), I+20);
	    update((C[0][1])*W[a]*(Qx*f11), I+21);
	    update((C[0][1])*W[a]*(Cz*Dx*f11), I+22);
	    update((C[0][0])*W[a]*(Dx*f11), I+23);
	    update((C[0][1])*W[a]*(Qz*f11), I+24);
	    double f12 = (B00 + Cy*Ky);
	    update((C[0][1])*W[a]*(Dz*Ix*f12), I+25);
	    update((C[0][1])*W[a]*(Dx*Iz*f12), I+26);
	    update((C[0][1])*W[a]*(f12*(Dx*Xij + Qx)), I+27);
	    double f13 = (Dy*Iy + B00);
	    update((C[0][1])*W[a]*(f13*(Cz*Zkl + Qz)), I+28);
	    update((C[0][1])*W[a]*(Cx*Kz*f13), I+29);
	    update((C[0][1])*W[a]*(Cz*Kx*f13), I+30);
	    update((C[0][0])*W[a]*(Kx*f13), I+31);
	    update((C[0][0])*W[a]*(Kz*f13), I+32);
	    double f14 = (Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij));
	    update((C[0][1])*W[a]*(Dy*(f14 + Zkl*(B10 + Cz*Iz))), I+33);
	    update((C[0][1])*W[a]*(Dx*(f14 + Zkl*(B10 + Cz*Iz))), I+34);
	    update((C[0][1])*W[a]*(Kx*f14), I+35);
	    update((C[0][1])*W[a]*(Ky*f14), I+36);
	    double f15 = (Cx*Ix + B10);
	    update((C[0][1])*W[a]*(Dy*Kz*f15), I+37);
	    update((C[0][1])*W[a]*(Dz*Ky*f15), I+38);
	    double f16 = (B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01));
	    update((C[0][1])*W[a]*(Cx*f16), I+39);
	    update((C[0][1])*W[a]*(Cy*f16), I+40);
	    update((C[0][0])*W[a]*(f16), I+41);
	    double f18 = (Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][1])*W[a]*(Dy*f18), I+42);
	    update((C[0][1])*W[a]*(Dz*f18), I+43);
	    double f2 = (Dz*Kz + B01);
	    update((C[0][1])*W[a]*(Cy*Ix*f2), I+44);
	    update((C[0][1])*W[a]*(f15*f2), I+45);
	    update((C[0][0])*W[a]*(Ix*f2), I+46);
	    update((C[0][1])*W[a]*(Cx*Iy*f2), I+47);
	    update((C[0][0])*W[a]*(Iy*f2), I+48);
	    double f20 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][1])*W[a]*(Ky*f20), I+49);
	    update((C[0][1])*W[a]*(Kz*f20), I+50);
	    double f21 = (B01 + Dx*Kx);
	    update((C[0][1])*W[a]*(Cy*Iz*f21), I+51);
	    update((C[0][0])*W[a]*(Iz*f21), I+52);
	    update((C[0][1])*W[a]*(Cz*Iy*f21), I+53);
	    update((C[0][0])*W[a]*(Iy*f21), I+54);
	    double f22 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Dx*Kz*f22), I+55);
	    update((C[0][1])*W[a]*(Dz*Kx*f22), I+56);
	    update((C[0][1])*W[a]*(f2*f22), I+57);
	    update((C[0][1])*W[a]*(f21*f22), I+58);
	    double f23 = (B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10));
	    update((C[0][1])*W[a]*(Dx*f23), I+59);
	    update((C[0][1])*W[a]*(Dz*f23), I+60);
	    double f24 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][1])*W[a]*(Kx*f24), I+61);
	    update((C[0][1])*W[a]*(Kz*f24), I+62);
	    double f26 = (B01 + Dy*Ky);
	    update((C[0][1])*W[a]*(Cx*Iz*f26), I+63);
	    update((C[0][0])*W[a]*(Iz*f26), I+64);
	    update((C[0][1])*W[a]*(f15*f26), I+65);
	    update((C[0][1])*W[a]*(Cz*Ix*f26), I+66);
	    update((C[0][0])*W[a]*(Ix*f26), I+67);
	    double f28 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[0][1])*W[a]*(Ix*f28), I+68);
	    update((C[0][1])*W[a]*(Iz*f28), I+69);
	    double f29 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[0][1])*W[a]*(Cz*(f29 + Xij*(B01 + Dx*Kx))), I+70);
	    update((C[0][0])*W[a]*((f29 + Xij*(B01 + Dx*Kx))), I+71);
	    update((C[0][1])*W[a]*(Cy*(f29 + Xij*(B01 + Dx*Kx))), I+72);
	    update((C[0][1])*W[a]*(Iz*f29), I+73);
	    update((C[0][1])*W[a]*(Iy*f29), I+74);
	    double f3 = (B00 + Cx*Kx);
	    update((C[0][1])*W[a]*(Cy*Dz*(Kx*Xij + f3)), I+75);
	    update((C[0][0])*W[a]*(Dz*(Kx*Xij + f3)), I+76);
	    update((C[0][1])*W[a]*(Qy*(Kx*Xij + f3)), I+77);
	    update((C[0][1])*W[a]*(Cz*Dy*(Kx*Xij + f3)), I+78);
	    update((C[0][0])*W[a]*(Dy*(Kx*Xij + f3)), I+79);
	    update((C[0][1])*W[a]*(Qz*(Kx*Xij + f3)), I+80);
	    update((C[0][1])*W[a]*(Dy*Iz*f3), I+81);
	    update((C[0][1])*W[a]*(Dz*Iy*f3), I+82);
	    update((C[0][1])*W[a]*(f13*f3), I+83);
	    double f30 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[0][1])*W[a]*(Ix*f30), I+84);
	    update((C[0][1])*W[a]*(Iy*f30), I+85);
	    double f1 = B01*B10;
	    double f31 = 2*pow(B00,2);
	    update((C[0][1])*W[a]*((B00*(Xij + 2*Cx)*(Xkl + 2*Dx) + f31 + B01*Cx*Ix + f1 + Dx*Kx*(Cx*Ix + B10))), I+86);
	    update((C[0][1])*W[a]*((f31 + Dy*Ky*(Cy*Iy + B10) + B01*Cy*Iy + B00*(Yij + 2*Cy)*(Ykl + 2*Dy) + f1)), I+87);
	    update((C[0][1])*W[a]*((Dz*Kz*(B10 + Cz*Iz) + f31 + B00*(2*Cz + Zij)*(2*Dz + Zkl) + f1 + B01*Cz*Iz)), I+88);
	    double f36 = (B10 + Cz*Iz);
	    update((C[0][1])*W[a]*(Dx*Ky*f36), I+89);
	    update((C[0][1])*W[a]*(Dy*Kx*f36), I+90);
	    update((C[0][1])*W[a]*(f21*f36), I+91);
	    update((C[0][1])*W[a]*(f26*f36), I+92);
	    double f4 = (B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky));
	    update((C[0][1])*W[a]*(Cz*f4), I+93);
	    update((C[0][1])*W[a]*(Cx*f4), I+94);
	    update((C[0][0])*W[a]*(f4), I+95);
	    double f7 = (B00 + Dz*Iz);
	    update((C[0][1])*W[a]*(Cy*Dx*(Iz*Zkl + f7)), I+96);
	    update((C[0][1])*W[a]*(Qx*(Iz*Zkl + f7)), I+97);
	    update((C[0][1])*W[a]*(Cx*Dy*(Iz*Zkl + f7)), I+98);
	    update((C[0][0])*W[a]*(Dy*(Iz*Zkl + f7)), I+99);
	    update((C[0][1])*W[a]*(Qy*(Iz*Zkl + f7)), I+100);
	    update((C[0][0])*W[a]*(Dx*(Iz*Zkl + f7)), I+101);
	    update((C[0][1])*W[a]*(Cy*Kx*f7), I+102);
	    update((C[0][0])*W[a]*(Kx*f7), I+103);
	    update((C[0][1])*W[a]*(Cx*Ky*f7), I+104);
	    update((C[0][0])*W[a]*(Ky*f7), I+105);
	    update((C[0][1])*W[a]*(f12*f7), I+106);
	    update((C[0][1])*W[a]*(f3*f7), I+107);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[108]) {
	double T[108];
	for (int i = 0; i < 108; ++i) {
	    T[i] = I[i];
	}
	I[20] = T[0];
	I[28] = T[1];
	I[60] = T[2];
	I[44] = T[3];
	I[84] = T[4];
	I[76] = T[5];
	I[77] = T[6];
	I[45] = T[7];
	I[86] = T[8];
	I[22] = T[9];
	I[63] = T[10];
	I[31] = T[11];
	I[74] = T[12];
	I[72] = T[13];
	I[39] = T[14];
	I[36] = T[15];
	I[87] = T[16];
	I[75] = T[17];
	I[79] = T[18];
	I[65] = T[19];
	I[64] = T[20];
	I[41] = T[21];
	I[43] = T[22];
	I[40] = T[23];
	I[67] = T[24];
	I[62] = T[25];
	I[46] = T[26];
	I[38] = T[27];
	I[91] = T[28];
	I[89] = T[29];
	I[19] = T[30];
	I[16] = T[31];
	I[88] = T[32];
	I[95] = T[33];
	I[83] = T[34];
	I[35] = T[35];
	I[71] = T[36];
	I[85] = T[37];
	I[61] = T[38];
	I[105] = T[39];
	I[106] = T[40];
	I[104] = T[41];
	I[13] = T[42];
	I[25] = T[43];
	I[98] = T[44];
	I[97] = T[45];
	I[96] = T[46];
	I[101] = T[47];
	I[100] = T[48];
	I[37] = T[49];
	I[73] = T[50];
	I[10] = T[51];
	I[8] = T[52];
	I[7] = T[53];
	I[4] = T[54];
	I[78] = T[55];
	I[30] = T[56];
	I[102] = T[57];
	I[6] = T[58];
	I[42] = T[59];
	I[66] = T[60];
	I[18] = T[61];
	I[90] = T[62];
	I[57] = T[63];
	I[56] = T[64];
	I[49] = T[65];
	I[51] = T[66];
	I[48] = T[67];
	I[50] = T[68];
	I[58] = T[69];
	I[3] = T[70];
	I[0] = T[71];
	I[2] = T[72];
	I[9] = T[73];
	I[5] = T[74];
	I[26] = T[75];
	I[24] = T[76];
	I[14] = T[77];
	I[15] = T[78];
	I[12] = T[79];
	I[27] = T[80];
	I[21] = T[81];
	I[29] = T[82];
	I[17] = T[83];
	I[99] = T[84];
	I[103] = T[85];
	I[1] = T[86];
	I[54] = T[87];
	I[107] = T[88];
	I[47] = T[89];
	I[23] = T[90];
	I[11] = T[91];
	I[59] = T[92];
	I[55] = T[93];
	I[53] = T[94];
	I[52] = T[95];
	I[82] = T[96];
	I[81] = T[97];
	I[93] = T[98];
	I[92] = T[99];
	I[94] = T[100];
	I[80] = T[101];
	I[34] = T[102];
	I[32] = T[103];
	I[69] = T[104];
	I[68] = T[105];
	I[70] = T[106];
	I[33] = T[107];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[108] = { 71, 86, 72, 70, 54, 74, 58, 53, 52, 73, 51, 91, 79, 42, 77, 78, 31, 83, 61, 30, 0, 81, 9, 90, 76, 43, 75, 80, 1, 82, 56, 11, 103, 107, 102, 35, 15, 49, 27, 14, 23, 21, 59, 22, 3, 7, 26, 89, 67, 65, 68, 66, 95, 94, 87, 93, 64, 63, 69, 92, 2, 38, 25, 10, 20, 19, 60, 24, 105, 104, 106, 36, 13, 50, 12, 17, 5, 6, 55, 18, 101, 97, 96, 34, 4, 37, 8, 16, 32, 29, 62, 28, 99, 98, 100, 33, 46, 45, 44, 84, 48, 47, 57, 85, 41, 39, 40, 88 };
// 	if (index < 108) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    20, 28, 60, 44, 84, 76, 77, 45, 86, 22, 63, 31, 74, 72, 39, 36, 87, 75, 79, 65, 64, 41, 43, 40, 67, 62, 46, 38, 91, 89, 19, 16, 88, 95, 83, 35, 71, 85, 61, 105, 106, 104, 13, 25, 98, 97, 96, 101, 100, 37, 73, 10, 8, 7, 4, 78, 30, 102, 6, 42, 66, 18, 90, 57, 56, 49, 51, 48, 50, 58, 3, 0, 2, 9, 5, 26, 24, 14, 15, 12, 27, 21, 29, 17, 99, 103, 1, 54, 107, 47, 23, 11, 59, 55, 53, 52, 82, 81, 93, 92, 94, 80, 34, 32, 69, 68, 70, 33
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 20;
	*idx++ = 28;
	*idx++ = 60;
	*idx++ = 44;
	*idx++ = 84;
	*idx++ = 76;
	*idx++ = 77;
	*idx++ = 45;
	*idx++ = 86;
	*idx++ = 22;
	*idx++ = 63;
	*idx++ = 31;
	*idx++ = 74;
	*idx++ = 72;
	*idx++ = 39;
	*idx++ = 36;
	*idx++ = 87;
	*idx++ = 75;
	*idx++ = 79;
	*idx++ = 65;
	*idx++ = 64;
	*idx++ = 41;
	*idx++ = 43;
	*idx++ = 40;
	*idx++ = 67;
	*idx++ = 62;
	*idx++ = 46;
	*idx++ = 38;
	*idx++ = 91;
	*idx++ = 89;
	*idx++ = 19;
	*idx++ = 16;
	*idx++ = 88;
	*idx++ = 95;
	*idx++ = 83;
	*idx++ = 35;
	*idx++ = 71;
	*idx++ = 85;
	*idx++ = 61;
	*idx++ = 105;
	*idx++ = 106;
	*idx++ = 104;
	*idx++ = 13;
	*idx++ = 25;
	*idx++ = 98;
	*idx++ = 97;
	*idx++ = 96;
	*idx++ = 101;
	*idx++ = 100;
	*idx++ = 37;
	*idx++ = 73;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 78;
	*idx++ = 30;
	*idx++ = 102;
	*idx++ = 6;
	*idx++ = 42;
	*idx++ = 66;
	*idx++ = 18;
	*idx++ = 90;
	*idx++ = 57;
	*idx++ = 56;
	*idx++ = 49;
	*idx++ = 51;
	*idx++ = 48;
	*idx++ = 50;
	*idx++ = 58;
	*idx++ = 3;
	*idx++ = 0;
	*idx++ = 2;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 14;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 27;
	*idx++ = 21;
	*idx++ = 29;
	*idx++ = 17;
	*idx++ = 99;
	*idx++ = 103;
	*idx++ = 1;
	*idx++ = 54;
	*idx++ = 107;
	*idx++ = 47;
	*idx++ = 23;
	*idx++ = 11;
	*idx++ = 59;
	*idx++ = 55;
	*idx++ = 53;
	*idx++ = 52;
	*idx++ = 82;
	*idx++ = 81;
	*idx++ = 93;
	*idx++ = 92;
	*idx++ = 94;
	*idx++ = 80;
	*idx++ = 34;
	*idx++ = 32;
	*idx++ = 69;
	*idx++ = 68;
	*idx++ = 70;
	*idx++ = 33;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[1][0])*W[a]*(Dz*Iy*Kx), I+0);
	    update((C[0][0])*W[a]*(Iy*Kx), I+1);
	    update((C[1][0])*W[a]*(Dy*Iz*Kx), I+2);
	    update((C[0][0])*W[a]*(Iz*Kx), I+3);
	    update((C[1][0])*W[a]*(Dx*Iz*Ky), I+4);
	    update((C[0][0])*W[a]*(Iz*Ky), I+5);
	    update((C[1][0])*W[a]*(Dz*Ix*Ky), I+6);
	    update((C[0][0])*W[a]*(Ix*Ky), I+7);
	    update((C[1][0])*W[a]*(Dy*Ix*Kz), I+8);
	    update((C[1][0])*W[a]*(Dx*Iy*Kz), I+9);
	    update((C[0][0])*W[a]*(Iy*Kz), I+10);
	    update((C[0][0])*W[a]*(Ix*Kz), I+11);
	    update((C[1][0])*W[a]*((Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+12);
	    update((C[1][0])*W[a]*((B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky))), I+13);
	    update((C[1][0])*W[a]*((B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+14);
	    double f10 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Ix*f10), I+15);
	    update((C[1][0])*W[a]*(Iy*f10), I+16);
	    double f11 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Ix*f11), I+17);
	    update((C[1][0])*W[a]*(Iz*f11), I+18);
	    double f12 = (B00 + Iy*Ky);
	    update((C[1][0])*W[a]*(Dx*f12), I+19);
	    update((C[1][0])*W[a]*(Dz*f12), I+20);
	    update((C[0][0])*W[a]*(f12), I+21);
	    double f13 = (Dy*Iy + B00);
	    update((C[1][0])*W[a]*(Kx*f13), I+22);
	    update((C[1][0])*W[a]*(Kz*f13), I+23);
	    double f14 = (Dx*Ix + B00);
	    update((C[1][0])*W[a]*(Ky*f14), I+24);
	    update((C[1][0])*W[a]*(Kz*f14), I+25);
	    double f3 = (B00 + Ix*Kx);
	    update((C[1][0])*W[a]*(Dz*f3), I+26);
	    update((C[1][0])*W[a]*(Dy*f3), I+27);
	    update((C[0][0])*W[a]*(f3), I+28);
	    double f4 = (Iz*Kz + B00);
	    update((C[1][0])*W[a]*(Dy*f4), I+29);
	    update((C[1][0])*W[a]*(Dx*f4), I+30);
	    update((C[0][0])*W[a]*(f4), I+31);
	    double f5 = (B00 + Dz*Iz);
	    update((C[1][0])*W[a]*(Kx*f5), I+32);
	    update((C[1][0])*W[a]*(Ky*f5), I+33);
	    double f7 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Iz*f7), I+34);
	    update((C[1][0])*W[a]*(Iy*f7), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[10] = T[0];
	I[1] = T[1];
	I[8] = T[2];
	I[2] = T[3];
	I[17] = T[4];
	I[14] = T[5];
	I[21] = T[6];
	I[12] = T[7];
	I[30] = T[8];
	I[28] = T[9];
	I[25] = T[10];
	I[24] = T[11];
	I[3] = T[12];
	I[19] = T[13];
	I[35] = T[14];
	I[33] = T[15];
	I[34] = T[16];
	I[18] = T[17];
	I[20] = T[18];
	I[16] = T[19];
	I[22] = T[20];
	I[13] = T[21];
	I[7] = T[22];
	I[31] = T[23];
	I[15] = T[24];
	I[27] = T[25];
	I[9] = T[26];
	I[6] = T[27];
	I[0] = T[28];
	I[32] = T[29];
	I[29] = T[30];
	I[26] = T[31];
	I[11] = T[32];
	I[23] = T[33];
	I[5] = T[34];
	I[4] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 28, 1, 3, 12, 35, 34, 27, 22, 2, 26, 0, 32, 7, 21, 5, 24, 19, 4, 17, 13, 18, 6, 20, 33, 11, 10, 31, 25, 9, 30, 8, 23, 29, 15, 16, 14 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    10, 1, 8, 2, 17, 14, 21, 12, 30, 28, 25, 24, 3, 19, 35, 33, 34, 18, 20, 16, 22, 13, 7, 31, 15, 27, 9, 6, 0, 32, 29, 26, 11, 23, 5, 4
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 10;
	*idx++ = 1;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 17;
	*idx++ = 14;
	*idx++ = 21;
	*idx++ = 12;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 3;
	*idx++ = 19;
	*idx++ = 35;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 18;
	*idx++ = 20;
	*idx++ = 16;
	*idx++ = 22;
	*idx++ = 13;
	*idx++ = 7;
	*idx++ = 31;
	*idx++ = 15;
	*idx++ = 27;
	*idx++ = 9;
	*idx++ = 6;
	*idx++ = 0;
	*idx++ = 32;
	*idx++ = 29;
	*idx++ = 26;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 5;
	*idx++ = 4;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::P, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cz*Dx), I+0);
	    update((C[0][0])*W[a]*(Cy*Dx), I+1);
	    update((C[0][0])*W[a]*(Cz*Dy), I+2);
	    update((C[0][0])*W[a]*(Cx*Dy), I+3);
	    update((C[0][0])*W[a]*(Cy*Dz), I+4);
	    update((C[0][0])*W[a]*(Cx*Dz), I+5);
	    update((C[1][0])*W[a]*(Cz*Dy*Kx), I+6);
	    update((C[1][0])*W[a]*(Cy*Dz*Kx), I+7);
	    update((C[1][0])*W[a]*(Cx*Dz*Ky), I+8);
	    update((C[1][0])*W[a]*(Cz*Dx*Ky), I+9);
	    update((C[1][0])*W[a]*(Cy*Dx*Kz), I+10);
	    update((C[1][0])*W[a]*(Cx*Dy*Kz), I+11);
	    update((C[1][0])*W[a]*(Kz*Qx), I+12);
	    update((C[1][0])*W[a]*(Ky*Qx), I+13);
	    update((C[0][0])*W[a]*(Qx), I+14);
	    update((C[1][0])*W[a]*(Kz*Qy), I+15);
	    update((C[1][0])*W[a]*(Kx*Qy), I+16);
	    update((C[0][0])*W[a]*(Qy), I+17);
	    update((C[1][0])*W[a]*(Kx*Qz), I+18);
	    update((C[1][0])*W[a]*(Ky*Qz), I+19);
	    update((C[0][0])*W[a]*(Qz), I+20);
	    update((C[1][0])*W[a]*((B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx))), I+21);
	    update((C[1][0])*W[a]*((Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy))), I+22);
	    update((C[1][0])*W[a]*(Dz*(Cy*Ykl + Qy)), I+23);
	    update((C[1][0])*W[a]*(Dx*(Cy*Ykl + Qy)), I+24);
	    update((C[1][0])*W[a]*((B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz)), I+25);
	    update((C[1][0])*W[a]*(Dx*(Cz*Zkl + Qz)), I+26);
	    update((C[1][0])*W[a]*(Dy*(Cz*Zkl + Qz)), I+27);
	    double f10 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Cx*f10), I+28);
	    update((C[1][0])*W[a]*(Cz*f10), I+29);
	    double f2 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Cy*f2), I+30);
	    update((C[1][0])*W[a]*(Cz*f2), I+31);
	    double f5 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Cx*f5), I+32);
	    update((C[1][0])*W[a]*(Cy*f5), I+33);
	    double f6 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Dy*f6), I+34);
	    update((C[1][0])*W[a]*(Dz*f6), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[2] = T[0];
	I[1] = T[1];
	I[5] = T[2];
	I[3] = T[3];
	I[7] = T[4];
	I[6] = T[5];
	I[14] = T[6];
	I[16] = T[7];
	I[24] = T[8];
	I[20] = T[9];
	I[28] = T[10];
	I[30] = T[11];
	I[27] = T[12];
	I[18] = T[13];
	I[0] = T[14];
	I[31] = T[15];
	I[13] = T[16];
	I[4] = T[17];
	I[17] = T[18];
	I[26] = T[19];
	I[8] = T[20];
	I[9] = T[21];
	I[22] = T[22];
	I[25] = T[23];
	I[19] = T[24];
	I[35] = T[25];
	I[29] = T[26];
	I[32] = T[27];
	I[21] = T[28];
	I[23] = T[29];
	I[10] = T[30];
	I[11] = T[31];
	I[33] = T[32];
	I[34] = T[33];
	I[12] = T[34];
	I[15] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 14, 1, 0, 3, 17, 2, 5, 4, 20, 21, 30, 31, 34, 16, 6, 35, 7, 18, 13, 24, 9, 28, 22, 29, 8, 23, 19, 12, 10, 26, 11, 15, 27, 32, 33, 25 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    2, 1, 5, 3, 7, 6, 14, 16, 24, 20, 28, 30, 27, 18, 0, 31, 13, 4, 17, 26, 8, 9, 22, 25, 19, 35, 29, 32, 21, 23, 10, 11, 33, 34, 12, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 24;
	*idx++ = 20;
	*idx++ = 28;
	*idx++ = 30;
	*idx++ = 27;
	*idx++ = 18;
	*idx++ = 0;
	*idx++ = 31;
	*idx++ = 13;
	*idx++ = 4;
	*idx++ = 17;
	*idx++ = 26;
	*idx++ = 8;
	*idx++ = 9;
	*idx++ = 22;
	*idx++ = 25;
	*idx++ = 19;
	*idx++ = 35;
	*idx++ = 29;
	*idx++ = 32;
	*idx++ = 21;
	*idx++ = 23;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 12;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx), I+0);
	    update((C[0][0])*W[a]*(Cy), I+1);
	    update((C[0][0])*W[a]*(Cz), I+2);
	    update((C[1][0])*W[a]*(Cy*Dx), I+3);
	    update((C[1][0])*W[a]*(Cz*Dx), I+4);
	    update((C[1][0])*W[a]*(Cz*Dy), I+5);
	    update((C[1][0])*W[a]*(Cx*Dy), I+6);
	    update((C[1][0])*W[a]*(Cx*Dz), I+7);
	    update((C[1][0])*W[a]*(Cy*Dz), I+8);
	    update((C[1][0])*W[a]*(Qx), I+9);
	    update((C[1][0])*W[a]*(Qy), I+10);
	    update((C[1][0])*W[a]*(Qz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[4] = T[3];
	I[5] = T[4];
	I[8] = T[5];
	I[6] = T[6];
	I[9] = T[7];
	I[10] = T[8];
	I[3] = T[9];
	I[7] = T[10];
	I[11] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 0, 1, 2, 9, 3, 4, 6, 10, 5, 7, 8, 11 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 4, 5, 8, 6, 9, 10, 3, 7, 11
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 3;
	*idx++ = 7;
	*idx++ = 11;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::S, rysq::F> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Kx*Ky*Kz), I+0);
	    double f0 = (2*B00*Ky + Cy*(pow(Ky,2) + B01));
	    update((C[0][1])*W[a]*(Kx*f0), I+1);
	    update((C[0][1])*W[a]*(Kz*f0), I+2);
	    double f10 = (pow(Kz,2) + 3*B01);
	    update((C[0][1])*W[a]*(Cy*Kz*f10), I+3);
	    update((C[0][1])*W[a]*(Cx*Kz*f10), I+4);
	    update((C[0][0])*W[a]*(Kz*f10), I+5);
	    double f11 = (B00 + Cy*Ky);
	    update((C[0][1])*W[a]*(Kx*Kz*f11), I+6);
	    double f12 = (B00 + Cx*Kx);
	    update((C[0][1])*W[a]*(Ky*Kz*f12), I+7);
	    double f13 = (3*B01 + pow(Kx,2));
	    update((C[0][1])*W[a]*(Cz*Kx*f13), I+8);
	    update((C[0][1])*W[a]*(Cy*Kx*f13), I+9);
	    update((C[0][0])*W[a]*(Kx*f13), I+10);
	    double f14 = (2*B00*Kx + Cx*(pow(Kx,2) + B01));
	    update((C[0][1])*W[a]*(Kz*f14), I+11);
	    update((C[0][1])*W[a]*(Ky*f14), I+12);
	    double f15 = (B00 + Cz*Kz);
	    update((C[0][1])*W[a]*(Kx*Ky*f15), I+13);
	    double f3 = (pow(Ky,2) + B01);
	    update((C[0][1])*W[a]*(Cx*Kz*f3), I+14);
	    update((C[0][0])*W[a]*(Kz*f3), I+15);
	    update((C[0][1])*W[a]*(f15*f3), I+16);
	    update((C[0][1])*W[a]*(f12*f3), I+17);
	    update((C[0][0])*W[a]*(Kx*f3), I+18);
	    update((C[0][1])*W[a]*(Cz*Kx*f3), I+19);
	    double f4 = (pow(Ky,2) + 3*B01);
	    update((C[0][1])*W[a]*(Cz*Ky*f4), I+20);
	    update((C[0][1])*W[a]*(Cx*Ky*f4), I+21);
	    update((C[0][0])*W[a]*(Ky*f4), I+22);
	    double f5 = 3*B00*B01;
	    update((C[0][1])*W[a]*((3*B00*pow(Kx,2) + 3*B01*Cx*Kx + f5 + Cx*pow(Kx,3))), I+23);
	    update((C[0][1])*W[a]*((f5 + 3*B00*pow(Kz,2) + 3*B01*Cz*Kz + Cz*pow(Kz,3))), I+24);
	    update((C[0][1])*W[a]*((3*B01*Cy*Ky + Cy*pow(Ky,3) + 3*B00*pow(Ky,2) + f5)), I+25);
	    double f6 = (pow(Kz,2) + B01);
	    update((C[0][1])*W[a]*(Cy*Kx*f6), I+26);
	    update((C[0][1])*W[a]*(f12*f6), I+27);
	    update((C[0][0])*W[a]*(Kx*f6), I+28);
	    update((C[0][1])*W[a]*(Cx*Ky*f6), I+29);
	    update((C[0][0])*W[a]*(Ky*f6), I+30);
	    update((C[0][1])*W[a]*(f11*f6), I+31);
	    double f7 = (2*B00*Kz + Cz*(pow(Kz,2) + B01));
	    update((C[0][1])*W[a]*(Kx*f7), I+32);
	    update((C[0][1])*W[a]*(Ky*f7), I+33);
	    double f9 = (pow(Kx,2) + B01);
	    update((C[0][1])*W[a]*(f15*f9), I+34);
	    update((C[0][1])*W[a]*(f11*f9), I+35);
	    update((C[0][0])*W[a]*(Kz*f9), I+36);
	    update((C[0][1])*W[a]*(Cy*Kz*f9), I+37);
	    update((C[0][0])*W[a]*(Ky*f9), I+38);
	    update((C[0][1])*W[a]*(Cz*Ky*f9), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[36] = T[0];
	I[22] = T[1];
	I[26] = T[2];
	I[10] = T[3];
	I[9] = T[4];
	I[8] = T[5];
	I[38] = T[6];
	I[37] = T[7];
	I[3] = T[8];
	I[2] = T[9];
	I[0] = T[10];
	I[17] = T[11];
	I[13] = T[12];
	I[39] = T[13];
	I[25] = T[14];
	I[24] = T[15];
	I[27] = T[16];
	I[21] = T[17];
	I[20] = T[18];
	I[23] = T[19];
	I[7] = T[20];
	I[5] = T[21];
	I[4] = T[22];
	I[1] = T[23];
	I[11] = T[24];
	I[6] = T[25];
	I[30] = T[26];
	I[29] = T[27];
	I[28] = T[28];
	I[33] = T[29];
	I[32] = T[30];
	I[34] = T[31];
	I[31] = T[32];
	I[35] = T[33];
	I[19] = T[34];
	I[14] = T[35];
	I[16] = T[36];
	I[18] = T[37];
	I[12] = T[38];
	I[15] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 10, 23, 9, 8, 22, 21, 25, 20, 5, 4, 3, 24, 38, 12, 35, 39, 36, 11, 37, 34, 18, 17, 1, 19, 15, 14, 2, 16, 28, 27, 26, 32, 30, 29, 31, 33, 0, 7, 6, 13 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    36, 22, 26, 10, 9, 8, 38, 37, 3, 2, 0, 17, 13, 39, 25, 24, 27, 21, 20, 23, 7, 5, 4, 1, 11, 6, 30, 29, 28, 33, 32, 34, 31, 35, 19, 14, 16, 18, 12, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 36;
	*idx++ = 22;
	*idx++ = 26;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 38;
	*idx++ = 37;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 39;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 1;
	*idx++ = 11;
	*idx++ = 6;
	*idx++ = 30;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 34;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 19;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((B10 + pow(Ix,2))), I+0);
	    update((C[0][0])*W[a]*((B10 + pow(Iy,2))), I+1);
	    update((C[0][0])*W[a]*(Ix*Iy), I+2);
	    update((C[0][0])*W[a]*((B10 + pow(Iz,2))), I+3);
	    update((C[0][0])*W[a]*(Iy*Iz), I+4);
	    update((C[0][0])*W[a]*(Ix*Iz), I+5);
	    update((C[1][0])*W[a]*((2*B00*Ix + Kx*(B10 + pow(Ix,2)))), I+6);
	    update((C[1][0])*W[a]*(Kx*(B10 + pow(Iy,2))), I+7);
	    update((C[1][0])*W[a]*(Kx*(B10 + pow(Iz,2))), I+8);
	    update((C[1][0])*W[a]*(Iy*Iz*Kx), I+9);
	    update((C[1][0])*W[a]*((2*B00*Iy + Ky*(B10 + pow(Iy,2)))), I+10);
	    update((C[1][0])*W[a]*(Ky*(B10 + pow(Ix,2))), I+11);
	    update((C[1][0])*W[a]*(Ky*(B10 + pow(Iz,2))), I+12);
	    update((C[1][0])*W[a]*(Ix*Iz*Ky), I+13);
	    update((C[1][0])*W[a]*((2*B00*Iz + Kz*(B10 + pow(Iz,2)))), I+14);
	    update((C[1][0])*W[a]*(Kz*(B10 + pow(Ix,2))), I+15);
	    update((C[1][0])*W[a]*(Kz*(B10 + pow(Iy,2))), I+16);
	    update((C[1][0])*W[a]*(Ix*Iy*Kz), I+17);
	    double f11 = (Iz*Kz + B00);
	    update((C[1][0])*W[a]*(Ix*f11), I+18);
	    update((C[1][0])*W[a]*(Iy*f11), I+19);
	    double f4 = (B00 + Ix*Kx);
	    update((C[1][0])*W[a]*(Iy*f4), I+20);
	    update((C[1][0])*W[a]*(Iz*f4), I+21);
	    double f9 = (B00 + Iy*Ky);
	    update((C[1][0])*W[a]*(Ix*f9), I+22);
	    update((C[1][0])*W[a]*(Iz*f9), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[3] = T[2];
	I[2] = T[3];
	I[5] = T[4];
	I[4] = T[5];
	I[6] = T[6];
	I[7] = T[7];
	I[8] = T[8];
	I[11] = T[9];
	I[13] = T[10];
	I[12] = T[11];
	I[14] = T[12];
	I[16] = T[13];
	I[20] = T[14];
	I[18] = T[15];
	I[19] = T[16];
	I[21] = T[17];
	I[22] = T[18];
	I[23] = T[19];
	I[9] = T[20];
	I[10] = T[21];
	I[15] = T[22];
	I[17] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 0, 1, 3, 2, 5, 4, 6, 7, 8, 20, 21, 9, 11, 10, 12, 22, 13, 23, 15, 16, 14, 17, 18, 19 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 3, 2, 5, 4, 6, 7, 8, 11, 13, 12, 14, 16, 20, 18, 19, 21, 22, 23, 9, 10, 15, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 11;
	*idx++ = 13;
	*idx++ = 12;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 20;
	*idx++ = 18;
	*idx++ = 19;
	*idx++ = 21;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::F, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy*Dz), I+0);
	    update((C[0][0])*W[a]*(Dz*Rx), I+1);
	    update((C[0][1])*W[a]*(Dz*Iy*Rx), I+2);
	    update((C[0][0])*W[a]*(Dy*Rx), I+3);
	    update((C[0][1])*W[a]*(Dy*Iz*Rx), I+4);
	    update((C[0][1])*W[a]*(Dz*Ix*Ry), I+5);
	    update((C[0][0])*W[a]*(Dz*Ry), I+6);
	    update((C[0][0])*W[a]*(Dx*Ry), I+7);
	    update((C[0][1])*W[a]*(Dx*Iz*Ry), I+8);
	    update((C[0][1])*W[a]*(Dx*Iy*Rz), I+9);
	    update((C[0][0])*W[a]*(Dx*Rz), I+10);
	    update((C[0][1])*W[a]*(Dy*Ix*Rz), I+11);
	    update((C[0][0])*W[a]*(Dy*Rz), I+12);
	    double f0 = (Iz*Rz + 2*B00*Dz);
	    update((C[0][1])*W[a]*(Dx*f0), I+13);
	    update((C[0][1])*W[a]*(Dy*f0), I+14);
	    double f10 = (2*B00*Dx + Ix*Rx);
	    update((C[0][1])*W[a]*(Dz*f10), I+15);
	    update((C[0][1])*W[a]*(Dy*f10), I+16);
	    double f11 = (3*B01 + pow(Dy,2));
	    update((C[0][1])*W[a]*(Dy*Iz*f11), I+17);
	    update((C[0][1])*W[a]*(Dy*Ix*f11), I+18);
	    update((C[0][0])*W[a]*(Dy*f11), I+19);
	    double f12 = (Dy*Iy + B00);
	    update((C[0][1])*W[a]*(Dx*Dz*f12), I+20);
	    update((C[0][1])*W[a]*(Rz*f12), I+21);
	    update((C[0][1])*W[a]*(Rx*f12), I+22);
	    double f13 = (Dx*Ix + B00);
	    update((C[0][1])*W[a]*(Dy*Dz*f13), I+23);
	    update((C[0][1])*W[a]*(Rz*f13), I+24);
	    update((C[0][1])*W[a]*(Ry*f13), I+25);
	    double f15 = (2*B00*Dy + Iy*Ry);
	    update((C[0][1])*W[a]*(Dx*f15), I+26);
	    update((C[0][1])*W[a]*(Dz*f15), I+27);
	    double f2 = (B00 + Dz*Iz);
	    update((C[0][1])*W[a]*(Dx*Dy*f2), I+28);
	    update((C[0][1])*W[a]*(Rx*f2), I+29);
	    update((C[0][1])*W[a]*(Ry*f2), I+30);
	    double f3 = (3*B01 + pow(Dx,2));
	    update((C[0][1])*W[a]*(Dx*Iz*f3), I+31);
	    update((C[0][1])*W[a]*(Dx*Iy*f3), I+32);
	    update((C[0][0])*W[a]*(Dx*f3), I+33);
	    double f5 = (pow(Dz,2) + 3*B01);
	    update((C[0][1])*W[a]*(Dz*Iy*f5), I+34);
	    update((C[0][1])*W[a]*(Dz*Ix*f5), I+35);
	    update((C[0][0])*W[a]*(Dz*f5), I+36);
	    double f7 = 3*B00*B01;
	    update((C[0][1])*W[a]*((Dy*(Iy*f11 + 3*B00*Dy) + f7)), I+37);
	    update((C[0][1])*W[a]*((f7 + Dz*(Iz*f5 + 3*B00*Dz))), I+38);
	    update((C[0][1])*W[a]*((Dx*(3*B00*Dx + Ix*f3) + f7)), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[36] = T[0];
	I[16] = T[1];
	I[18] = T[2];
	I[12] = T[3];
	I[15] = T[4];
	I[25] = T[5];
	I[24] = T[6];
	I[20] = T[7];
	I[23] = T[8];
	I[30] = T[9];
	I[28] = T[10];
	I[33] = T[11];
	I[32] = T[12];
	I[31] = T[13];
	I[35] = T[14];
	I[17] = T[15];
	I[13] = T[16];
	I[7] = T[17];
	I[5] = T[18];
	I[4] = T[19];
	I[38] = T[20];
	I[34] = T[21];
	I[14] = T[22];
	I[37] = T[23];
	I[29] = T[24];
	I[21] = T[25];
	I[22] = T[26];
	I[26] = T[27];
	I[39] = T[28];
	I[19] = T[29];
	I[27] = T[30];
	I[3] = T[31];
	I[2] = T[32];
	I[0] = T[33];
	I[10] = T[34];
	I[9] = T[35];
	I[8] = T[36];
	I[6] = T[37];
	I[11] = T[38];
	I[1] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 33, 39, 32, 31, 19, 18, 37, 17, 36, 35, 34, 38, 3, 16, 22, 4, 1, 15, 2, 29, 7, 25, 26, 8, 6, 5, 27, 30, 10, 24, 9, 13, 12, 11, 21, 14, 0, 23, 20, 28 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    36, 16, 18, 12, 15, 25, 24, 20, 23, 30, 28, 33, 32, 31, 35, 17, 13, 7, 5, 4, 38, 34, 14, 37, 29, 21, 22, 26, 39, 19, 27, 3, 2, 0, 10, 9, 8, 6, 11, 1
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 36;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 38;
	*idx++ = 34;
	*idx++ = 14;
	*idx++ = 37;
	*idx++ = 29;
	*idx++ = 21;
	*idx++ = 22;
	*idx++ = 26;
	*idx++ = 39;
	*idx++ = 19;
	*idx++ = 27;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 1;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Dx), I+0);
	    update((C[0][0])*W[a]*(Dy), I+1);
	    update((C[0][0])*W[a]*(Dz), I+2);
	    update((C[0][1])*W[a]*((Dx*Ix + B00)), I+3);
	    update((C[0][1])*W[a]*(Dy*Ix), I+4);
	    update((C[0][1])*W[a]*(Dz*Ix), I+5);
	    update((C[0][1])*W[a]*((Dy*Iy + B00)), I+6);
	    update((C[0][1])*W[a]*(Dz*Iy), I+7);
	    update((C[0][1])*W[a]*(Dx*Iy), I+8);
	    update((C[0][1])*W[a]*((B00 + Dz*Iz)), I+9);
	    update((C[0][1])*W[a]*(Dx*Iz), I+10);
	    update((C[0][1])*W[a]*(Dy*Iz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[4] = T[1];
	I[8] = T[2];
	I[1] = T[3];
	I[5] = T[4];
	I[9] = T[5];
	I[6] = T[6];
	I[10] = T[7];
	I[2] = T[8];
	I[11] = T[9];
	I[3] = T[10];
	I[7] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 0, 3, 8, 10, 1, 4, 6, 11, 2, 5, 7, 9 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 4, 8, 1, 5, 9, 6, 10, 2, 11, 3, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 1;
	*idx++ = 5;
	*idx++ = 9;
	*idx++ = 6;
	*idx++ = 10;
	*idx++ = 2;
	*idx++ = 11;
	*idx++ = 3;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[16]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[1][0])*W[a]*(Dx), I+1);
	    update((C[1][0])*W[a]*(Dy), I+2);
	    update((C[1][0])*W[a]*(Dz), I+3);
	    update((C[1][1])*W[a]*((Dx*Ix + B00)), I+4);
	    update((C[1][1])*W[a]*(Dz*Ix), I+5);
	    update((C[1][1])*W[a]*(Dy*Ix), I+6);
	    update((C[0][1])*W[a]*(Ix), I+7);
	    update((C[1][1])*W[a]*((Dy*Iy + B00)), I+8);
	    update((C[1][1])*W[a]*(Dz*Iy), I+9);
	    update((C[1][1])*W[a]*(Dx*Iy), I+10);
	    update((C[0][1])*W[a]*(Iy), I+11);
	    update((C[1][1])*W[a]*((B00 + Dz*Iz)), I+12);
	    update((C[1][1])*W[a]*(Dx*Iz), I+13);
	    update((C[1][1])*W[a]*(Dy*Iz), I+14);
	    update((C[0][1])*W[a]*(Iz), I+15);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[16]) {
	double T[16];
	for (int i = 0; i < 16; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[4] = T[1];
	I[8] = T[2];
	I[12] = T[3];
	I[5] = T[4];
	I[13] = T[5];
	I[9] = T[6];
	I[1] = T[7];
	I[10] = T[8];
	I[14] = T[9];
	I[6] = T[10];
	I[2] = T[11];
	I[15] = T[12];
	I[7] = T[13];
	I[11] = T[14];
	I[3] = T[15];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[16] = { 0, 7, 11, 15, 1, 4, 10, 13, 2, 6, 8, 14, 3, 5, 9, 12 };
// 	if (index < 16) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 4, 8, 12, 5, 13, 9, 1, 10, 14, 6, 2, 15, 7, 11, 3
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 12;
	*idx++ = 5;
	*idx++ = 13;
	*idx++ = 9;
	*idx++ = 1;
	*idx++ = 10;
	*idx++ = 14;
	*idx++ = 6;
	*idx++ = 2;
	*idx++ = 15;
	*idx++ = 7;
	*idx++ = 11;
	*idx++ = 3;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[3]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {



	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))


	    update((C[0][0])*W[a]*(Dx), I+0);
	    update((C[0][0])*W[a]*(Dy), I+1);
	    update((C[0][0])*W[a]*(Dz), I+2);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[3]) {
	double T[3];
	for (int i = 0; i < 3; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[3] = { 0, 1, 2 };
// 	if (index < 3) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[27]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cz*Dy*Kx), I+0);
	    update((C[0][0])*W[a]*(Cy*Dz*Kx), I+1);
	    update((C[0][0])*W[a]*(Cz*Dx*Ky), I+2);
	    update((C[0][0])*W[a]*(Cx*Dz*Ky), I+3);
	    update((C[0][0])*W[a]*(Cy*Dx*Kz), I+4);
	    update((C[0][0])*W[a]*(Cx*Dy*Kz), I+5);
	    update((C[0][0])*W[a]*(Ky*Qx), I+6);
	    update((C[0][0])*W[a]*(Kz*Qx), I+7);
	    update((C[0][0])*W[a]*(Kx*Qy), I+8);
	    update((C[0][0])*W[a]*(Kz*Qy), I+9);
	    update((C[0][0])*W[a]*(Kx*Qz), I+10);
	    update((C[0][0])*W[a]*(Ky*Qz), I+11);
	    update((C[0][0])*W[a]*((B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx))), I+12);
	    update((C[0][0])*W[a]*((Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy))), I+13);
	    update((C[0][0])*W[a]*((B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz)), I+14);
	    update((C[0][0])*W[a]*(Dx*(Cz*Zkl + Qz)), I+15);
	    update((C[0][0])*W[a]*(Dy*(Cz*Zkl + Qz)), I+16);
	    double f10 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Cz*f10), I+17);
	    update((C[0][0])*W[a]*(Cx*f10), I+18);
	    double f2 = (B01 + Dx*Kx);
	    update((C[0][0])*W[a]*(Cy*f2), I+19);
	    update((C[0][0])*W[a]*(Cz*f2), I+20);
	    double f5 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Cx*f5), I+21);
	    update((C[0][0])*W[a]*(Cy*f5), I+22);
	    double f6 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Dx*f6), I+23);
	    update((C[0][0])*W[a]*(Dz*f6), I+24);
	    double f8 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Dy*f8), I+25);
	    update((C[0][0])*W[a]*(Dz*f8), I+26);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[27]) {
	double T[27];
	for (int i = 0; i < 27; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[7] = T[1];
	I[11] = T[2];
	I[15] = T[3];
	I[19] = T[4];
	I[21] = T[5];
	I[9] = T[6];
	I[18] = T[7];
	I[4] = T[8];
	I[22] = T[9];
	I[8] = T[10];
	I[17] = T[11];
	I[0] = T[12];
	I[13] = T[13];
	I[26] = T[14];
	I[20] = T[15];
	I[23] = T[16];
	I[14] = T[17];
	I[12] = T[18];
	I[1] = T[19];
	I[2] = T[20];
	I[24] = T[21];
	I[25] = T[22];
	I[10] = T[23];
	I[16] = T[24];
	I[3] = T[25];
	I[6] = T[26];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[27] = { 12, 19, 20, 25, 8, 0, 26, 1, 10, 6, 23, 2, 18, 13, 17, 3, 24, 11, 7, 4, 15, 5, 9, 16, 21, 22, 14 };
// 	if (index < 27) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 7, 11, 15, 19, 21, 9, 18, 4, 22, 8, 17, 0, 13, 26, 20, 23, 14, 12, 1, 2, 24, 25, 10, 16, 3, 6
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 11;
	*idx++ = 15;
	*idx++ = 19;
	*idx++ = 21;
	*idx++ = 9;
	*idx++ = 18;
	*idx++ = 4;
	*idx++ = 22;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 26;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 24;
	*idx++ = 25;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 3;
	*idx++ = 6;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[36]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Iy*Iz*Rx), I+0);
	    update((C[0][0])*W[a]*(Ix*Iz*Ry), I+1);
	    update((C[0][0])*W[a]*(Ix*Iy*Rz), I+2);
	    double f1 = (Iz*Rz + 2*B00*Dz);
	    update((C[0][0])*W[a]*(Ix*f1), I+3);
	    update((C[0][0])*W[a]*(Iy*f1), I+4);
	    double f10 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Dy*Iz*f10), I+5);
	    update((C[0][0])*W[a]*(Dz*Iy*f10), I+6);
	    double f11 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Dx*Dz*f11), I+7);
	    update((C[0][0])*W[a]*(Rx*f11), I+8);
	    update((C[0][0])*W[a]*(Rz*f11), I+9);
	    double f12 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Dx*Dy*f12), I+10);
	    update((C[0][0])*W[a]*(Rx*f12), I+11);
	    update((C[0][0])*W[a]*(Ry*f12), I+12);
	    double f14 = (2*B00*Dx + Ix*Rx);
	    update((C[0][0])*W[a]*(Iy*f14), I+13);
	    update((C[0][0])*W[a]*(Iz*f14), I+14);
	    double f15 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Dx*Iz*f15), I+15);
	    update((C[0][0])*W[a]*(Dz*Ix*f15), I+16);
	    update((C[0][0])*W[a]*(f10*f15), I+17);
	    double f16 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Dx*Iy*f16), I+18);
	    update((C[0][0])*W[a]*(Dy*Ix*f16), I+19);
	    update((C[0][0])*W[a]*(f10*f16), I+20);
	    update((C[0][0])*W[a]*(f15*f16), I+21);
	    double f17 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Dy*Dz*f17), I+22);
	    update((C[0][0])*W[a]*(Ry*f17), I+23);
	    update((C[0][0])*W[a]*(Rz*f17), I+24);
	    double f19 = (2*B00*Dy + Iy*Ry);
	    update((C[0][0])*W[a]*(Iz*f19), I+25);
	    update((C[0][0])*W[a]*(Ix*f19), I+26);
	    double f2 = (2*B00*Iz + Dz*(B10 + pow(Iz,2)));
	    update((C[0][0])*W[a]*(Dx*f2), I+27);
	    update((C[0][0])*W[a]*(Dy*f2), I+28);
	    double f3 = (2*B00*Ix + Dx*(B10 + pow(Ix,2)));
	    update((C[0][0])*W[a]*(Dz*f3), I+29);
	    update((C[0][0])*W[a]*(Dy*f3), I+30);
	    double f13 = 2*pow(B00,2);
	    double f7 = B01*B10;
	    update((C[0][0])*W[a]*((f13 + B01*pow(Ix,2) + f7 + pow(Dx,2)*(B10 + pow(Ix,2)) + 4*B00*Dx*Ix)), I+31);
	    update((C[0][0])*W[a]*((f13 + pow(Dy,2)*(B10 + pow(Iy,2)) + B01*pow(Iy,2) + f7 + 4*B00*Dy*Iy)), I+32);
	    update((C[0][0])*W[a]*((f13 + B01*pow(Iz,2) + pow(Dz,2)*(B10 + pow(Iz,2)) + 4*B00*Dz*Iz + f7)), I+33);
	    double f8 = (2*B00*Iy + Dy*(B10 + pow(Iy,2)));
	    update((C[0][0])*W[a]*(Dx*f8), I+34);
	    update((C[0][0])*W[a]*(Dz*f8), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[10] = T[1];
	I[15] = T[2];
	I[16] = T[3];
	I[17] = T[4];
	I[22] = T[5];
	I[27] = T[6];
	I[25] = T[7];
	I[1] = T[8];
	I[13] = T[9];
	I[20] = T[10];
	I[2] = T[11];
	I[8] = T[12];
	I[3] = T[13];
	I[4] = T[14];
	I[23] = T[15];
	I[33] = T[16];
	I[21] = T[17];
	I[29] = T[18];
	I[34] = T[19];
	I[28] = T[20];
	I[35] = T[21];
	I[30] = T[22];
	I[6] = T[23];
	I[12] = T[24];
	I[11] = T[25];
	I[9] = T[26];
	I[26] = T[27];
	I[32] = T[28];
	I[24] = T[29];
	I[18] = T[30];
	I[0] = T[31];
	I[7] = T[32];
	I[14] = T[33];
	I[19] = T[34];
	I[31] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 31, 8, 11, 13, 14, 0, 23, 32, 12, 26, 1, 25, 24, 9, 33, 2, 3, 4, 30, 34, 10, 17, 5, 15, 29, 7, 27, 6, 20, 18, 22, 35, 28, 16, 19, 21 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 10, 15, 16, 17, 22, 27, 25, 1, 13, 20, 2, 8, 3, 4, 23, 33, 21, 29, 34, 28, 35, 30, 6, 12, 11, 9, 26, 32, 24, 18, 0, 7, 14, 19, 31
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 22;
	*idx++ = 27;
	*idx++ = 25;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 20;
	*idx++ = 2;
	*idx++ = 8;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 21;
	*idx++ = 29;
	*idx++ = 34;
	*idx++ = 28;
	*idx++ = 35;
	*idx++ = 30;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 26;
	*idx++ = 32;
	*idx++ = 24;
	*idx++ = 18;
	*idx++ = 0;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 19;
	*idx++ = 31;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((Ix*(pow(Kx,2) + B01) + 2*B00*Kx)), I+0);
	    update((C[0][0])*W[a]*((2*B00*Ky + Iy*(pow(Ky,2) + B01))), I+1);
	    update((C[0][0])*W[a]*(Iz*Kx*Ky), I+2);
	    update((C[0][0])*W[a]*((Iz*(pow(Kz,2) + B01) + 2*B00*Kz)), I+3);
	    update((C[0][0])*W[a]*(Ix*Ky*Kz), I+4);
	    update((C[0][0])*W[a]*(Iy*Kx*Kz), I+5);
	    double f10 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Iy*f10), I+6);
	    update((C[0][0])*W[a]*(Iz*f10), I+7);
	    double f11 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Iz*f11), I+8);
	    update((C[0][0])*W[a]*(Ix*f11), I+9);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Kz*f3), I+10);
	    update((C[0][0])*W[a]*(Ky*f3), I+11);
	    double f5 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Kx*f5), I+12);
	    update((C[0][0])*W[a]*(Ky*f5), I+13);
	    double f6 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Ix*f6), I+14);
	    update((C[0][0])*W[a]*(Iy*f6), I+15);
	    double f8 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Kx*f8), I+16);
	    update((C[0][0])*W[a]*(Kz*f8), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[4] = T[1];
	I[11] = T[2];
	I[8] = T[3];
	I[15] = T[4];
	I[13] = T[5];
	I[1] = T[6];
	I[2] = T[7];
	I[5] = T[8];
	I[3] = T[9];
	I[12] = T[10];
	I[9] = T[11];
	I[14] = T[12];
	I[17] = T[13];
	I[6] = T[14];
	I[7] = T[15];
	I[10] = T[16];
	I[16] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 0, 6, 7, 9, 1, 8, 14, 15, 3, 11, 16, 2, 10, 5, 12, 4, 17, 13 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 4, 11, 8, 15, 13, 1, 2, 5, 3, 12, 9, 14, 17, 6, 7, 10, 16
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 14;
	*idx++ = 17;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 10;
	*idx++ = 16;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::SP, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Cx), I+0);
	    update((C[0][0])*W[a]*(Cy), I+1);
	    update((C[0][0])*W[a]*(Cz), I+2);
	    update((C[0][1])*W[a]*((Cx*Ix + B10)), I+3);
	    update((C[0][1])*W[a]*(Cy*Ix), I+4);
	    update((C[0][1])*W[a]*(Cz*Ix), I+5);
	    update((C[0][1])*W[a]*((Cy*Iy + B10)), I+6);
	    update((C[0][1])*W[a]*(Cz*Iy), I+7);
	    update((C[0][1])*W[a]*(Cx*Iy), I+8);
	    update((C[0][1])*W[a]*((B10 + Cz*Iz)), I+9);
	    update((C[0][1])*W[a]*(Cx*Iz), I+10);
	    update((C[0][1])*W[a]*(Cy*Iz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[4] = T[4];
	I[5] = T[5];
	I[7] = T[6];
	I[8] = T[7];
	I[6] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[10] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 0, 1, 2, 3, 4, 5, 8, 6, 7, 10, 11, 9 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 4, 5, 7, 8, 6, 11, 9, 10
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[9]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((B00 + Ix*Kx)), I+0);
	    update((C[0][0])*W[a]*(Iy*Kx), I+1);
	    update((C[0][0])*W[a]*(Iz*Kx), I+2);
	    update((C[0][0])*W[a]*((B00 + Iy*Ky)), I+3);
	    update((C[0][0])*W[a]*(Iz*Ky), I+4);
	    update((C[0][0])*W[a]*(Ix*Ky), I+5);
	    update((C[0][0])*W[a]*((Iz*Kz + B00)), I+6);
	    update((C[0][0])*W[a]*(Ix*Kz), I+7);
	    update((C[0][0])*W[a]*(Iy*Kz), I+8);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[9]) {
	double T[9];
	for (int i = 0; i < 9; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[4] = T[3];
	I[5] = T[4];
	I[3] = T[5];
	I[8] = T[6];
	I[6] = T[7];
	I[7] = T[8];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[9] = { 0, 1, 2, 5, 3, 4, 7, 8, 6 };
// 	if (index < 9) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 4, 5, 3, 8, 6, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::F, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Ix*Iy*Iz), I+0);
	    double f11 = (B10 + pow(Iy,2));
	    update((C[1][0])*W[a]*(Ix*Kz*f11), I+1);
	    update((C[1][0])*W[a]*(Iz*Kx*f11), I+2);
	    update((C[0][0])*W[a]*(Iz*f11), I+3);
	    update((C[0][0])*W[a]*(Ix*f11), I+4);
	    double f12 = (B00 + Iy*Ky);
	    update((C[1][0])*W[a]*(Ix*Iz*f12), I+5);
	    double f13 = (B10 + pow(Iz,2));
	    update((C[1][0])*W[a]*(f12*f13), I+6);
	    update((C[0][0])*W[a]*(Iy*f13), I+7);
	    update((C[1][0])*W[a]*(Iy*Kx*f13), I+8);
	    update((C[0][0])*W[a]*(Ix*f13), I+9);
	    update((C[1][0])*W[a]*(Ix*Ky*f13), I+10);
	    double f14 = (3*B10 + pow(Iz,2));
	    update((C[1][0])*W[a]*(Iz*Ky*f14), I+11);
	    update((C[1][0])*W[a]*(Iz*Kx*f14), I+12);
	    update((C[0][0])*W[a]*(Iz*f14), I+13);
	    double f15 = (B10 + pow(Ix,2));
	    update((C[1][0])*W[a]*(Iz*Ky*f15), I+14);
	    update((C[0][0])*W[a]*(Iz*f15), I+15);
	    update((C[1][0])*W[a]*(f12*f15), I+16);
	    update((C[1][0])*W[a]*(Iy*Kz*f15), I+17);
	    update((C[0][0])*W[a]*(Iy*f15), I+18);
	    double f16 = (3*B10 + pow(Ix,2));
	    update((C[1][0])*W[a]*(Ix*Kz*f16), I+19);
	    update((C[1][0])*W[a]*(Ix*Ky*f16), I+20);
	    update((C[0][0])*W[a]*(Ix*f16), I+21);
	    double f17 = (2*B00*Ix + Kx*(B10 + pow(Ix,2)));
	    update((C[1][0])*W[a]*(Iy*f17), I+22);
	    update((C[1][0])*W[a]*(Iz*f17), I+23);
	    double f18 = (2*B00*Iy + Ky*(B10 + pow(Iy,2)));
	    update((C[1][0])*W[a]*(Ix*f18), I+24);
	    update((C[1][0])*W[a]*(Iz*f18), I+25);
	    double f2 = 3*B00*B10;
	    update((C[1][0])*W[a]*((Ix*Kx*(3*B10 + pow(Ix,2)) + f2 + 3*B00*pow(Ix,2))), I+26);
	    update((C[1][0])*W[a]*((Iy*Ky*(3*B10 + pow(Iy,2)) + f2 + 3*B00*pow(Iy,2))), I+27);
	    update((C[1][0])*W[a]*((3*B00*pow(Iz,2) + f2 + Iz*Kz*(3*B10 + pow(Iz,2)))), I+28);
	    double f4 = (B00 + Ix*Kx);
	    update((C[1][0])*W[a]*(Iy*Iz*f4), I+29);
	    update((C[1][0])*W[a]*(f11*f4), I+30);
	    update((C[1][0])*W[a]*(f13*f4), I+31);
	    double f5 = (Iz*Kz + B00);
	    update((C[1][0])*W[a]*(Ix*Iy*f5), I+32);
	    update((C[1][0])*W[a]*(f15*f5), I+33);
	    update((C[1][0])*W[a]*(f11*f5), I+34);
	    double f7 = (3*B10 + pow(Iy,2));
	    update((C[1][0])*W[a]*(Iy*Kx*f7), I+35);
	    update((C[0][0])*W[a]*(Iy*f7), I+36);
	    update((C[1][0])*W[a]*(Iy*Kz*f7), I+37);
	    double f8 = (2*B00*Iz + Kz*(B10 + pow(Iz,2)));
	    update((C[1][0])*W[a]*(Ix*f8), I+38);
	    update((C[1][0])*W[a]*(Iy*f8), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[35] = T[1];
	I[16] = T[2];
	I[6] = T[3];
	I[5] = T[4];
	I[29] = T[5];
	I[28] = T[6];
	I[8] = T[7];
	I[18] = T[8];
	I[7] = T[9];
	I[27] = T[10];
	I[22] = T[11];
	I[12] = T[12];
	I[2] = T[13];
	I[24] = T[14];
	I[4] = T[15];
	I[23] = T[16];
	I[33] = T[17];
	I[3] = T[18];
	I[30] = T[19];
	I[20] = T[20];
	I[0] = T[21];
	I[13] = T[22];
	I[14] = T[23];
	I[25] = T[24];
	I[26] = T[25];
	I[10] = T[26];
	I[21] = T[27];
	I[32] = T[28];
	I[19] = T[29];
	I[15] = T[30];
	I[17] = T[31];
	I[39] = T[32];
	I[34] = T[33];
	I[36] = T[34];
	I[11] = T[35];
	I[1] = T[36];
	I[31] = T[37];
	I[37] = T[38];
	I[38] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 21, 36, 13, 18, 15, 4, 3, 9, 7, 0, 26, 35, 12, 22, 23, 30, 2, 31, 8, 29, 20, 27, 11, 16, 14, 24, 25, 10, 6, 5, 19, 37, 28, 17, 33, 1, 34, 38, 39, 32 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 35, 16, 6, 5, 29, 28, 8, 18, 7, 27, 22, 12, 2, 24, 4, 23, 33, 3, 30, 20, 0, 13, 14, 25, 26, 10, 21, 32, 19, 15, 17, 39, 34, 36, 11, 1, 31, 37, 38
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 35;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 8;
	*idx++ = 18;
	*idx++ = 7;
	*idx++ = 27;
	*idx++ = 22;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 25;
	*idx++ = 26;
	*idx++ = 10;
	*idx++ = 21;
	*idx++ = 32;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 39;
	*idx++ = 34;
	*idx++ = 36;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 31;
	*idx++ = 37;
	*idx++ = 38;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[10]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*((3*B10*Cx + pow(Cx,3))), I+0);
	    update((C[0][0])*W[a]*((3*B10*Cy + pow(Cy,3))), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Cz), I+2);
	    update((C[0][0])*W[a]*((3*B10*Cz + pow(Cz,3))), I+3);
	    update((C[0][0])*W[a]*(Cy*Px), I+4);
	    update((C[0][0])*W[a]*(Cz*Px), I+5);
	    update((C[0][0])*W[a]*(Cx*Py), I+6);
	    update((C[0][0])*W[a]*(Cz*Py), I+7);
	    update((C[0][0])*W[a]*(Cx*Pz), I+8);
	    update((C[0][0])*W[a]*(Cy*Pz), I+9);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[10]) {
	double T[10];
	for (int i = 0; i < 10; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[9] = T[2];
	I[2] = T[3];
	I[3] = T[4];
	I[4] = T[5];
	I[5] = T[6];
	I[6] = T[7];
	I[7] = T[8];
	I[8] = T[9];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[10] = { 0, 1, 3, 4, 5, 6, 7, 8, 9, 2 };
// 	if (index < 10) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 9, 2, 3, 4, 5, 6, 7, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 9;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cx*Cy), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz), I+1);
	    update((C[0][0])*W[a]*(Cy*Cz), I+2);
	    update((C[1][0])*W[a]*(Cy*Cz*Kx), I+3);
	    update((C[1][0])*W[a]*(Cx*Cz*Ky), I+4);
	    update((C[1][0])*W[a]*(Cx*Cy*Kz), I+5);
	    update((C[1][0])*W[a]*((Kx*Px + 2*B00*Cx)), I+6);
	    update((C[1][0])*W[a]*(Kz*Px), I+7);
	    update((C[1][0])*W[a]*(Ky*Px), I+8);
	    update((C[0][0])*W[a]*(Px), I+9);
	    update((C[1][0])*W[a]*((Ky*Py + 2*B00*Cy)), I+10);
	    update((C[1][0])*W[a]*(Kx*Py), I+11);
	    update((C[0][0])*W[a]*(Py), I+12);
	    update((C[1][0])*W[a]*(Kz*Py), I+13);
	    update((C[1][0])*W[a]*((Kz*Pz + 2*B00*Cz)), I+14);
	    update((C[1][0])*W[a]*(Kx*Pz), I+15);
	    update((C[0][0])*W[a]*(Pz), I+16);
	    update((C[1][0])*W[a]*(Ky*Pz), I+17);
	    double f5 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Cy*f5), I+18);
	    update((C[1][0])*W[a]*(Cz*f5), I+19);
	    double f7 = (B00 + Cy*Ky);
	    update((C[1][0])*W[a]*(Cx*f7), I+20);
	    update((C[1][0])*W[a]*(Cz*f7), I+21);
	    double f9 = (B00 + Cz*Kz);
	    update((C[1][0])*W[a]*(Cx*f9), I+22);
	    update((C[1][0])*W[a]*(Cy*f9), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[4] = T[1];
	I[5] = T[2];
	I[11] = T[3];
	I[16] = T[4];
	I[21] = T[5];
	I[6] = T[6];
	I[18] = T[7];
	I[12] = T[8];
	I[0] = T[9];
	I[13] = T[10];
	I[7] = T[11];
	I[1] = T[12];
	I[19] = T[13];
	I[20] = T[14];
	I[8] = T[15];
	I[2] = T[16];
	I[14] = T[17];
	I[9] = T[18];
	I[10] = T[19];
	I[15] = T[20];
	I[17] = T[21];
	I[22] = T[22];
	I[23] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 9, 12, 16, 0, 1, 2, 6, 11, 15, 18, 19, 3, 8, 10, 17, 20, 4, 21, 7, 13, 14, 5, 22, 23 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 4, 5, 11, 16, 21, 6, 18, 12, 0, 13, 7, 1, 19, 20, 8, 2, 14, 9, 10, 15, 17, 22, 23
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 11;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 6;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 19;
	*idx++ = 20;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 14;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 22;
	*idx++ = 23;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*(Iz*(B01 + Dx*Kx)), I+0);
	    update((C[0][1])*W[a]*(Iy*(B01 + Dx*Kx)), I+1);
	    update((C[0][0])*W[a]*((B01 + Dx*Kx)), I+2);
	    update((C[0][1])*W[a]*(Dz*Iy*Kx), I+3);
	    update((C[0][0])*W[a]*(Dz*Kx), I+4);
	    update((C[0][1])*W[a]*(Dy*Iz*Kx), I+5);
	    update((C[0][0])*W[a]*(Dy*Kx), I+6);
	    update((C[0][1])*W[a]*(Ix*(B01 + Dy*Ky)), I+7);
	    update((C[0][1])*W[a]*(Iz*(B01 + Dy*Ky)), I+8);
	    update((C[0][0])*W[a]*((B01 + Dy*Ky)), I+9);
	    update((C[0][1])*W[a]*(Dz*Ix*Ky), I+10);
	    update((C[0][0])*W[a]*(Dz*Ky), I+11);
	    update((C[0][1])*W[a]*(Dx*Iz*Ky), I+12);
	    update((C[0][0])*W[a]*(Dx*Ky), I+13);
	    update((C[0][1])*W[a]*(Dx*Iy*Kz), I+14);
	    update((C[0][1])*W[a]*(Iy*(Dz*Kz + B01)), I+15);
	    update((C[0][1])*W[a]*(Ix*(Dz*Kz + B01)), I+16);
	    update((C[0][0])*W[a]*((Dz*Kz + B01)), I+17);
	    update((C[0][0])*W[a]*(Dx*Kz), I+18);
	    update((C[0][1])*W[a]*(Dy*Ix*Kz), I+19);
	    update((C[0][0])*W[a]*(Dy*Kz), I+20);
	    update((C[0][1])*W[a]*((Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+21);
	    update((C[0][1])*W[a]*((B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky))), I+22);
	    update((C[0][1])*W[a]*((B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+23);
	    double f13 = (B00 + Iy*Ky);
	    update((C[0][1])*W[a]*(Dx*f13), I+24);
	    update((C[0][1])*W[a]*(Dz*f13), I+25);
	    double f14 = (Dy*Iy + B00);
	    update((C[0][1])*W[a]*(Kz*f14), I+26);
	    update((C[0][1])*W[a]*(Kx*f14), I+27);
	    double f15 = (Dx*Ix + B00);
	    update((C[0][1])*W[a]*(Kz*f15), I+28);
	    update((C[0][1])*W[a]*(Ky*f15), I+29);
	    double f4 = (B00 + Ix*Kx);
	    update((C[0][1])*W[a]*(Dz*f4), I+30);
	    update((C[0][1])*W[a]*(Dy*f4), I+31);
	    double f5 = (Iz*Kz + B00);
	    update((C[0][1])*W[a]*(Dx*f5), I+32);
	    update((C[0][1])*W[a]*(Dy*f5), I+33);
	    double f6 = (B00 + Dz*Iz);
	    update((C[0][1])*W[a]*(Kx*f6), I+34);
	    update((C[0][1])*W[a]*(Ky*f6), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[2] = T[1];
	I[0] = T[2];
	I[10] = T[3];
	I[8] = T[4];
	I[7] = T[5];
	I[4] = T[6];
	I[17] = T[7];
	I[19] = T[8];
	I[16] = T[9];
	I[21] = T[10];
	I[20] = T[11];
	I[15] = T[12];
	I[12] = T[13];
	I[26] = T[14];
	I[34] = T[15];
	I[33] = T[16];
	I[32] = T[17];
	I[24] = T[18];
	I[29] = T[19];
	I[28] = T[20];
	I[1] = T[21];
	I[18] = T[22];
	I[35] = T[23];
	I[14] = T[24];
	I[22] = T[25];
	I[30] = T[26];
	I[6] = T[27];
	I[25] = T[28];
	I[13] = T[29];
	I[9] = T[30];
	I[5] = T[31];
	I[27] = T[32];
	I[31] = T[33];
	I[11] = T[34];
	I[23] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 2, 21, 1, 0, 6, 31, 27, 5, 4, 30, 3, 34, 13, 29, 24, 12, 9, 7, 22, 8, 11, 10, 25, 35, 18, 28, 14, 32, 20, 19, 26, 33, 17, 16, 15, 23 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 2, 0, 10, 8, 7, 4, 17, 19, 16, 21, 20, 15, 12, 26, 34, 33, 32, 24, 29, 28, 1, 18, 35, 14, 22, 30, 6, 25, 13, 9, 5, 27, 31, 11, 23
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 17;
	*idx++ = 19;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 26;
	*idx++ = 34;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 24;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 1;
	*idx++ = 18;
	*idx++ = 35;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 30;
	*idx++ = 6;
	*idx++ = 25;
	*idx++ = 13;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 27;
	*idx++ = 31;
	*idx++ = 11;
	*idx++ = 23;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::S, rysq::F> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[10]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Kx*(3*B01 + pow(Kx,2))), I+0);
	    update((C[0][0])*W[a]*(Ky*(pow(Ky,2) + 3*B01)), I+1);
	    update((C[0][0])*W[a]*(Kz*(pow(Kz,2) + 3*B01)), I+2);
	    update((C[0][0])*W[a]*(Kx*Ky*Kz), I+3);
	    double f1 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Ky*f1), I+4);
	    update((C[0][0])*W[a]*(Kz*f1), I+5);
	    double f2 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Kx*f2), I+6);
	    update((C[0][0])*W[a]*(Ky*f2), I+7);
	    double f5 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Kz*f5), I+8);
	    update((C[0][0])*W[a]*(Kx*f5), I+9);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[10]) {
	double T[10];
	for (int i = 0; i < 10; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[9] = T[3];
	I[3] = T[4];
	I[4] = T[5];
	I[7] = T[6];
	I[8] = T[7];
	I[6] = T[8];
	I[5] = T[9];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[10] = { 0, 1, 2, 4, 5, 9, 8, 6, 7, 3 };
// 	if (index < 10) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 9, 3, 4, 7, 8, 6, 5
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 9;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 7;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 5;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::S, rysq::F> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    double f0 = (2*B00*Ky + Cy*(pow(Ky,2) + B01));
	    update((C[0][0])*W[a]*(Kx*f0), I+0);
	    update((C[0][0])*W[a]*(Kz*f0), I+1);
	    double f10 = (pow(Kz,2) + 3*B01);
	    update((C[0][0])*W[a]*(Cy*Kz*f10), I+2);
	    update((C[0][0])*W[a]*(Cx*Kz*f10), I+3);
	    double f11 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Kx*Kz*f11), I+4);
	    double f12 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Ky*Kz*f12), I+5);
	    double f13 = (3*B01 + pow(Kx,2));
	    update((C[0][0])*W[a]*(Cz*Kx*f13), I+6);
	    update((C[0][0])*W[a]*(Cy*Kx*f13), I+7);
	    double f14 = (2*B00*Kx + Cx*(pow(Kx,2) + B01));
	    update((C[0][0])*W[a]*(Kz*f14), I+8);
	    update((C[0][0])*W[a]*(Ky*f14), I+9);
	    double f15 = (B00 + Cz*Kz);
	    update((C[0][0])*W[a]*(Kx*Ky*f15), I+10);
	    double f3 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Cx*Kz*f3), I+11);
	    update((C[0][0])*W[a]*(Cz*Kx*f3), I+12);
	    update((C[0][0])*W[a]*(f15*f3), I+13);
	    update((C[0][0])*W[a]*(f12*f3), I+14);
	    double f4 = (pow(Ky,2) + 3*B01);
	    update((C[0][0])*W[a]*(Cx*Ky*f4), I+15);
	    update((C[0][0])*W[a]*(Cz*Ky*f4), I+16);
	    double f5 = 3*B00*B01;
	    update((C[0][0])*W[a]*((3*B01*Cy*Ky + Cy*pow(Ky,3) + 3*B00*pow(Ky,2) + f5)), I+17);
	    update((C[0][0])*W[a]*((f5 + 3*B00*pow(Kz,2) + 3*B01*Cz*Kz + Cz*pow(Kz,3))), I+18);
	    update((C[0][0])*W[a]*((3*B00*pow(Kx,2) + 3*B01*Cx*Kx + f5 + Cx*pow(Kx,3))), I+19);
	    double f6 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Cy*Kx*f6), I+20);
	    update((C[0][0])*W[a]*(Cx*Ky*f6), I+21);
	    update((C[0][0])*W[a]*(f12*f6), I+22);
	    update((C[0][0])*W[a]*(f11*f6), I+23);
	    double f7 = (2*B00*Kz + Cz*(pow(Kz,2) + B01));
	    update((C[0][0])*W[a]*(Kx*f7), I+24);
	    update((C[0][0])*W[a]*(Ky*f7), I+25);
	    double f9 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Cy*Kz*f9), I+26);
	    update((C[0][0])*W[a]*(Cz*Ky*f9), I+27);
	    update((C[0][0])*W[a]*(f11*f9), I+28);
	    update((C[0][0])*W[a]*(f15*f9), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[16] = T[0];
	I[19] = T[1];
	I[7] = T[2];
	I[6] = T[3];
	I[28] = T[4];
	I[27] = T[5];
	I[2] = T[6];
	I[1] = T[7];
	I[12] = T[8];
	I[9] = T[9];
	I[29] = T[10];
	I[18] = T[11];
	I[17] = T[12];
	I[20] = T[13];
	I[15] = T[14];
	I[3] = T[15];
	I[5] = T[16];
	I[4] = T[17];
	I[8] = T[18];
	I[0] = T[19];
	I[22] = T[20];
	I[24] = T[21];
	I[21] = T[22];
	I[25] = T[23];
	I[23] = T[24];
	I[26] = T[25];
	I[13] = T[26];
	I[11] = T[27];
	I[10] = T[28];
	I[14] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 19, 7, 6, 15, 17, 16, 3, 2, 18, 9, 28, 27, 8, 26, 29, 14, 0, 12, 11, 1, 13, 22, 20, 24, 21, 23, 25, 5, 4, 10 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    16, 19, 7, 6, 28, 27, 2, 1, 12, 9, 29, 18, 17, 20, 15, 3, 5, 4, 8, 0, 22, 24, 21, 25, 23, 26, 13, 11, 10, 14
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 16;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 28;
	*idx++ = 27;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 29;
	*idx++ = 18;
	*idx++ = 17;
	*idx++ = 20;
	*idx++ = 15;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 0;
	*idx++ = 22;
	*idx++ = 24;
	*idx++ = 21;
	*idx++ = 25;
	*idx++ = 23;
	*idx++ = 26;
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 14;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx*Cy), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz), I+1);
	    update((C[0][0])*W[a]*(Cy*Cz), I+2);
	    update((C[1][0])*W[a]*(Cy*Cz*Dx), I+3);
	    update((C[1][0])*W[a]*(Cx*Cz*Dy), I+4);
	    update((C[1][0])*W[a]*(Cx*Cy*Dz), I+5);
	    update((C[1][0])*W[a]*((Dx*Px + 2*B00*Cx)), I+6);
	    update((C[1][0])*W[a]*(Dz*Px), I+7);
	    update((C[1][0])*W[a]*(Dy*Px), I+8);
	    update((C[0][0])*W[a]*(Px), I+9);
	    update((C[1][0])*W[a]*((Dy*Py + 2*B00*Cy)), I+10);
	    update((C[1][0])*W[a]*(Dx*Py), I+11);
	    update((C[0][0])*W[a]*(Py), I+12);
	    update((C[1][0])*W[a]*(Dz*Py), I+13);
	    update((C[1][0])*W[a]*((Dz*Pz + 2*B00*Cz)), I+14);
	    update((C[1][0])*W[a]*(Dx*Pz), I+15);
	    update((C[0][0])*W[a]*(Pz), I+16);
	    update((C[1][0])*W[a]*(Dy*Pz), I+17);
	    update((C[1][0])*W[a]*(Cy*Qx), I+18);
	    update((C[1][0])*W[a]*(Cz*Qx), I+19);
	    update((C[1][0])*W[a]*(Cx*Qy), I+20);
	    update((C[1][0])*W[a]*(Cz*Qy), I+21);
	    update((C[1][0])*W[a]*(Cx*Qz), I+22);
	    update((C[1][0])*W[a]*(Cy*Qz), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[4] = T[1];
	I[5] = T[2];
	I[11] = T[3];
	I[16] = T[4];
	I[21] = T[5];
	I[6] = T[6];
	I[18] = T[7];
	I[12] = T[8];
	I[0] = T[9];
	I[13] = T[10];
	I[7] = T[11];
	I[1] = T[12];
	I[19] = T[13];
	I[20] = T[14];
	I[8] = T[15];
	I[2] = T[16];
	I[14] = T[17];
	I[9] = T[18];
	I[10] = T[19];
	I[15] = T[20];
	I[17] = T[21];
	I[22] = T[22];
	I[23] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 9, 12, 16, 0, 1, 2, 6, 11, 15, 18, 19, 3, 8, 10, 17, 20, 4, 21, 7, 13, 14, 5, 22, 23 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 4, 5, 11, 16, 21, 6, 18, 12, 0, 13, 7, 1, 19, 20, 8, 2, 14, 9, 10, 15, 17, 22, 23
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 11;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 6;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 19;
	*idx++ = 20;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 14;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 22;
	*idx++ = 23;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::F, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dz*Ky*Rx), I+0);
	    update((C[0][0])*W[a]*(Dy*Kz*Rx), I+1);
	    update((C[0][0])*W[a]*(Dz*Kx*Ry), I+2);
	    update((C[0][0])*W[a]*(Dx*Kz*Ry), I+3);
	    update((C[0][0])*W[a]*(Dy*Kx*Rz), I+4);
	    update((C[0][0])*W[a]*(Dx*Ky*Rz), I+5);
	    update((C[0][0])*W[a]*(Dy*Dz*(Rx + Dx*Xkl)), I+6);
	    update((C[0][0])*W[a]*(Ry*(Rx + Dx*Xkl)), I+7);
	    update((C[0][0])*W[a]*(Rz*(Rx + Dx*Xkl)), I+8);
	    update((C[0][0])*W[a]*(Dx*Dz*(Ry + Dy*Ykl)), I+9);
	    update((C[0][0])*W[a]*(Rx*(Ry + Dy*Ykl)), I+10);
	    update((C[0][0])*W[a]*(Rz*(Ry + Dy*Ykl)), I+11);
	    double f1 = 3*pow(B01,2);
	    update((C[0][0])*W[a]*((Kx*pow(Dx,3) + 3*B01*Dx*(Xkl + 2*Dx) + f1)), I+12);
	    update((C[0][0])*W[a]*((Ky*pow(Dy,3) + f1 + 3*B01*Dy*(Ykl + 2*Dy))), I+13);
	    update((C[0][0])*W[a]*((Kz*pow(Dz,3) + 3*B01*Dz*(2*Dz + Zkl) + f1)), I+14);
	    double f10 = (3*B01 + pow(Dx,2));
	    update((C[0][0])*W[a]*(Dx*Ky*f10), I+15);
	    update((C[0][0])*W[a]*(Dx*Kz*f10), I+16);
	    double f12 = (Kx*pow(Dx,2) + B01*(3*Dx + Xkl));
	    update((C[0][0])*W[a]*(Dy*f12), I+17);
	    update((C[0][0])*W[a]*(Dz*f12), I+18);
	    double f14 = (B01*(3*Dy + Ykl) + Ky*pow(Dy,2));
	    update((C[0][0])*W[a]*(Dz*f14), I+19);
	    update((C[0][0])*W[a]*(Dx*f14), I+20);
	    double f15 = (B01*(3*Dz + Zkl) + Kz*pow(Dz,2));
	    update((C[0][0])*W[a]*(Dx*f15), I+21);
	    update((C[0][0])*W[a]*(Dy*f15), I+22);
	    double f4 = (pow(Dz,2) + 3*B01);
	    update((C[0][0])*W[a]*(Dz*Kx*f4), I+23);
	    update((C[0][0])*W[a]*(Dz*Ky*f4), I+24);
	    double f6 = (3*B01 + pow(Dy,2));
	    update((C[0][0])*W[a]*(Dy*Kx*f6), I+25);
	    update((C[0][0])*W[a]*(Dy*Kz*f6), I+26);
	    double f7 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Dx*Dy*f7), I+27);
	    update((C[0][0])*W[a]*(Rx*f7), I+28);
	    update((C[0][0])*W[a]*(Ry*f7), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[14] = T[0];
	I[23] = T[1];
	I[6] = T[2];
	I[25] = T[3];
	I[8] = T[4];
	I[17] = T[5];
	I[9] = T[6];
	I[5] = T[7];
	I[7] = T[8];
	I[19] = T[9];
	I[13] = T[10];
	I[18] = T[11];
	I[0] = T[12];
	I[11] = T[13];
	I[22] = T[14];
	I[10] = T[15];
	I[20] = T[16];
	I[3] = T[17];
	I[4] = T[18];
	I[16] = T[19];
	I[15] = T[20];
	I[27] = T[21];
	I[28] = T[22];
	I[2] = T[23];
	I[12] = T[24];
	I[1] = T[25];
	I[21] = T[26];
	I[29] = T[27];
	I[24] = T[28];
	I[26] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 12, 25, 23, 17, 18, 7, 2, 8, 4, 6, 15, 13, 24, 10, 0, 20, 19, 5, 11, 9, 16, 26, 14, 1, 28, 3, 29, 21, 22, 27 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    14, 23, 6, 25, 8, 17, 9, 5, 7, 19, 13, 18, 0, 11, 22, 10, 20, 3, 4, 16, 15, 27, 28, 2, 12, 1, 21, 29, 24, 26
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 14;
	*idx++ = 23;
	*idx++ = 6;
	*idx++ = 25;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 19;
	*idx++ = 13;
	*idx++ = 18;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 10;
	*idx++ = 20;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 2;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 21;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx*Cy*Cz), I+0);
	    update((C[1][0])*W[a]*(Cz*Dy*Px), I+1);
	    update((C[0][0])*W[a]*(Cz*Px), I+2);
	    update((C[1][0])*W[a]*(Cy*Dz*Px), I+3);
	    update((C[0][0])*W[a]*(Cy*Px), I+4);
	    update((C[1][0])*W[a]*(Cx*Dz*Py), I+5);
	    update((C[1][0])*W[a]*(Cz*Dx*Py), I+6);
	    update((C[0][0])*W[a]*(Cz*Py), I+7);
	    update((C[0][0])*W[a]*(Cx*Py), I+8);
	    update((C[0][0])*W[a]*(Cy*Pz), I+9);
	    update((C[1][0])*W[a]*(Cy*Dx*Pz), I+10);
	    update((C[0][0])*W[a]*(Cx*Pz), I+11);
	    update((C[1][0])*W[a]*(Cx*Dy*Pz), I+12);
	    update((C[1][0])*W[a]*(Cy*Cz*Qx), I+13);
	    update((C[1][0])*W[a]*(Py*Qx), I+14);
	    update((C[1][0])*W[a]*(Pz*Qx), I+15);
	    update((C[1][0])*W[a]*(Cx*Cz*Qy), I+16);
	    update((C[1][0])*W[a]*(Px*Qy), I+17);
	    update((C[1][0])*W[a]*(Pz*Qy), I+18);
	    update((C[1][0])*W[a]*(Cx*Cy*Qz), I+19);
	    update((C[1][0])*W[a]*(Px*Qz), I+20);
	    update((C[1][0])*W[a]*(Py*Qz), I+21);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[1][0])*W[a]*(Cx*Dz*f0), I+22);
	    update((C[1][0])*W[a]*(Cx*Dy*f0), I+23);
	    update((C[0][0])*W[a]*(Cx*f0), I+24);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*(Cx*f1), I+25);
	    update((C[1][0])*W[a]*(Cy*f1), I+26);
	    double f10 = (3*B10 + pow(Cz,2));
	    update((C[1][0])*W[a]*(Cz*Dy*f10), I+27);
	    update((C[1][0])*W[a]*(Cz*Dx*f10), I+28);
	    update((C[0][0])*W[a]*(Cz*f10), I+29);
	    double f11 = (3*B10 + pow(Cy,2));
	    update((C[1][0])*W[a]*(Cy*Dx*f11), I+30);
	    update((C[0][0])*W[a]*(Cy*f11), I+31);
	    update((C[1][0])*W[a]*(Cy*Dz*f11), I+32);
	    double f3 = 3*B00*B10;
	    update((C[1][0])*W[a]*((Dx*pow(Cx,3) + 3*B00*pow(Cx,2) + 3*B10*Cx*Dx + f3)), I+33);
	    update((C[1][0])*W[a]*((3*B00*pow(Cy,2) + f3 + Dy*pow(Cy,3) + 3*B10*Cy*Dy)), I+34);
	    update((C[1][0])*W[a]*((3*B10*Cz*Dz + f3 + 3*B00*pow(Cz,2) + Dz*pow(Cz,3))), I+35);
	    double f4 = (Dy*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(Cx*f4), I+36);
	    update((C[1][0])*W[a]*(Cz*f4), I+37);
	    double f6 = (Dx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Cy*f6), I+38);
	    update((C[1][0])*W[a]*(Cz*f6), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[24] = T[1];
	I[4] = T[2];
	I[33] = T[3];
	I[3] = T[4];
	I[35] = T[5];
	I[16] = T[6];
	I[6] = T[7];
	I[5] = T[8];
	I[8] = T[9];
	I[18] = T[10];
	I[7] = T[11];
	I[27] = T[12];
	I[19] = T[13];
	I[15] = T[14];
	I[17] = T[15];
	I[29] = T[16];
	I[23] = T[17];
	I[28] = T[18];
	I[39] = T[19];
	I[34] = T[20];
	I[36] = T[21];
	I[30] = T[22];
	I[20] = T[23];
	I[0] = T[24];
	I[37] = T[25];
	I[38] = T[26];
	I[22] = T[27];
	I[12] = T[28];
	I[2] = T[29];
	I[11] = T[30];
	I[1] = T[31];
	I[31] = T[32];
	I[10] = T[33];
	I[21] = T[34];
	I[32] = T[35];
	I[25] = T[36];
	I[26] = T[37];
	I[13] = T[38];
	I[14] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 24, 31, 29, 4, 2, 8, 7, 11, 9, 0, 33, 30, 28, 38, 39, 14, 6, 15, 10, 13, 23, 34, 27, 17, 1, 36, 37, 12, 18, 16, 22, 32, 35, 3, 20, 5, 21, 25, 26, 19 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 24, 4, 33, 3, 35, 16, 6, 5, 8, 18, 7, 27, 19, 15, 17, 29, 23, 28, 39, 34, 36, 30, 20, 0, 37, 38, 22, 12, 2, 11, 1, 31, 10, 21, 32, 25, 26, 13, 14
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 35;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 8;
	*idx++ = 18;
	*idx++ = 7;
	*idx++ = 27;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 29;
	*idx++ = 23;
	*idx++ = 28;
	*idx++ = 39;
	*idx++ = 34;
	*idx++ = 36;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 22;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 31;
	*idx++ = 10;
	*idx++ = 21;
	*idx++ = 32;
	*idx++ = 25;
	*idx++ = 26;
	*idx++ = 13;
	*idx++ = 14;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::SP, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[160]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx*Cy*Cz), I+0);
	    update((C[1][0])*W[a]*(Cz*Dy*Px), I+1);
	    update((C[0][1])*W[a]*(Cz*Iy*Px), I+2);
	    update((C[0][0])*W[a]*(Cz*Px), I+3);
	    update((C[0][1])*W[a]*(Cy*Iz*Px), I+4);
	    update((C[0][0])*W[a]*(Cy*Px), I+5);
	    update((C[1][0])*W[a]*(Cy*Dz*Px), I+6);
	    update((C[1][0])*W[a]*(Cx*Dz*Py), I+7);
	    update((C[1][0])*W[a]*(Cz*Dx*Py), I+8);
	    update((C[0][1])*W[a]*(Cz*Ix*Py), I+9);
	    update((C[0][0])*W[a]*(Cz*Py), I+10);
	    update((C[0][1])*W[a]*(Cx*Iz*Py), I+11);
	    update((C[0][0])*W[a]*(Cx*Py), I+12);
	    update((C[1][0])*W[a]*(Cx*Dy*Pz), I+13);
	    update((C[0][1])*W[a]*(Cx*Iy*Pz), I+14);
	    update((C[0][0])*W[a]*(Cx*Pz), I+15);
	    update((C[0][1])*W[a]*(Cy*Ix*Pz), I+16);
	    update((C[0][0])*W[a]*(Cy*Pz), I+17);
	    update((C[1][0])*W[a]*(Cy*Dx*Pz), I+18);
	    update((C[1][0])*W[a]*(Cy*Cz*Qx), I+19);
	    update((C[1][1])*W[a]*(Iy*Pz*Qx), I+20);
	    update((C[1][0])*W[a]*(Pz*Qx), I+21);
	    update((C[1][0])*W[a]*(Py*Qx), I+22);
	    update((C[1][1])*W[a]*(Iz*Py*Qx), I+23);
	    update((C[1][0])*W[a]*(Cx*Cz*Qy), I+24);
	    update((C[1][1])*W[a]*(Iz*Px*Qy), I+25);
	    update((C[1][0])*W[a]*(Px*Qy), I+26);
	    update((C[1][0])*W[a]*(Pz*Qy), I+27);
	    update((C[1][1])*W[a]*(Ix*Pz*Qy), I+28);
	    update((C[1][1])*W[a]*(Ix*Py*Qz), I+29);
	    update((C[1][0])*W[a]*(Cx*Cy*Qz), I+30);
	    update((C[1][1])*W[a]*(Iy*Px*Qz), I+31);
	    update((C[1][0])*W[a]*(Px*Qz), I+32);
	    update((C[1][0])*W[a]*(Py*Qz), I+33);
	    update((C[1][1])*W[a]*(Cy*Qz*(Px + Cx*Xij)), I+34);
	    update((C[0][1])*W[a]*(Cy*Cz*(Px + Cx*Xij)), I+35);
	    update((C[1][1])*W[a]*(Cz*Qy*(Px + Cx*Xij)), I+36);
	    update((C[1][1])*W[a]*(Cz*Py*(Dx*Xij + Qx)), I+37);
	    update((C[1][1])*W[a]*(Cy*Pz*(Dx*Xij + Qx)), I+38);
	    update((C[1][1])*W[a]*(Dy*Pz*(Px + Cx*Xij)), I+39);
	    update((C[0][1])*W[a]*(Pz*(Px + Cx*Xij)), I+40);
	    update((C[0][1])*W[a]*(Py*(Px + Cx*Xij)), I+41);
	    update((C[1][1])*W[a]*(Dz*Py*(Px + Cx*Xij)), I+42);
	    update((C[1][1])*W[a]*(Cz*Px*(Dy*Yij + Qy)), I+43);
	    update((C[1][1])*W[a]*(Cx*Pz*(Dy*Yij + Qy)), I+44);
	    update((C[1][1])*W[a]*(Cx*Qy*(Cz*Zij + Pz)), I+45);
	    update((C[1][1])*W[a]*(Dx*Py*(Cz*Zij + Pz)), I+46);
	    update((C[1][1])*W[a]*(Cy*Qx*(Cz*Zij + Pz)), I+47);
	    update((C[0][1])*W[a]*(Cx*Cy*(Cz*Zij + Pz)), I+48);
	    update((C[1][1])*W[a]*(Dy*Px*(Cz*Zij + Pz)), I+49);
	    update((C[0][1])*W[a]*(Px*(Cz*Zij + Pz)), I+50);
	    update((C[0][1])*W[a]*(Py*(Cz*Zij + Pz)), I+51);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[1][1])*W[a]*(Cx*f0*(Dy*Yij + Qy)), I+52);
	    update((C[1][1])*W[a]*(Cx*Dz*Iy*f0), I+53);
	    update((C[1][0])*W[a]*(Cx*Dz*f0), I+54);
	    update((C[1][0])*W[a]*(Cx*Dy*f0), I+55);
	    update((C[1][1])*W[a]*(Cx*Dy*Iz*f0), I+56);
	    update((C[0][1])*W[a]*(Cx*Iz*f0), I+57);
	    update((C[0][0])*W[a]*(Cx*f0), I+58);
	    update((C[0][1])*W[a]*(Cx*Iy*f0), I+59);
	    double f12 = (B00 + Dz*Iz);
	    update((C[1][1])*W[a]*(Cx*f0*f12), I+60);
	    update((C[1][1])*W[a]*(Cy*Px*f12), I+61);
	    update((C[1][1])*W[a]*(Cx*Py*f12), I+62);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][1])*W[a]*(Cx*Cy*(f2 + Qz*Zij)), I+63);
	    update((C[1][1])*W[a]*(Px*(f2 + Qz*Zij)), I+64);
	    update((C[1][1])*W[a]*(Py*(f2 + Qz*Zij)), I+65);
	    update((C[1][1])*W[a]*(Cy*Ix*f2), I+66);
	    update((C[1][1])*W[a]*(f2*(Px + Cx*Xij)), I+67);
	    update((C[1][1])*W[a]*(Cx*Iy*f2), I+68);
	    update((C[1][0])*W[a]*(Cx*f2), I+69);
	    update((C[1][0])*W[a]*(Cy*f2), I+70);
	    double f20 = (3*B10 + pow(Cz,2));
	    update((C[1][1])*W[a]*(Cz*f20*(Dy*Yij + Qy)), I+71);
	    update((C[1][1])*W[a]*(Cz*f20*(Dx*Xij + Qx)), I+72);
	    update((C[1][1])*W[a]*(Cz*Dy*Ix*f20), I+73);
	    update((C[1][0])*W[a]*(Cz*Dy*f20), I+74);
	    update((C[1][0])*W[a]*(Cz*Dx*f20), I+75);
	    update((C[1][1])*W[a]*(Cz*Dx*Iy*f20), I+76);
	    update((C[0][1])*W[a]*(Cz*Iy*f20), I+77);
	    update((C[0][0])*W[a]*(Cz*f20), I+78);
	    update((C[0][1])*W[a]*(Cz*Ix*f20), I+79);
	    double f21 = (3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy));
	    update((C[1][1])*W[a]*(Dz*f21), I+80);
	    update((C[1][1])*W[a]*(Dx*f21), I+81);
	    update((C[0][1])*W[a]*(f21), I+82);
	    double f23 = (Cy*Iy + B10);
	    update((C[1][1])*W[a]*(Cz*Qx*f23), I+83);
	    update((C[0][1])*W[a]*(Cx*Cz*f23), I+84);
	    update((C[1][1])*W[a]*(Cx*Qz*f23), I+85);
	    update((C[1][1])*W[a]*(Dx*Pz*f23), I+86);
	    update((C[0][1])*W[a]*(Pz*f23), I+87);
	    update((C[1][1])*W[a]*(Dz*Px*f23), I+88);
	    update((C[0][1])*W[a]*(Px*f23), I+89);
	    update((C[1][1])*W[a]*(f2*f23), I+90);
	    double f24 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[1][1])*W[a]*(Cy*Dz*f24), I+91);
	    update((C[1][1])*W[a]*(Qy*f24), I+92);
	    update((C[1][1])*W[a]*(Cz*Dy*f24), I+93);
	    update((C[0][1])*W[a]*(Cz*f24), I+94);
	    update((C[0][1])*W[a]*(Cy*f24), I+95);
	    update((C[1][1])*W[a]*(Qz*f24), I+96);
	    double f25 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[1][1])*W[a]*(Cx*Cz*f25), I+97);
	    update((C[1][1])*W[a]*(Pz*f25), I+98);
	    update((C[1][1])*W[a]*(Px*f25), I+99);
	    double f28 = (2*B00*Cy*Yij + 3*B00*Py + Dy*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)));
	    update((C[1][1])*W[a]*(Cx*f28), I+100);
	    update((C[1][1])*W[a]*(Cz*f28), I+101);
	    double f29 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[1][1])*W[a]*(Qx*f29), I+102);
	    update((C[1][1])*W[a]*(Cx*Dz*f29), I+103);
	    update((C[0][1])*W[a]*(Cx*f29), I+104);
	    update((C[1][1])*W[a]*(Qz*f29), I+105);
	    update((C[0][1])*W[a]*(Cz*f29), I+106);
	    update((C[1][1])*W[a]*(Cz*Dx*f29), I+107);
	    double f3 = (3*B00*Pz + Cz*Dz*(3*B10 + pow(Cz,2)));
	    update((C[1][1])*W[a]*(Cy*(f3 + Zij*(Dz*Pz + 2*B00*Cz))), I+108);
	    update((C[1][1])*W[a]*(Cx*(f3 + Zij*(Dz*Pz + 2*B00*Cz))), I+109);
	    update((C[1][1])*W[a]*(Ix*f3), I+110);
	    update((C[1][1])*W[a]*(Iy*f3), I+111);
	    update((C[1][0])*W[a]*(f3), I+112);
	    double f30 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[1][1])*W[a]*(Ix*f30), I+113);
	    update((C[1][1])*W[a]*(Iz*f30), I+114);
	    update((C[1][0])*W[a]*(f30), I+115);
	    double f32 = (Dy*Py + 2*B00*Cy);
	    update((C[1][1])*W[a]*(f32*(Cz*Zij + Pz)), I+116);
	    update((C[1][1])*W[a]*(f32*(Px + Cx*Xij)), I+117);
	    update((C[1][1])*W[a]*(Cz*Ix*f32), I+118);
	    update((C[1][0])*W[a]*(Cz*f32), I+119);
	    update((C[1][1])*W[a]*(Cx*Iz*f32), I+120);
	    update((C[1][0])*W[a]*(Cx*f32), I+121);
	    double f33 = 3*pow(B10,2);
	    double f31 = 3*B00*B10;
	    update((C[1][1])*W[a]*((B00*pow(Cy,2)*(4*Cy + 3*Yij) + f31*Yij + Dy*f33 + Dy*pow(Cy,4) + Dy*Yij*pow(Cy,3) + 4*Cy*f31 + 3*B10*Cy*Dy*(Yij + 2*Cy))), I+122);
	    update((C[1][1])*W[a]*((Dx*f33 + Dx*Xij*pow(Cx,3) + f31*Xij + Dx*pow(Cx,4) + 4*Cx*f31 + B00*pow(Cx,2)*(4*Cx + 3*Xij) + 3*B10*Cx*Dx*(Xij + 2*Cx))), I+123);
	    update((C[1][1])*W[a]*((Dz*f33 + B00*pow(Cz,2)*(4*Cz + 3*Zij) + Dz*pow(Cz,4) + 3*B10*Cz*Dz*(2*Cz + Zij) + 4*Cz*f31 + Dz*Zij*pow(Cz,3) + f31*Zij)), I+124);
	    double f36 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[1][1])*W[a]*(Qx*f36), I+125);
	    update((C[1][1])*W[a]*(Cy*Dx*f36), I+126);
	    update((C[0][1])*W[a]*(Cy*f36), I+127);
	    update((C[1][1])*W[a]*(Qy*f36), I+128);
	    update((C[0][1])*W[a]*(Cx*f36), I+129);
	    update((C[1][1])*W[a]*(Cx*Dy*f36), I+130);
	    double f4 = (3*pow(B10,2) + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3));
	    update((C[1][1])*W[a]*(Dz*f4), I+131);
	    update((C[1][1])*W[a]*(Dy*f4), I+132);
	    update((C[0][1])*W[a]*(f4), I+133);
	    double f5 = (Dx*Px + 2*B00*Cx);
	    update((C[1][1])*W[a]*(Cy*Cz*(Qx*Xij + f5)), I+134);
	    update((C[1][1])*W[a]*(Py*(Qx*Xij + f5)), I+135);
	    update((C[1][1])*W[a]*(Pz*(Qx*Xij + f5)), I+136);
	    update((C[1][1])*W[a]*(f5*(Cz*Zij + Pz)), I+137);
	    update((C[1][1])*W[a]*(f23*f5), I+138);
	    update((C[1][1])*W[a]*(Cy*Iz*f5), I+139);
	    update((C[1][0])*W[a]*(Cy*f5), I+140);
	    update((C[1][0])*W[a]*(Cz*f5), I+141);
	    update((C[1][1])*W[a]*(Cz*Iy*f5), I+142);
	    double f7 = (3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3));
	    update((C[1][1])*W[a]*(Dx*f7), I+143);
	    update((C[1][1])*W[a]*(Dy*f7), I+144);
	    update((C[0][1])*W[a]*(f7), I+145);
	    double f8 = (Cx*Dx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[1][1])*W[a]*(Cz*(Xij*(Dx*Px + 2*B00*Cx) + f8)), I+146);
	    update((C[1][1])*W[a]*(Cy*(Xij*(Dx*Px + 2*B00*Cx) + f8)), I+147);
	    update((C[1][1])*W[a]*(Iz*f8), I+148);
	    update((C[1][1])*W[a]*(Iy*f8), I+149);
	    update((C[1][0])*W[a]*(f8), I+150);
	    double f9 = (3*B10 + pow(Cy,2));
	    update((C[1][1])*W[a]*(Cy*f9*(Dx*Xij + Qx)), I+151);
	    update((C[1][1])*W[a]*(Cy*f12*f9), I+152);
	    update((C[1][1])*W[a]*(Cy*Dx*Iz*f9), I+153);
	    update((C[1][0])*W[a]*(Cy*Dx*f9), I+154);
	    update((C[1][0])*W[a]*(Cy*Dz*f9), I+155);
	    update((C[1][1])*W[a]*(Cy*Dz*Ix*f9), I+156);
	    update((C[0][1])*W[a]*(Cy*Ix*f9), I+157);
	    update((C[0][0])*W[a]*(Cy*f9), I+158);
	    update((C[0][1])*W[a]*(Cy*Iz*f9), I+159);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[160]) {
	double T[160];
	for (int i = 0; i < 160; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[84] = T[1];
	I[24] = T[2];
	I[4] = T[3];
	I[33] = T[4];
	I[3] = T[5];
	I[123] = T[6];
	I[125] = T[7];
	I[46] = T[8];
	I[16] = T[9];
	I[6] = T[10];
	I[35] = T[11];
	I[5] = T[12];
	I[87] = T[13];
	I[27] = T[14];
	I[7] = T[15];
	I[18] = T[16];
	I[8] = T[17];
	I[48] = T[18];
	I[49] = T[19];
	I[67] = T[20];
	I[47] = T[21];
	I[45] = T[22];
	I[75] = T[23];
	I[89] = T[24];
	I[113] = T[25];
	I[83] = T[26];
	I[88] = T[27];
	I[98] = T[28];
	I[136] = T[29];
	I[129] = T[30];
	I[144] = T[31];
	I[124] = T[32];
	I[126] = T[33];
	I[139] = T[34];
	I[19] = T[35];
	I[99] = T[36];
	I[56] = T[37];
	I[58] = T[38];
	I[97] = T[39];
	I[17] = T[40];
	I[15] = T[41];
	I[135] = T[42];
	I[104] = T[43];
	I[107] = T[44];
	I[119] = T[45];
	I[76] = T[46];
	I[79] = T[47];
	I[39] = T[48];
	I[114] = T[49];
	I[34] = T[50];
	I[36] = T[51];
	I[100] = T[52];
	I[140] = T[53];
	I[120] = T[54];
	I[80] = T[55];
	I[110] = T[56];
	I[30] = T[57];
	I[0] = T[58];
	I[20] = T[59];
	I[150] = T[60];
	I[153] = T[61];
	I[155] = T[62];
	I[159] = T[63];
	I[154] = T[64];
	I[156] = T[65];
	I[138] = T[66];
	I[137] = T[67];
	I[147] = T[68];
	I[127] = T[69];
	I[128] = T[70];
	I[102] = T[71];
	I[52] = T[72];
	I[92] = T[73];
	I[82] = T[74];
	I[42] = T[75];
	I[62] = T[76];
	I[22] = T[77];
	I[2] = T[78];
	I[12] = T[79];
	I[141] = T[80];
	I[61] = T[81];
	I[21] = T[82];
	I[69] = T[83];
	I[29] = T[84];
	I[149] = T[85];
	I[68] = T[86];
	I[28] = T[87];
	I[143] = T[88];
	I[23] = T[89];
	I[148] = T[90];
	I[133] = T[91];
	I[93] = T[92];
	I[94] = T[93];
	I[14] = T[94];
	I[13] = T[95];
	I[134] = T[96];
	I[109] = T[97];
	I[108] = T[98];
	I[103] = T[99];
	I[105] = T[100];
	I[106] = T[101];
	I[65] = T[102];
	I[145] = T[103];
	I[25] = T[104];
	I[146] = T[105];
	I[26] = T[106];
	I[66] = T[107];
	I[158] = T[108];
	I[157] = T[109];
	I[132] = T[110];
	I[142] = T[111];
	I[122] = T[112];
	I[91] = T[113];
	I[111] = T[114];
	I[81] = T[115];
	I[116] = T[116];
	I[95] = T[117];
	I[96] = T[118];
	I[86] = T[119];
	I[115] = T[120];
	I[85] = T[121];
	I[101] = T[122];
	I[50] = T[123];
	I[152] = T[124];
	I[77] = T[125];
	I[78] = T[126];
	I[38] = T[127];
	I[118] = T[128];
	I[37] = T[129];
	I[117] = T[130];
	I[130] = T[131];
	I[90] = T[132];
	I[10] = T[133];
	I[59] = T[134];
	I[55] = T[135];
	I[57] = T[136];
	I[74] = T[137];
	I[63] = T[138];
	I[73] = T[139];
	I[43] = T[140];
	I[44] = T[141];
	I[64] = T[142];
	I[72] = T[143];
	I[112] = T[144];
	I[32] = T[145];
	I[54] = T[146];
	I[53] = T[147];
	I[70] = T[148];
	I[60] = T[149];
	I[40] = T[150];
	I[51] = T[151];
	I[151] = T[152];
	I[71] = T[153];
	I[41] = T[154];
	I[121] = T[155];
	I[131] = T[156];
	I[11] = T[157];
	I[1] = T[158];
	I[31] = T[159];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[160] = { 58, 158, 78, 5, 3, 12, 10, 15, 17, 0, 133, 157, 79, 95, 94, 41, 9, 40, 16, 35, 59, 82, 77, 89, 2, 104, 106, 14, 87, 84, 57, 159, 145, 4, 50, 11, 51, 129, 127, 48, 150, 154, 75, 140, 141, 22, 8, 21, 18, 19, 123, 151, 72, 147, 146, 135, 37, 136, 38, 134, 149, 81, 76, 138, 142, 102, 107, 20, 86, 83, 148, 153, 143, 139, 137, 23, 46, 125, 126, 47, 55, 115, 74, 26, 1, 121, 119, 13, 27, 24, 132, 113, 73, 92, 93, 117, 118, 39, 28, 36, 52, 122, 71, 99, 43, 100, 101, 44, 98, 97, 56, 114, 144, 25, 49, 120, 116, 130, 128, 45, 54, 155, 112, 6, 32, 7, 33, 69, 70, 30, 131, 156, 110, 91, 96, 42, 29, 67, 66, 34, 53, 80, 111, 88, 31, 103, 105, 68, 90, 85, 60, 152, 124, 61, 64, 62, 65, 109, 108, 63 };
// 	if (index < 160) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 84, 24, 4, 33, 3, 123, 125, 46, 16, 6, 35, 5, 87, 27, 7, 18, 8, 48, 49, 67, 47, 45, 75, 89, 113, 83, 88, 98, 136, 129, 144, 124, 126, 139, 19, 99, 56, 58, 97, 17, 15, 135, 104, 107, 119, 76, 79, 39, 114, 34, 36, 100, 140, 120, 80, 110, 30, 0, 20, 150, 153, 155, 159, 154, 156, 138, 137, 147, 127, 128, 102, 52, 92, 82, 42, 62, 22, 2, 12, 141, 61, 21, 69, 29, 149, 68, 28, 143, 23, 148, 133, 93, 94, 14, 13, 134, 109, 108, 103, 105, 106, 65, 145, 25, 146, 26, 66, 158, 157, 132, 142, 122, 91, 111, 81, 116, 95, 96, 86, 115, 85, 101, 50, 152, 77, 78, 38, 118, 37, 117, 130, 90, 10, 59, 55, 57, 74, 63, 73, 43, 44, 64, 72, 112, 32, 54, 53, 70, 60, 40, 51, 151, 71, 41, 121, 131, 11, 1, 31
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 84;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 123;
	*idx++ = 125;
	*idx++ = 46;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 35;
	*idx++ = 5;
	*idx++ = 87;
	*idx++ = 27;
	*idx++ = 7;
	*idx++ = 18;
	*idx++ = 8;
	*idx++ = 48;
	*idx++ = 49;
	*idx++ = 67;
	*idx++ = 47;
	*idx++ = 45;
	*idx++ = 75;
	*idx++ = 89;
	*idx++ = 113;
	*idx++ = 83;
	*idx++ = 88;
	*idx++ = 98;
	*idx++ = 136;
	*idx++ = 129;
	*idx++ = 144;
	*idx++ = 124;
	*idx++ = 126;
	*idx++ = 139;
	*idx++ = 19;
	*idx++ = 99;
	*idx++ = 56;
	*idx++ = 58;
	*idx++ = 97;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 135;
	*idx++ = 104;
	*idx++ = 107;
	*idx++ = 119;
	*idx++ = 76;
	*idx++ = 79;
	*idx++ = 39;
	*idx++ = 114;
	*idx++ = 34;
	*idx++ = 36;
	*idx++ = 100;
	*idx++ = 140;
	*idx++ = 120;
	*idx++ = 80;
	*idx++ = 110;
	*idx++ = 30;
	*idx++ = 0;
	*idx++ = 20;
	*idx++ = 150;
	*idx++ = 153;
	*idx++ = 155;
	*idx++ = 159;
	*idx++ = 154;
	*idx++ = 156;
	*idx++ = 138;
	*idx++ = 137;
	*idx++ = 147;
	*idx++ = 127;
	*idx++ = 128;
	*idx++ = 102;
	*idx++ = 52;
	*idx++ = 92;
	*idx++ = 82;
	*idx++ = 42;
	*idx++ = 62;
	*idx++ = 22;
	*idx++ = 2;
	*idx++ = 12;
	*idx++ = 141;
	*idx++ = 61;
	*idx++ = 21;
	*idx++ = 69;
	*idx++ = 29;
	*idx++ = 149;
	*idx++ = 68;
	*idx++ = 28;
	*idx++ = 143;
	*idx++ = 23;
	*idx++ = 148;
	*idx++ = 133;
	*idx++ = 93;
	*idx++ = 94;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 134;
	*idx++ = 109;
	*idx++ = 108;
	*idx++ = 103;
	*idx++ = 105;
	*idx++ = 106;
	*idx++ = 65;
	*idx++ = 145;
	*idx++ = 25;
	*idx++ = 146;
	*idx++ = 26;
	*idx++ = 66;
	*idx++ = 158;
	*idx++ = 157;
	*idx++ = 132;
	*idx++ = 142;
	*idx++ = 122;
	*idx++ = 91;
	*idx++ = 111;
	*idx++ = 81;
	*idx++ = 116;
	*idx++ = 95;
	*idx++ = 96;
	*idx++ = 86;
	*idx++ = 115;
	*idx++ = 85;
	*idx++ = 101;
	*idx++ = 50;
	*idx++ = 152;
	*idx++ = 77;
	*idx++ = 78;
	*idx++ = 38;
	*idx++ = 118;
	*idx++ = 37;
	*idx++ = 117;
	*idx++ = 130;
	*idx++ = 90;
	*idx++ = 10;
	*idx++ = 59;
	*idx++ = 55;
	*idx++ = 57;
	*idx++ = 74;
	*idx++ = 63;
	*idx++ = 73;
	*idx++ = 43;
	*idx++ = 44;
	*idx++ = 64;
	*idx++ = 72;
	*idx++ = 112;
	*idx++ = 32;
	*idx++ = 54;
	*idx++ = 53;
	*idx++ = 70;
	*idx++ = 60;
	*idx++ = 40;
	*idx++ = 51;
	*idx++ = 151;
	*idx++ = 71;
	*idx++ = 41;
	*idx++ = 121;
	*idx++ = 131;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 31;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*((2*B00*Kx + Cx*(pow(Kx,2) + B01))), I+0);
	    update((C[0][1])*W[a]*((2*B00*Ky + Cy*(pow(Ky,2) + B01))), I+1);
	    update((C[0][1])*W[a]*(Cz*Kx*Ky), I+2);
	    update((C[0][0])*W[a]*(Kx*Ky), I+3);
	    update((C[0][1])*W[a]*((2*B00*Kz + Cz*(pow(Kz,2) + B01))), I+4);
	    update((C[0][1])*W[a]*(Cy*Kx*Kz), I+5);
	    update((C[0][0])*W[a]*(Kx*Kz), I+6);
	    update((C[0][1])*W[a]*(Cx*Ky*Kz), I+7);
	    update((C[0][0])*W[a]*(Ky*Kz), I+8);
	    double f2 = (pow(Kz,2) + B01);
	    update((C[0][1])*W[a]*(Cy*f2), I+9);
	    update((C[0][1])*W[a]*(Cx*f2), I+10);
	    update((C[0][0])*W[a]*(f2), I+11);
	    double f4 = (pow(Kx,2) + B01);
	    update((C[0][1])*W[a]*(Cz*f4), I+12);
	    update((C[0][1])*W[a]*(Cy*f4), I+13);
	    update((C[0][0])*W[a]*(f4), I+14);
	    double f5 = (B00 + Cy*Ky);
	    update((C[0][1])*W[a]*(Kx*f5), I+15);
	    update((C[0][1])*W[a]*(Kz*f5), I+16);
	    double f6 = (B00 + Cx*Kx);
	    update((C[0][1])*W[a]*(Kz*f6), I+17);
	    update((C[0][1])*W[a]*(Ky*f6), I+18);
	    double f7 = (pow(Ky,2) + B01);
	    update((C[0][1])*W[a]*(Cx*f7), I+19);
	    update((C[0][1])*W[a]*(Cz*f7), I+20);
	    update((C[0][0])*W[a]*(f7), I+21);
	    double f9 = (B00 + Cz*Kz);
	    update((C[0][1])*W[a]*(Kx*f9), I+22);
	    update((C[0][1])*W[a]*(Ky*f9), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[6] = T[1];
	I[15] = T[2];
	I[12] = T[3];
	I[11] = T[4];
	I[18] = T[5];
	I[16] = T[6];
	I[21] = T[7];
	I[20] = T[8];
	I[10] = T[9];
	I[9] = T[10];
	I[8] = T[11];
	I[3] = T[12];
	I[2] = T[13];
	I[0] = T[14];
	I[14] = T[15];
	I[22] = T[16];
	I[17] = T[17];
	I[13] = T[18];
	I[5] = T[19];
	I[7] = T[20];
	I[4] = T[21];
	I[19] = T[22];
	I[23] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 14, 0, 13, 12, 21, 19, 1, 20, 11, 10, 9, 4, 3, 18, 15, 2, 6, 17, 5, 22, 8, 7, 16, 23 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 6, 15, 12, 11, 18, 16, 21, 20, 10, 9, 8, 3, 2, 0, 14, 22, 17, 13, 5, 7, 4, 19, 23
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 11;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 19;
	*idx++ = 23;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::SP, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cx*Cy), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz), I+1);
	    update((C[0][0])*W[a]*(Cy*Cz), I+2);
	    update((C[0][1])*W[a]*(Cy*Cz*Ix), I+3);
	    update((C[0][1])*W[a]*(Cy*(Cx*Ix + B10)), I+4);
	    update((C[0][1])*W[a]*(Cz*(Cx*Ix + B10)), I+5);
	    update((C[0][1])*W[a]*(Cx*Cz*Iy), I+6);
	    update((C[0][1])*W[a]*(Cz*(Cy*Iy + B10)), I+7);
	    update((C[0][1])*W[a]*(Cx*(Cy*Iy + B10)), I+8);
	    update((C[0][1])*W[a]*(Cy*(B10 + Cz*Iz)), I+9);
	    update((C[0][1])*W[a]*(Cx*(B10 + Cz*Iz)), I+10);
	    update((C[0][1])*W[a]*(Cx*Cy*Iz), I+11);
	    update((C[0][1])*W[a]*(Iy*Px), I+12);
	    update((C[0][0])*W[a]*(Px), I+13);
	    update((C[0][1])*W[a]*(Iz*Px), I+14);
	    update((C[0][1])*W[a]*(Iz*Py), I+15);
	    update((C[0][1])*W[a]*(Ix*Py), I+16);
	    update((C[0][0])*W[a]*(Py), I+17);
	    update((C[0][1])*W[a]*(Iy*Pz), I+18);
	    update((C[0][1])*W[a]*(Ix*Pz), I+19);
	    update((C[0][0])*W[a]*(Pz), I+20);
	    update((C[0][1])*W[a]*((B10*(3*Cx + Xij) + Ix*pow(Cx,2))), I+21);
	    update((C[0][1])*W[a]*((B10*(3*Cy + Yij) + Iy*pow(Cy,2))), I+22);
	    update((C[0][1])*W[a]*((Iz*pow(Cz,2) + B10*(3*Cz + Zij))), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[4] = T[1];
	I[5] = T[2];
	I[11] = T[3];
	I[9] = T[4];
	I[10] = T[5];
	I[16] = T[6];
	I[17] = T[7];
	I[15] = T[8];
	I[23] = T[9];
	I[22] = T[10];
	I[21] = T[11];
	I[12] = T[12];
	I[0] = T[13];
	I[18] = T[14];
	I[19] = T[15];
	I[7] = T[16];
	I[1] = T[17];
	I[14] = T[18];
	I[8] = T[19];
	I[2] = T[20];
	I[6] = T[21];
	I[13] = T[22];
	I[20] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 13, 17, 20, 0, 1, 2, 21, 16, 19, 4, 5, 3, 12, 22, 18, 8, 6, 7, 14, 15, 23, 11, 10, 9 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 4, 5, 11, 9, 10, 16, 17, 15, 23, 22, 21, 12, 0, 18, 19, 7, 1, 14, 8, 2, 6, 13, 20
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 23;
	*idx++ = 22;
	*idx++ = 21;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 18;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 14;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 6;
	*idx++ = 13;
	*idx++ = 20;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][1])*W[a]*(Cy*(B01 + Dx*Kx)), I+0);
	    update((C[0][0])*W[a]*((B01 + Dx*Kx)), I+1);
	    update((C[0][1])*W[a]*(Cz*(B01 + Dx*Kx)), I+2);
	    update((C[0][1])*W[a]*(Cy*Dz*Kx), I+3);
	    update((C[0][0])*W[a]*(Dz*Kx), I+4);
	    update((C[0][1])*W[a]*(Cz*Dy*Kx), I+5);
	    update((C[0][0])*W[a]*(Dy*Kx), I+6);
	    update((C[0][1])*W[a]*(Cx*(B01 + Dy*Ky)), I+7);
	    update((C[0][0])*W[a]*((B01 + Dy*Ky)), I+8);
	    update((C[0][1])*W[a]*(Cz*(B01 + Dy*Ky)), I+9);
	    update((C[0][1])*W[a]*(Cz*Dx*Ky), I+10);
	    update((C[0][1])*W[a]*(Cx*Dz*Ky), I+11);
	    update((C[0][0])*W[a]*(Dz*Ky), I+12);
	    update((C[0][0])*W[a]*(Dx*Ky), I+13);
	    update((C[0][1])*W[a]*(Cx*Dy*Kz), I+14);
	    update((C[0][1])*W[a]*(Cy*(Dz*Kz + B01)), I+15);
	    update((C[0][1])*W[a]*(Cx*(Dz*Kz + B01)), I+16);
	    update((C[0][0])*W[a]*((Dz*Kz + B01)), I+17);
	    update((C[0][1])*W[a]*(Cy*Dx*Kz), I+18);
	    update((C[0][0])*W[a]*(Dx*Kz), I+19);
	    update((C[0][0])*W[a]*(Dy*Kz), I+20);
	    update((C[0][1])*W[a]*(Ky*Qx), I+21);
	    update((C[0][1])*W[a]*(Kz*Qx), I+22);
	    update((C[0][1])*W[a]*(Kx*Qy), I+23);
	    update((C[0][1])*W[a]*(Kz*Qy), I+24);
	    update((C[0][1])*W[a]*(Kx*Qz), I+25);
	    update((C[0][1])*W[a]*(Ky*Qz), I+26);
	    update((C[0][1])*W[a]*((B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx))), I+27);
	    update((C[0][1])*W[a]*((Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy))), I+28);
	    update((C[0][1])*W[a]*(Dx*(Cz*Zkl + Qz)), I+29);
	    update((C[0][1])*W[a]*(Dy*(Cz*Zkl + Qz)), I+30);
	    update((C[0][1])*W[a]*((B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz)), I+31);
	    double f7 = (B00 + Cy*Ky);
	    update((C[0][1])*W[a]*(Dx*f7), I+32);
	    update((C[0][1])*W[a]*(Dz*f7), I+33);
	    double f9 = (B00 + Cx*Kx);
	    update((C[0][1])*W[a]*(Dz*f9), I+34);
	    update((C[0][1])*W[a]*(Dy*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[2] = T[0];
	I[0] = T[1];
	I[3] = T[2];
	I[10] = T[3];
	I[8] = T[4];
	I[7] = T[5];
	I[4] = T[6];
	I[17] = T[7];
	I[16] = T[8];
	I[19] = T[9];
	I[15] = T[10];
	I[21] = T[11];
	I[20] = T[12];
	I[12] = T[13];
	I[29] = T[14];
	I[34] = T[15];
	I[33] = T[16];
	I[32] = T[17];
	I[26] = T[18];
	I[24] = T[19];
	I[28] = T[20];
	I[13] = T[21];
	I[25] = T[22];
	I[6] = T[23];
	I[30] = T[24];
	I[11] = T[25];
	I[23] = T[26];
	I[1] = T[27];
	I[18] = T[28];
	I[27] = T[29];
	I[31] = T[30];
	I[35] = T[31];
	I[14] = T[32];
	I[22] = T[33];
	I[9] = T[34];
	I[5] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 1, 27, 0, 2, 6, 35, 23, 5, 4, 34, 3, 25, 13, 21, 32, 10, 8, 7, 28, 9, 12, 11, 33, 26, 19, 22, 18, 29, 20, 14, 24, 30, 17, 16, 15, 31 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    2, 0, 3, 10, 8, 7, 4, 17, 16, 19, 15, 21, 20, 12, 29, 34, 33, 32, 26, 24, 28, 13, 25, 6, 30, 11, 23, 1, 18, 27, 31, 35, 14, 22, 9, 5
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 3;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 17;
	*idx++ = 16;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 12;
	*idx++ = 29;
	*idx++ = 34;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 28;
	*idx++ = 13;
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 30;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 1;
	*idx++ = 18;
	*idx++ = 27;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 9;
	*idx++ = 5;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::SP, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cx*Cy*Cz), I+0);
	    update((C[0][1])*W[a]*(Cz*Iy*Px), I+1);
	    update((C[0][0])*W[a]*(Cz*Px), I+2);
	    update((C[0][0])*W[a]*(Cy*Px), I+3);
	    update((C[0][1])*W[a]*(Cy*Iz*Px), I+4);
	    update((C[0][1])*W[a]*(Cz*Ix*Py), I+5);
	    update((C[0][0])*W[a]*(Cz*Py), I+6);
	    update((C[0][1])*W[a]*(Cx*Iz*Py), I+7);
	    update((C[0][0])*W[a]*(Cx*Py), I+8);
	    update((C[0][1])*W[a]*(Cy*Ix*Pz), I+9);
	    update((C[0][0])*W[a]*(Cy*Pz), I+10);
	    update((C[0][0])*W[a]*(Cx*Pz), I+11);
	    update((C[0][1])*W[a]*(Cx*Iy*Pz), I+12);
	    update((C[0][1])*W[a]*(Cy*Cz*(Px + Cx*Xij)), I+13);
	    update((C[0][1])*W[a]*(Py*(Px + Cx*Xij)), I+14);
	    update((C[0][1])*W[a]*(Pz*(Px + Cx*Xij)), I+15);
	    update((C[0][1])*W[a]*(Cx*Cy*(Cz*Zij + Pz)), I+16);
	    update((C[0][1])*W[a]*(Px*(Cz*Zij + Pz)), I+17);
	    update((C[0][1])*W[a]*(Py*(Cz*Zij + Pz)), I+18);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][1])*W[a]*(Cx*Iz*f0), I+19);
	    update((C[0][1])*W[a]*(Cx*Iy*f0), I+20);
	    update((C[0][0])*W[a]*(Cx*f0), I+21);
	    double f11 = (3*B10 + pow(Cz,2));
	    update((C[0][1])*W[a]*(Cz*Iy*f11), I+22);
	    update((C[0][1])*W[a]*(Cz*Ix*f11), I+23);
	    update((C[0][0])*W[a]*(Cz*f11), I+24);
	    double f12 = (3*B10 + pow(Cy,2));
	    update((C[0][1])*W[a]*(Cy*Iz*f12), I+25);
	    update((C[0][0])*W[a]*(Cy*f12), I+26);
	    update((C[0][1])*W[a]*(Cy*Ix*f12), I+27);
	    double f13 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][1])*W[a]*(Cx*f13), I+28);
	    update((C[0][1])*W[a]*(Cy*f13), I+29);
	    double f2 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][1])*W[a]*(Cx*f2), I+30);
	    update((C[0][1])*W[a]*(Cz*f2), I+31);
	    double f5 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Cx*Cz*f5), I+32);
	    update((C[0][1])*W[a]*(Px*f5), I+33);
	    update((C[0][1])*W[a]*(Pz*f5), I+34);
	    double f6 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][1])*W[a]*(Cy*f6), I+35);
	    update((C[0][1])*W[a]*(Cz*f6), I+36);
	    double f8 = 3*pow(B10,2);
	    update((C[0][1])*W[a]*((3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3) + f8)), I+37);
	    update((C[0][1])*W[a]*((Iy*pow(Cy,3) + f8 + 3*B10*Cy*(Yij + 2*Cy))), I+38);
	    update((C[0][1])*W[a]*((3*B10*Cz*(2*Cz + Zij) + f8 + Iz*pow(Cz,3))), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[24] = T[1];
	I[4] = T[2];
	I[3] = T[3];
	I[33] = T[4];
	I[16] = T[5];
	I[6] = T[6];
	I[35] = T[7];
	I[5] = T[8];
	I[18] = T[9];
	I[8] = T[10];
	I[7] = T[11];
	I[27] = T[12];
	I[19] = T[13];
	I[15] = T[14];
	I[17] = T[15];
	I[39] = T[16];
	I[34] = T[17];
	I[36] = T[18];
	I[30] = T[19];
	I[20] = T[20];
	I[0] = T[21];
	I[22] = T[22];
	I[12] = T[23];
	I[2] = T[24];
	I[31] = T[25];
	I[1] = T[26];
	I[11] = T[27];
	I[37] = T[28];
	I[38] = T[29];
	I[25] = T[30];
	I[26] = T[31];
	I[29] = T[32];
	I[23] = T[33];
	I[28] = T[34];
	I[13] = T[35];
	I[14] = T[36];
	I[10] = T[37];
	I[21] = T[38];
	I[32] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 21, 26, 24, 3, 2, 8, 6, 11, 10, 0, 37, 27, 23, 35, 36, 14, 5, 15, 9, 13, 20, 38, 22, 33, 1, 30, 31, 12, 34, 32, 19, 25, 39, 4, 17, 7, 18, 28, 29, 16 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 24, 4, 3, 33, 16, 6, 35, 5, 18, 8, 7, 27, 19, 15, 17, 39, 34, 36, 30, 20, 0, 22, 12, 2, 31, 1, 11, 37, 38, 25, 26, 29, 23, 28, 13, 14, 10, 21, 32
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 3;
	*idx++ = 33;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 35;
	*idx++ = 5;
	*idx++ = 18;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 27;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 17;
	*idx++ = 39;
	*idx++ = 34;
	*idx++ = 36;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 22;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 31;
	*idx++ = 1;
	*idx++ = 11;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 25;
	*idx++ = 26;
	*idx++ = 29;
	*idx++ = 23;
	*idx++ = 28;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 10;
	*idx++ = 21;
	*idx++ = 32;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::D, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dy*Dz*Kx), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz*Ky), I+1);
	    update((C[0][0])*W[a]*(Dx*Dy*Kz), I+2);
	    update((C[0][0])*W[a]*(Ky*Rx), I+3);
	    update((C[0][0])*W[a]*(Kz*Rx), I+4);
	    update((C[0][0])*W[a]*(Kx*Ry), I+5);
	    update((C[0][0])*W[a]*(Kz*Ry), I+6);
	    update((C[0][0])*W[a]*(Kx*Rz), I+7);
	    update((C[0][0])*W[a]*(Ky*Rz), I+8);
	    update((C[0][0])*W[a]*((Kx*pow(Dx,2) + B01*(3*Dx + Xkl))), I+9);
	    update((C[0][0])*W[a]*(Dz*(Rx + Dx*Xkl)), I+10);
	    update((C[0][0])*W[a]*(Dy*(Rx + Dx*Xkl)), I+11);
	    update((C[0][0])*W[a]*((B01*(3*Dy + Ykl) + Ky*pow(Dy,2))), I+12);
	    update((C[0][0])*W[a]*((B01*(3*Dz + Zkl) + Kz*pow(Dz,2))), I+13);
	    double f2 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Dz*f2), I+14);
	    update((C[0][0])*W[a]*(Dx*f2), I+15);
	    double f4 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Dx*f4), I+16);
	    update((C[0][0])*W[a]*(Dy*f4), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[10] = T[1];
	I[15] = T[2];
	I[6] = T[3];
	I[12] = T[4];
	I[1] = T[5];
	I[13] = T[6];
	I[2] = T[7];
	I[8] = T[8];
	I[0] = T[9];
	I[4] = T[10];
	I[3] = T[11];
	I[7] = T[12];
	I[14] = T[13];
	I[11] = T[14];
	I[9] = T[15];
	I[16] = T[16];
	I[17] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 9, 5, 7, 11, 10, 0, 3, 12, 8, 15, 1, 14, 4, 6, 13, 2, 16, 17 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 10, 15, 6, 12, 1, 13, 2, 8, 0, 4, 3, 7, 14, 11, 9, 16, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 2;
	*idx++ = 8;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 3;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 16;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::F, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy*Dz), I+0);
	    update((C[1][0])*W[a]*(Dz*Ky*Rx), I+1);
	    update((C[0][0])*W[a]*(Dz*Rx), I+2);
	    update((C[1][0])*W[a]*(Dy*Kz*Rx), I+3);
	    update((C[0][0])*W[a]*(Dy*Rx), I+4);
	    update((C[1][0])*W[a]*(Dx*Kz*Ry), I+5);
	    update((C[1][0])*W[a]*(Dz*Kx*Ry), I+6);
	    update((C[0][0])*W[a]*(Dz*Ry), I+7);
	    update((C[0][0])*W[a]*(Dx*Ry), I+8);
	    update((C[0][0])*W[a]*(Dy*Rz), I+9);
	    update((C[1][0])*W[a]*(Dy*Kx*Rz), I+10);
	    update((C[0][0])*W[a]*(Dx*Rz), I+11);
	    update((C[1][0])*W[a]*(Dx*Ky*Rz), I+12);
	    update((C[1][0])*W[a]*(Dy*Dz*(Rx + Dx*Xkl)), I+13);
	    update((C[1][0])*W[a]*(Rz*(Rx + Dx*Xkl)), I+14);
	    update((C[1][0])*W[a]*(Ry*(Rx + Dx*Xkl)), I+15);
	    update((C[1][0])*W[a]*(Rx*(Ry + Dy*Ykl)), I+16);
	    update((C[1][0])*W[a]*(Rz*(Ry + Dy*Ykl)), I+17);
	    update((C[1][0])*W[a]*(Dx*Dz*(Ry + Dy*Ykl)), I+18);
	    double f1 = 3*pow(B01,2);
	    update((C[1][0])*W[a]*((Ky*pow(Dy,3) + f1 + 3*B01*Dy*(Ykl + 2*Dy))), I+19);
	    update((C[1][0])*W[a]*((Kx*pow(Dx,3) + 3*B01*Dx*(Xkl + 2*Dx) + f1)), I+20);
	    update((C[1][0])*W[a]*((Kz*pow(Dz,3) + 3*B01*Dz*(2*Dz + Zkl) + f1)), I+21);
	    double f10 = (3*B01 + pow(Dx,2));
	    update((C[1][0])*W[a]*(Dx*Kz*f10), I+22);
	    update((C[1][0])*W[a]*(Dx*Ky*f10), I+23);
	    update((C[0][0])*W[a]*(Dx*f10), I+24);
	    double f12 = (Kx*pow(Dx,2) + B01*(3*Dx + Xkl));
	    update((C[1][0])*W[a]*(Dz*f12), I+25);
	    update((C[1][0])*W[a]*(Dy*f12), I+26);
	    double f14 = (B01*(3*Dy + Ykl) + Ky*pow(Dy,2));
	    update((C[1][0])*W[a]*(Dx*f14), I+27);
	    update((C[1][0])*W[a]*(Dz*f14), I+28);
	    double f15 = (B01*(3*Dz + Zkl) + Kz*pow(Dz,2));
	    update((C[1][0])*W[a]*(Dx*f15), I+29);
	    update((C[1][0])*W[a]*(Dy*f15), I+30);
	    double f4 = (pow(Dz,2) + 3*B01);
	    update((C[1][0])*W[a]*(Dz*Ky*f4), I+31);
	    update((C[1][0])*W[a]*(Dz*Kx*f4), I+32);
	    update((C[0][0])*W[a]*(Dz*f4), I+33);
	    double f6 = (3*B01 + pow(Dy,2));
	    update((C[1][0])*W[a]*(Dy*Kx*f6), I+34);
	    update((C[0][0])*W[a]*(Dy*f6), I+35);
	    update((C[1][0])*W[a]*(Dy*Kz*f6), I+36);
	    double f7 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Dx*Dy*f7), I+37);
	    update((C[1][0])*W[a]*(Rx*f7), I+38);
	    update((C[1][0])*W[a]*(Ry*f7), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[24] = T[1];
	I[4] = T[2];
	I[33] = T[3];
	I[3] = T[4];
	I[35] = T[5];
	I[16] = T[6];
	I[6] = T[7];
	I[5] = T[8];
	I[8] = T[9];
	I[18] = T[10];
	I[7] = T[11];
	I[27] = T[12];
	I[19] = T[13];
	I[17] = T[14];
	I[15] = T[15];
	I[23] = T[16];
	I[28] = T[17];
	I[29] = T[18];
	I[21] = T[19];
	I[10] = T[20];
	I[32] = T[21];
	I[30] = T[22];
	I[20] = T[23];
	I[0] = T[24];
	I[14] = T[25];
	I[13] = T[26];
	I[25] = T[27];
	I[26] = T[28];
	I[37] = T[29];
	I[38] = T[30];
	I[22] = T[31];
	I[12] = T[32];
	I[2] = T[33];
	I[11] = T[34];
	I[1] = T[35];
	I[31] = T[36];
	I[39] = T[37];
	I[34] = T[38];
	I[36] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 24, 35, 33, 4, 2, 8, 7, 11, 9, 0, 20, 34, 32, 26, 25, 15, 6, 14, 10, 13, 23, 19, 31, 16, 1, 27, 28, 12, 17, 18, 22, 36, 21, 3, 38, 5, 39, 29, 30, 37 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 24, 4, 33, 3, 35, 16, 6, 5, 8, 18, 7, 27, 19, 17, 15, 23, 28, 29, 21, 10, 32, 30, 20, 0, 14, 13, 25, 26, 37, 38, 22, 12, 2, 11, 1, 31, 39, 34, 36
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 35;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 8;
	*idx++ = 18;
	*idx++ = 7;
	*idx++ = 27;
	*idx++ = 19;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 23;
	*idx++ = 28;
	*idx++ = 29;
	*idx++ = 21;
	*idx++ = 10;
	*idx++ = 32;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 25;
	*idx++ = 26;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 22;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 31;
	*idx++ = 39;
	*idx++ = 34;
	*idx++ = 36;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[1][0])*W[a]*(Cy*Dz*Kx), I+0);
	    update((C[0][0])*W[a]*(Cy*Kx), I+1);
	    update((C[1][0])*W[a]*(Cz*Dy*Kx), I+2);
	    update((C[0][0])*W[a]*(Cz*Kx), I+3);
	    update((C[1][0])*W[a]*(Cz*Dx*Ky), I+4);
	    update((C[0][0])*W[a]*(Cz*Ky), I+5);
	    update((C[1][0])*W[a]*(Cx*Dz*Ky), I+6);
	    update((C[0][0])*W[a]*(Cx*Ky), I+7);
	    update((C[1][0])*W[a]*(Cx*Dy*Kz), I+8);
	    update((C[1][0])*W[a]*(Cy*Dx*Kz), I+9);
	    update((C[0][0])*W[a]*(Cy*Kz), I+10);
	    update((C[0][0])*W[a]*(Cx*Kz), I+11);
	    update((C[1][0])*W[a]*(Ky*Qx), I+12);
	    update((C[1][0])*W[a]*(Kz*Qx), I+13);
	    update((C[1][0])*W[a]*(Kx*Qy), I+14);
	    update((C[1][0])*W[a]*(Kz*Qy), I+15);
	    update((C[1][0])*W[a]*(Kx*Qz), I+16);
	    update((C[1][0])*W[a]*(Ky*Qz), I+17);
	    update((C[1][0])*W[a]*((B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx))), I+18);
	    update((C[1][0])*W[a]*((Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy))), I+19);
	    update((C[1][0])*W[a]*((B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz)), I+20);
	    update((C[1][0])*W[a]*(Dx*(Cz*Zkl + Qz)), I+21);
	    update((C[1][0])*W[a]*(Dy*(Cz*Zkl + Qz)), I+22);
	    update((C[0][0])*W[a]*((Cz*Zkl + Qz)), I+23);
	    double f10 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Cx*f10), I+24);
	    update((C[1][0])*W[a]*(Cz*f10), I+25);
	    double f2 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Cz*f2), I+26);
	    update((C[1][0])*W[a]*(Cy*f2), I+27);
	    double f5 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Cx*f5), I+28);
	    update((C[1][0])*W[a]*(Cy*f5), I+29);
	    double f6 = (B00 + Cy*Ky);
	    update((C[1][0])*W[a]*(Dx*f6), I+30);
	    update((C[1][0])*W[a]*(Dz*f6), I+31);
	    update((C[0][0])*W[a]*(f6), I+32);
	    double f8 = (B00 + Cx*Kx);
	    update((C[1][0])*W[a]*(Dz*f8), I+33);
	    update((C[1][0])*W[a]*(Dy*f8), I+34);
	    update((C[0][0])*W[a]*(f8), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[10] = T[0];
	I[1] = T[1];
	I[8] = T[2];
	I[2] = T[3];
	I[17] = T[4];
	I[14] = T[5];
	I[21] = T[6];
	I[12] = T[7];
	I[30] = T[8];
	I[28] = T[9];
	I[25] = T[10];
	I[24] = T[11];
	I[15] = T[12];
	I[27] = T[13];
	I[7] = T[14];
	I[31] = T[15];
	I[11] = T[16];
	I[23] = T[17];
	I[3] = T[18];
	I[19] = T[19];
	I[35] = T[20];
	I[29] = T[21];
	I[32] = T[22];
	I[26] = T[23];
	I[18] = T[24];
	I[20] = T[25];
	I[5] = T[26];
	I[4] = T[27];
	I[33] = T[28];
	I[34] = T[29];
	I[16] = T[30];
	I[22] = T[31];
	I[13] = T[32];
	I[9] = T[33];
	I[6] = T[34];
	I[0] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 35, 1, 3, 18, 27, 26, 34, 14, 2, 33, 0, 16, 7, 32, 5, 12, 30, 4, 24, 19, 25, 6, 31, 17, 11, 10, 23, 13, 9, 21, 8, 15, 22, 28, 29, 20 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    10, 1, 8, 2, 17, 14, 21, 12, 30, 28, 25, 24, 15, 27, 7, 31, 11, 23, 3, 19, 35, 29, 32, 26, 18, 20, 5, 4, 33, 34, 16, 22, 13, 9, 6, 0
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 10;
	*idx++ = 1;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 17;
	*idx++ = 14;
	*idx++ = 21;
	*idx++ = 12;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 15;
	*idx++ = 27;
	*idx++ = 7;
	*idx++ = 31;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 3;
	*idx++ = 19;
	*idx++ = 35;
	*idx++ = 29;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 18;
	*idx++ = 20;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 16;
	*idx++ = 22;
	*idx++ = 13;
	*idx++ = 9;
	*idx++ = 6;
	*idx++ = 0;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[16]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Ix), I+1);
	    update((C[0][1])*W[a]*(Iy), I+2);
	    update((C[0][1])*W[a]*(Iz), I+3);
	    update((C[1][1])*W[a]*((B00 + Ix*Kx)), I+4);
	    update((C[1][1])*W[a]*(Iy*Kx), I+5);
	    update((C[1][1])*W[a]*(Iz*Kx), I+6);
	    update((C[1][0])*W[a]*(Kx), I+7);
	    update((C[1][1])*W[a]*((B00 + Iy*Ky)), I+8);
	    update((C[1][1])*W[a]*(Iz*Ky), I+9);
	    update((C[1][1])*W[a]*(Ix*Ky), I+10);
	    update((C[1][0])*W[a]*(Ky), I+11);
	    update((C[1][1])*W[a]*((Iz*Kz + B00)), I+12);
	    update((C[1][1])*W[a]*(Ix*Kz), I+13);
	    update((C[1][1])*W[a]*(Iy*Kz), I+14);
	    update((C[1][0])*W[a]*(Kz), I+15);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[16]) {
	double T[16];
	for (int i = 0; i < 16; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[5] = T[4];
	I[6] = T[5];
	I[7] = T[6];
	I[4] = T[7];
	I[10] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[8] = T[11];
	I[15] = T[12];
	I[13] = T[13];
	I[14] = T[14];
	I[12] = T[15];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[16] = { 0, 1, 2, 3, 7, 4, 5, 6, 11, 10, 8, 9, 15, 13, 14, 12 };
// 	if (index < 16) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 5, 6, 7, 4, 10, 11, 9, 8, 15, 13, 14, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::P, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][2],
	      double (&I)[144]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[1][1])*W[a]*(Cz*Kx*(Dy*Iy + B00)), I+0);
	    update((C[1][0])*W[a]*(Kx*(Dy*Iy + B00)), I+1);
	    update((C[1][1])*W[a]*(Cy*Kx*(B00 + Dz*Iz)), I+2);
	    update((C[1][0])*W[a]*(Kx*(B00 + Dz*Iz)), I+3);
	    update((C[0][1])*W[a]*(Cy*Iz*Kx), I+4);
	    update((C[1][0])*W[a]*(Dz*Iy*Kx), I+5);
	    update((C[0][0])*W[a]*(Iz*Kx), I+6);
	    update((C[1][0])*W[a]*(Dy*Iz*Kx), I+7);
	    update((C[0][0])*W[a]*(Iy*Kx), I+8);
	    update((C[0][1])*W[a]*(Cz*Iy*Kx), I+9);
	    update((C[1][1])*W[a]*((B00 + Cy*Ky)*(Dx*Ix + B00)), I+10);
	    update((C[1][1])*W[a]*(Cz*Dx*(B00 + Iy*Ky)), I+11);
	    update((C[1][1])*W[a]*(Cz*Ky*(Dx*Ix + B00)), I+12);
	    update((C[1][0])*W[a]*(Ky*(Dx*Ix + B00)), I+13);
	    update((C[1][1])*W[a]*(Dx*Iz*(B00 + Cy*Ky)), I+14);
	    update((C[0][1])*W[a]*(Iz*(B00 + Cy*Ky)), I+15);
	    update((C[0][1])*W[a]*(Ix*(B00 + Cy*Ky)), I+16);
	    update((C[1][1])*W[a]*(Dz*Ix*(B00 + Cy*Ky)), I+17);
	    update((C[1][1])*W[a]*(Cx*Dz*(B00 + Iy*Ky)), I+18);
	    update((C[1][0])*W[a]*(Dz*(B00 + Iy*Ky)), I+19);
	    update((C[1][0])*W[a]*(Dx*(B00 + Iy*Ky)), I+20);
	    update((C[0][1])*W[a]*(Cx*(B00 + Iy*Ky)), I+21);
	    update((C[0][0])*W[a]*((B00 + Iy*Ky)), I+22);
	    update((C[0][1])*W[a]*(Cz*(B00 + Iy*Ky)), I+23);
	    update((C[1][1])*W[a]*((B00 + Cy*Ky)*(B00 + Dz*Iz)), I+24);
	    update((C[1][1])*W[a]*(Cx*Ky*(B00 + Dz*Iz)), I+25);
	    update((C[1][0])*W[a]*(Ky*(B00 + Dz*Iz)), I+26);
	    update((C[1][0])*W[a]*(Dz*Ix*Ky), I+27);
	    update((C[1][0])*W[a]*(Dx*Iz*Ky), I+28);
	    update((C[0][1])*W[a]*(Cx*Iz*Ky), I+29);
	    update((C[0][0])*W[a]*(Iz*Ky), I+30);
	    update((C[0][1])*W[a]*(Cz*Ix*Ky), I+31);
	    update((C[0][0])*W[a]*(Ix*Ky), I+32);
	    update((C[1][1])*W[a]*((B00 + Cz*Kz)*(Dx*Ix + B00)), I+33);
	    update((C[1][1])*W[a]*((B00 + Cz*Kz)*(Dy*Iy + B00)), I+34);
	    update((C[1][1])*W[a]*(Dx*Iy*(B00 + Cz*Kz)), I+35);
	    update((C[1][1])*W[a]*(Dy*Ix*(B00 + Cz*Kz)), I+36);
	    update((C[0][1])*W[a]*(Ix*(B00 + Cz*Kz)), I+37);
	    update((C[0][1])*W[a]*(Iy*(B00 + Cz*Kz)), I+38);
	    update((C[1][0])*W[a]*(Kz*(Dx*Ix + B00)), I+39);
	    update((C[1][1])*W[a]*(Cy*Kz*(Dx*Ix + B00)), I+40);
	    update((C[1][1])*W[a]*(Cx*Kz*(Dy*Iy + B00)), I+41);
	    update((C[1][0])*W[a]*(Kz*(Dy*Iy + B00)), I+42);
	    update((C[1][1])*W[a]*(Cx*Dy*(Iz*Kz + B00)), I+43);
	    update((C[1][0])*W[a]*(Dy*(Iz*Kz + B00)), I+44);
	    update((C[1][1])*W[a]*(Cy*Dx*(Iz*Kz + B00)), I+45);
	    update((C[1][0])*W[a]*(Dx*(Iz*Kz + B00)), I+46);
	    update((C[0][1])*W[a]*(Cx*(Iz*Kz + B00)), I+47);
	    update((C[0][0])*W[a]*((Iz*Kz + B00)), I+48);
	    update((C[0][1])*W[a]*(Cy*(Iz*Kz + B00)), I+49);
	    update((C[1][0])*W[a]*(Dx*Iy*Kz), I+50);
	    update((C[0][1])*W[a]*(Cx*Iy*Kz), I+51);
	    update((C[1][0])*W[a]*(Dy*Ix*Kz), I+52);
	    update((C[0][0])*W[a]*(Iy*Kz), I+53);
	    update((C[0][0])*W[a]*(Ix*Kz), I+54);
	    update((C[0][1])*W[a]*(Cy*Ix*Kz), I+55);
	    update((C[1][1])*W[a]*(Iz*Ky*Qx), I+56);
	    update((C[1][1])*W[a]*(Iy*Kz*Qx), I+57);
	    update((C[1][1])*W[a]*(Qx*(B00 + Iy*Ky)), I+58);
	    update((C[1][1])*W[a]*(Qx*(Iz*Kz + B00)), I+59);
	    update((C[1][1])*W[a]*(Ix*Kz*Qy), I+60);
	    update((C[1][1])*W[a]*(Iz*Kx*Qy), I+61);
	    update((C[1][1])*W[a]*(Qy*(Iz*Kz + B00)), I+62);
	    update((C[1][1])*W[a]*(Iy*Kx*Qz), I+63);
	    update((C[1][1])*W[a]*(Qz*(B00 + Iy*Ky)), I+64);
	    update((C[1][1])*W[a]*(Ix*Ky*Qz), I+65);
	    update((C[1][1])*W[a]*(Dz*(Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+66);
	    update((C[1][1])*W[a]*(Dy*(Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+67);
	    update((C[1][1])*W[a]*(Ky*(Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+68);
	    update((C[0][1])*W[a]*((Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+69);
	    update((C[1][1])*W[a]*(Kz*(Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+70);
	    update((C[1][1])*W[a]*(Iz*(B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx))), I+71);
	    update((C[1][1])*W[a]*(Iy*(B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx))), I+72);
	    update((C[1][1])*W[a]*(Cz*(Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+73);
	    update((C[1][0])*W[a]*((Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+74);
	    update((C[1][1])*W[a]*(Cy*(Ix*(B01 + Dx*Kx) + B00*(Xkl + 2*Dx))), I+75);
	    update((C[1][1])*W[a]*(Kz*(B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+76);
	    update((C[1][1])*W[a]*(Kx*(B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+77);
	    update((C[1][1])*W[a]*(Dz*(B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+78);
	    update((C[0][1])*W[a]*((B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+79);
	    update((C[1][1])*W[a]*(Dx*(B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+80);
	    update((C[1][1])*W[a]*(Iz*(Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy))), I+81);
	    update((C[1][1])*W[a]*(Ix*(Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy))), I+82);
	    update((C[1][1])*W[a]*(Dy*(Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+83);
	    update((C[0][1])*W[a]*((Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+84);
	    update((C[1][1])*W[a]*(Dx*(Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+85);
	    update((C[1][1])*W[a]*(Kx*(Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+86);
	    update((C[1][1])*W[a]*(Ky*(Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+87);
	    update((C[1][1])*W[a]*(Cx*(B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+88);
	    update((C[1][1])*W[a]*(Cy*(B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+89);
	    update((C[1][0])*W[a]*((B00*(2*Dz + Zkl) + Iz*(Dz*Kz + B01))), I+90);
	    update((C[1][1])*W[a]*(Iy*(B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz)), I+91);
	    update((C[1][1])*W[a]*(Ix*(B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz)), I+92);
	    double f14 = (Cx*Ix + B10);
	    update((C[1][1])*W[a]*(Dz*Ky*f14), I+93);
	    update((C[0][1])*W[a]*(Ky*f14), I+94);
	    update((C[1][1])*W[a]*(Dy*Kz*f14), I+95);
	    update((C[0][1])*W[a]*(Kz*f14), I+96);
	    double f2 = (B00 + Cx*Kx);
	    update((C[1][1])*W[a]*(f2*(B00 + Dz*Iz)), I+97);
	    update((C[1][1])*W[a]*(Qz*(Kx*Xij + f2)), I+98);
	    update((C[1][1])*W[a]*(Qy*(Kx*Xij + f2)), I+99);
	    update((C[1][0])*W[a]*(Dy*(Kx*Xij + f2)), I+100);
	    update((C[1][1])*W[a]*(Cz*Dy*(Kx*Xij + f2)), I+101);
	    update((C[0][1])*W[a]*(Cz*(Kx*Xij + f2)), I+102);
	    update((C[1][0])*W[a]*(Dz*(Kx*Xij + f2)), I+103);
	    update((C[1][1])*W[a]*(Cy*Dz*(Kx*Xij + f2)), I+104);
	    update((C[0][1])*W[a]*(Cy*(Kx*Xij + f2)), I+105);
	    update((C[0][0])*W[a]*((Kx*Xij + f2)), I+106);
	    update((C[1][1])*W[a]*(f2*(Dy*Iy + B00)), I+107);
	    update((C[1][1])*W[a]*(Dz*Iy*f2), I+108);
	    update((C[0][1])*W[a]*(Iy*f2), I+109);
	    update((C[1][1])*W[a]*(Dy*Iz*f2), I+110);
	    update((C[0][1])*W[a]*(Iz*f2), I+111);
	    double f21 = (B01 + Dx*Kx);
	    update((C[1][1])*W[a]*(Cy*Iz*f21), I+112);
	    update((C[1][1])*W[a]*(Cz*Iy*f21), I+113);
	    update((C[1][0])*W[a]*(Iy*f21), I+114);
	    update((C[1][0])*W[a]*(Iz*f21), I+115);
	    double f22 = (Cy*Iy + B10);
	    update((C[1][1])*W[a]*(Dx*Kz*f22), I+116);
	    update((C[1][1])*W[a]*(Dz*Kx*f22), I+117);
	    update((C[0][1])*W[a]*(Kx*f22), I+118);
	    update((C[1][1])*W[a]*(f21*f22), I+119);
	    update((C[0][1])*W[a]*(Kz*f22), I+120);
	    double f26 = (B01 + Dy*Ky);
	    update((C[1][1])*W[a]*(Cx*Iz*f26), I+121);
	    update((C[1][1])*W[a]*(f14*f26), I+122);
	    update((C[1][0])*W[a]*(Iz*f26), I+123);
	    update((C[1][0])*W[a]*(Ix*f26), I+124);
	    update((C[1][1])*W[a]*(Cz*Ix*f26), I+125);
	    double f3 = (B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky));
	    update((C[1][1])*W[a]*(Cz*f3), I+126);
	    update((C[1][1])*W[a]*(Cx*f3), I+127);
	    update((C[1][0])*W[a]*(f3), I+128);
	    double f30 = (Dz*Kz + B01);
	    update((C[1][1])*W[a]*(Cx*Iy*f30), I+129);
	    update((C[1][1])*W[a]*(Cy*Ix*f30), I+130);
	    update((C[1][0])*W[a]*(Ix*f30), I+131);
	    update((C[1][1])*W[a]*(f14*f30), I+132);
	    update((C[1][0])*W[a]*(Iy*f30), I+133);
	    update((C[1][1])*W[a]*(f22*f30), I+134);
	    double f1 = B01*B10;
	    double f32 = 2*pow(B00,2);
	    update((C[1][1])*W[a]*((f32 + Dy*Ky*(Cy*Iy + B10) + B01*Cy*Iy + B00*(Yij + 2*Cy)*(Ykl + 2*Dy) + f1)), I+135);
	    update((C[1][1])*W[a]*((Dz*Kz*(B10 + Cz*Iz) + f32 + B00*(2*Cz + Zij)*(2*Dz + Zkl) + f1 + B01*Cz*Iz)), I+136);
	    update((C[1][1])*W[a]*((B00*(Xij + 2*Cx)*(Xkl + 2*Dx) + f32 + B01*Cx*Ix + f1 + Dx*Kx*(Cx*Ix + B10))), I+137);
	    double f37 = (B10 + Cz*Iz);
	    update((C[1][1])*W[a]*(Dy*Kx*f37), I+138);
	    update((C[0][1])*W[a]*(Kx*f37), I+139);
	    update((C[1][1])*W[a]*(f21*f37), I+140);
	    update((C[1][1])*W[a]*(Dx*Ky*f37), I+141);
	    update((C[0][1])*W[a]*(Ky*f37), I+142);
	    update((C[1][1])*W[a]*(f26*f37), I+143);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[144]) {
	double T[144];
	for (int i = 0; i < 144; ++i) {
	    T[i] = I[i];
	}
	I[31] = T[0];
	I[28] = T[1];
	I[46] = T[2];
	I[44] = T[3];
	I[10] = T[4];
	I[40] = T[5];
	I[8] = T[6];
	I[32] = T[7];
	I[4] = T[8];
	I[7] = T[9];
	I[62] = T[10];
	I[67] = T[11];
	I[63] = T[12];
	I[60] = T[13];
	I[70] = T[14];
	I[58] = T[15];
	I[50] = T[16];
	I[86] = T[17];
	I[89] = T[18];
	I[88] = T[19];
	I[64] = T[20];
	I[53] = T[21];
	I[52] = T[22];
	I[55] = T[23];
	I[94] = T[24];
	I[93] = T[25];
	I[92] = T[26];
	I[84] = T[27];
	I[68] = T[28];
	I[57] = T[29];
	I[56] = T[30];
	I[51] = T[31];
	I[48] = T[32];
	I[111] = T[33];
	I[127] = T[34];
	I[115] = T[35];
	I[123] = T[36];
	I[99] = T[37];
	I[103] = T[38];
	I[108] = T[39];
	I[110] = T[40];
	I[125] = T[41];
	I[124] = T[42];
	I[129] = T[43];
	I[128] = T[44];
	I[118] = T[45];
	I[116] = T[46];
	I[105] = T[47];
	I[104] = T[48];
	I[106] = T[49];
	I[112] = T[50];
	I[101] = T[51];
	I[120] = T[52];
	I[100] = T[53];
	I[96] = T[54];
	I[98] = T[55];
	I[69] = T[56];
	I[113] = T[57];
	I[65] = T[58];
	I[117] = T[59];
	I[122] = T[60];
	I[34] = T[61];
	I[130] = T[62];
	I[43] = T[63];
	I[91] = T[64];
	I[87] = T[65];
	I[37] = T[66];
	I[25] = T[67];
	I[61] = T[68];
	I[1] = T[69];
	I[109] = T[70];
	I[21] = T[71];
	I[17] = T[72];
	I[15] = T[73];
	I[12] = T[74];
	I[14] = T[75];
	I[126] = T[76];
	I[30] = T[77];
	I[90] = T[78];
	I[54] = T[79];
	I[66] = T[80];
	I[82] = T[81];
	I[74] = T[82];
	I[131] = T[83];
	I[107] = T[84];
	I[119] = T[85];
	I[47] = T[86];
	I[95] = T[87];
	I[141] = T[88];
	I[142] = T[89];
	I[140] = T[90];
	I[139] = T[91];
	I[135] = T[92];
	I[85] = T[93];
	I[49] = T[94];
	I[121] = T[95];
	I[97] = T[96];
	I[45] = T[97];
	I[39] = T[98];
	I[26] = T[99];
	I[24] = T[100];
	I[27] = T[101];
	I[3] = T[102];
	I[36] = T[103];
	I[38] = T[104];
	I[2] = T[105];
	I[0] = T[106];
	I[29] = T[107];
	I[41] = T[108];
	I[5] = T[109];
	I[33] = T[110];
	I[9] = T[111];
	I[22] = T[112];
	I[19] = T[113];
	I[16] = T[114];
	I[20] = T[115];
	I[114] = T[116];
	I[42] = T[117];
	I[6] = T[118];
	I[18] = T[119];
	I[102] = T[120];
	I[81] = T[121];
	I[73] = T[122];
	I[80] = T[123];
	I[72] = T[124];
	I[75] = T[125];
	I[79] = T[126];
	I[77] = T[127];
	I[76] = T[128];
	I[137] = T[129];
	I[134] = T[130];
	I[132] = T[131];
	I[133] = T[132];
	I[136] = T[133];
	I[138] = T[134];
	I[78] = T[135];
	I[143] = T[136];
	I[13] = T[137];
	I[35] = T[138];
	I[11] = T[139];
	I[23] = T[140];
	I[71] = T[141];
	I[59] = T[142];
	I[83] = T[143];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[144] = { 106, 69, 105, 102, 8, 109, 118, 9, 6, 111, 4, 139, 74, 137, 75, 73, 114, 72, 119, 113, 115, 71, 112, 140, 100, 67, 99, 101, 1, 107, 77, 0, 7, 110, 61, 138, 103, 66, 104, 98, 5, 108, 117, 63, 3, 97, 2, 86, 32, 94, 16, 31, 22, 21, 79, 23, 30, 29, 15, 142, 13, 68, 10, 12, 20, 58, 80, 11, 28, 56, 14, 141, 124, 122, 82, 125, 128, 127, 135, 126, 123, 121, 81, 143, 27, 93, 17, 65, 19, 18, 78, 64, 26, 25, 24, 87, 54, 96, 55, 37, 53, 51, 120, 38, 48, 47, 49, 84, 39, 70, 40, 33, 50, 57, 116, 35, 46, 59, 45, 85, 52, 95, 60, 36, 42, 41, 76, 34, 44, 43, 62, 83, 131, 132, 130, 92, 133, 129, 134, 91, 90, 88, 89, 136 };
// 	if (index < 144) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    31, 28, 46, 44, 10, 40, 8, 32, 4, 7, 62, 67, 63, 60, 70, 58, 50, 86, 89, 88, 64, 53, 52, 55, 94, 93, 92, 84, 68, 57, 56, 51, 48, 111, 127, 115, 123, 99, 103, 108, 110, 125, 124, 129, 128, 118, 116, 105, 104, 106, 112, 101, 120, 100, 96, 98, 69, 113, 65, 117, 122, 34, 130, 43, 91, 87, 37, 25, 61, 1, 109, 21, 17, 15, 12, 14, 126, 30, 90, 54, 66, 82, 74, 131, 107, 119, 47, 95, 141, 142, 140, 139, 135, 85, 49, 121, 97, 45, 39, 26, 24, 27, 3, 36, 38, 2, 0, 29, 41, 5, 33, 9, 22, 19, 16, 20, 114, 42, 6, 18, 102, 81, 73, 80, 72, 75, 79, 77, 76, 137, 134, 132, 133, 136, 138, 78, 143, 13, 35, 11, 23, 71, 59, 83
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 31;
	*idx++ = 28;
	*idx++ = 46;
	*idx++ = 44;
	*idx++ = 10;
	*idx++ = 40;
	*idx++ = 8;
	*idx++ = 32;
	*idx++ = 4;
	*idx++ = 7;
	*idx++ = 62;
	*idx++ = 67;
	*idx++ = 63;
	*idx++ = 60;
	*idx++ = 70;
	*idx++ = 58;
	*idx++ = 50;
	*idx++ = 86;
	*idx++ = 89;
	*idx++ = 88;
	*idx++ = 64;
	*idx++ = 53;
	*idx++ = 52;
	*idx++ = 55;
	*idx++ = 94;
	*idx++ = 93;
	*idx++ = 92;
	*idx++ = 84;
	*idx++ = 68;
	*idx++ = 57;
	*idx++ = 56;
	*idx++ = 51;
	*idx++ = 48;
	*idx++ = 111;
	*idx++ = 127;
	*idx++ = 115;
	*idx++ = 123;
	*idx++ = 99;
	*idx++ = 103;
	*idx++ = 108;
	*idx++ = 110;
	*idx++ = 125;
	*idx++ = 124;
	*idx++ = 129;
	*idx++ = 128;
	*idx++ = 118;
	*idx++ = 116;
	*idx++ = 105;
	*idx++ = 104;
	*idx++ = 106;
	*idx++ = 112;
	*idx++ = 101;
	*idx++ = 120;
	*idx++ = 100;
	*idx++ = 96;
	*idx++ = 98;
	*idx++ = 69;
	*idx++ = 113;
	*idx++ = 65;
	*idx++ = 117;
	*idx++ = 122;
	*idx++ = 34;
	*idx++ = 130;
	*idx++ = 43;
	*idx++ = 91;
	*idx++ = 87;
	*idx++ = 37;
	*idx++ = 25;
	*idx++ = 61;
	*idx++ = 1;
	*idx++ = 109;
	*idx++ = 21;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 14;
	*idx++ = 126;
	*idx++ = 30;
	*idx++ = 90;
	*idx++ = 54;
	*idx++ = 66;
	*idx++ = 82;
	*idx++ = 74;
	*idx++ = 131;
	*idx++ = 107;
	*idx++ = 119;
	*idx++ = 47;
	*idx++ = 95;
	*idx++ = 141;
	*idx++ = 142;
	*idx++ = 140;
	*idx++ = 139;
	*idx++ = 135;
	*idx++ = 85;
	*idx++ = 49;
	*idx++ = 121;
	*idx++ = 97;
	*idx++ = 45;
	*idx++ = 39;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 3;
	*idx++ = 36;
	*idx++ = 38;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 29;
	*idx++ = 41;
	*idx++ = 5;
	*idx++ = 33;
	*idx++ = 9;
	*idx++ = 22;
	*idx++ = 19;
	*idx++ = 16;
	*idx++ = 20;
	*idx++ = 114;
	*idx++ = 42;
	*idx++ = 6;
	*idx++ = 18;
	*idx++ = 102;
	*idx++ = 81;
	*idx++ = 73;
	*idx++ = 80;
	*idx++ = 72;
	*idx++ = 75;
	*idx++ = 79;
	*idx++ = 77;
	*idx++ = 76;
	*idx++ = 137;
	*idx++ = 134;
	*idx++ = 132;
	*idx++ = 133;
	*idx++ = 136;
	*idx++ = 138;
	*idx++ = 78;
	*idx++ = 143;
	*idx++ = 13;
	*idx++ = 35;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 71;
	*idx++ = 59;
	*idx++ = 83;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::F, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy*Dz), I+0);
	    update((C[0][1])*W[a]*(Dy*Dz*Qx), I+1);
	    update((C[0][1])*W[a]*(Dx*Dz*Qy), I+2);
	    update((C[0][1])*W[a]*(Dx*Dy*Qz), I+3);
	    update((C[0][1])*W[a]*(Qz*Rx), I+4);
	    update((C[0][1])*W[a]*(Qy*Rx), I+5);
	    update((C[0][0])*W[a]*(Dz*Rx), I+6);
	    update((C[0][1])*W[a]*(Cy*Dz*Rx), I+7);
	    update((C[0][0])*W[a]*(Dy*Rx), I+8);
	    update((C[0][1])*W[a]*(Cz*Dy*Rx), I+9);
	    update((C[0][1])*W[a]*(Cx*Dz*Ry), I+10);
	    update((C[0][0])*W[a]*(Dz*Ry), I+11);
	    update((C[0][1])*W[a]*(Qz*Ry), I+12);
	    update((C[0][1])*W[a]*(Qx*Ry), I+13);
	    update((C[0][0])*W[a]*(Dx*Ry), I+14);
	    update((C[0][1])*W[a]*(Cz*Dx*Ry), I+15);
	    update((C[0][1])*W[a]*(Cy*Dx*Rz), I+16);
	    update((C[0][1])*W[a]*(Qx*Rz), I+17);
	    update((C[0][0])*W[a]*(Dx*Rz), I+18);
	    update((C[0][1])*W[a]*(Cx*Dy*Rz), I+19);
	    update((C[0][0])*W[a]*(Dy*Rz), I+20);
	    update((C[0][1])*W[a]*(Qy*Rz), I+21);
	    double f0 = (3*B01 + pow(Dx,2));
	    update((C[0][1])*W[a]*(Cz*Dx*f0), I+22);
	    update((C[0][1])*W[a]*(Cy*Dx*f0), I+23);
	    update((C[0][0])*W[a]*(Dx*f0), I+24);
	    double f10 = 3*B00*B01;
	    update((C[0][1])*W[a]*((3*B01*Cy*Dy + f10 + Cy*pow(Dy,3) + 3*B00*pow(Dy,2))), I+25);
	    update((C[0][1])*W[a]*((f10 + 3*B00*pow(Dz,2) + 3*B01*Cz*Dz + Cz*pow(Dz,3))), I+26);
	    update((C[0][1])*W[a]*((f10 + Cx*pow(Dx,3) + 3*B00*pow(Dx,2) + 3*B01*Cx*Dx)), I+27);
	    double f12 = (2*B00*Dy + Cy*Ry);
	    update((C[0][1])*W[a]*(Dx*f12), I+28);
	    update((C[0][1])*W[a]*(Dz*f12), I+29);
	    double f2 = (2*B00*Dz + Cz*Rz);
	    update((C[0][1])*W[a]*(Dx*f2), I+30);
	    update((C[0][1])*W[a]*(Dy*f2), I+31);
	    double f3 = (pow(Dz,2) + 3*B01);
	    update((C[0][1])*W[a]*(Cy*Dz*f3), I+32);
	    update((C[0][1])*W[a]*(Cx*Dz*f3), I+33);
	    update((C[0][0])*W[a]*(Dz*f3), I+34);
	    double f5 = (3*B01 + pow(Dy,2));
	    update((C[0][1])*W[a]*(Cz*Dy*f5), I+35);
	    update((C[0][1])*W[a]*(Cx*Dy*f5), I+36);
	    update((C[0][0])*W[a]*(Dy*f5), I+37);
	    double f7 = (2*B00*Dx + Cx*Rx);
	    update((C[0][1])*W[a]*(Dz*f7), I+38);
	    update((C[0][1])*W[a]*(Dy*f7), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[36] = T[0];
	I[37] = T[1];
	I[38] = T[2];
	I[39] = T[3];
	I[19] = T[4];
	I[14] = T[5];
	I[16] = T[6];
	I[18] = T[7];
	I[12] = T[8];
	I[15] = T[9];
	I[25] = T[10];
	I[24] = T[11];
	I[27] = T[12];
	I[21] = T[13];
	I[20] = T[14];
	I[23] = T[15];
	I[30] = T[16];
	I[29] = T[17];
	I[28] = T[18];
	I[33] = T[19];
	I[32] = T[20];
	I[34] = T[21];
	I[3] = T[22];
	I[2] = T[23];
	I[0] = T[24];
	I[6] = T[25];
	I[11] = T[26];
	I[1] = T[27];
	I[22] = T[28];
	I[26] = T[29];
	I[31] = T[30];
	I[35] = T[31];
	I[10] = T[32];
	I[9] = T[33];
	I[8] = T[34];
	I[7] = T[35];
	I[5] = T[36];
	I[4] = T[37];
	I[17] = T[38];
	I[13] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 24, 27, 23, 22, 37, 36, 25, 35, 34, 33, 32, 26, 8, 39, 5, 9, 6, 38, 7, 4, 14, 13, 28, 15, 11, 10, 29, 12, 18, 17, 16, 30, 20, 19, 21, 31, 0, 1, 2, 3 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    36, 37, 38, 39, 19, 14, 16, 18, 12, 15, 25, 24, 27, 21, 20, 23, 30, 29, 28, 33, 32, 34, 3, 2, 0, 6, 11, 1, 22, 26, 31, 35, 10, 9, 8, 7, 5, 4, 17, 13
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 36;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 39;
	*idx++ = 19;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 30;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 34;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 22;
	*idx++ = 26;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 17;
	*idx++ = 13;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::F, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Ix*Iy*Iz), I+0);
	    double f1 = (2*B00*Iz + Dz*(B10 + pow(Iz,2)));
	    update((C[1][0])*W[a]*(Ix*f1), I+1);
	    update((C[1][0])*W[a]*(Iy*f1), I+2);
	    double f10 = (B10 + pow(Iy,2));
	    update((C[1][0])*W[a]*(Dz*Ix*f10), I+3);
	    update((C[1][0])*W[a]*(Dx*Iz*f10), I+4);
	    update((C[0][0])*W[a]*(Iz*f10), I+5);
	    update((C[0][0])*W[a]*(Ix*f10), I+6);
	    double f11 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Iy*f11), I+7);
	    update((C[1][0])*W[a]*(Dx*Iy*f11), I+8);
	    update((C[0][0])*W[a]*(Ix*f11), I+9);
	    update((C[1][0])*W[a]*(Dy*Ix*f11), I+10);
	    double f12 = (Dy*Iy + B00);
	    update((C[1][0])*W[a]*(Ix*Iz*f12), I+11);
	    update((C[1][0])*W[a]*(f11*f12), I+12);
	    double f13 = (3*B10 + pow(Iz,2));
	    update((C[1][0])*W[a]*(Dy*Iz*f13), I+13);
	    update((C[1][0])*W[a]*(Dx*Iz*f13), I+14);
	    update((C[0][0])*W[a]*(Iz*f13), I+15);
	    double f14 = (B10 + pow(Ix,2));
	    update((C[1][0])*W[a]*(Dy*Iz*f14), I+16);
	    update((C[0][0])*W[a]*(Iz*f14), I+17);
	    update((C[1][0])*W[a]*(f12*f14), I+18);
	    update((C[1][0])*W[a]*(Dz*Iy*f14), I+19);
	    update((C[0][0])*W[a]*(Iy*f14), I+20);
	    double f15 = (3*B10 + pow(Ix,2));
	    update((C[1][0])*W[a]*(Dz*Ix*f15), I+21);
	    update((C[1][0])*W[a]*(Dy*Ix*f15), I+22);
	    update((C[0][0])*W[a]*(Ix*f15), I+23);
	    double f2 = (2*B00*Ix + Dx*(B10 + pow(Ix,2)));
	    update((C[1][0])*W[a]*(Iy*f2), I+24);
	    update((C[1][0])*W[a]*(Iz*f2), I+25);
	    double f3 = 3*B00*B10;
	    update((C[1][0])*W[a]*((3*B10*Dx*Ix + f3 + 3*B00*pow(Ix,2) + Dx*pow(Ix,3))), I+26);
	    update((C[1][0])*W[a]*((Dy*pow(Iy,3) + f3 + 3*B00*pow(Iy,2) + 3*B10*Dy*Iy)), I+27);
	    update((C[1][0])*W[a]*((Dz*pow(Iz,3) + 3*B00*pow(Iz,2) + 3*B10*Dz*Iz + f3)), I+28);
	    double f4 = (B00 + Dz*Iz);
	    update((C[1][0])*W[a]*(Ix*Iy*f4), I+29);
	    update((C[1][0])*W[a]*(f14*f4), I+30);
	    update((C[1][0])*W[a]*(f10*f4), I+31);
	    double f6 = (3*B10 + pow(Iy,2));
	    update((C[1][0])*W[a]*(Dx*Iy*f6), I+32);
	    update((C[0][0])*W[a]*(Iy*f6), I+33);
	    update((C[1][0])*W[a]*(Dz*Iy*f6), I+34);
	    double f7 = (2*B00*Iy + Dy*(B10 + pow(Iy,2)));
	    update((C[1][0])*W[a]*(Ix*f7), I+35);
	    update((C[1][0])*W[a]*(Iz*f7), I+36);
	    double f9 = (Dx*Ix + B00);
	    update((C[1][0])*W[a]*(Iy*Iz*f9), I+37);
	    update((C[1][0])*W[a]*(f10*f9), I+38);
	    update((C[1][0])*W[a]*(f11*f9), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[37] = T[1];
	I[38] = T[2];
	I[35] = T[3];
	I[16] = T[4];
	I[6] = T[5];
	I[5] = T[6];
	I[8] = T[7];
	I[18] = T[8];
	I[7] = T[9];
	I[27] = T[10];
	I[29] = T[11];
	I[28] = T[12];
	I[22] = T[13];
	I[12] = T[14];
	I[2] = T[15];
	I[24] = T[16];
	I[4] = T[17];
	I[23] = T[18];
	I[33] = T[19];
	I[3] = T[20];
	I[30] = T[21];
	I[20] = T[22];
	I[0] = T[23];
	I[13] = T[24];
	I[14] = T[25];
	I[10] = T[26];
	I[21] = T[27];
	I[32] = T[28];
	I[39] = T[29];
	I[34] = T[30];
	I[36] = T[31];
	I[11] = T[32];
	I[1] = T[33];
	I[31] = T[34];
	I[25] = T[35];
	I[26] = T[36];
	I[19] = T[37];
	I[15] = T[38];
	I[17] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 23, 33, 15, 20, 17, 6, 5, 9, 7, 0, 26, 32, 14, 24, 25, 38, 4, 39, 8, 37, 22, 27, 13, 18, 16, 35, 36, 10, 12, 11, 21, 34, 28, 19, 30, 3, 31, 1, 2, 29 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 37, 38, 35, 16, 6, 5, 8, 18, 7, 27, 29, 28, 22, 12, 2, 24, 4, 23, 33, 3, 30, 20, 0, 13, 14, 10, 21, 32, 39, 34, 36, 11, 1, 31, 25, 26, 19, 15, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 35;
	*idx++ = 16;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 8;
	*idx++ = 18;
	*idx++ = 7;
	*idx++ = 27;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 22;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 30;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 10;
	*idx++ = 21;
	*idx++ = 32;
	*idx++ = 39;
	*idx++ = 34;
	*idx++ = 36;
	*idx++ = 11;
	*idx++ = 1;
	*idx++ = 31;
	*idx++ = 25;
	*idx++ = 26;
	*idx++ = 19;
	*idx++ = 15;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::SP, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][4],
	      double (&I)[16]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Cx), I+1);
	    update((C[0][1])*W[a]*(Cy), I+2);
	    update((C[0][1])*W[a]*(Cz), I+3);
	    update((C[0][3])*W[a]*((Cx*Ix + B10)), I+4);
	    update((C[0][3])*W[a]*(Cy*Ix), I+5);
	    update((C[0][3])*W[a]*(Cz*Ix), I+6);
	    update((C[0][2])*W[a]*(Ix), I+7);
	    update((C[0][3])*W[a]*((Cy*Iy + B10)), I+8);
	    update((C[0][3])*W[a]*(Cz*Iy), I+9);
	    update((C[0][3])*W[a]*(Cx*Iy), I+10);
	    update((C[0][2])*W[a]*(Iy), I+11);
	    update((C[0][3])*W[a]*((B10 + Cz*Iz)), I+12);
	    update((C[0][3])*W[a]*(Cx*Iz), I+13);
	    update((C[0][3])*W[a]*(Cy*Iz), I+14);
	    update((C[0][2])*W[a]*(Iz), I+15);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[16]) {
	double T[16];
	for (int i = 0; i < 16; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[5] = T[4];
	I[6] = T[5];
	I[7] = T[6];
	I[4] = T[7];
	I[10] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[8] = T[11];
	I[15] = T[12];
	I[13] = T[13];
	I[14] = T[14];
	I[12] = T[15];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[16] = { 0, 1, 2, 3, 7, 4, 5, 6, 11, 10, 8, 9, 15, 13, 14, 12 };
// 	if (index < 16) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 5, 6, 7, 4, 10, 11, 9, 8, 15, 13, 14, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::P, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[90]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Iz*Py*Qx), I+0);
	    update((C[0][0])*W[a]*(Iy*Pz*Qx), I+1);
	    update((C[0][0])*W[a]*(Iz*Px*Qy), I+2);
	    update((C[0][0])*W[a]*(Ix*Pz*Qy), I+3);
	    update((C[0][0])*W[a]*(Ix*Py*Qz), I+4);
	    update((C[0][0])*W[a]*(Iy*Px*Qz), I+5);
	    update((C[0][0])*W[a]*(Dz*Py*(Px + Cx*Xij)), I+6);
	    update((C[0][0])*W[a]*(Cy*Qz*(Px + Cx*Xij)), I+7);
	    update((C[0][0])*W[a]*(Cz*Qy*(Px + Cx*Xij)), I+8);
	    update((C[0][0])*W[a]*(Dy*Pz*(Px + Cx*Xij)), I+9);
	    update((C[0][0])*W[a]*(Cy*Pz*(Dx*Xij + Qx)), I+10);
	    update((C[0][0])*W[a]*(Cz*Py*(Dx*Xij + Qx)), I+11);
	    update((C[0][0])*W[a]*(Cx*Qy*(Cz*Zij + Pz)), I+12);
	    update((C[0][0])*W[a]*(Dy*Px*(Cz*Zij + Pz)), I+13);
	    update((C[0][0])*W[a]*(Cy*Qx*(Cz*Zij + Pz)), I+14);
	    update((C[0][0])*W[a]*(Dx*Py*(Cz*Zij + Pz)), I+15);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*((3*B00*Px*Xij + 4*B00*Cx*f0 + Dx*(3*pow(B10,2) + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3)))), I+16);
	    update((C[0][0])*W[a]*(Cx*Dy*Iz*f0), I+17);
	    update((C[0][0])*W[a]*(Cx*Dz*Iy*f0), I+18);
	    double f10 = (3*B00*Pz + Cz*Dz*(3*B10 + pow(Cz,2)));
	    update((C[0][0])*W[a]*(Ix*f10), I+19);
	    update((C[0][0])*W[a]*(Iy*f10), I+20);
	    double f11 = (3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3));
	    update((C[0][0])*W[a]*(Dx*f11), I+21);
	    update((C[0][0])*W[a]*(Dy*f11), I+22);
	    double f12 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Cy*Px*f12), I+23);
	    update((C[0][0])*W[a]*(Cx*f0*f12), I+24);
	    update((C[0][0])*W[a]*(Cx*Py*f12), I+25);
	    double f14 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][0])*W[a]*(Cy*Cz*f14), I+26);
	    update((C[0][0])*W[a]*(Py*f14), I+27);
	    update((C[0][0])*W[a]*(Pz*f14), I+28);
	    double f16 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Cz*Px*f16), I+29);
	    update((C[0][0])*W[a]*(Cx*Pz*f16), I+30);
	    update((C[0][0])*W[a]*(Cx*f0*f16), I+31);
	    double f17 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Dz*Px*f17), I+32);
	    update((C[0][0])*W[a]*(Cx*Qz*f17), I+33);
	    update((C[0][0])*W[a]*(Cz*Qx*f17), I+34);
	    update((C[0][0])*W[a]*(Dx*Pz*f17), I+35);
	    double f18 = (3*pow(B10,2) + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3));
	    update((C[0][0])*W[a]*(Dy*f18), I+36);
	    update((C[0][0])*W[a]*(Dz*f18), I+37);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*Cy*(f2 + Qz*Zij)), I+38);
	    update((C[0][0])*W[a]*(Py*(f2 + Qz*Zij)), I+39);
	    update((C[0][0])*W[a]*(Px*(f2 + Qz*Zij)), I+40);
	    update((C[0][0])*W[a]*(Cy*Ix*f2), I+41);
	    update((C[0][0])*W[a]*(f2*(Px + Cx*Xij)), I+42);
	    update((C[0][0])*W[a]*(Cx*Iy*f2), I+43);
	    update((C[0][0])*W[a]*(f17*f2), I+44);
	    double f20 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(f20*(Cz*Zij + Pz)), I+45);
	    update((C[0][0])*W[a]*(Cy*Iz*f20), I+46);
	    update((C[0][0])*W[a]*(Cz*Iy*f20), I+47);
	    update((C[0][0])*W[a]*(f17*f20), I+48);
	    double f22 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Cy*Dz*f22), I+49);
	    update((C[0][0])*W[a]*(Cz*Dy*f22), I+50);
	    update((C[0][0])*W[a]*(Qz*f22), I+51);
	    update((C[0][0])*W[a]*(Qy*f22), I+52);
	    double f28 = (Cx*Dx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[0][0])*W[a]*(Iz*f28), I+53);
	    update((C[0][0])*W[a]*(Iy*f28), I+54);
	    double f3 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Cx*Dy*f3), I+55);
	    update((C[0][0])*W[a]*(Cy*Dx*f3), I+56);
	    update((C[0][0])*W[a]*(Qy*f3), I+57);
	    update((C[0][0])*W[a]*(Qx*f3), I+58);
	    double f31 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*((3*B00*Pz*Zij + Dz*(3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3)) + 4*B00*Cz*f31)), I+59);
	    update((C[0][0])*W[a]*(Cz*f31*(Dx*Xij + Qx)), I+60);
	    update((C[0][0])*W[a]*(Cz*f16*f31), I+61);
	    update((C[0][0])*W[a]*(Cz*Dy*Ix*f31), I+62);
	    update((C[0][0])*W[a]*(Cz*Dx*Iy*f31), I+63);
	    double f32 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*((3*B00*Py*Yij + Dy*(3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy)) + 4*B00*Cy*f32)), I+64);
	    update((C[0][0])*W[a]*(Cy*f32*(Dx*Xij + Qx)), I+65);
	    update((C[0][0])*W[a]*(Cy*Dz*Ix*f32), I+66);
	    update((C[0][0])*W[a]*(Cy*Dx*Iz*f32), I+67);
	    update((C[0][0])*W[a]*(Cy*f12*f32), I+68);
	    double f35 = (3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy));
	    update((C[0][0])*W[a]*(Dx*f35), I+69);
	    update((C[0][0])*W[a]*(Dz*f35), I+70);
	    double f4 = (Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij);
	    update((C[0][0])*W[a]*(Cx*f4), I+71);
	    update((C[0][0])*W[a]*(Cy*f4), I+72);
	    double f5 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][0])*W[a]*(Cx*Cz*f5), I+73);
	    update((C[0][0])*W[a]*(Px*f5), I+74);
	    update((C[0][0])*W[a]*(Pz*f5), I+75);
	    double f6 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[0][0])*W[a]*(Cy*f6), I+76);
	    update((C[0][0])*W[a]*(Cz*f6), I+77);
	    double f7 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[0][0])*W[a]*(Cx*(f7 + Yij*(Dy*Py + 2*B00*Cy))), I+78);
	    update((C[0][0])*W[a]*(Cz*(f7 + Yij*(Dy*Py + 2*B00*Cy))), I+79);
	    update((C[0][0])*W[a]*(Ix*f7), I+80);
	    update((C[0][0])*W[a]*(Iz*f7), I+81);
	    double f8 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(f8*(Cz*Zij + Pz)), I+82);
	    update((C[0][0])*W[a]*(f8*(Px + Cx*Xij)), I+83);
	    update((C[0][0])*W[a]*(Cx*Iz*f8), I+84);
	    update((C[0][0])*W[a]*(Cz*Ix*f8), I+85);
	    double f9 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Cx*Dz*f9), I+86);
	    update((C[0][0])*W[a]*(Cz*Dx*f9), I+87);
	    update((C[0][0])*W[a]*(Qx*f9), I+88);
	    update((C[0][0])*W[a]*(Qz*f9), I+89);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[90]) {
	double T[90];
	for (int i = 0; i < 90; ++i) {
	    T[i] = I[i];
	}
	I[25] = T[0];
	I[17] = T[1];
	I[53] = T[2];
	I[38] = T[3];
	I[66] = T[4];
	I[74] = T[5];
	I[65] = T[6];
	I[69] = T[7];
	I[39] = T[8];
	I[37] = T[9];
	I[8] = T[10];
	I[6] = T[11];
	I[59] = T[12];
	I[54] = T[13];
	I[29] = T[14];
	I[26] = T[15];
	I[0] = T[16];
	I[50] = T[17];
	I[70] = T[18];
	I[62] = T[19];
	I[72] = T[20];
	I[22] = T[21];
	I[52] = T[22];
	I[83] = T[23];
	I[80] = T[24];
	I[85] = T[25];
	I[9] = T[26];
	I[5] = T[27];
	I[7] = T[28];
	I[44] = T[29];
	I[47] = T[30];
	I[40] = T[31];
	I[73] = T[32];
	I[79] = T[33];
	I[19] = T[34];
	I[18] = T[35];
	I[30] = T[36];
	I[60] = T[37];
	I[89] = T[38];
	I[86] = T[39];
	I[84] = T[40];
	I[68] = T[41];
	I[67] = T[42];
	I[77] = T[43];
	I[78] = T[44];
	I[24] = T[45];
	I[23] = T[46];
	I[14] = T[47];
	I[13] = T[48];
	I[63] = T[49];
	I[34] = T[50];
	I[64] = T[51];
	I[33] = T[52];
	I[20] = T[53];
	I[10] = T[54];
	I[57] = T[55];
	I[28] = T[56];
	I[58] = T[57];
	I[27] = T[58];
	I[82] = T[59];
	I[2] = T[60];
	I[42] = T[61];
	I[32] = T[62];
	I[12] = T[63];
	I[41] = T[64];
	I[1] = T[65];
	I[61] = T[66];
	I[21] = T[67];
	I[81] = T[68];
	I[11] = T[69];
	I[71] = T[70];
	I[87] = T[71];
	I[88] = T[72];
	I[49] = T[73];
	I[43] = T[74];
	I[48] = T[75];
	I[3] = T[76];
	I[4] = T[77];
	I[45] = T[78];
	I[46] = T[79];
	I[31] = T[80];
	I[51] = T[81];
	I[56] = T[82];
	I[35] = T[83];
	I[55] = T[84];
	I[36] = T[85];
	I[75] = T[86];
	I[16] = T[87];
	I[15] = T[88];
	I[76] = T[89];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[90] = { 16, 65, 60, 76, 77, 27, 11, 28, 10, 26, 54, 69, 63, 48, 47, 88, 87, 1, 35, 34, 53, 67, 21, 46, 45, 0, 15, 58, 56, 14, 36, 80, 62, 52, 50, 83, 85, 9, 3, 8, 31, 64, 61, 74, 29, 78, 79, 30, 75, 73, 17, 81, 22, 2, 13, 84, 82, 55, 57, 12, 37, 66, 19, 49, 51, 6, 4, 42, 41, 7, 18, 70, 20, 32, 5, 86, 89, 43, 44, 33, 24, 68, 59, 23, 40, 25, 39, 71, 72, 38 };
// 	if (index < 90) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    25, 17, 53, 38, 66, 74, 65, 69, 39, 37, 8, 6, 59, 54, 29, 26, 0, 50, 70, 62, 72, 22, 52, 83, 80, 85, 9, 5, 7, 44, 47, 40, 73, 79, 19, 18, 30, 60, 89, 86, 84, 68, 67, 77, 78, 24, 23, 14, 13, 63, 34, 64, 33, 20, 10, 57, 28, 58, 27, 82, 2, 42, 32, 12, 41, 1, 61, 21, 81, 11, 71, 87, 88, 49, 43, 48, 3, 4, 45, 46, 31, 51, 56, 35, 55, 36, 75, 16, 15, 76
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 25;
	*idx++ = 17;
	*idx++ = 53;
	*idx++ = 38;
	*idx++ = 66;
	*idx++ = 74;
	*idx++ = 65;
	*idx++ = 69;
	*idx++ = 39;
	*idx++ = 37;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 59;
	*idx++ = 54;
	*idx++ = 29;
	*idx++ = 26;
	*idx++ = 0;
	*idx++ = 50;
	*idx++ = 70;
	*idx++ = 62;
	*idx++ = 72;
	*idx++ = 22;
	*idx++ = 52;
	*idx++ = 83;
	*idx++ = 80;
	*idx++ = 85;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 44;
	*idx++ = 47;
	*idx++ = 40;
	*idx++ = 73;
	*idx++ = 79;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 30;
	*idx++ = 60;
	*idx++ = 89;
	*idx++ = 86;
	*idx++ = 84;
	*idx++ = 68;
	*idx++ = 67;
	*idx++ = 77;
	*idx++ = 78;
	*idx++ = 24;
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 63;
	*idx++ = 34;
	*idx++ = 64;
	*idx++ = 33;
	*idx++ = 20;
	*idx++ = 10;
	*idx++ = 57;
	*idx++ = 28;
	*idx++ = 58;
	*idx++ = 27;
	*idx++ = 82;
	*idx++ = 2;
	*idx++ = 42;
	*idx++ = 32;
	*idx++ = 12;
	*idx++ = 41;
	*idx++ = 1;
	*idx++ = 61;
	*idx++ = 21;
	*idx++ = 81;
	*idx++ = 11;
	*idx++ = 71;
	*idx++ = 87;
	*idx++ = 88;
	*idx++ = 49;
	*idx++ = 43;
	*idx++ = 48;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 45;
	*idx++ = 46;
	*idx++ = 31;
	*idx++ = 51;
	*idx++ = 56;
	*idx++ = 35;
	*idx++ = 55;
	*idx++ = 36;
	*idx++ = 75;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 76;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::F, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Ix*Iy*Iz), I+0);
	    double f1 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[0][1])*W[a]*(Iz*f1), I+1);
	    update((C[0][1])*W[a]*(Iy*f1), I+2);
	    double f10 = (B10 + pow(Iz,2));
	    update((C[0][1])*W[a]*(Cy*Ix*f10), I+3);
	    update((C[0][0])*W[a]*(Ix*f10), I+4);
	    update((C[0][1])*W[a]*(Cx*Iy*f10), I+5);
	    update((C[0][0])*W[a]*(Iy*f10), I+6);
	    double f11 = 3*pow(B10,2);
	    update((C[0][1])*W[a]*((f11 + 3*B10*Ix*(Xij + 2*Cx) + Cx*pow(Ix,3))), I+7);
	    update((C[0][1])*W[a]*((f11 + Cz*pow(Iz,3) + 3*B10*Iz*(2*Cz + Zij))), I+8);
	    update((C[0][1])*W[a]*((f11 + Cy*pow(Iy,3) + 3*B10*Iy*(Yij + 2*Cy))), I+9);
	    double f12 = (Cx*Ix + B10);
	    update((C[0][1])*W[a]*(Iy*Iz*f12), I+10);
	    update((C[0][1])*W[a]*(f10*f12), I+11);
	    double f13 = (3*B10 + pow(Iz,2));
	    update((C[0][1])*W[a]*(Cy*Iz*f13), I+12);
	    update((C[0][1])*W[a]*(Cx*Iz*f13), I+13);
	    update((C[0][0])*W[a]*(Iz*f13), I+14);
	    double f14 = (3*B10 + pow(Ix,2));
	    update((C[0][1])*W[a]*(Cz*Ix*f14), I+15);
	    update((C[0][1])*W[a]*(Cy*Ix*f14), I+16);
	    update((C[0][0])*W[a]*(Ix*f14), I+17);
	    double f15 = (B10 + Cz*Iz);
	    update((C[0][1])*W[a]*(Ix*Iy*f15), I+18);
	    double f3 = (3*B10 + pow(Iy,2));
	    update((C[0][1])*W[a]*(Cz*Iy*f3), I+19);
	    update((C[0][1])*W[a]*(Cx*Iy*f3), I+20);
	    update((C[0][0])*W[a]*(Iy*f3), I+21);
	    double f4 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[0][1])*W[a]*(Ix*f4), I+22);
	    update((C[0][1])*W[a]*(Iy*f4), I+23);
	    double f5 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Ix*Iz*f5), I+24);
	    update((C[0][1])*W[a]*(f10*f5), I+25);
	    double f6 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[0][1])*W[a]*(Ix*f6), I+26);
	    update((C[0][1])*W[a]*(Iz*f6), I+27);
	    double f8 = (B10 + pow(Iy,2));
	    update((C[0][1])*W[a]*(Cx*Iz*f8), I+28);
	    update((C[0][0])*W[a]*(Iz*f8), I+29);
	    update((C[0][1])*W[a]*(f15*f8), I+30);
	    update((C[0][1])*W[a]*(f12*f8), I+31);
	    update((C[0][0])*W[a]*(Ix*f8), I+32);
	    update((C[0][1])*W[a]*(Cz*Ix*f8), I+33);
	    double f9 = (B10 + pow(Ix,2));
	    update((C[0][1])*W[a]*(f15*f9), I+34);
	    update((C[0][1])*W[a]*(f5*f9), I+35);
	    update((C[0][0])*W[a]*(Iz*f9), I+36);
	    update((C[0][1])*W[a]*(Cy*Iz*f9), I+37);
	    update((C[0][0])*W[a]*(Iy*f9), I+38);
	    update((C[0][1])*W[a]*(Cz*Iy*f9), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[36] = T[0];
	I[17] = T[1];
	I[13] = T[2];
	I[30] = T[3];
	I[28] = T[4];
	I[33] = T[5];
	I[32] = T[6];
	I[1] = T[7];
	I[11] = T[8];
	I[6] = T[9];
	I[37] = T[10];
	I[29] = T[11];
	I[10] = T[12];
	I[9] = T[13];
	I[8] = T[14];
	I[3] = T[15];
	I[2] = T[16];
	I[0] = T[17];
	I[39] = T[18];
	I[7] = T[19];
	I[5] = T[20];
	I[4] = T[21];
	I[31] = T[22];
	I[35] = T[23];
	I[38] = T[24];
	I[34] = T[25];
	I[22] = T[26];
	I[26] = T[27];
	I[25] = T[28];
	I[24] = T[29];
	I[27] = T[30];
	I[21] = T[31];
	I[20] = T[32];
	I[23] = T[33];
	I[19] = T[34];
	I[14] = T[35];
	I[16] = T[36];
	I[18] = T[37];
	I[12] = T[38];
	I[15] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 17, 7, 16, 15, 21, 20, 9, 19, 14, 13, 12, 8, 38, 2, 35, 39, 36, 1, 37, 34, 32, 31, 26, 33, 29, 28, 27, 30, 4, 11, 3, 22, 6, 5, 25, 23, 0, 10, 24, 18 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    36, 17, 13, 30, 28, 33, 32, 1, 11, 6, 37, 29, 10, 9, 8, 3, 2, 0, 39, 7, 5, 4, 31, 35, 38, 34, 22, 26, 25, 24, 27, 21, 20, 23, 19, 14, 16, 18, 12, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 36;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 1;
	*idx++ = 11;
	*idx++ = 6;
	*idx++ = 37;
	*idx++ = 29;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 39;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 38;
	*idx++ = 34;
	*idx++ = 22;
	*idx++ = 26;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 19;
	*idx++ = 14;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::F, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 4;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<4> &t2, const vector<4> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[100]) {
	eval<4>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Qx*Qy*Qz), I+0);
	    update((C[0][0])*W[a]*(Pz*Qy*Rx), I+1);
	    update((C[0][0])*W[a]*(Py*Qz*Rx), I+2);
	    update((C[0][0])*W[a]*(Pz*Qx*Ry), I+3);
	    update((C[0][0])*W[a]*(Px*Qz*Ry), I+4);
	    update((C[0][0])*W[a]*(Py*Qx*Rz), I+5);
	    update((C[0][0])*W[a]*(Px*Qy*Rz), I+6);
	    double f0 = (4*B00*Cy*Dy + Py*Ry + 2*pow(B00,2));
	    update((C[0][0])*W[a]*(Cz*Dx*f0), I+7);
	    update((C[0][0])*W[a]*(Cx*Dz*f0), I+8);
	    update((C[0][0])*W[a]*(Qx*f0), I+9);
	    update((C[0][0])*W[a]*(Qz*f0), I+10);
	    double f1 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*(Cx*Dy*Rz*f1), I+11);
	    update((C[0][0])*W[a]*(Cx*Dz*Ry*f1), I+12);
	    double f11 = (2*B00*Dz + Cz*Rz);
	    update((C[0][0])*W[a]*(Cy*Qx*f11), I+13);
	    update((C[0][0])*W[a]*(Cx*Qy*f11), I+14);
	    update((C[0][0])*W[a]*(Dy*Px*f11), I+15);
	    update((C[0][0])*W[a]*(Dx*Py*f11), I+16);
	    double f12 = (pow(Dz,2) + 3*B01);
	    update((C[0][0])*W[a]*(Cx*Dz*f1*f12), I+17);
	    update((C[0][0])*W[a]*(Cx*Dz*Py*f12), I+18);
	    update((C[0][0])*W[a]*(Cy*Dz*Px*f12), I+19);
	    double f13 = (6*Cy*pow(B00,2) + Cy*Ry*(3*B10 + pow(Cy,2)) + 6*B00*Dy*Py);
	    update((C[0][0])*W[a]*(Dx*f13), I+20);
	    update((C[0][0])*W[a]*(Dz*f13), I+21);
	    double f15 = (3*B01 + pow(Dy,2));
	    update((C[0][0])*W[a]*(Cx*Dy*f1*f15), I+22);
	    update((C[0][0])*W[a]*(Cx*Dy*Pz*f15), I+23);
	    update((C[0][0])*W[a]*(Cz*Dy*Px*f15), I+24);
	    double f2 = (6*Dz*pow(B00,2) + Dz*Pz*(pow(Dz,2) + 3*B01) + 6*B00*Cz*Rz);
	    update((C[0][0])*W[a]*(Cx*f2), I+25);
	    update((C[0][0])*W[a]*(Cy*f2), I+26);
	    double f16 = 9*B00*B01*B10;
	    double f21 = (3*B01 + pow(Dx,2));
	    double f19 = 6*pow(B00,3);
	    update((C[0][0])*W[a]*((18*Cx*Dx*pow(B00,2) + f16 + f19 + Cx*Dx*f1*f21 + 9*B00*(B01*pow(Cx,2) + Px*pow(Dx,2)))), I+27);
	    update((C[0][0])*W[a]*(Cy*Dx*Pz*f21), I+28);
	    update((C[0][0])*W[a]*(Cz*Dx*Py*f21), I+29);
	    double f22 = (3*B00*Pz + Cz*Dz*(3*B10 + pow(Cz,2)));
	    update((C[0][0])*W[a]*(Dx*Dy*f22), I+30);
	    update((C[0][0])*W[a]*(Rx*f22), I+31);
	    update((C[0][0])*W[a]*(Ry*f22), I+32);
	    double f23 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cz*Rx*f23), I+33);
	    update((C[0][0])*W[a]*(Dz*Qx*f23), I+34);
	    update((C[0][0])*W[a]*(Dx*Qz*f23), I+35);
	    update((C[0][0])*W[a]*(Cx*Rz*f23), I+36);
	    update((C[0][0])*W[a]*(f11*f23), I+37);
	    double f25 = (2*B00*Dx + Cx*Rx);
	    update((C[0][0])*W[a]*(Cy*Qz*f25), I+38);
	    update((C[0][0])*W[a]*(Cz*Qy*f25), I+39);
	    update((C[0][0])*W[a]*(Dy*Pz*f25), I+40);
	    update((C[0][0])*W[a]*(Dz*Py*f25), I+41);
	    update((C[0][0])*W[a]*(f23*f25), I+42);
	    double f26 = (3*B00*Rx + Cx*Dx*(3*B01 + pow(Dx,2)));
	    update((C[0][0])*W[a]*(Cy*Cz*f26), I+43);
	    update((C[0][0])*W[a]*(Py*f26), I+44);
	    update((C[0][0])*W[a]*(Pz*f26), I+45);
	    double f27 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cy*Rz*f27), I+46);
	    update((C[0][0])*W[a]*(Cz*Ry*f27), I+47);
	    update((C[0][0])*W[a]*(Dz*Qy*f27), I+48);
	    update((C[0][0])*W[a]*(Dy*Qz*f27), I+49);
	    update((C[0][0])*W[a]*(f11*f27), I+50);
	    double f28 = (Cy*Dy*(3*B01 + pow(Dy,2)) + 3*B00*Ry);
	    update((C[0][0])*W[a]*(Cx*Cz*f28), I+51);
	    update((C[0][0])*W[a]*(Pz*f28), I+52);
	    update((C[0][0])*W[a]*(Px*f28), I+53);
	    double f29 = (6*Cx*pow(B00,2) + Cx*Rx*(3*B10 + pow(Cx,2)) + 6*B00*Dx*Px);
	    update((C[0][0])*W[a]*(Dz*f29), I+54);
	    update((C[0][0])*W[a]*(Dy*f29), I+55);
	    double f3 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*Ry*f3), I+56);
	    update((C[0][0])*W[a]*(Cy*Rx*f3), I+57);
	    update((C[0][0])*W[a]*(Dy*Qx*f3), I+58);
	    update((C[0][0])*W[a]*(Dx*Qy*f3), I+59);
	    update((C[0][0])*W[a]*(f25*f3), I+60);
	    double f32 = (Cx*Dx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[0][0])*W[a]*(Dy*Dz*f32), I+61);
	    update((C[0][0])*W[a]*(Ry*f32), I+62);
	    update((C[0][0])*W[a]*(Rz*f32), I+63);
	    double f33 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*((Cz*Dz*f12*f33 + f16 + f19 + 9*B00*(B01*pow(Cz,2) + Pz*pow(Dz,2)) + 18*Cz*Dz*pow(B00,2))), I+64);
	    update((C[0][0])*W[a]*(Cz*Dx*Ry*f33), I+65);
	    update((C[0][0])*W[a]*(Cz*Dy*Rx*f33), I+66);
	    update((C[0][0])*W[a]*(Cz*Dy*f15*f33), I+67);
	    update((C[0][0])*W[a]*(Cz*Dx*f21*f33), I+68);
	    double f34 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*((f16 + f19 + Cy*Dy*f15*f34 + 9*B00*(B01*pow(Cy,2) + Py*pow(Dy,2)) + 18*Cy*Dy*pow(B00,2))), I+69);
	    update((C[0][0])*W[a]*(Cy*Dx*Rz*f34), I+70);
	    update((C[0][0])*W[a]*(Cy*Dx*f21*f34), I+71);
	    update((C[0][0])*W[a]*(Cy*Dz*Rx*f34), I+72);
	    update((C[0][0])*W[a]*(Cy*Dz*f12*f34), I+73);
	    double f35 = (2*pow(B00,2) + Pz*Rz + 4*B00*Cz*Dz);
	    update((C[0][0])*W[a]*(Cy*Dx*f35), I+74);
	    update((C[0][0])*W[a]*(Cx*Dy*f35), I+75);
	    update((C[0][0])*W[a]*(Qx*f35), I+76);
	    update((C[0][0])*W[a]*(Qy*f35), I+77);
	    double f36 = (6*B00*Dz*Pz + Cz*Rz*(3*B10 + pow(Cz,2)) + 6*Cz*pow(B00,2));
	    update((C[0][0])*W[a]*(Dx*f36), I+78);
	    update((C[0][0])*W[a]*(Dy*f36), I+79);
	    double f4 = (2*B00*Dy + Cy*Ry);
	    update((C[0][0])*W[a]*(Cx*Qz*f4), I+80);
	    update((C[0][0])*W[a]*(Cz*Qx*f4), I+81);
	    update((C[0][0])*W[a]*(Dx*Pz*f4), I+82);
	    update((C[0][0])*W[a]*(Dz*Px*f4), I+83);
	    update((C[0][0])*W[a]*(f27*f4), I+84);
	    update((C[0][0])*W[a]*(f3*f4), I+85);
	    double f5 = (Dx*Px*(3*B01 + pow(Dx,2)) + 6*Dx*pow(B00,2) + 6*B00*Cx*Rx);
	    update((C[0][0])*W[a]*(Cy*f5), I+86);
	    update((C[0][0])*W[a]*(Cz*f5), I+87);
	    double f6 = (Px*Rx + 2*pow(B00,2) + 4*B00*Cx*Dx);
	    update((C[0][0])*W[a]*(Cz*Dy*f6), I+88);
	    update((C[0][0])*W[a]*(Cy*Dz*f6), I+89);
	    update((C[0][0])*W[a]*(Qz*f6), I+90);
	    update((C[0][0])*W[a]*(Qy*f6), I+91);
	    double f7 = (6*B00*Cy*Ry + 6*Dy*pow(B00,2) + Dy*Py*(3*B01 + pow(Dy,2)));
	    update((C[0][0])*W[a]*(Cz*f7), I+92);
	    update((C[0][0])*W[a]*(Cx*f7), I+93);
	    double f8 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[0][0])*W[a]*(Dx*Dz*f8), I+94);
	    update((C[0][0])*W[a]*(Rx*f8), I+95);
	    update((C[0][0])*W[a]*(Rz*f8), I+96);
	    double f9 = (Cz*Dz*(pow(Dz,2) + 3*B01) + 3*B00*Rz);
	    update((C[0][0])*W[a]*(Cx*Cy*f9), I+97);
	    update((C[0][0])*W[a]*(Px*f9), I+98);
	    update((C[0][0])*W[a]*(Py*f9), I+99);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[100]) {
	double T[100];
	for (int i = 0; i < 100; ++i) {
	    T[i] = I[i];
	}
	I[99] = T[0];
	I[38] = T[1];
	I[46] = T[2];
	I[57] = T[3];
	I[64] = T[4];
	I[75] = T[5];
	I[83] = T[6];
	I[56] = T[7];
	I[65] = T[8];
	I[55] = T[9];
	I[66] = T[10];
	I[80] = T[11];
	I[60] = T[12];
	I[79] = T[13];
	I[89] = T[14];
	I[84] = T[15];
	I[76] = T[16];
	I[20] = T[17];
	I[25] = T[18];
	I[23] = T[19];
	I[51] = T[20];
	I[61] = T[21];
	I[10] = T[22];
	I[17] = T[23];
	I[14] = T[24];
	I[27] = T[25];
	I[28] = T[26];
	I[0] = T[27];
	I[8] = T[28];
	I[6] = T[29];
	I[92] = T[30];
	I[42] = T[31];
	I[62] = T[32];
	I[36] = T[33];
	I[95] = T[34];
	I[96] = T[35];
	I[85] = T[36];
	I[86] = T[37];
	I[49] = T[38];
	I[39] = T[39];
	I[37] = T[40];
	I[45] = T[41];
	I[35] = T[42];
	I[9] = T[43];
	I[5] = T[44];
	I[7] = T[45];
	I[73] = T[46];
	I[54] = T[47];
	I[93] = T[48];
	I[94] = T[49];
	I[74] = T[50];
	I[19] = T[51];
	I[18] = T[52];
	I[13] = T[53];
	I[40] = T[54];
	I[30] = T[55];
	I[67] = T[56];
	I[48] = T[57];
	I[97] = T[58];
	I[98] = T[59];
	I[47] = T[60];
	I[90] = T[61];
	I[50] = T[62];
	I[70] = T[63];
	I[22] = T[64];
	I[52] = T[65];
	I[32] = T[66];
	I[12] = T[67];
	I[2] = T[68];
	I[11] = T[69];
	I[71] = T[70];
	I[1] = T[71];
	I[41] = T[72];
	I[21] = T[73];
	I[78] = T[74];
	I[87] = T[75];
	I[77] = T[76];
	I[88] = T[77];
	I[72] = T[78];
	I[82] = T[79];
	I[69] = T[80];
	I[59] = T[81];
	I[58] = T[82];
	I[63] = T[83];
	I[53] = T[84];
	I[68] = T[85];
	I[3] = T[86];
	I[4] = T[87];
	I[34] = T[88];
	I[43] = T[89];
	I[44] = T[90];
	I[33] = T[91];
	I[16] = T[92];
	I[15] = T[93];
	I[91] = T[94];
	I[31] = T[95];
	I[81] = T[96];
	I[29] = T[97];
	I[24] = T[98];
	I[26] = T[99];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[100] = { 27, 71, 68, 86, 87, 44, 29, 45, 28, 43, 22, 69, 67, 53, 24, 93, 92, 23, 52, 51, 17, 73, 64, 19, 98, 18, 99, 25, 26, 97, 55, 95, 66, 91, 88, 42, 33, 40, 1, 39, 54, 72, 31, 89, 90, 41, 2, 60, 57, 38, 62, 20, 65, 84, 47, 9, 7, 3, 82, 81, 12, 21, 32, 83, 4, 8, 10, 56, 85, 80, 63, 70, 78, 46, 50, 5, 16, 76, 74, 13, 11, 96, 79, 6, 15, 36, 37, 75, 77, 14, 61, 94, 30, 48, 49, 34, 35, 58, 59, 0 };
// 	if (index < 100) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    99, 38, 46, 57, 64, 75, 83, 56, 65, 55, 66, 80, 60, 79, 89, 84, 76, 20, 25, 23, 51, 61, 10, 17, 14, 27, 28, 0, 8, 6, 92, 42, 62, 36, 95, 96, 85, 86, 49, 39, 37, 45, 35, 9, 5, 7, 73, 54, 93, 94, 74, 19, 18, 13, 40, 30, 67, 48, 97, 98, 47, 90, 50, 70, 22, 52, 32, 12, 2, 11, 71, 1, 41, 21, 78, 87, 77, 88, 72, 82, 69, 59, 58, 63, 53, 68, 3, 4, 34, 43, 44, 33, 16, 15, 91, 31, 81, 29, 24, 26
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 99;
	*idx++ = 38;
	*idx++ = 46;
	*idx++ = 57;
	*idx++ = 64;
	*idx++ = 75;
	*idx++ = 83;
	*idx++ = 56;
	*idx++ = 65;
	*idx++ = 55;
	*idx++ = 66;
	*idx++ = 80;
	*idx++ = 60;
	*idx++ = 79;
	*idx++ = 89;
	*idx++ = 84;
	*idx++ = 76;
	*idx++ = 20;
	*idx++ = 25;
	*idx++ = 23;
	*idx++ = 51;
	*idx++ = 61;
	*idx++ = 10;
	*idx++ = 17;
	*idx++ = 14;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 0;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 92;
	*idx++ = 42;
	*idx++ = 62;
	*idx++ = 36;
	*idx++ = 95;
	*idx++ = 96;
	*idx++ = 85;
	*idx++ = 86;
	*idx++ = 49;
	*idx++ = 39;
	*idx++ = 37;
	*idx++ = 45;
	*idx++ = 35;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 73;
	*idx++ = 54;
	*idx++ = 93;
	*idx++ = 94;
	*idx++ = 74;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 13;
	*idx++ = 40;
	*idx++ = 30;
	*idx++ = 67;
	*idx++ = 48;
	*idx++ = 97;
	*idx++ = 98;
	*idx++ = 47;
	*idx++ = 90;
	*idx++ = 50;
	*idx++ = 70;
	*idx++ = 22;
	*idx++ = 52;
	*idx++ = 32;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 11;
	*idx++ = 71;
	*idx++ = 1;
	*idx++ = 41;
	*idx++ = 21;
	*idx++ = 78;
	*idx++ = 87;
	*idx++ = 77;
	*idx++ = 88;
	*idx++ = 72;
	*idx++ = 82;
	*idx++ = 69;
	*idx++ = 59;
	*idx++ = 58;
	*idx++ = 63;
	*idx++ = 53;
	*idx++ = 68;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 34;
	*idx++ = 43;
	*idx++ = 44;
	*idx++ = 33;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 91;
	*idx++ = 31;
	*idx++ = 81;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::F, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    double f1 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[0][0])*W[a]*(Iz*f1), I+0);
	    update((C[0][0])*W[a]*(Iy*f1), I+1);
	    double f10 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cy*Ix*f10), I+2);
	    update((C[0][0])*W[a]*(Cx*Iy*f10), I+3);
	    double f11 = 3*pow(B10,2);
	    update((C[0][0])*W[a]*((f11 + Cy*pow(Iy,3) + 3*B10*Iy*(Yij + 2*Cy))), I+4);
	    update((C[0][0])*W[a]*((f11 + Cz*pow(Iz,3) + 3*B10*Iz*(2*Cz + Zij))), I+5);
	    update((C[0][0])*W[a]*((f11 + 3*B10*Ix*(Xij + 2*Cx) + Cx*pow(Ix,3))), I+6);
	    double f12 = (Cx*Ix + B10);
	    update((C[0][0])*W[a]*(Iy*Iz*f12), I+7);
	    update((C[0][0])*W[a]*(f10*f12), I+8);
	    double f13 = (3*B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cy*Iz*f13), I+9);
	    update((C[0][0])*W[a]*(Cx*Iz*f13), I+10);
	    double f14 = (3*B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Cz*Ix*f14), I+11);
	    update((C[0][0])*W[a]*(Cy*Ix*f14), I+12);
	    double f15 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Ix*Iy*f15), I+13);
	    double f3 = (3*B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Cx*Iy*f3), I+14);
	    update((C[0][0])*W[a]*(Cz*Iy*f3), I+15);
	    double f4 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[0][0])*W[a]*(Ix*f4), I+16);
	    update((C[0][0])*W[a]*(Iy*f4), I+17);
	    double f5 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Ix*Iz*f5), I+18);
	    update((C[0][0])*W[a]*(f10*f5), I+19);
	    double f6 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[0][0])*W[a]*(Ix*f6), I+20);
	    update((C[0][0])*W[a]*(Iz*f6), I+21);
	    double f8 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Cx*Iz*f8), I+22);
	    update((C[0][0])*W[a]*(Cz*Ix*f8), I+23);
	    update((C[0][0])*W[a]*(f15*f8), I+24);
	    update((C[0][0])*W[a]*(f12*f8), I+25);
	    double f9 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Cy*Iz*f9), I+26);
	    update((C[0][0])*W[a]*(Cz*Iy*f9), I+27);
	    update((C[0][0])*W[a]*(f5*f9), I+28);
	    update((C[0][0])*W[a]*(f15*f9), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[12] = T[0];
	I[9] = T[1];
	I[22] = T[2];
	I[24] = T[3];
	I[4] = T[4];
	I[8] = T[5];
	I[0] = T[6];
	I[27] = T[7];
	I[21] = T[8];
	I[7] = T[9];
	I[6] = T[10];
	I[2] = T[11];
	I[1] = T[12];
	I[29] = T[13];
	I[3] = T[14];
	I[5] = T[15];
	I[23] = T[16];
	I[26] = T[17];
	I[28] = T[18];
	I[25] = T[19];
	I[16] = T[20];
	I[19] = T[21];
	I[18] = T[22];
	I[17] = T[23];
	I[20] = T[24];
	I[15] = T[25];
	I[13] = T[26];
	I[11] = T[27];
	I[10] = T[28];
	I[14] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 6, 12, 11, 14, 4, 15, 10, 9, 5, 1, 28, 27, 0, 26, 29, 25, 20, 23, 22, 21, 24, 8, 2, 16, 3, 19, 17, 7, 18, 13 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    12, 9, 22, 24, 4, 8, 0, 27, 21, 7, 6, 2, 1, 29, 3, 5, 23, 26, 28, 25, 16, 19, 18, 17, 20, 15, 13, 11, 10, 14
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 22;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 0;
	*idx++ = 27;
	*idx++ = 21;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 29;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 23;
	*idx++ = 26;
	*idx++ = 28;
	*idx++ = 25;
	*idx++ = 16;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 17;
	*idx++ = 20;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 14;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::SP, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*(Cy*Iz*Kx), I+0);
	    update((C[0][0])*W[a]*(Cy*Kx), I+1);
	    update((C[0][1])*W[a]*(Cz*Iy*Kx), I+2);
	    update((C[0][0])*W[a]*(Cz*Kx), I+3);
	    update((C[0][1])*W[a]*(Cz*Ix*Ky), I+4);
	    update((C[0][0])*W[a]*(Cz*Ky), I+5);
	    update((C[0][1])*W[a]*(Cx*Iz*Ky), I+6);
	    update((C[0][0])*W[a]*(Cx*Ky), I+7);
	    update((C[0][1])*W[a]*(Ix*(B00 + Cz*Kz)), I+8);
	    update((C[0][1])*W[a]*(Iy*(B00 + Cz*Kz)), I+9);
	    update((C[0][0])*W[a]*((B00 + Cz*Kz)), I+10);
	    update((C[0][1])*W[a]*(Cx*Iy*Kz), I+11);
	    update((C[0][1])*W[a]*(Cy*Ix*Kz), I+12);
	    update((C[0][0])*W[a]*(Cy*Kz), I+13);
	    update((C[0][0])*W[a]*(Cx*Kz), I+14);
	    update((C[0][1])*W[a]*((Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+15);
	    update((C[0][1])*W[a]*((B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10))), I+16);
	    update((C[0][1])*W[a]*((Kz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+17);
	    double f10 = (B00 + Cx*Kx);
	    update((C[0][1])*W[a]*(Iz*f10), I+18);
	    update((C[0][1])*W[a]*(Iy*f10), I+19);
	    update((C[0][0])*W[a]*(f10), I+20);
	    double f11 = (B00 + Cy*Ky);
	    update((C[0][1])*W[a]*(Ix*f11), I+21);
	    update((C[0][1])*W[a]*(Iz*f11), I+22);
	    update((C[0][0])*W[a]*(f11), I+23);
	    double f12 = (Cx*Ix + B10);
	    update((C[0][1])*W[a]*(Ky*f12), I+24);
	    update((C[0][1])*W[a]*(Kz*f12), I+25);
	    double f14 = (B10 + Cz*Iz);
	    update((C[0][1])*W[a]*(Kx*f14), I+26);
	    update((C[0][1])*W[a]*(Ky*f14), I+27);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][1])*W[a]*(Cz*f3), I+28);
	    update((C[0][1])*W[a]*(Cy*f3), I+29);
	    double f4 = (Iz*Kz + B00);
	    update((C[0][1])*W[a]*(Cx*f4), I+30);
	    update((C[0][1])*W[a]*(Cy*f4), I+31);
	    double f6 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Kx*f6), I+32);
	    update((C[0][1])*W[a]*(Kz*f6), I+33);
	    double f8 = (B00 + Iy*Ky);
	    update((C[0][1])*W[a]*(Cx*f8), I+34);
	    update((C[0][1])*W[a]*(Cz*f8), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[10] = T[0];
	I[1] = T[1];
	I[8] = T[2];
	I[2] = T[3];
	I[17] = T[4];
	I[14] = T[5];
	I[21] = T[6];
	I[12] = T[7];
	I[29] = T[8];
	I[32] = T[9];
	I[26] = T[10];
	I[30] = T[11];
	I[28] = T[12];
	I[25] = T[13];
	I[24] = T[14];
	I[3] = T[15];
	I[19] = T[16];
	I[35] = T[17];
	I[9] = T[18];
	I[6] = T[19];
	I[0] = T[20];
	I[16] = T[21];
	I[22] = T[22];
	I[13] = T[23];
	I[15] = T[24];
	I[27] = T[25];
	I[11] = T[26];
	I[23] = T[27];
	I[5] = T[28];
	I[4] = T[29];
	I[33] = T[30];
	I[34] = T[31];
	I[7] = T[32];
	I[31] = T[33];
	I[18] = T[34];
	I[20] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 20, 1, 3, 15, 29, 28, 19, 32, 2, 18, 0, 26, 7, 23, 5, 24, 21, 4, 34, 16, 35, 6, 22, 27, 14, 13, 10, 25, 12, 8, 11, 33, 9, 30, 31, 17 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    10, 1, 8, 2, 17, 14, 21, 12, 29, 32, 26, 30, 28, 25, 24, 3, 19, 35, 9, 6, 0, 16, 22, 13, 15, 27, 11, 23, 5, 4, 33, 34, 7, 31, 18, 20
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 10;
	*idx++ = 1;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 17;
	*idx++ = 14;
	*idx++ = 21;
	*idx++ = 12;
	*idx++ = 29;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 3;
	*idx++ = 19;
	*idx++ = 35;
	*idx++ = 9;
	*idx++ = 6;
	*idx++ = 0;
	*idx++ = 16;
	*idx++ = 22;
	*idx++ = 13;
	*idx++ = 15;
	*idx++ = 27;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 7;
	*idx++ = 31;
	*idx++ = 18;
	*idx++ = 20;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::P, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[120]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Iz*Px), I+0);
	    update((C[0][0])*W[a]*(Cz*Iy*Px), I+1);
	    update((C[0][0])*W[a]*(Cx*Iz*Py), I+2);
	    update((C[0][0])*W[a]*(Cz*Ix*Py), I+3);
	    update((C[0][0])*W[a]*(Cx*Iy*Pz), I+4);
	    update((C[0][0])*W[a]*(Cy*Ix*Pz), I+5);
	    update((C[1][0])*W[a]*(Iz*Py*Qx), I+6);
	    update((C[1][0])*W[a]*(Iy*Pz*Qx), I+7);
	    update((C[1][0])*W[a]*(Iz*Px*Qy), I+8);
	    update((C[1][0])*W[a]*(Ix*Pz*Qy), I+9);
	    update((C[1][0])*W[a]*(Ix*Py*Qz), I+10);
	    update((C[1][0])*W[a]*(Iy*Px*Qz), I+11);
	    update((C[1][0])*W[a]*(Cz*Py*(Dx*Xij + Qx)), I+12);
	    update((C[1][0])*W[a]*(Cy*Pz*(Dx*Xij + Qx)), I+13);
	    update((C[1][0])*W[a]*(Cz*Qy*(Px + Cx*Xij)), I+14);
	    update((C[1][0])*W[a]*(Cy*Qz*(Px + Cx*Xij)), I+15);
	    update((C[0][0])*W[a]*(Cy*Cz*(Px + Cx*Xij)), I+16);
	    update((C[1][0])*W[a]*(Dz*Py*(Px + Cx*Xij)), I+17);
	    update((C[0][0])*W[a]*(Py*(Px + Cx*Xij)), I+18);
	    update((C[1][0])*W[a]*(Dy*Pz*(Px + Cx*Xij)), I+19);
	    update((C[0][0])*W[a]*(Pz*(Px + Cx*Xij)), I+20);
	    update((C[1][0])*W[a]*(Cy*Qx*(Cz*Zij + Pz)), I+21);
	    update((C[0][0])*W[a]*(Cx*Cy*(Cz*Zij + Pz)), I+22);
	    update((C[1][0])*W[a]*(Cx*Qy*(Cz*Zij + Pz)), I+23);
	    update((C[1][0])*W[a]*(Dx*Py*(Cz*Zij + Pz)), I+24);
	    update((C[0][0])*W[a]*(Py*(Cz*Zij + Pz)), I+25);
	    update((C[1][0])*W[a]*(Dy*Px*(Cz*Zij + Pz)), I+26);
	    update((C[0][0])*W[a]*(Px*(Cz*Zij + Pz)), I+27);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[1][0])*W[a]*(Cx*Dy*Iz*f0), I+28);
	    update((C[0][0])*W[a]*(Cx*Iz*f0), I+29);
	    update((C[1][0])*W[a]*(Cx*Dz*Iy*f0), I+30);
	    update((C[0][0])*W[a]*(Cx*Iy*f0), I+31);
	    double f10 = (3*B00*Pz + Cz*Dz*(3*B10 + pow(Cz,2)));
	    update((C[1][0])*W[a]*(Ix*f10), I+32);
	    update((C[1][0])*W[a]*(Iy*f10), I+33);
	    double f15 = (Dy*Iy + B00);
	    update((C[1][0])*W[a]*(Cx*Pz*f15), I+34);
	    update((C[1][0])*W[a]*(Cx*f0*f15), I+35);
	    update((C[1][0])*W[a]*(Cz*Px*f15), I+36);
	    double f16 = (3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3));
	    update((C[1][0])*W[a]*(Dy*f16), I+37);
	    update((C[1][0])*W[a]*(Dx*f16), I+38);
	    update((C[0][0])*W[a]*(f16), I+39);
	    double f17 = (Cy*Iy + B10);
	    update((C[1][0])*W[a]*(Cx*Qz*f17), I+40);
	    update((C[1][0])*W[a]*(Cz*Qx*f17), I+41);
	    update((C[0][0])*W[a]*(Cx*Cz*f17), I+42);
	    update((C[1][0])*W[a]*(Dx*Pz*f17), I+43);
	    update((C[0][0])*W[a]*(Pz*f17), I+44);
	    update((C[1][0])*W[a]*(Dz*Px*f17), I+45);
	    update((C[0][0])*W[a]*(Px*f17), I+46);
	    double f18 = (3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy));
	    update((C[1][0])*W[a]*(Dz*f18), I+47);
	    update((C[1][0])*W[a]*(Dx*f18), I+48);
	    update((C[0][0])*W[a]*(f18), I+49);
	    double f19 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[1][0])*W[a]*(Qy*f19), I+50);
	    update((C[1][0])*W[a]*(Qz*f19), I+51);
	    update((C[1][0])*W[a]*(Cy*Dz*f19), I+52);
	    update((C[0][0])*W[a]*(Cy*f19), I+53);
	    update((C[1][0])*W[a]*(Cz*Dy*f19), I+54);
	    update((C[0][0])*W[a]*(Cz*f19), I+55);
	    double f2 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*(Cx*Cy*(f2 + Qz*Zij)), I+56);
	    update((C[1][0])*W[a]*(Px*(f2 + Qz*Zij)), I+57);
	    update((C[1][0])*W[a]*(Py*(f2 + Qz*Zij)), I+58);
	    update((C[1][0])*W[a]*(Cy*Ix*f2), I+59);
	    update((C[1][0])*W[a]*(Cx*Iy*f2), I+60);
	    update((C[1][0])*W[a]*(f2*(Px + Cx*Xij)), I+61);
	    update((C[1][0])*W[a]*(f17*f2), I+62);
	    double f21 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[1][0])*W[a]*(Cy*f21), I+63);
	    update((C[1][0])*W[a]*(Cz*f21), I+64);
	    double f24 = 3*pow(B10,2);
	    update((C[1][0])*W[a]*((Dx*(f24 + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3)) + 3*B00*Px*Xij + 4*B00*Cx*f0)), I+65);
	    update((C[1][0])*W[a]*(Dz*(f24 + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3))), I+66);
	    update((C[1][0])*W[a]*(Dy*(f24 + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3))), I+67);
	    update((C[0][0])*W[a]*((f24 + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3))), I+68);
	    double f28 = (Cx*Dx*(3*B10 + pow(Cx,2)) + 3*B00*Px);
	    update((C[1][0])*W[a]*(Iz*f28), I+69);
	    update((C[1][0])*W[a]*(Iy*f28), I+70);
	    double f3 = (Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij);
	    update((C[1][0])*W[a]*(Cx*f3), I+71);
	    update((C[1][0])*W[a]*(Cy*f3), I+72);
	    double f32 = (3*B10 + pow(Cz,2));
	    update((C[1][0])*W[a]*((Dz*(f24 + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3)) + 3*B00*Pz*Zij + 4*B00*Cz*f32)), I+73);
	    update((C[1][0])*W[a]*(Cz*f32*(Dx*Xij + Qx)), I+74);
	    update((C[1][0])*W[a]*(Cz*f15*f32), I+75);
	    update((C[1][0])*W[a]*(Cz*Dx*Iy*f32), I+76);
	    update((C[0][0])*W[a]*(Cz*Iy*f32), I+77);
	    update((C[1][0])*W[a]*(Cz*Dy*Ix*f32), I+78);
	    update((C[0][0])*W[a]*(Cz*Ix*f32), I+79);
	    double f33 = (3*B10 + pow(Cy,2));
	    update((C[1][0])*W[a]*((Dy*(f24 + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy)) + 3*B00*Py*Yij + 4*B00*Cy*f33)), I+80);
	    update((C[1][0])*W[a]*(Cy*f33*(Dx*Xij + Qx)), I+81);
	    update((C[1][0])*W[a]*(Cy*Dx*Iz*f33), I+82);
	    update((C[0][0])*W[a]*(Cy*Iz*f33), I+83);
	    update((C[1][0])*W[a]*(Cy*Dz*Ix*f33), I+84);
	    update((C[0][0])*W[a]*(Cy*Ix*f33), I+85);
	    double f34 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[1][0])*W[a]*(Qy*f34), I+86);
	    update((C[1][0])*W[a]*(Qx*f34), I+87);
	    update((C[1][0])*W[a]*(Cx*Dy*f34), I+88);
	    update((C[0][0])*W[a]*(Cx*f34), I+89);
	    update((C[1][0])*W[a]*(Cy*Dx*f34), I+90);
	    update((C[0][0])*W[a]*(Cy*f34), I+91);
	    double f4 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[1][0])*W[a]*(Cx*Cz*f4), I+92);
	    update((C[1][0])*W[a]*(Px*f4), I+93);
	    update((C[1][0])*W[a]*(Pz*f4), I+94);
	    double f5 = (Dx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Cy*Cz*(Qx*Xij + f5)), I+95);
	    update((C[1][0])*W[a]*(Pz*(Qx*Xij + f5)), I+96);
	    update((C[1][0])*W[a]*(Py*(Qx*Xij + f5)), I+97);
	    update((C[1][0])*W[a]*(Cy*Iz*f5), I+98);
	    update((C[1][0])*W[a]*(Cz*Iy*f5), I+99);
	    update((C[1][0])*W[a]*(f5*(Cz*Zij + Pz)), I+100);
	    update((C[1][0])*W[a]*(f17*f5), I+101);
	    double f6 = (B00 + Dz*Iz);
	    update((C[1][0])*W[a]*(Cy*f33*f6), I+102);
	    update((C[1][0])*W[a]*(Cy*Px*f6), I+103);
	    update((C[1][0])*W[a]*(Cx*f0*f6), I+104);
	    update((C[1][0])*W[a]*(Cx*Py*f6), I+105);
	    double f7 = (Cy*Dy*(3*B10 + pow(Cy,2)) + 3*B00*Py);
	    update((C[1][0])*W[a]*(Cz*(f7 + Yij*(Dy*Py + 2*B00*Cy))), I+106);
	    update((C[1][0])*W[a]*(Cx*(f7 + Yij*(Dy*Py + 2*B00*Cy))), I+107);
	    update((C[1][0])*W[a]*(Ix*f7), I+108);
	    update((C[1][0])*W[a]*(Iz*f7), I+109);
	    double f8 = (Dy*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(Cx*Iz*f8), I+110);
	    update((C[1][0])*W[a]*(f8*(Px + Cx*Xij)), I+111);
	    update((C[1][0])*W[a]*(f8*(Cz*Zij + Pz)), I+112);
	    update((C[1][0])*W[a]*(Cz*Ix*f8), I+113);
	    double f9 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[1][0])*W[a]*(Qz*f9), I+114);
	    update((C[1][0])*W[a]*(Qx*f9), I+115);
	    update((C[1][0])*W[a]*(Cz*Dx*f9), I+116);
	    update((C[0][0])*W[a]*(Cz*f9), I+117);
	    update((C[1][0])*W[a]*(Cx*Dz*f9), I+118);
	    update((C[0][0])*W[a]*(Cx*f9), I+119);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[120]) {
	double T[120];
	for (int i = 0; i < 120; ++i) {
	    T[i] = I[i];
	}
	I[23] = T[0];
	I[14] = T[1];
	I[25] = T[2];
	I[6] = T[3];
	I[17] = T[4];
	I[8] = T[5];
	I[55] = T[6];
	I[47] = T[7];
	I[83] = T[8];
	I[68] = T[9];
	I[96] = T[10];
	I[104] = T[11];
	I[36] = T[12];
	I[38] = T[13];
	I[69] = T[14];
	I[99] = T[15];
	I[9] = T[16];
	I[95] = T[17];
	I[5] = T[18];
	I[67] = T[19];
	I[7] = T[20];
	I[59] = T[21];
	I[29] = T[22];
	I[89] = T[23];
	I[56] = T[24];
	I[26] = T[25];
	I[84] = T[26];
	I[24] = T[27];
	I[80] = T[28];
	I[20] = T[29];
	I[100] = T[30];
	I[10] = T[31];
	I[92] = T[32];
	I[102] = T[33];
	I[77] = T[34];
	I[70] = T[35];
	I[74] = T[36];
	I[82] = T[37];
	I[52] = T[38];
	I[22] = T[39];
	I[109] = T[40];
	I[49] = T[41];
	I[19] = T[42];
	I[48] = T[43];
	I[18] = T[44];
	I[103] = T[45];
	I[13] = T[46];
	I[101] = T[47];
	I[41] = T[48];
	I[11] = T[49];
	I[63] = T[50];
	I[94] = T[51];
	I[93] = T[52];
	I[3] = T[53];
	I[64] = T[54];
	I[4] = T[55];
	I[119] = T[56];
	I[114] = T[57];
	I[116] = T[58];
	I[98] = T[59];
	I[107] = T[60];
	I[97] = T[61];
	I[108] = T[62];
	I[33] = T[63];
	I[34] = T[64];
	I[30] = T[65];
	I[90] = T[66];
	I[60] = T[67];
	I[0] = T[68];
	I[50] = T[69];
	I[40] = T[70];
	I[117] = T[71];
	I[118] = T[72];
	I[112] = T[73];
	I[32] = T[74];
	I[72] = T[75];
	I[42] = T[76];
	I[12] = T[77];
	I[62] = T[78];
	I[2] = T[79];
	I[71] = T[80];
	I[31] = T[81];
	I[51] = T[82];
	I[21] = T[83];
	I[91] = T[84];
	I[1] = T[85];
	I[88] = T[86];
	I[57] = T[87];
	I[87] = T[88];
	I[27] = T[89];
	I[58] = T[90];
	I[28] = T[91];
	I[79] = T[92];
	I[73] = T[93];
	I[78] = T[94];
	I[39] = T[95];
	I[37] = T[96];
	I[35] = T[97];
	I[53] = T[98];
	I[44] = T[99];
	I[54] = T[100];
	I[43] = T[101];
	I[111] = T[102];
	I[113] = T[103];
	I[110] = T[104];
	I[115] = T[105];
	I[76] = T[106];
	I[75] = T[107];
	I[61] = T[108];
	I[81] = T[109];
	I[85] = T[110];
	I[65] = T[111];
	I[86] = T[112];
	I[66] = T[113];
	I[106] = T[114];
	I[45] = T[115];
	I[46] = T[116];
	I[16] = T[117];
	I[105] = T[118];
	I[15] = T[119];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[120] = { 68, 85, 79, 53, 55, 18, 3, 20, 5, 16, 31, 49, 77, 46, 1, 119, 117, 4, 44, 42, 29, 83, 39, 0, 27, 2, 25, 89, 91, 22, 65, 81, 74, 63, 64, 97, 12, 96, 13, 95, 70, 48, 76, 101, 99, 115, 116, 7, 43, 41, 69, 82, 38, 98, 100, 6, 24, 87, 90, 21, 67, 108, 78, 50, 54, 111, 113, 19, 9, 14, 35, 80, 75, 93, 36, 107, 106, 34, 94, 92, 28, 109, 37, 8, 26, 110, 112, 88, 86, 23, 66, 84, 32, 52, 51, 17, 10, 61, 59, 15, 30, 47, 33, 45, 11, 118, 114, 60, 62, 40, 104, 102, 73, 103, 57, 105, 58, 71, 72, 56 };
// 	if (index < 120) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    23, 14, 25, 6, 17, 8, 55, 47, 83, 68, 96, 104, 36, 38, 69, 99, 9, 95, 5, 67, 7, 59, 29, 89, 56, 26, 84, 24, 80, 20, 100, 10, 92, 102, 77, 70, 74, 82, 52, 22, 109, 49, 19, 48, 18, 103, 13, 101, 41, 11, 63, 94, 93, 3, 64, 4, 119, 114, 116, 98, 107, 97, 108, 33, 34, 30, 90, 60, 0, 50, 40, 117, 118, 112, 32, 72, 42, 12, 62, 2, 71, 31, 51, 21, 91, 1, 88, 57, 87, 27, 58, 28, 79, 73, 78, 39, 37, 35, 53, 44, 54, 43, 111, 113, 110, 115, 76, 75, 61, 81, 85, 65, 86, 66, 106, 45, 46, 16, 105, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 17;
	*idx++ = 8;
	*idx++ = 55;
	*idx++ = 47;
	*idx++ = 83;
	*idx++ = 68;
	*idx++ = 96;
	*idx++ = 104;
	*idx++ = 36;
	*idx++ = 38;
	*idx++ = 69;
	*idx++ = 99;
	*idx++ = 9;
	*idx++ = 95;
	*idx++ = 5;
	*idx++ = 67;
	*idx++ = 7;
	*idx++ = 59;
	*idx++ = 29;
	*idx++ = 89;
	*idx++ = 56;
	*idx++ = 26;
	*idx++ = 84;
	*idx++ = 24;
	*idx++ = 80;
	*idx++ = 20;
	*idx++ = 100;
	*idx++ = 10;
	*idx++ = 92;
	*idx++ = 102;
	*idx++ = 77;
	*idx++ = 70;
	*idx++ = 74;
	*idx++ = 82;
	*idx++ = 52;
	*idx++ = 22;
	*idx++ = 109;
	*idx++ = 49;
	*idx++ = 19;
	*idx++ = 48;
	*idx++ = 18;
	*idx++ = 103;
	*idx++ = 13;
	*idx++ = 101;
	*idx++ = 41;
	*idx++ = 11;
	*idx++ = 63;
	*idx++ = 94;
	*idx++ = 93;
	*idx++ = 3;
	*idx++ = 64;
	*idx++ = 4;
	*idx++ = 119;
	*idx++ = 114;
	*idx++ = 116;
	*idx++ = 98;
	*idx++ = 107;
	*idx++ = 97;
	*idx++ = 108;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 30;
	*idx++ = 90;
	*idx++ = 60;
	*idx++ = 0;
	*idx++ = 50;
	*idx++ = 40;
	*idx++ = 117;
	*idx++ = 118;
	*idx++ = 112;
	*idx++ = 32;
	*idx++ = 72;
	*idx++ = 42;
	*idx++ = 12;
	*idx++ = 62;
	*idx++ = 2;
	*idx++ = 71;
	*idx++ = 31;
	*idx++ = 51;
	*idx++ = 21;
	*idx++ = 91;
	*idx++ = 1;
	*idx++ = 88;
	*idx++ = 57;
	*idx++ = 87;
	*idx++ = 27;
	*idx++ = 58;
	*idx++ = 28;
	*idx++ = 79;
	*idx++ = 73;
	*idx++ = 78;
	*idx++ = 39;
	*idx++ = 37;
	*idx++ = 35;
	*idx++ = 53;
	*idx++ = 44;
	*idx++ = 54;
	*idx++ = 43;
	*idx++ = 111;
	*idx++ = 113;
	*idx++ = 110;
	*idx++ = 115;
	*idx++ = 76;
	*idx++ = 75;
	*idx++ = 61;
	*idx++ = 81;
	*idx++ = 85;
	*idx++ = 65;
	*idx++ = 86;
	*idx++ = 66;
	*idx++ = 106;
	*idx++ = 45;
	*idx++ = 46;
	*idx++ = 16;
	*idx++ = 105;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::SP, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[4][1],
	      double (&I)[16]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[1][0])*W[a]*(Dx), I+1);
	    update((C[1][0])*W[a]*(Dy), I+2);
	    update((C[1][0])*W[a]*(Dz), I+3);
	    update((C[3][0])*W[a]*((B01 + Dx*Kx)), I+4);
	    update((C[3][0])*W[a]*(Dy*Kx), I+5);
	    update((C[3][0])*W[a]*(Dz*Kx), I+6);
	    update((C[2][0])*W[a]*(Kx), I+7);
	    update((C[3][0])*W[a]*((B01 + Dy*Ky)), I+8);
	    update((C[3][0])*W[a]*(Dz*Ky), I+9);
	    update((C[3][0])*W[a]*(Dx*Ky), I+10);
	    update((C[2][0])*W[a]*(Ky), I+11);
	    update((C[3][0])*W[a]*((Dz*Kz + B01)), I+12);
	    update((C[3][0])*W[a]*(Dx*Kz), I+13);
	    update((C[3][0])*W[a]*(Dy*Kz), I+14);
	    update((C[2][0])*W[a]*(Kz), I+15);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[16]) {
	double T[16];
	for (int i = 0; i < 16; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
	I[5] = T[4];
	I[6] = T[5];
	I[7] = T[6];
	I[4] = T[7];
	I[10] = T[8];
	I[11] = T[9];
	I[9] = T[10];
	I[8] = T[11];
	I[15] = T[12];
	I[13] = T[13];
	I[14] = T[14];
	I[12] = T[15];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[16] = { 0, 1, 2, 3, 7, 4, 5, 6, 11, 10, 8, 9, 15, 13, 14, 12 };
// 	if (index < 16) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3, 5, 6, 7, 4, 10, 11, 9, 8, 15, 13, 14, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 10;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cy*Cz*Kx), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz*Ky), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Kz), I+2);
	    update((C[0][0])*W[a]*((Kx*Px + 2*B00*Cx)), I+3);
	    update((C[0][0])*W[a]*(Ky*Px), I+4);
	    update((C[0][0])*W[a]*(Kz*Px), I+5);
	    update((C[0][0])*W[a]*((Ky*Py + 2*B00*Cy)), I+6);
	    update((C[0][0])*W[a]*(Kx*Py), I+7);
	    update((C[0][0])*W[a]*(Kz*Py), I+8);
	    update((C[0][0])*W[a]*((Kz*Pz + 2*B00*Cz)), I+9);
	    update((C[0][0])*W[a]*(Kx*Pz), I+10);
	    update((C[0][0])*W[a]*(Ky*Pz), I+11);
	    double f4 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Cy*f4), I+12);
	    update((C[0][0])*W[a]*(Cz*f4), I+13);
	    double f6 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Cz*f6), I+14);
	    update((C[0][0])*W[a]*(Cx*f6), I+15);
	    double f8 = (B00 + Cz*Kz);
	    update((C[0][0])*W[a]*(Cx*f8), I+16);
	    update((C[0][0])*W[a]*(Cy*f8), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[10] = T[1];
	I[15] = T[2];
	I[0] = T[3];
	I[6] = T[4];
	I[12] = T[5];
	I[7] = T[6];
	I[1] = T[7];
	I[13] = T[8];
	I[14] = T[9];
	I[2] = T[10];
	I[8] = T[11];
	I[3] = T[12];
	I[4] = T[13];
	I[11] = T[14];
	I[9] = T[15];
	I[16] = T[16];
	I[17] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 3, 7, 10, 12, 13, 0, 4, 6, 11, 15, 1, 14, 5, 8, 9, 2, 16, 17 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 10, 15, 0, 6, 12, 7, 1, 13, 14, 2, 8, 3, 4, 11, 9, 16, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 12;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 8;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 16;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::SP, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[4][1],
	      double (&I)[96]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx*Cy), I+0);
	    update((C[0][0])*W[a]*(Cy*Cz), I+1);
	    update((C[0][0])*W[a]*(Cx*Cz), I+2);
	    update((C[1][0])*W[a]*(Cy*Cz*Dx), I+3);
	    update((C[1][0])*W[a]*(Cx*Cz*Dy), I+4);
	    update((C[1][0])*W[a]*(Cx*Cy*Dz), I+5);
	    update((C[2][0])*W[a]*(Cy*Cz*Kx), I+6);
	    update((C[2][0])*W[a]*(Cx*Cz*Ky), I+7);
	    update((C[2][0])*W[a]*(Cx*Cy*Kz), I+8);
	    update((C[2][0])*W[a]*(Ky*Px), I+9);
	    update((C[3][0])*W[a]*(Dz*Ky*Px), I+10);
	    update((C[1][0])*W[a]*(Dz*Px), I+11);
	    update((C[2][0])*W[a]*(Kz*Px), I+12);
	    update((C[3][0])*W[a]*(Dy*Kz*Px), I+13);
	    update((C[1][0])*W[a]*(Dy*Px), I+14);
	    update((C[0][0])*W[a]*(Px), I+15);
	    update((C[3][0])*W[a]*(Dx*Kz*Py), I+16);
	    update((C[2][0])*W[a]*(Kz*Py), I+17);
	    update((C[2][0])*W[a]*(Kx*Py), I+18);
	    update((C[3][0])*W[a]*(Dz*Kx*Py), I+19);
	    update((C[1][0])*W[a]*(Dz*Py), I+20);
	    update((C[0][0])*W[a]*(Py), I+21);
	    update((C[1][0])*W[a]*(Dx*Py), I+22);
	    update((C[1][0])*W[a]*(Dy*Pz), I+23);
	    update((C[3][0])*W[a]*(Dy*Kx*Pz), I+24);
	    update((C[2][0])*W[a]*(Kx*Pz), I+25);
	    update((C[3][0])*W[a]*(Dx*Ky*Pz), I+26);
	    update((C[1][0])*W[a]*(Dx*Pz), I+27);
	    update((C[0][0])*W[a]*(Pz), I+28);
	    update((C[2][0])*W[a]*(Ky*Pz), I+29);
	    update((C[3][0])*W[a]*(Cy*Kz*Qx), I+30);
	    update((C[1][0])*W[a]*(Cy*Qx), I+31);
	    update((C[1][0])*W[a]*(Cz*Qx), I+32);
	    update((C[3][0])*W[a]*(Cz*Ky*Qx), I+33);
	    update((C[3][0])*W[a]*(Cz*Kx*Qy), I+34);
	    update((C[1][0])*W[a]*(Cz*Qy), I+35);
	    update((C[1][0])*W[a]*(Cx*Qy), I+36);
	    update((C[3][0])*W[a]*(Cx*Kz*Qy), I+37);
	    update((C[3][0])*W[a]*(Cy*Kx*Qz), I+38);
	    update((C[3][0])*W[a]*(Cx*Ky*Qz), I+39);
	    update((C[1][0])*W[a]*(Cx*Qz), I+40);
	    update((C[1][0])*W[a]*(Cy*Qz), I+41);
	    update((C[3][0])*W[a]*(Cy*Dx*(Cz*Zkl + Qz)), I+42);
	    update((C[3][0])*W[a]*(Cx*Dy*(Cz*Zkl + Qz)), I+43);
	    update((C[2][0])*W[a]*(Cx*(Cz*Zkl + Qz)), I+44);
	    update((C[3][0])*W[a]*(Qx*(Cz*Zkl + Qz)), I+45);
	    update((C[3][0])*W[a]*(Qy*(Cz*Zkl + Qz)), I+46);
	    update((C[2][0])*W[a]*(Cy*(Cz*Zkl + Qz)), I+47);
	    double f11 = (B01 + Dx*Kx);
	    update((C[3][0])*W[a]*(Pz*f11), I+48);
	    update((C[3][0])*W[a]*(Py*f11), I+49);
	    update((C[3][0])*W[a]*(Cy*Cz*f11), I+50);
	    double f2 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[3][0])*W[a]*(Cx*f2), I+51);
	    update((C[3][0])*W[a]*(Cz*f2), I+52);
	    double f20 = (Dz*Kz + B01);
	    update((C[3][0])*W[a]*(Px*f20), I+53);
	    update((C[3][0])*W[a]*(Py*f20), I+54);
	    update((C[3][0])*W[a]*(Cx*Cy*f20), I+55);
	    double f10 = 2*B00*Cx;
	    double f21 = Dx*pow(Cx,2);
	    update((C[3][0])*W[a]*(Dz*(f10 + f21 + B10*Kx + Xkl*pow(Cx,2))), I+56);
	    update((C[3][0])*W[a]*(Dy*(f10 + f21 + B10*Kx + Xkl*pow(Cx,2))), I+57);
	    update((C[2][0])*W[a]*((f10 + f21 + B10*Kx + Xkl*pow(Cx,2))), I+58);
	    double f22 = (B00 + Cx*Kx);
	    update((C[3][0])*W[a]*(Qy*f22), I+59);
	    update((C[3][0])*W[a]*(Qz*f22), I+60);
	    update((C[3][0])*W[a]*(Cz*Dy*f22), I+61);
	    update((C[2][0])*W[a]*(Cz*f22), I+62);
	    update((C[2][0])*W[a]*(Cy*f22), I+63);
	    update((C[3][0])*W[a]*(Cy*Dz*f22), I+64);
	    double f23 = (B00 + Cy*Ky);
	    update((C[3][0])*W[a]*(Cx*Dz*f23), I+65);
	    update((C[3][0])*W[a]*(Cz*Dx*f23), I+66);
	    update((C[2][0])*W[a]*(Cz*f23), I+67);
	    update((C[3][0])*W[a]*(Qx*f23), I+68);
	    update((C[2][0])*W[a]*(Cx*f23), I+69);
	    update((C[3][0])*W[a]*(Qz*f23), I+70);
	    double f29 = (B01 + Dy*Ky);
	    update((C[3][0])*W[a]*(Cx*Cz*f29), I+71);
	    update((C[3][0])*W[a]*(Px*f29), I+72);
	    update((C[3][0])*W[a]*(Pz*f29), I+73);
	    double f27 = 2*pow(B00,2);
	    double f31 = B10*Dx;
	    update((C[3][0])*W[a]*((f27 + Xkl*(f10 + f21 + f31) + B01*Px + Dx*(2*f10 + Dx*Px))), I+74);
	    update((C[3][0])*W[a]*(Kz*(f10 + f21 + f31)), I+75);
	    update((C[3][0])*W[a]*(Ky*(f10 + f21 + f31)), I+76);
	    update((C[1][0])*W[a]*((f10 + f21 + f31)), I+77);
	    double f35 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[3][0])*W[a]*(Cx*f35), I+78);
	    update((C[3][0])*W[a]*(Cy*f35), I+79);
	    double f4 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[3][0])*W[a]*(Cy*f4), I+80);
	    update((C[3][0])*W[a]*(Cz*f4), I+81);
	    double f33 = Dy*pow(Cy,2);
	    double f5 = 2*B00*Cy;
	    update((C[3][0])*W[a]*(Dx*(f33 + B10*Ky + Ykl*pow(Cy,2) + f5)), I+82);
	    update((C[3][0])*W[a]*(Dz*(f33 + B10*Ky + Ykl*pow(Cy,2) + f5)), I+83);
	    update((C[2][0])*W[a]*((f33 + B10*Ky + Ykl*pow(Cy,2) + f5)), I+84);
	    double f7 = B10*Dy;
	    update((C[3][0])*W[a]*((f27 + B01*Py + Dy*(2*f5 + Dy*Py) + Ykl*(f33 + f5 + f7))), I+85);
	    update((C[3][0])*W[a]*(Kx*(f33 + f5 + f7)), I+86);
	    update((C[3][0])*W[a]*(Kz*(f33 + f5 + f7)), I+87);
	    update((C[1][0])*W[a]*((f33 + f5 + f7)), I+88);
	    double f25 = B10*Dz;
	    double f19 = 2*B00*Cz;
	    double f8 = Dz*pow(Cz,2);
	    update((C[3][0])*W[a]*((f27 + Dz*(Dz*Pz + 2*f19) + B01*Pz + Zkl*(f19 + f25 + f8))), I+89);
	    update((C[3][0])*W[a]*(Dy*(Zkl*pow(Cz,2) + f19 + B10*Kz + f8)), I+90);
	    update((C[3][0])*W[a]*(Dx*(Zkl*pow(Cz,2) + f19 + B10*Kz + f8)), I+91);
	    update((C[2][0])*W[a]*((Zkl*pow(Cz,2) + f19 + B10*Kz + f8)), I+92);
	    update((C[3][0])*W[a]*(Ky*(f19 + f25 + f8)), I+93);
	    update((C[3][0])*W[a]*(Kx*(f19 + f25 + f8)), I+94);
	    update((C[1][0])*W[a]*((f19 + f25 + f8)), I+95);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[96]) {
	double T[96];
	for (int i = 0; i < 96; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[5] = T[1];
	I[4] = T[2];
	I[11] = T[3];
	I[16] = T[4];
	I[21] = T[5];
	I[29] = T[6];
	I[52] = T[7];
	I[75] = T[8];
	I[48] = T[9];
	I[66] = T[10];
	I[18] = T[11];
	I[72] = T[12];
	I[84] = T[13];
	I[12] = T[14];
	I[0] = T[15];
	I[79] = T[16];
	I[73] = T[17];
	I[25] = T[18];
	I[43] = T[19];
	I[19] = T[20];
	I[1] = T[21];
	I[7] = T[22];
	I[14] = T[23];
	I[38] = T[24];
	I[26] = T[25];
	I[56] = T[26];
	I[8] = T[27];
	I[2] = T[28];
	I[50] = T[29];
	I[81] = T[30];
	I[9] = T[31];
	I[10] = T[32];
	I[58] = T[33];
	I[41] = T[34];
	I[17] = T[35];
	I[15] = T[36];
	I[87] = T[37];
	I[47] = T[38];
	I[70] = T[39];
	I[22] = T[40];
	I[23] = T[41];
	I[83] = T[42];
	I[88] = T[43];
	I[76] = T[44];
	I[82] = T[45];
	I[89] = T[46];
	I[77] = T[47];
	I[32] = T[48];
	I[31] = T[49];
	I[35] = T[50];
	I[63] = T[51];
	I[65] = T[52];
	I[90] = T[53];
	I[91] = T[54];
	I[93] = T[55];
	I[42] = T[56];
	I[36] = T[57];
	I[24] = T[58];
	I[39] = T[59];
	I[46] = T[60];
	I[40] = T[61];
	I[28] = T[62];
	I[27] = T[63];
	I[45] = T[64];
	I[69] = T[65];
	I[59] = T[66];
	I[53] = T[67];
	I[57] = T[68];
	I[51] = T[69];
	I[71] = T[70];
	I[64] = T[71];
	I[60] = T[72];
	I[62] = T[73];
	I[30] = T[74];
	I[78] = T[75];
	I[54] = T[76];
	I[6] = T[77];
	I[94] = T[78];
	I[95] = T[79];
	I[33] = T[80];
	I[34] = T[81];
	I[55] = T[82];
	I[67] = T[83];
	I[49] = T[84];
	I[61] = T[85];
	I[37] = T[86];
	I[85] = T[87];
	I[13] = T[88];
	I[92] = T[89];
	I[86] = T[90];
	I[80] = T[91];
	I[74] = T[92];
	I[68] = T[93];
	I[44] = T[94];
	I[20] = T[95];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[96] = { 15, 21, 28, 0, 2, 1, 77, 22, 27, 31, 32, 3, 14, 88, 23, 36, 4, 35, 11, 20, 95, 5, 40, 41, 58, 18, 25, 63, 62, 6, 74, 49, 48, 80, 81, 50, 57, 86, 24, 59, 61, 34, 56, 19, 94, 64, 60, 38, 9, 84, 29, 69, 7, 67, 76, 82, 26, 68, 33, 66, 72, 85, 73, 51, 71, 52, 10, 83, 93, 65, 39, 70, 12, 17, 92, 8, 44, 47, 75, 16, 91, 30, 45, 42, 13, 87, 90, 37, 43, 46, 53, 54, 89, 55, 78, 79 };
// 	if (index < 96) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 5, 4, 11, 16, 21, 29, 52, 75, 48, 66, 18, 72, 84, 12, 0, 79, 73, 25, 43, 19, 1, 7, 14, 38, 26, 56, 8, 2, 50, 81, 9, 10, 58, 41, 17, 15, 87, 47, 70, 22, 23, 83, 88, 76, 82, 89, 77, 32, 31, 35, 63, 65, 90, 91, 93, 42, 36, 24, 39, 46, 40, 28, 27, 45, 69, 59, 53, 57, 51, 71, 64, 60, 62, 30, 78, 54, 6, 94, 95, 33, 34, 55, 67, 49, 61, 37, 85, 13, 92, 86, 80, 74, 68, 44, 20
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 29;
	*idx++ = 52;
	*idx++ = 75;
	*idx++ = 48;
	*idx++ = 66;
	*idx++ = 18;
	*idx++ = 72;
	*idx++ = 84;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 79;
	*idx++ = 73;
	*idx++ = 25;
	*idx++ = 43;
	*idx++ = 19;
	*idx++ = 1;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 38;
	*idx++ = 26;
	*idx++ = 56;
	*idx++ = 8;
	*idx++ = 2;
	*idx++ = 50;
	*idx++ = 81;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 58;
	*idx++ = 41;
	*idx++ = 17;
	*idx++ = 15;
	*idx++ = 87;
	*idx++ = 47;
	*idx++ = 70;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 83;
	*idx++ = 88;
	*idx++ = 76;
	*idx++ = 82;
	*idx++ = 89;
	*idx++ = 77;
	*idx++ = 32;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 63;
	*idx++ = 65;
	*idx++ = 90;
	*idx++ = 91;
	*idx++ = 93;
	*idx++ = 42;
	*idx++ = 36;
	*idx++ = 24;
	*idx++ = 39;
	*idx++ = 46;
	*idx++ = 40;
	*idx++ = 28;
	*idx++ = 27;
	*idx++ = 45;
	*idx++ = 69;
	*idx++ = 59;
	*idx++ = 53;
	*idx++ = 57;
	*idx++ = 51;
	*idx++ = 71;
	*idx++ = 64;
	*idx++ = 60;
	*idx++ = 62;
	*idx++ = 30;
	*idx++ = 78;
	*idx++ = 54;
	*idx++ = 6;
	*idx++ = 94;
	*idx++ = 95;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 55;
	*idx++ = 67;
	*idx++ = 49;
	*idx++ = 61;
	*idx++ = 37;
	*idx++ = 85;
	*idx++ = 13;
	*idx++ = 92;
	*idx++ = 86;
	*idx++ = 80;
	*idx++ = 74;
	*idx++ = 68;
	*idx++ = 44;
	*idx++ = 20;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::P, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[54]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Dy*Iz*Px), I+0);
	    update((C[0][0])*W[a]*(Dz*Iy*Px), I+1);
	    update((C[0][0])*W[a]*(Dx*Iz*Py), I+2);
	    update((C[0][0])*W[a]*(Dz*Ix*Py), I+3);
	    update((C[0][0])*W[a]*(Dy*Ix*Pz), I+4);
	    update((C[0][0])*W[a]*(Dx*Iy*Pz), I+5);
	    update((C[0][0])*W[a]*(Cy*Iz*Qx), I+6);
	    update((C[0][0])*W[a]*(Cz*Iy*Qx), I+7);
	    update((C[0][0])*W[a]*(Cz*Ix*Qy), I+8);
	    update((C[0][0])*W[a]*(Cx*Iz*Qy), I+9);
	    update((C[0][0])*W[a]*(Cy*Ix*Qz), I+10);
	    update((C[0][0])*W[a]*(Cx*Iy*Qz), I+11);
	    update((C[0][0])*W[a]*(Cy*Dz*(Px + Cx*Xij)), I+12);
	    update((C[0][0])*W[a]*(Cz*Dy*(Px + Cx*Xij)), I+13);
	    update((C[0][0])*W[a]*(Qy*(Px + Cx*Xij)), I+14);
	    update((C[0][0])*W[a]*(Qz*(Px + Cx*Xij)), I+15);
	    update((C[0][0])*W[a]*(Cx*Cy*(Dz*Zij + Qz)), I+16);
	    update((C[0][0])*W[a]*(Cy*Dx*(Cz*Zij + Pz)), I+17);
	    update((C[0][0])*W[a]*(Qy*(Cz*Zij + Pz)), I+18);
	    update((C[0][0])*W[a]*(Qx*(Cz*Zij + Pz)), I+19);
	    update((C[0][0])*W[a]*(Cx*Dy*(Cz*Zij + Pz)), I+20);
	    update((C[0][0])*W[a]*(Py*(Dz*Zij + Qz)), I+21);
	    update((C[0][0])*W[a]*(Px*(Dz*Zij + Qz)), I+22);
	    double f1 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][0])*W[a]*(Cx*f1), I+23);
	    update((C[0][0])*W[a]*(Cz*f1), I+24);
	    double f10 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Cx*Cz*f10), I+25);
	    update((C[0][0])*W[a]*(Px*f10), I+26);
	    update((C[0][0])*W[a]*(Pz*f10), I+27);
	    double f11 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Cz*Dx*f11), I+28);
	    update((C[0][0])*W[a]*(Cx*Dz*f11), I+29);
	    update((C[0][0])*W[a]*(Qx*f11), I+30);
	    update((C[0][0])*W[a]*(Qz*f11), I+31);
	    double f12 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Dy*f12), I+32);
	    update((C[0][0])*W[a]*(Dz*f12), I+33);
	    double f14 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Iz*f14), I+34);
	    update((C[0][0])*W[a]*(Iy*f14), I+35);
	    double f15 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Cy*Cz*f15), I+36);
	    update((C[0][0])*W[a]*(Py*f15), I+37);
	    update((C[0][0])*W[a]*(Pz*f15), I+38);
	    double f22 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Dx*f22), I+39);
	    update((C[0][0])*W[a]*(Dy*f22), I+40);
	    double f3 = 3*B00*B10;
	    update((C[0][0])*W[a]*((Dy*Iy*pow(Cy,2) + f3 + B00*Cy*(3*Cy + 2*Yij) + B10*Dy*(3*Cy + Yij))), I+41);
	    update((C[0][0])*W[a]*((Dx*Ix*pow(Cx,2) + B00*Cx*(3*Cx + 2*Xij) + B10*Dx*(3*Cx + Xij) + f3)), I+42);
	    update((C[0][0])*W[a]*((B00*Cz*(3*Cz + 2*Zij) + Dz*Iz*pow(Cz,2) + B10*Dz*(3*Cz + Zij) + f3)), I+43);
	    double f4 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Ix*f4), I+44);
	    update((C[0][0])*W[a]*(Iz*f4), I+45);
	    double f5 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Dx*f5), I+46);
	    update((C[0][0])*W[a]*(Dz*f5), I+47);
	    double f6 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*(f6 + Qz*Zij)), I+48);
	    update((C[0][0])*W[a]*(Cy*(f6 + Qz*Zij)), I+49);
	    update((C[0][0])*W[a]*(Ix*f6), I+50);
	    update((C[0][0])*W[a]*(Iy*f6), I+51);
	    double f8 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][0])*W[a]*(Cy*f8), I+52);
	    update((C[0][0])*W[a]*(Cz*f8), I+53);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[54]) {
	double T[54];
	for (int i = 0; i < 54; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[42] = T[1];
	I[13] = T[2];
	I[37] = T[3];
	I[20] = T[4];
	I[8] = T[5];
	I[15] = T[6];
	I[10] = T[7];
	I[23] = T[8];
	I[33] = T[9];
	I[41] = T[10];
	I[46] = T[11];
	I[39] = T[12];
	I[22] = T[13];
	I[21] = T[14];
	I[40] = T[15];
	I[51] = T[16];
	I[17] = T[17];
	I[35] = T[18];
	I[16] = T[19];
	I[34] = T[20];
	I[49] = T[21];
	I[48] = T[22];
	I[27] = T[23];
	I[29] = T[24];
	I[28] = T[25];
	I[24] = T[26];
	I[26] = T[27];
	I[11] = T[28];
	I[45] = T[29];
	I[9] = T[30];
	I[47] = T[31];
	I[18] = T[32];
	I[36] = T[33];
	I[12] = T[34];
	I[6] = T[35];
	I[5] = T[36];
	I[1] = T[37];
	I[2] = T[38];
	I[14] = T[39];
	I[32] = T[40];
	I[25] = T[41];
	I[0] = T[42];
	I[50] = T[43];
	I[19] = T[44];
	I[31] = T[45];
	I[7] = T[46];
	I[43] = T[47];
	I[52] = T[48];
	I[53] = T[49];
	I[38] = T[50];
	I[44] = T[51];
	I[3] = T[52];
	I[4] = T[53];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[54] = { 42, 37, 38, 52, 53, 36, 35, 46, 5, 30, 7, 28, 34, 2, 39, 6, 19, 17, 32, 44, 4, 14, 13, 8, 26, 41, 27, 23, 25, 24, 0, 45, 40, 9, 20, 18, 33, 3, 50, 12, 15, 10, 1, 47, 51, 29, 11, 31, 22, 21, 43, 16, 48, 49 };
// 	if (index < 54) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    30, 42, 13, 37, 20, 8, 15, 10, 23, 33, 41, 46, 39, 22, 21, 40, 51, 17, 35, 16, 34, 49, 48, 27, 29, 28, 24, 26, 11, 45, 9, 47, 18, 36, 12, 6, 5, 1, 2, 14, 32, 25, 0, 50, 19, 31, 7, 43, 52, 53, 38, 44, 3, 4
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 30;
	*idx++ = 42;
	*idx++ = 13;
	*idx++ = 37;
	*idx++ = 20;
	*idx++ = 8;
	*idx++ = 15;
	*idx++ = 10;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 41;
	*idx++ = 46;
	*idx++ = 39;
	*idx++ = 22;
	*idx++ = 21;
	*idx++ = 40;
	*idx++ = 51;
	*idx++ = 17;
	*idx++ = 35;
	*idx++ = 16;
	*idx++ = 34;
	*idx++ = 49;
	*idx++ = 48;
	*idx++ = 27;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 11;
	*idx++ = 45;
	*idx++ = 9;
	*idx++ = 47;
	*idx++ = 18;
	*idx++ = 36;
	*idx++ = 12;
	*idx++ = 6;
	*idx++ = 5;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 14;
	*idx++ = 32;
	*idx++ = 25;
	*idx++ = 0;
	*idx++ = 50;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 7;
	*idx++ = 43;
	*idx++ = 52;
	*idx++ = 53;
	*idx++ = 38;
	*idx++ = 44;
	*idx++ = 3;
	*idx++ = 4;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dy*Dz*Ix), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz*Iy), I+1);
	    update((C[0][0])*W[a]*(Dx*Dy*Iz), I+2);
	    update((C[0][0])*W[a]*((2*B00*Dx + Ix*Rx)), I+3);
	    update((C[0][0])*W[a]*(Iy*Rx), I+4);
	    update((C[0][0])*W[a]*(Iz*Rx), I+5);
	    update((C[0][0])*W[a]*((2*B00*Dy + Iy*Ry)), I+6);
	    update((C[0][0])*W[a]*(Iz*Ry), I+7);
	    update((C[0][0])*W[a]*(Ix*Ry), I+8);
	    update((C[0][0])*W[a]*((Iz*Rz + 2*B00*Dz)), I+9);
	    update((C[0][0])*W[a]*(Ix*Rz), I+10);
	    update((C[0][0])*W[a]*(Iy*Rz), I+11);
	    double f3 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Dx*f3), I+12);
	    update((C[0][0])*W[a]*(Dy*f3), I+13);
	    double f6 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Dx*f6), I+14);
	    update((C[0][0])*W[a]*(Dz*f6), I+15);
	    double f7 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Dz*f7), I+16);
	    update((C[0][0])*W[a]*(Dy*f7), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[15] = T[0];
	I[13] = T[1];
	I[11] = T[2];
	I[0] = T[3];
	I[1] = T[4];
	I[2] = T[5];
	I[4] = T[6];
	I[5] = T[7];
	I[3] = T[8];
	I[8] = T[9];
	I[6] = T[10];
	I[7] = T[11];
	I[14] = T[12];
	I[17] = T[13];
	I[10] = T[14];
	I[16] = T[15];
	I[12] = T[16];
	I[9] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 3, 4, 5, 8, 6, 7, 10, 11, 9, 17, 14, 2, 16, 1, 12, 0, 15, 13 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    15, 13, 11, 0, 1, 2, 4, 5, 3, 8, 6, 7, 14, 17, 10, 16, 12, 9
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 17;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 12;
	*idx++ = 9;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[3]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {



	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Kx), I+0);
	    update((C[0][0])*W[a]*(Ky), I+1);
	    update((C[0][0])*W[a]*(Kz), I+2);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[3]) {
	double T[3];
	for (int i = 0; i < 3; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[3] = { 0, 1, 2 };
// 	if (index < 3) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::S, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[9]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*((B00 + Cx*Kx)), I+0);
	    update((C[0][0])*W[a]*(Cy*Kx), I+1);
	    update((C[0][0])*W[a]*(Cz*Kx), I+2);
	    update((C[0][0])*W[a]*((B00 + Cy*Ky)), I+3);
	    update((C[0][0])*W[a]*(Cz*Ky), I+4);
	    update((C[0][0])*W[a]*(Cx*Ky), I+5);
	    update((C[0][0])*W[a]*((B00 + Cz*Kz)), I+6);
	    update((C[0][0])*W[a]*(Cx*Kz), I+7);
	    update((C[0][0])*W[a]*(Cy*Kz), I+8);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[9]) {
	double T[9];
	for (int i = 0; i < 9; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[4] = T[3];
	I[5] = T[4];
	I[3] = T[5];
	I[8] = T[6];
	I[6] = T[7];
	I[7] = T[8];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[9] = { 0, 1, 2, 5, 3, 4, 7, 8, 6 };
// 	if (index < 9) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 4, 5, 3, 8, 6, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::P, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[27]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cz*Dy*Ix), I+0);
	    update((C[0][0])*W[a]*(Cy*Dz*Ix), I+1);
	    update((C[0][0])*W[a]*(Cx*Dz*Iy), I+2);
	    update((C[0][0])*W[a]*(Cz*Dx*Iy), I+3);
	    update((C[0][0])*W[a]*(Cx*Dy*Iz), I+4);
	    update((C[0][0])*W[a]*(Cy*Dx*Iz), I+5);
	    update((C[0][0])*W[a]*(Iz*Qx), I+6);
	    update((C[0][0])*W[a]*(Iy*Qx), I+7);
	    update((C[0][0])*W[a]*(Ix*Qy), I+8);
	    update((C[0][0])*W[a]*(Iz*Qy), I+9);
	    update((C[0][0])*W[a]*(Ix*Qz), I+10);
	    update((C[0][0])*W[a]*(Iy*Qz), I+11);
	    update((C[0][0])*W[a]*(Cy*(Dx*Xij + Qx)), I+12);
	    update((C[0][0])*W[a]*(Cz*(Dx*Xij + Qx)), I+13);
	    update((C[0][0])*W[a]*((Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+14);
	    update((C[0][0])*W[a]*((B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+15);
	    update((C[0][0])*W[a]*(Cx*(Dy*Yij + Qy)), I+16);
	    update((C[0][0])*W[a]*(Cz*(Dy*Yij + Qy)), I+17);
	    update((C[0][0])*W[a]*((Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+18);
	    double f1 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Cx*f1), I+19);
	    update((C[0][0])*W[a]*(Cy*f1), I+20);
	    double f11 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Dx*f11), I+21);
	    update((C[0][0])*W[a]*(Dy*f11), I+22);
	    double f3 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Dx*f3), I+23);
	    update((C[0][0])*W[a]*(Dz*f3), I+24);
	    double f7 = (Cx*Ix + B10);
	    update((C[0][0])*W[a]*(Dy*f7), I+25);
	    update((C[0][0])*W[a]*(Dz*f7), I+26);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[27]) {
	double T[27];
	for (int i = 0; i < 27; ++i) {
	    T[i] = I[i];
	}
	I[11] = T[0];
	I[19] = T[1];
	I[21] = T[2];
	I[5] = T[3];
	I[15] = T[4];
	I[7] = T[5];
	I[6] = T[6];
	I[3] = T[7];
	I[10] = T[8];
	I[16] = T[9];
	I[20] = T[10];
	I[23] = T[11];
	I[1] = T[12];
	I[2] = T[13];
	I[0] = T[14];
	I[13] = T[15];
	I[12] = T[16];
	I[14] = T[17];
	I[26] = T[18];
	I[24] = T[19];
	I[25] = T[20];
	I[8] = T[21];
	I[17] = T[22];
	I[4] = T[23];
	I[22] = T[24];
	I[9] = T[25];
	I[18] = T[26];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[27] = { 14, 12, 13, 7, 23, 3, 6, 5, 21, 25, 8, 0, 16, 15, 17, 4, 9, 22, 26, 1, 10, 2, 24, 11, 19, 20, 18 };
// 	if (index < 27) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    11, 19, 21, 5, 15, 7, 6, 3, 10, 16, 20, 23, 1, 2, 0, 13, 12, 14, 26, 24, 25, 8, 17, 4, 22, 9, 18
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 11;
	*idx++ = 19;
	*idx++ = 21;
	*idx++ = 5;
	*idx++ = 15;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 13;
	*idx++ = 12;
	*idx++ = 14;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 25;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 4;
	*idx++ = 22;
	*idx++ = 9;
	*idx++ = 18;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[36]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dy*Dz*Px), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz*Py), I+1);
	    update((C[0][0])*W[a]*(Dx*Dy*Pz), I+2);
	    update((C[0][0])*W[a]*(Cz*Dy*Qx), I+3);
	    update((C[0][0])*W[a]*(Cy*Dz*Qx), I+4);
	    update((C[0][0])*W[a]*(Cz*Dx*Qy), I+5);
	    update((C[0][0])*W[a]*(Cx*Dz*Qy), I+6);
	    update((C[0][0])*W[a]*(Qx*Qy), I+7);
	    update((C[0][0])*W[a]*(Cy*Dx*Qz), I+8);
	    update((C[0][0])*W[a]*(Cx*Dy*Qz), I+9);
	    update((C[0][0])*W[a]*(Qx*Qz), I+10);
	    update((C[0][0])*W[a]*(Qy*Qz), I+11);
	    update((C[0][0])*W[a]*(Cy*Cz*Rx), I+12);
	    update((C[0][0])*W[a]*(Py*Rx), I+13);
	    update((C[0][0])*W[a]*(Pz*Rx), I+14);
	    update((C[0][0])*W[a]*(Cx*Cz*Ry), I+15);
	    update((C[0][0])*W[a]*(Pz*Ry), I+16);
	    update((C[0][0])*W[a]*(Px*Ry), I+17);
	    update((C[0][0])*W[a]*(Cx*Cy*Rz), I+18);
	    update((C[0][0])*W[a]*(Px*Rz), I+19);
	    update((C[0][0])*W[a]*(Py*Rz), I+20);
	    double f0 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Dx*f0), I+21);
	    update((C[0][0])*W[a]*(Dy*f0), I+22);
	    double f1 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Dx*f1), I+23);
	    update((C[0][0])*W[a]*(Dz*f1), I+24);
	    double f15 = (2*B00*Dy + Cy*Ry);
	    update((C[0][0])*W[a]*(Cz*f15), I+25);
	    update((C[0][0])*W[a]*(Cx*f15), I+26);
	    double f3 = (2*B00*Dz + Cz*Rz);
	    update((C[0][0])*W[a]*(Cx*f3), I+27);
	    update((C[0][0])*W[a]*(Cy*f3), I+28);
	    double f13 = 2*pow(B00,2);
	    double f7 = B01*B10;
	    update((C[0][0])*W[a]*((4*B00*Cy*Dy + f13 + B01*pow(Cy,2) + f7 + Py*pow(Dy,2))), I+29);
	    update((C[0][0])*W[a]*((f13 + B01*pow(Cz,2) + Pz*pow(Dz,2) + f7 + 4*B00*Cz*Dz)), I+30);
	    update((C[0][0])*W[a]*((B01*pow(Cx,2) + f13 + Px*pow(Dx,2) + 4*B00*Cx*Dx + f7)), I+31);
	    double f8 = (2*B00*Dx + Cx*Rx);
	    update((C[0][0])*W[a]*(Cy*f8), I+32);
	    update((C[0][0])*W[a]*(Cz*f8), I+33);
	    double f9 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Dz*f9), I+34);
	    update((C[0][0])*W[a]*(Dy*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[25] = T[1];
	I[20] = T[2];
	I[22] = T[3];
	I[27] = T[4];
	I[23] = T[5];
	I[33] = T[6];
	I[21] = T[7];
	I[29] = T[8];
	I[34] = T[9];
	I[28] = T[10];
	I[35] = T[11];
	I[5] = T[12];
	I[1] = T[13];
	I[2] = T[14];
	I[10] = T[15];
	I[8] = T[16];
	I[6] = T[17];
	I[15] = T[18];
	I[12] = T[19];
	I[13] = T[20];
	I[26] = T[21];
	I[32] = T[22];
	I[19] = T[23];
	I[31] = T[24];
	I[11] = T[25];
	I[9] = T[26];
	I[16] = T[27];
	I[17] = T[28];
	I[7] = T[29];
	I[14] = T[30];
	I[0] = T[31];
	I[3] = T[32];
	I[4] = T[33];
	I[24] = T[34];
	I[18] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 31, 13, 14, 32, 33, 12, 17, 29, 16, 26, 15, 25, 19, 20, 30, 18, 27, 28, 35, 23, 2, 7, 3, 5, 34, 1, 21, 4, 10, 8, 0, 24, 22, 6, 9, 11 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    30, 25, 20, 22, 27, 23, 33, 21, 29, 34, 28, 35, 5, 1, 2, 10, 8, 6, 15, 12, 13, 26, 32, 19, 31, 11, 9, 16, 17, 7, 14, 0, 3, 4, 24, 18
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 30;
	*idx++ = 25;
	*idx++ = 20;
	*idx++ = 22;
	*idx++ = 27;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 21;
	*idx++ = 29;
	*idx++ = 34;
	*idx++ = 28;
	*idx++ = 35;
	*idx++ = 5;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 13;
	*idx++ = 26;
	*idx++ = 32;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 0;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 24;
	*idx++ = 18;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[36]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    double f0 = (Iz*(pow(Kz,2) + B01) + 2*B00*Kz);
	    update((C[0][0])*W[a]*(Ix*f0), I+0);
	    update((C[0][0])*W[a]*(Iy*f0), I+1);
	    double f10 = (2*B00*Iz + Kz*(B10 + pow(Iz,2)));
	    update((C[0][0])*W[a]*(Kx*f10), I+2);
	    update((C[0][0])*W[a]*(Ky*f10), I+3);
	    double f13 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Iy*Iz*f13), I+4);
	    double f14 = (B00 + Ix*Kx);
	    update((C[0][0])*W[a]*(Iz*Ky*f14), I+5);
	    update((C[0][0])*W[a]*(Iy*Kz*f14), I+6);
	    double f15 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Kx*Kz*f15), I+7);
	    update((C[0][0])*W[a]*(f13*f15), I+8);
	    double f16 = (B00 + Iy*Ky);
	    update((C[0][0])*W[a]*(Iz*Kx*f16), I+9);
	    update((C[0][0])*W[a]*(Ix*Kz*f16), I+10);
	    update((C[0][0])*W[a]*(f14*f16), I+11);
	    double f17 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Kx*Ky*f17), I+12);
	    update((C[0][0])*W[a]*(f13*f17), I+13);
	    double f11 = B01*B10;
	    double f18 = 2*pow(B00,2);
	    update((C[0][0])*W[a]*((f11 + f18 + B01*pow(Ix,2) + pow(Kx,2)*(B10 + pow(Ix,2)) + 4*B00*Ix*Kx)), I+14);
	    update((C[0][0])*W[a]*((f11 + f18 + B01*pow(Iy,2) + pow(Ky,2)*(B10 + pow(Iy,2)) + 4*B00*Iy*Ky)), I+15);
	    update((C[0][0])*W[a]*((f11 + f18 + 4*B00*Iz*Kz + B01*pow(Iz,2) + pow(Kz,2)*(B10 + pow(Iz,2)))), I+16);
	    double f19 = (pow(Ky,2) + B01);
	    update((C[0][0])*W[a]*(Ix*Iz*f19), I+17);
	    update((C[0][0])*W[a]*(f17*f19), I+18);
	    double f20 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Ky*Kz*f20), I+19);
	    update((C[0][0])*W[a]*(f19*f20), I+20);
	    double f21 = (Ix*(pow(Kx,2) + B01) + 2*B00*Kx);
	    update((C[0][0])*W[a]*(Iy*f21), I+21);
	    update((C[0][0])*W[a]*(Iz*f21), I+22);
	    double f22 = (2*B00*Iy + Ky*(B10 + pow(Iy,2)));
	    update((C[0][0])*W[a]*(Kx*f22), I+23);
	    update((C[0][0])*W[a]*(Kz*f22), I+24);
	    double f3 = (2*B00*Ky + Iy*(pow(Ky,2) + B01));
	    update((C[0][0])*W[a]*(Iz*f3), I+25);
	    update((C[0][0])*W[a]*(Ix*f3), I+26);
	    double f4 = (Iz*Kz + B00);
	    update((C[0][0])*W[a]*(Iy*Kx*f4), I+27);
	    update((C[0][0])*W[a]*(Ix*Ky*f4), I+28);
	    update((C[0][0])*W[a]*(f14*f4), I+29);
	    update((C[0][0])*W[a]*(f16*f4), I+30);
	    double f8 = (pow(Kz,2) + B01);
	    update((C[0][0])*W[a]*(Ix*Iy*f8), I+31);
	    update((C[0][0])*W[a]*(f20*f8), I+32);
	    update((C[0][0])*W[a]*(f15*f8), I+33);
	    double f9 = (2*B00*Ix + Kx*(B10 + pow(Ix,2)));
	    update((C[0][0])*W[a]*(Kz*f9), I+34);
	    update((C[0][0])*W[a]*(Ky*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[16] = T[0];
	I[17] = T[1];
	I[26] = T[2];
	I[32] = T[3];
	I[5] = T[4];
	I[22] = T[5];
	I[27] = T[6];
	I[25] = T[7];
	I[1] = T[8];
	I[23] = T[9];
	I[33] = T[10];
	I[21] = T[11];
	I[20] = T[12];
	I[2] = T[13];
	I[0] = T[14];
	I[7] = T[15];
	I[14] = T[16];
	I[10] = T[17];
	I[8] = T[18];
	I[30] = T[19];
	I[6] = T[20];
	I[3] = T[21];
	I[4] = T[22];
	I[19] = T[23];
	I[31] = T[24];
	I[11] = T[25];
	I[9] = T[26];
	I[29] = T[27];
	I[34] = T[28];
	I[28] = T[29];
	I[35] = T[30];
	I[15] = T[31];
	I[12] = T[32];
	I[13] = T[33];
	I[24] = T[34];
	I[18] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 14, 8, 13, 21, 22, 4, 20, 15, 18, 26, 17, 25, 32, 33, 16, 31, 0, 1, 35, 23, 12, 11, 5, 9, 34, 7, 2, 6, 29, 27, 19, 24, 3, 10, 28, 30 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    16, 17, 26, 32, 5, 22, 27, 25, 1, 23, 33, 21, 20, 2, 0, 7, 14, 10, 8, 30, 6, 3, 4, 19, 31, 11, 9, 29, 34, 28, 35, 15, 12, 13, 24, 18
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 26;
	*idx++ = 32;
	*idx++ = 5;
	*idx++ = 22;
	*idx++ = 27;
	*idx++ = 25;
	*idx++ = 1;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 7;
	*idx++ = 14;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 30;
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 29;
	*idx++ = 34;
	*idx++ = 28;
	*idx++ = 35;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 13;
	*idx++ = 24;
	*idx++ = 18;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::SP, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[144]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dy*Dz*Px), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz*Py), I+1);
	    update((C[0][0])*W[a]*(Dx*Dy*Pz), I+2);
	    update((C[0][0])*W[a]*(Cy*Dz*Qx), I+3);
	    update((C[0][0])*W[a]*(Cz*Dy*Qx), I+4);
	    update((C[0][0])*W[a]*(Cz*Dx*Qy), I+5);
	    update((C[0][0])*W[a]*(Cx*Dz*Qy), I+6);
	    update((C[0][1])*W[a]*(Iz*Qx*Qy), I+7);
	    update((C[0][0])*W[a]*(Qx*Qy), I+8);
	    update((C[0][0])*W[a]*(Cy*Dx*Qz), I+9);
	    update((C[0][0])*W[a]*(Cx*Dy*Qz), I+10);
	    update((C[0][1])*W[a]*(Ix*Qy*Qz), I+11);
	    update((C[0][0])*W[a]*(Qy*Qz), I+12);
	    update((C[0][1])*W[a]*(Iy*Qx*Qz), I+13);
	    update((C[0][0])*W[a]*(Qx*Qz), I+14);
	    update((C[0][0])*W[a]*(Cy*Cz*Rx), I+15);
	    update((C[0][1])*W[a]*(Iz*Py*Rx), I+16);
	    update((C[0][0])*W[a]*(Py*Rx), I+17);
	    update((C[0][1])*W[a]*(Iy*Pz*Rx), I+18);
	    update((C[0][0])*W[a]*(Pz*Rx), I+19);
	    update((C[0][0])*W[a]*(Cx*Cz*Ry), I+20);
	    update((C[0][1])*W[a]*(Ix*Pz*Ry), I+21);
	    update((C[0][0])*W[a]*(Pz*Ry), I+22);
	    update((C[0][1])*W[a]*(Iz*Px*Ry), I+23);
	    update((C[0][0])*W[a]*(Px*Ry), I+24);
	    update((C[0][0])*W[a]*(Cx*Cy*Rz), I+25);
	    update((C[0][1])*W[a]*(Ix*Py*Rz), I+26);
	    update((C[0][0])*W[a]*(Py*Rz), I+27);
	    update((C[0][1])*W[a]*(Iy*Px*Rz), I+28);
	    update((C[0][0])*W[a]*(Px*Rz), I+29);
	    update((C[0][1])*W[a]*(Dz*Py*(Dx*Xij + Qx)), I+30);
	    update((C[0][1])*W[a]*(Cy*Qz*(Dx*Xij + Qx)), I+31);
	    update((C[0][1])*W[a]*(Cy*Rz*(Px + Cx*Xij)), I+32);
	    update((C[0][1])*W[a]*(Dz*Qy*(Px + Cx*Xij)), I+33);
	    update((C[0][1])*W[a]*(Dy*Pz*(Dx*Xij + Qx)), I+34);
	    update((C[0][1])*W[a]*(Cz*Qy*(Dx*Xij + Qx)), I+35);
	    update((C[0][1])*W[a]*(Cz*Ry*(Px + Cx*Xij)), I+36);
	    update((C[0][1])*W[a]*(Dy*Qz*(Px + Cx*Xij)), I+37);
	    update((C[0][1])*W[a]*(Cz*Qx*(Dy*Yij + Qy)), I+38);
	    update((C[0][1])*W[a]*(Dz*Px*(Dy*Yij + Qy)), I+39);
	    update((C[0][1])*W[a]*(Cx*Qz*(Dy*Yij + Qy)), I+40);
	    update((C[0][1])*W[a]*(Dx*Pz*(Dy*Yij + Qy)), I+41);
	    update((C[0][1])*W[a]*(Dx*Qy*(Cz*Zij + Pz)), I+42);
	    update((C[0][1])*W[a]*(Dy*Qx*(Cz*Zij + Pz)), I+43);
	    update((C[0][1])*W[a]*(Cx*Ry*(Cz*Zij + Pz)), I+44);
	    update((C[0][1])*W[a]*(Cy*Rx*(Cz*Zij + Pz)), I+45);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][1])*W[a]*(Cx*Dy*(f1 + Qz*Zij)), I+46);
	    update((C[0][1])*W[a]*(Cy*Dx*(f1 + Qz*Zij)), I+47);
	    update((C[0][1])*W[a]*(Qx*(f1 + Qz*Zij)), I+48);
	    update((C[0][1])*W[a]*(Qy*(f1 + Qz*Zij)), I+49);
	    update((C[0][1])*W[a]*(f1*(Dy*Yij + Qy)), I+50);
	    update((C[0][1])*W[a]*(f1*(Dx*Xij + Qx)), I+51);
	    update((C[0][1])*W[a]*(Dy*Ix*f1), I+52);
	    update((C[0][0])*W[a]*(Dy*f1), I+53);
	    update((C[0][1])*W[a]*(Dx*Iy*f1), I+54);
	    update((C[0][0])*W[a]*(Dx*f1), I+55);
	    double f14 = (2*B00*Dx + Ix*Rx);
	    update((C[0][1])*W[a]*(Cy*Cz*f14), I+56);
	    update((C[0][1])*W[a]*(Pz*f14), I+57);
	    update((C[0][1])*W[a]*(Py*f14), I+58);
	    double f16 = (2*B00*Cy*Yij + 3*B00*Py + Dy*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)));
	    update((C[0][1])*W[a]*(Dz*f16), I+59);
	    update((C[0][1])*W[a]*(Dx*f16), I+60);
	    double f2 = (4*B00*Cy*Dy + Py*Ry + 2*pow(B00,2));
	    update((C[0][1])*W[a]*(Cx*(Yij*(2*B00*Dy + Cy*Ry) + f2)), I+61);
	    update((C[0][1])*W[a]*(Cz*(Yij*(2*B00*Dy + Cy*Ry) + f2)), I+62);
	    update((C[0][1])*W[a]*(Iz*f2), I+63);
	    update((C[0][1])*W[a]*(Ix*f2), I+64);
	    update((C[0][0])*W[a]*(f2), I+65);
	    double f21 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Dx*Qz*f21), I+66);
	    update((C[0][1])*W[a]*(Cx*Rz*f21), I+67);
	    update((C[0][1])*W[a]*(Dz*Qx*f21), I+68);
	    update((C[0][1])*W[a]*(Cz*Rx*f21), I+69);
	    double f22 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][1])*W[a]*(Dy*Dz*f22), I+70);
	    update((C[0][1])*W[a]*(Ry*f22), I+71);
	    update((C[0][1])*W[a]*(Rz*f22), I+72);
	    double f23 = (2*B00*Dx + Cx*Rx);
	    update((C[0][1])*W[a]*(f23*(Cz*Zij + Pz)), I+73);
	    update((C[0][1])*W[a]*(Cz*Iy*f23), I+74);
	    update((C[0][1])*W[a]*(Cy*Iz*f23), I+75);
	    update((C[0][0])*W[a]*(Cy*f23), I+76);
	    update((C[0][1])*W[a]*(f21*f23), I+77);
	    update((C[0][0])*W[a]*(Cz*f23), I+78);
	    double f24 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][1])*W[a]*(Cz*Dx*f24), I+79);
	    update((C[0][1])*W[a]*(Cx*Dz*f24), I+80);
	    update((C[0][1])*W[a]*(Qx*f24), I+81);
	    update((C[0][1])*W[a]*(Qz*f24), I+82);
	    double f28 = (2*B00*Dy + Iy*Ry);
	    update((C[0][1])*W[a]*(Cx*Cz*f28), I+83);
	    update((C[0][1])*W[a]*(Px*f28), I+84);
	    update((C[0][1])*W[a]*(Pz*f28), I+85);
	    double f29 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[0][1])*W[a]*(Dz*f29), I+86);
	    update((C[0][1])*W[a]*(Dy*f29), I+87);
	    double f30 = (Iz*Rz + 2*B00*Dz);
	    update((C[0][1])*W[a]*(Cx*Cy*f30), I+88);
	    update((C[0][1])*W[a]*(Px*f30), I+89);
	    update((C[0][1])*W[a]*(Py*f30), I+90);
	    double f31 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][1])*W[a]*(Dx*Dz*f31), I+91);
	    update((C[0][1])*W[a]*(Rz*f31), I+92);
	    update((C[0][1])*W[a]*(Rx*f31), I+93);
	    double f32 = (Dy*Py + 2*B00*Cy);
	    update((C[0][1])*W[a]*(f32*(Dx*Xij + Qx)), I+94);
	    update((C[0][1])*W[a]*(Dz*Ix*f32), I+95);
	    update((C[0][0])*W[a]*(Dz*f32), I+96);
	    update((C[0][1])*W[a]*(Dx*Iz*f32), I+97);
	    update((C[0][0])*W[a]*(Dx*f32), I+98);
	    double f33 = (2*B00*Dz + Cz*Rz);
	    update((C[0][1])*W[a]*(Cy*Ix*f33), I+99);
	    update((C[0][1])*W[a]*(f33*(Px + Cx*Xij)), I+100);
	    update((C[0][1])*W[a]*(Cx*Iy*f33), I+101);
	    update((C[0][0])*W[a]*(Cx*f33), I+102);
	    update((C[0][0])*W[a]*(Cy*f33), I+103);
	    update((C[0][1])*W[a]*(f21*f33), I+104);
	    double f36 = 2*pow(B00,2);
	    double f3 = B01*B10;
	    update((C[0][1])*W[a]*(Cz*(f36 + 2*B00*Dx*(Xij + 2*Cx) + B01*Cx*Ix + pow(Dx,2)*(Cx*Ix + B10) + f3)), I+105);
	    update((C[0][1])*W[a]*((B01*Iy*pow(Cy,2) + 4*B00*Cy*Dy*Yij + 6*B00*Dy*Py + Yij*(f36 + f3 + Py*pow(Dy,2)) + Cy*(3*f36 + 3*f3 + pow(Dy,2)*(3*B10 + pow(Cy,2))))), I+106);
	    update((C[0][1])*W[a]*(Cy*(f36 + pow(Dz,2)*(B10 + Cz*Iz) + 2*B00*Dz*(2*Cz + Zij) + f3 + B01*Cz*Iz)), I+107);
	    update((C[0][1])*W[a]*(Cx*(f36 + pow(Dz,2)*(B10 + Cz*Iz) + 2*B00*Dz*(2*Cz + Zij) + f3 + B01*Cz*Iz)), I+108);
	    update((C[0][1])*W[a]*((6*B00*Dz*Pz + B01*Iz*pow(Cz,2) + Cz*(3*f36 + 3*f3 + pow(Dz,2)*(3*B10 + pow(Cz,2))) + Zij*(f36 + Pz*pow(Dz,2) + f3) + 4*B00*Cz*Dz*Zij)), I+109);
	    update((C[0][1])*W[a]*(Iy*(f36 + B01*pow(Cz,2) + Pz*pow(Dz,2) + f3 + 4*B00*Cz*Dz)), I+110);
	    update((C[0][1])*W[a]*(Ix*(f36 + B01*pow(Cz,2) + Pz*pow(Dz,2) + f3 + 4*B00*Cz*Dz)), I+111);
	    update((C[0][0])*W[a]*((f36 + B01*pow(Cz,2) + Pz*pow(Dz,2) + f3 + 4*B00*Cz*Dz)), I+112);
	    update((C[0][1])*W[a]*(Iz*(B01*pow(Cx,2) + f36 + Px*pow(Dx,2) + 4*B00*Cx*Dx + f3)), I+113);
	    update((C[0][1])*W[a]*(Iy*(B01*pow(Cx,2) + f36 + Px*pow(Dx,2) + 4*B00*Cx*Dx + f3)), I+114);
	    update((C[0][0])*W[a]*((B01*pow(Cx,2) + f36 + Px*pow(Dx,2) + 4*B00*Cx*Dx + f3)), I+115);
	    update((C[0][1])*W[a]*((B01*Ix*pow(Cx,2) + 6*B00*Dx*Px + Cx*(3*f36 + 3*f3 + pow(Dx,2)*(3*B10 + pow(Cx,2))) + 4*B00*Cx*Dx*Xij + Xij*(f36 + Px*pow(Dx,2) + f3))), I+116);
	    update((C[0][1])*W[a]*(Cy*(f36 + 2*B00*Dx*(Xij + 2*Cx) + B01*Cx*Ix + pow(Dx,2)*(Cx*Ix + B10) + f3)), I+117);
	    double f4 = (Dx*Px + 2*B00*Cx);
	    update((C[0][1])*W[a]*(Cy*Dz*(Qx*Xij + f4)), I+118);
	    update((C[0][1])*W[a]*(Qy*(Qx*Xij + f4)), I+119);
	    update((C[0][1])*W[a]*(Qz*(Qx*Xij + f4)), I+120);
	    update((C[0][1])*W[a]*(Cz*Dy*(Qx*Xij + f4)), I+121);
	    update((C[0][1])*W[a]*(f4*(Dy*Yij + Qy)), I+122);
	    update((C[0][1])*W[a]*(Dz*Iy*f4), I+123);
	    update((C[0][0])*W[a]*(Dz*f4), I+124);
	    update((C[0][1])*W[a]*(Dy*Iz*f4), I+125);
	    update((C[0][0])*W[a]*(Dy*f4), I+126);
	    double f40 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][1])*W[a]*(Dx*Dy*f40), I+127);
	    update((C[0][1])*W[a]*(Rx*f40), I+128);
	    update((C[0][1])*W[a]*(Ry*f40), I+129);
	    double f42 = (2*B00*Dy + Cy*Ry);
	    update((C[0][1])*W[a]*(f42*(Px + Cx*Xij)), I+130);
	    update((C[0][1])*W[a]*(Cx*Iz*f42), I+131);
	    update((C[0][1])*W[a]*(f42*(Cz*Zij + Pz)), I+132);
	    update((C[0][1])*W[a]*(Cz*Ix*f42), I+133);
	    update((C[0][0])*W[a]*(Cz*f42), I+134);
	    update((C[0][0])*W[a]*(Cx*f42), I+135);
	    double f7 = (Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij);
	    update((C[0][1])*W[a]*(Dx*f7), I+136);
	    update((C[0][1])*W[a]*(Dy*f7), I+137);
	    double f8 = (B00 + Dz*Iz);
	    update((C[0][1])*W[a]*(Dx*Py*f8), I+138);
	    update((C[0][1])*W[a]*(Cy*Qx*f8), I+139);
	    update((C[0][1])*W[a]*(Dy*Px*f8), I+140);
	    update((C[0][1])*W[a]*(Cx*Qy*f8), I+141);
	    update((C[0][1])*W[a]*(f32*f8), I+142);
	    update((C[0][1])*W[a]*(f4*f8), I+143);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[144]) {
	double T[144];
	for (int i = 0; i < 144; ++i) {
	    T[i] = I[i];
	}
	I[120] = T[0];
	I[97] = T[1];
	I[74] = T[2];
	I[99] = T[3];
	I[76] = T[4];
	I[77] = T[5];
	I[123] = T[6];
	I[93] = T[7];
	I[75] = T[8];
	I[101] = T[9];
	I[124] = T[10];
	I[131] = T[11];
	I[125] = T[12];
	I[112] = T[13];
	I[100] = T[14];
	I[5] = T[15];
	I[19] = T[16];
	I[1] = T[17];
	I[14] = T[18];
	I[2] = T[19];
	I[28] = T[20];
	I[32] = T[21];
	I[26] = T[22];
	I[42] = T[23];
	I[24] = T[24];
	I[51] = T[25];
	I[55] = T[26];
	I[49] = T[27];
	I[60] = T[28];
	I[48] = T[29];
	I[103] = T[30];
	I[107] = T[31];
	I[57] = T[32];
	I[129] = T[33];
	I[80] = T[34];
	I[83] = T[35];
	I[34] = T[36];
	I[130] = T[37];
	I[88] = T[38];
	I[132] = T[39];
	I[136] = T[40];
	I[86] = T[41];
	I[95] = T[42];
	I[94] = T[43];
	I[46] = T[44];
	I[23] = T[45];
	I[142] = T[46];
	I[119] = T[47];
	I[118] = T[48];
	I[143] = T[49];
	I[134] = T[50];
	I[104] = T[51];
	I[128] = T[52];
	I[122] = T[53];
	I[110] = T[54];
	I[98] = T[55];
	I[11] = T[56];
	I[8] = T[57];
	I[7] = T[58];
	I[133] = T[59];
	I[85] = T[60];
	I[39] = T[61];
	I[41] = T[62];
	I[43] = T[63];
	I[31] = T[64];
	I[25] = T[65];
	I[113] = T[66];
	I[63] = T[67];
	I[111] = T[68];
	I[17] = T[69];
	I[126] = T[70];
	I[30] = T[71];
	I[54] = T[72];
	I[22] = T[73];
	I[16] = T[74];
	I[21] = T[75];
	I[3] = T[76];
	I[15] = T[77];
	I[4] = T[78];
	I[89] = T[79];
	I[135] = T[80];
	I[87] = T[81];
	I[137] = T[82];
	I[40] = T[83];
	I[36] = T[84];
	I[38] = T[85];
	I[102] = T[86];
	I[78] = T[87];
	I[69] = T[88];
	I[66] = T[89];
	I[67] = T[90];
	I[109] = T[91];
	I[61] = T[92];
	I[13] = T[93];
	I[79] = T[94];
	I[127] = T[95];
	I[121] = T[96];
	I[91] = T[97];
	I[73] = T[98];
	I[59] = T[99];
	I[58] = T[100];
	I[64] = T[101];
	I[52] = T[102];
	I[53] = T[103];
	I[65] = T[104];
	I[10] = T[105];
	I[37] = T[106];
	I[71] = T[107];
	I[70] = T[108];
	I[68] = T[109];
	I[62] = T[110];
	I[56] = T[111];
	I[50] = T[112];
	I[18] = T[113];
	I[12] = T[114];
	I[0] = T[115];
	I[6] = T[116];
	I[9] = T[117];
	I[105] = T[118];
	I[81] = T[119];
	I[106] = T[120];
	I[82] = T[121];
	I[84] = T[122];
	I[108] = T[123];
	I[96] = T[124];
	I[90] = T[125];
	I[72] = T[126];
	I[92] = T[127];
	I[20] = T[128];
	I[44] = T[129];
	I[33] = T[130];
	I[45] = T[131];
	I[47] = T[132];
	I[35] = T[133];
	I[29] = T[134];
	I[27] = T[135];
	I[116] = T[136];
	I[140] = T[137];
	I[115] = T[138];
	I[117] = T[139];
	I[138] = T[140];
	I[141] = T[141];
	I[139] = T[142];
	I[114] = T[143];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[144] = { 115, 17, 19, 76, 78, 15, 116, 58, 57, 117, 105, 56, 114, 93, 18, 77, 74, 69, 113, 16, 128, 75, 73, 45, 24, 65, 22, 135, 20, 134, 71, 64, 21, 130, 36, 133, 84, 106, 85, 61, 83, 62, 23, 63, 129, 131, 44, 132, 29, 27, 112, 25, 102, 103, 72, 26, 111, 32, 100, 99, 28, 92, 110, 67, 101, 104, 89, 90, 109, 88, 108, 107, 126, 98, 2, 8, 4, 5, 87, 94, 34, 119, 121, 35, 122, 60, 41, 81, 38, 79, 125, 97, 127, 7, 43, 42, 124, 1, 55, 3, 14, 9, 86, 30, 51, 118, 120, 31, 123, 91, 54, 68, 13, 66, 143, 138, 136, 139, 48, 47, 0, 96, 53, 6, 10, 12, 70, 95, 52, 33, 37, 11, 39, 59, 50, 80, 40, 82, 140, 142, 137, 141, 46, 49 };
// 	if (index < 144) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    120, 97, 74, 99, 76, 77, 123, 93, 75, 101, 124, 131, 125, 112, 100, 5, 19, 1, 14, 2, 28, 32, 26, 42, 24, 51, 55, 49, 60, 48, 103, 107, 57, 129, 80, 83, 34, 130, 88, 132, 136, 86, 95, 94, 46, 23, 142, 119, 118, 143, 134, 104, 128, 122, 110, 98, 11, 8, 7, 133, 85, 39, 41, 43, 31, 25, 113, 63, 111, 17, 126, 30, 54, 22, 16, 21, 3, 15, 4, 89, 135, 87, 137, 40, 36, 38, 102, 78, 69, 66, 67, 109, 61, 13, 79, 127, 121, 91, 73, 59, 58, 64, 52, 53, 65, 10, 37, 71, 70, 68, 62, 56, 50, 18, 12, 0, 6, 9, 105, 81, 106, 82, 84, 108, 96, 90, 72, 92, 20, 44, 33, 45, 47, 35, 29, 27, 116, 140, 115, 117, 138, 141, 139, 114
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 120;
	*idx++ = 97;
	*idx++ = 74;
	*idx++ = 99;
	*idx++ = 76;
	*idx++ = 77;
	*idx++ = 123;
	*idx++ = 93;
	*idx++ = 75;
	*idx++ = 101;
	*idx++ = 124;
	*idx++ = 131;
	*idx++ = 125;
	*idx++ = 112;
	*idx++ = 100;
	*idx++ = 5;
	*idx++ = 19;
	*idx++ = 1;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 28;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 42;
	*idx++ = 24;
	*idx++ = 51;
	*idx++ = 55;
	*idx++ = 49;
	*idx++ = 60;
	*idx++ = 48;
	*idx++ = 103;
	*idx++ = 107;
	*idx++ = 57;
	*idx++ = 129;
	*idx++ = 80;
	*idx++ = 83;
	*idx++ = 34;
	*idx++ = 130;
	*idx++ = 88;
	*idx++ = 132;
	*idx++ = 136;
	*idx++ = 86;
	*idx++ = 95;
	*idx++ = 94;
	*idx++ = 46;
	*idx++ = 23;
	*idx++ = 142;
	*idx++ = 119;
	*idx++ = 118;
	*idx++ = 143;
	*idx++ = 134;
	*idx++ = 104;
	*idx++ = 128;
	*idx++ = 122;
	*idx++ = 110;
	*idx++ = 98;
	*idx++ = 11;
	*idx++ = 8;
	*idx++ = 7;
	*idx++ = 133;
	*idx++ = 85;
	*idx++ = 39;
	*idx++ = 41;
	*idx++ = 43;
	*idx++ = 31;
	*idx++ = 25;
	*idx++ = 113;
	*idx++ = 63;
	*idx++ = 111;
	*idx++ = 17;
	*idx++ = 126;
	*idx++ = 30;
	*idx++ = 54;
	*idx++ = 22;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 3;
	*idx++ = 15;
	*idx++ = 4;
	*idx++ = 89;
	*idx++ = 135;
	*idx++ = 87;
	*idx++ = 137;
	*idx++ = 40;
	*idx++ = 36;
	*idx++ = 38;
	*idx++ = 102;
	*idx++ = 78;
	*idx++ = 69;
	*idx++ = 66;
	*idx++ = 67;
	*idx++ = 109;
	*idx++ = 61;
	*idx++ = 13;
	*idx++ = 79;
	*idx++ = 127;
	*idx++ = 121;
	*idx++ = 91;
	*idx++ = 73;
	*idx++ = 59;
	*idx++ = 58;
	*idx++ = 64;
	*idx++ = 52;
	*idx++ = 53;
	*idx++ = 65;
	*idx++ = 10;
	*idx++ = 37;
	*idx++ = 71;
	*idx++ = 70;
	*idx++ = 68;
	*idx++ = 62;
	*idx++ = 56;
	*idx++ = 50;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 9;
	*idx++ = 105;
	*idx++ = 81;
	*idx++ = 106;
	*idx++ = 82;
	*idx++ = 84;
	*idx++ = 108;
	*idx++ = 96;
	*idx++ = 90;
	*idx++ = 72;
	*idx++ = 92;
	*idx++ = 20;
	*idx++ = 44;
	*idx++ = 33;
	*idx++ = 45;
	*idx++ = 47;
	*idx++ = 35;
	*idx++ = 29;
	*idx++ = 27;
	*idx++ = 116;
	*idx++ = 140;
	*idx++ = 115;
	*idx++ = 117;
	*idx++ = 138;
	*idx++ = 141;
	*idx++ = 139;
	*idx++ = 114;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::D, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(Cz*Ix*Iy), I+0);
	    update((C[0][0])*W[a]*(Cy*Ix*Iz), I+1);
	    update((C[0][0])*W[a]*(Cx*Iy*Iz), I+2);
	    update((C[0][0])*W[a]*((B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2))), I+3);
	    update((C[0][0])*W[a]*((Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij))), I+4);
	    update((C[0][0])*W[a]*((B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2))), I+5);
	    double f1 = (Cx*Ix + B10);
	    update((C[0][0])*W[a]*(Iz*f1), I+6);
	    update((C[0][0])*W[a]*(Iy*f1), I+7);
	    double f3 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Ix*f3), I+8);
	    update((C[0][0])*W[a]*(Iz*f3), I+9);
	    double f5 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Cz*f5), I+10);
	    update((C[0][0])*W[a]*(Cx*f5), I+11);
	    double f6 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*f6), I+12);
	    update((C[0][0])*W[a]*(Cy*f6), I+13);
	    double f7 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Cy*f7), I+14);
	    update((C[0][0])*W[a]*(Cz*f7), I+15);
	    double f8 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Ix*f8), I+16);
	    update((C[0][0])*W[a]*(Iy*f8), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[11] = T[0];
	I[13] = T[1];
	I[15] = T[2];
	I[0] = T[3];
	I[4] = T[4];
	I[8] = T[5];
	I[12] = T[6];
	I[9] = T[7];
	I[10] = T[8];
	I[16] = T[9];
	I[5] = T[10];
	I[3] = T[11];
	I[6] = T[12];
	I[7] = T[13];
	I[1] = T[14];
	I[2] = T[15];
	I[14] = T[16];
	I[17] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 3, 14, 15, 11, 4, 10, 12, 13, 5, 7, 8, 0, 6, 1, 16, 2, 9, 17 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    11, 13, 15, 0, 4, 8, 12, 9, 10, 16, 5, 3, 6, 7, 1, 2, 14, 17
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 11;
	*idx++ = 13;
	*idx++ = 15;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 16;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 14;
	*idx++ = 17;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::P, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[9]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*((Cx*Ix + B10)), I+0);
	    update((C[0][0])*W[a]*(Cy*Ix), I+1);
	    update((C[0][0])*W[a]*(Cz*Ix), I+2);
	    update((C[0][0])*W[a]*((Cy*Iy + B10)), I+3);
	    update((C[0][0])*W[a]*(Cz*Iy), I+4);
	    update((C[0][0])*W[a]*(Cx*Iy), I+5);
	    update((C[0][0])*W[a]*((B10 + Cz*Iz)), I+6);
	    update((C[0][0])*W[a]*(Cx*Iz), I+7);
	    update((C[0][0])*W[a]*(Cy*Iz), I+8);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[9]) {
	double T[9];
	for (int i = 0; i < 9; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[4] = T[3];
	I[5] = T[4];
	I[3] = T[5];
	I[8] = T[6];
	I[6] = T[7];
	I[7] = T[8];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[9] = { 0, 1, 2, 5, 3, 4, 7, 8, 6 };
// 	if (index < 9) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 4, 5, 3, 8, 6, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 8;
	*idx++ = 6;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::S, rysq::SP, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[4][1],
	      double (&I)[160]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[4][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cx*Cy*Cz), I+0);
	    update((C[2][0])*W[a]*(Cz*Ky*Px), I+1);
	    update((C[1][0])*W[a]*(Cz*Dy*Px), I+2);
	    update((C[0][0])*W[a]*(Cz*Px), I+3);
	    update((C[1][0])*W[a]*(Cy*Dz*Px), I+4);
	    update((C[0][0])*W[a]*(Cy*Px), I+5);
	    update((C[2][0])*W[a]*(Cy*Kz*Px), I+6);
	    update((C[1][0])*W[a]*(Cx*Dz*Py), I+7);
	    update((C[0][0])*W[a]*(Cx*Py), I+8);
	    update((C[2][0])*W[a]*(Cx*Kz*Py), I+9);
	    update((C[2][0])*W[a]*(Cz*Kx*Py), I+10);
	    update((C[0][0])*W[a]*(Cz*Py), I+11);
	    update((C[1][0])*W[a]*(Cz*Dx*Py), I+12);
	    update((C[2][0])*W[a]*(Cy*Kx*Pz), I+13);
	    update((C[1][0])*W[a]*(Cy*Dx*Pz), I+14);
	    update((C[0][0])*W[a]*(Cy*Pz), I+15);
	    update((C[1][0])*W[a]*(Cx*Dy*Pz), I+16);
	    update((C[0][0])*W[a]*(Cx*Pz), I+17);
	    update((C[2][0])*W[a]*(Cx*Ky*Pz), I+18);
	    update((C[1][0])*W[a]*(Cy*Cz*Qx), I+19);
	    update((C[3][0])*W[a]*(Kz*Py*Qx), I+20);
	    update((C[1][0])*W[a]*(Py*Qx), I+21);
	    update((C[3][0])*W[a]*(Ky*Pz*Qx), I+22);
	    update((C[1][0])*W[a]*(Pz*Qx), I+23);
	    update((C[3][0])*W[a]*(Kx*Pz*Qy), I+24);
	    update((C[1][0])*W[a]*(Cx*Cz*Qy), I+25);
	    update((C[1][0])*W[a]*(Pz*Qy), I+26);
	    update((C[3][0])*W[a]*(Kz*Px*Qy), I+27);
	    update((C[1][0])*W[a]*(Px*Qy), I+28);
	    update((C[1][0])*W[a]*(Cx*Cy*Qz), I+29);
	    update((C[3][0])*W[a]*(Kx*Py*Qz), I+30);
	    update((C[1][0])*W[a]*(Py*Qz), I+31);
	    update((C[1][0])*W[a]*(Px*Qz), I+32);
	    update((C[3][0])*W[a]*(Ky*Px*Qz), I+33);
	    update((C[3][0])*W[a]*(Cx*Qz*(Cy*Ykl + Qy)), I+34);
	    update((C[3][0])*W[a]*(Cz*Qx*(Cy*Ykl + Qy)), I+35);
	    update((C[2][0])*W[a]*(Cx*Cz*(Cy*Ykl + Qy)), I+36);
	    update((C[3][0])*W[a]*(Dz*Px*(Cy*Ykl + Qy)), I+37);
	    update((C[2][0])*W[a]*(Px*(Cy*Ykl + Qy)), I+38);
	    update((C[3][0])*W[a]*(Dx*Pz*(Cy*Ykl + Qy)), I+39);
	    update((C[2][0])*W[a]*(Pz*(Cy*Ykl + Qy)), I+40);
	    update((C[3][0])*W[a]*(Dx*Py*(Cz*Zkl + Qz)), I+41);
	    update((C[3][0])*W[a]*(Cx*Qy*(Cz*Zkl + Qz)), I+42);
	    update((C[3][0])*W[a]*(Cy*Qx*(Cz*Zkl + Qz)), I+43);
	    update((C[2][0])*W[a]*(Cx*Cy*(Cz*Zkl + Qz)), I+44);
	    update((C[3][0])*W[a]*(Dy*Px*(Cz*Zkl + Qz)), I+45);
	    update((C[2][0])*W[a]*(Px*(Cz*Zkl + Qz)), I+46);
	    update((C[2][0])*W[a]*(Py*(Cz*Zkl + Qz)), I+47);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[2][0])*W[a]*(Cx*Ky*f0), I+48);
	    update((C[3][0])*W[a]*(Cx*Dz*Ky*f0), I+49);
	    update((C[1][0])*W[a]*(Cx*Dz*f0), I+50);
	    update((C[2][0])*W[a]*(Cx*Kz*f0), I+51);
	    update((C[3][0])*W[a]*(Cx*Dy*Kz*f0), I+52);
	    update((C[1][0])*W[a]*(Cx*Dy*f0), I+53);
	    update((C[0][0])*W[a]*(Cx*f0), I+54);
	    double f12 = (Dy*Py + 2*B00*Cy);
	    update((C[3][0])*W[a]*(f12*(Cz*Zkl + Qz)), I+55);
	    update((C[3][0])*W[a]*(Qz*(f12 + Py*Ykl)), I+56);
	    update((C[3][0])*W[a]*(Qx*(f12 + Py*Ykl)), I+57);
	    update((C[3][0])*W[a]*(Cx*Dz*(f12 + Py*Ykl)), I+58);
	    update((C[2][0])*W[a]*(Cx*(f12 + Py*Ykl)), I+59);
	    update((C[3][0])*W[a]*(Cz*Dx*(f12 + Py*Ykl)), I+60);
	    update((C[2][0])*W[a]*(Cz*(f12 + Py*Ykl)), I+61);
	    update((C[3][0])*W[a]*(Cz*Kx*f12), I+62);
	    update((C[1][0])*W[a]*(Cz*f12), I+63);
	    update((C[3][0])*W[a]*(Cx*Kz*f12), I+64);
	    update((C[1][0])*W[a]*(Cx*f12), I+65);
	    double f14 = (2*B00*Cx*(Xkl + 2*Dx) + 2*pow(B00,2) + Px*(B01 + Dx*Kx));
	    update((C[3][0])*W[a]*(Cy*f14), I+66);
	    update((C[3][0])*W[a]*(Cz*f14), I+67);
	    double f16 = (B01 + Dx*Kx);
	    update((C[3][0])*W[a]*(Cz*Py*f16), I+68);
	    update((C[3][0])*W[a]*(Cy*Pz*f16), I+69);
	    double f19 = (Kx*Px + 2*B00*Cx);
	    update((C[3][0])*W[a]*(Qy*f19), I+70);
	    update((C[3][0])*W[a]*(Qz*f19), I+71);
	    update((C[3][0])*W[a]*(Cz*Dy*f19), I+72);
	    update((C[2][0])*W[a]*(Cz*f19), I+73);
	    update((C[2][0])*W[a]*(Cy*f19), I+74);
	    update((C[3][0])*W[a]*(Cy*Dz*f19), I+75);
	    double f2 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[3][0])*W[a]*(Cy*Cz*f2), I+76);
	    update((C[3][0])*W[a]*(Pz*f2), I+77);
	    update((C[3][0])*W[a]*(Py*f2), I+78);
	    double f21 = (2*pow(B00,2) + Pz*(Dz*Kz + B01) + 2*B00*Cz*(2*Dz + Zkl));
	    update((C[3][0])*W[a]*(Cx*f21), I+79);
	    update((C[3][0])*W[a]*(Cy*f21), I+80);
	    double f25 = (Dx*Px + 2*B00*Cx);
	    update((C[3][0])*W[a]*(f25*(Cz*Zkl + Qz)), I+81);
	    update((C[3][0])*W[a]*(f25*(Cy*Ykl + Qy)), I+82);
	    update((C[3][0])*W[a]*(Cz*Ky*f25), I+83);
	    update((C[1][0])*W[a]*(Cz*f25), I+84);
	    update((C[3][0])*W[a]*(Cy*Kz*f25), I+85);
	    update((C[1][0])*W[a]*(Cy*f25), I+86);
	    double f27 = (Dz*Kz + B01);
	    update((C[3][0])*W[a]*(Cy*Px*f27), I+87);
	    update((C[3][0])*W[a]*(Cx*f0*f27), I+88);
	    update((C[3][0])*W[a]*(Cx*Py*f27), I+89);
	    double f28 = (B01 + Dy*Ky);
	    update((C[3][0])*W[a]*(Cx*Pz*f28), I+90);
	    update((C[3][0])*W[a]*(Cx*f0*f28), I+91);
	    update((C[3][0])*W[a]*(Cz*Px*f28), I+92);
	    double f29 = (B00 + Cx*Kx);
	    update((C[3][0])*W[a]*(Cz*Qy*f29), I+93);
	    update((C[2][0])*W[a]*(Cy*Cz*f29), I+94);
	    update((C[3][0])*W[a]*(Cy*Qz*f29), I+95);
	    update((C[3][0])*W[a]*(f12*f29), I+96);
	    update((C[2][0])*W[a]*(Py*f29), I+97);
	    update((C[3][0])*W[a]*(Dz*Py*f29), I+98);
	    update((C[2][0])*W[a]*(Pz*f29), I+99);
	    update((C[3][0])*W[a]*(Dy*Pz*f29), I+100);
	    double f3 = (Dz*Pz + 2*B00*Cz);
	    update((C[3][0])*W[a]*(Qy*(Pz*Zkl + f3)), I+101);
	    update((C[3][0])*W[a]*(Qx*(Pz*Zkl + f3)), I+102);
	    update((C[3][0])*W[a]*(Cx*Dy*(Pz*Zkl + f3)), I+103);
	    update((C[2][0])*W[a]*(Cx*(Pz*Zkl + f3)), I+104);
	    update((C[3][0])*W[a]*(Cy*Dx*(Pz*Zkl + f3)), I+105);
	    update((C[2][0])*W[a]*(Cy*(Pz*Zkl + f3)), I+106);
	    update((C[3][0])*W[a]*(f3*(Cy*Ykl + Qy)), I+107);
	    update((C[3][0])*W[a]*(f29*f3), I+108);
	    update((C[3][0])*W[a]*(Cx*Ky*f3), I+109);
	    update((C[1][0])*W[a]*(Cx*f3), I+110);
	    update((C[3][0])*W[a]*(Cy*Kx*f3), I+111);
	    update((C[1][0])*W[a]*(Cy*f3), I+112);
	    double f36 = Dx*pow(Cx,3);
	    double f26 = 3*B10*Cx*Dx;
	    double f22 = 3*B00*pow(Cx,2);
	    double f11 = 3*B00*B10;
	    update((C[3][0])*W[a]*((6*Cx*pow(B00,2) + B01*(3*B10*Cx + pow(Cx,3)) + Xkl*(f11 + f22 + f26 + f36) + Dx*(Cx*Dx*f0 + 2*f22 + 2*f11))), I+113);
	    update((C[3][0])*W[a]*(Dz*(f11 + f22 + f26 + f36 + 3*B10*Cx*Xkl + Xkl*pow(Cx,3))), I+114);
	    update((C[3][0])*W[a]*(Dy*(f11 + f22 + f26 + f36 + 3*B10*Cx*Xkl + Xkl*pow(Cx,3))), I+115);
	    update((C[2][0])*W[a]*((f11 + f22 + f26 + f36 + 3*B10*Cx*Xkl + Xkl*pow(Cx,3))), I+116);
	    update((C[3][0])*W[a]*(Ky*(f11 + f22 + f26 + f36)), I+117);
	    update((C[3][0])*W[a]*(Kz*(f11 + f22 + f26 + f36)), I+118);
	    update((C[1][0])*W[a]*((f11 + f22 + f26 + f36)), I+119);
	    double f37 = (3*B10 + pow(Cz,2));
	    update((C[3][0])*W[a]*(Cz*f28*f37), I+120);
	    update((C[3][0])*W[a]*(Cz*f16*f37), I+121);
	    update((C[2][0])*W[a]*(Cz*Kx*f37), I+122);
	    update((C[3][0])*W[a]*(Cz*Dy*Kx*f37), I+123);
	    update((C[1][0])*W[a]*(Cz*Dy*f37), I+124);
	    update((C[3][0])*W[a]*(Cz*Dx*Ky*f37), I+125);
	    update((C[1][0])*W[a]*(Cz*Dx*f37), I+126);
	    update((C[0][0])*W[a]*(Cz*f37), I+127);
	    update((C[2][0])*W[a]*(Cz*Ky*f37), I+128);
	    double f38 = (3*B10 + pow(Cy,2));
	    update((C[3][0])*W[a]*(Cy*f27*f38), I+129);
	    update((C[3][0])*W[a]*(Cy*Dx*Kz*f38), I+130);
	    update((C[2][0])*W[a]*(Cy*Kz*f38), I+131);
	    update((C[3][0])*W[a]*(Cy*f16*f38), I+132);
	    update((C[2][0])*W[a]*(Cy*Kx*f38), I+133);
	    update((C[3][0])*W[a]*(Cy*Dz*Kx*f38), I+134);
	    update((C[1][0])*W[a]*(Cy*Dz*f38), I+135);
	    update((C[0][0])*W[a]*(Cy*f38), I+136);
	    update((C[1][0])*W[a]*(Cy*Dx*f38), I+137);
	    double f15 = 3*B10*Cy*Dy;
	    double f1 = 3*B00*pow(Cy,2);
	    double f42 = Dy*pow(Cy,3);
	    update((C[3][0])*W[a]*((6*Cy*pow(B00,2) + B01*(3*B10*Cy + pow(Cy,3)) + Dy*(2*f11 + Cy*Dy*f38 + 2*f1) + Ykl*(f11 + f15 + f42 + f1))), I+138);
	    update((C[3][0])*W[a]*(Dx*(f11 + f15 + f42 + Ykl*pow(Cy,3) + f1 + 3*B10*Cy*Ykl)), I+139);
	    update((C[3][0])*W[a]*(Dz*(f11 + f15 + f42 + Ykl*pow(Cy,3) + f1 + 3*B10*Cy*Ykl)), I+140);
	    update((C[2][0])*W[a]*((f11 + f15 + f42 + Ykl*pow(Cy,3) + f1 + 3*B10*Cy*Ykl)), I+141);
	    update((C[3][0])*W[a]*(Kz*(f11 + f15 + f42 + f1)), I+142);
	    update((C[3][0])*W[a]*(Kx*(f11 + f15 + f42 + f1)), I+143);
	    update((C[1][0])*W[a]*((f11 + f15 + f42 + f1)), I+144);
	    double f45 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[3][0])*W[a]*(Cx*Cy*f45), I+145);
	    update((C[3][0])*W[a]*(Px*f45), I+146);
	    update((C[3][0])*W[a]*(Py*f45), I+147);
	    double f5 = (Py*(B01 + Dy*Ky) + 2*pow(B00,2) + 2*B00*Cy*(Ykl + 2*Dy));
	    update((C[3][0])*W[a]*(Cx*f5), I+148);
	    update((C[3][0])*W[a]*(Cz*f5), I+149);
	    double f6 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[3][0])*W[a]*(Cx*Cz*f6), I+150);
	    update((C[3][0])*W[a]*(Px*f6), I+151);
	    update((C[3][0])*W[a]*(Pz*f6), I+152);
	    double f4 = 3*B10*Cz*Dz;
	    double f20 = 3*B00*pow(Cz,2);
	    double f7 = Dz*pow(Cz,3);
	    update((C[3][0])*W[a]*((Dz*(2*f20 + Cz*Dz*f37 + 2*f11) + Zkl*(f11 + f20 + f4 + f7) + B01*(3*B10*Cz + pow(Cz,3)) + 6*Cz*pow(B00,2))), I+153);
	    update((C[3][0])*W[a]*(Dy*(f11 + f20 + 3*B10*Cz*Zkl + f4 + f7 + Zkl*pow(Cz,3))), I+154);
	    update((C[3][0])*W[a]*(Dx*(f11 + f20 + 3*B10*Cz*Zkl + f4 + f7 + Zkl*pow(Cz,3))), I+155);
	    update((C[2][0])*W[a]*((f11 + f20 + 3*B10*Cz*Zkl + f4 + f7 + Zkl*pow(Cz,3))), I+156);
	    update((C[3][0])*W[a]*(Ky*(f11 + f20 + f4 + f7)), I+157);
	    update((C[3][0])*W[a]*(Kx*(f11 + f20 + f4 + f7)), I+158);
	    update((C[1][0])*W[a]*((f11 + f20 + f4 + f7)), I+159);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[160]) {
	double T[160];
	for (int i = 0; i < 160; ++i) {
	    T[i] = I[i];
	}
	I[9] = T[0];
	I[84] = T[1];
	I[24] = T[2];
	I[4] = T[3];
	I[33] = T[4];
	I[3] = T[5];
	I[123] = T[6];
	I[35] = T[7];
	I[5] = T[8];
	I[125] = T[9];
	I[46] = T[10];
	I[6] = T[11];
	I[16] = T[12];
	I[48] = T[13];
	I[18] = T[14];
	I[8] = T[15];
	I[27] = T[16];
	I[7] = T[17];
	I[87] = T[18];
	I[19] = T[19];
	I[135] = T[20];
	I[15] = T[21];
	I[97] = T[22];
	I[17] = T[23];
	I[68] = T[24];
	I[29] = T[25];
	I[28] = T[26];
	I[143] = T[27];
	I[23] = T[28];
	I[39] = T[29];
	I[76] = T[30];
	I[36] = T[31];
	I[34] = T[32];
	I[114] = T[33];
	I[119] = T[34];
	I[99] = T[35];
	I[89] = T[36];
	I[113] = T[37];
	I[83] = T[38];
	I[98] = T[39];
	I[88] = T[40];
	I[136] = T[41];
	I[149] = T[42];
	I[139] = T[43];
	I[129] = T[44];
	I[144] = T[45];
	I[124] = T[46];
	I[126] = T[47];
	I[80] = T[48];
	I[110] = T[49];
	I[30] = T[50];
	I[120] = T[51];
	I[140] = T[52];
	I[20] = T[53];
	I[0] = T[54];
	I[146] = T[55];
	I[116] = T[56];
	I[95] = T[57];
	I[115] = T[58];
	I[85] = T[59];
	I[96] = T[60];
	I[86] = T[61];
	I[66] = T[62];
	I[26] = T[63];
	I[145] = T[64];
	I[25] = T[65];
	I[53] = T[66];
	I[54] = T[67];
	I[56] = T[68];
	I[58] = T[69];
	I[63] = T[70];
	I[74] = T[71];
	I[64] = T[72];
	I[44] = T[73];
	I[43] = T[74];
	I[73] = T[75];
	I[59] = T[76];
	I[57] = T[77];
	I[55] = T[78];
	I[157] = T[79];
	I[158] = T[80];
	I[134] = T[81];
	I[93] = T[82];
	I[94] = T[83];
	I[14] = T[84];
	I[133] = T[85];
	I[13] = T[86];
	I[153] = T[87];
	I[150] = T[88];
	I[155] = T[89];
	I[107] = T[90];
	I[100] = T[91];
	I[104] = T[92];
	I[69] = T[93];
	I[49] = T[94];
	I[79] = T[95];
	I[65] = T[96];
	I[45] = T[97];
	I[75] = T[98];
	I[47] = T[99];
	I[67] = T[100];
	I[148] = T[101];
	I[137] = T[102];
	I[147] = T[103];
	I[127] = T[104];
	I[138] = T[105];
	I[128] = T[106];
	I[118] = T[107];
	I[77] = T[108];
	I[117] = T[109];
	I[37] = T[110];
	I[78] = T[111];
	I[38] = T[112];
	I[50] = T[113];
	I[70] = T[114];
	I[60] = T[115];
	I[40] = T[116];
	I[90] = T[117];
	I[130] = T[118];
	I[10] = T[119];
	I[102] = T[120];
	I[52] = T[121];
	I[42] = T[122];
	I[62] = T[123];
	I[22] = T[124];
	I[92] = T[125];
	I[12] = T[126];
	I[2] = T[127];
	I[82] = T[128];
	I[151] = T[129];
	I[131] = T[130];
	I[121] = T[131];
	I[51] = T[132];
	I[41] = T[133];
	I[71] = T[134];
	I[31] = T[135];
	I[1] = T[136];
	I[11] = T[137];
	I[101] = T[138];
	I[91] = T[139];
	I[111] = T[140];
	I[81] = T[141];
	I[141] = T[142];
	I[61] = T[143];
	I[21] = T[144];
	I[159] = T[145];
	I[154] = T[146];
	I[156] = T[147];
	I[105] = T[148];
	I[106] = T[149];
	I[109] = T[150];
	I[103] = T[151];
	I[108] = T[152];
	I[152] = T[153];
	I[142] = T[154];
	I[132] = T[155];
	I[122] = T[156];
	I[112] = T[157];
	I[72] = T[158];
	I[32] = T[159];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[160] = { 54, 136, 127, 5, 3, 8, 11, 17, 15, 0, 119, 137, 126, 86, 84, 21, 12, 23, 14, 19, 53, 144, 124, 28, 2, 65, 63, 16, 26, 25, 50, 135, 159, 4, 32, 7, 31, 110, 112, 29, 116, 133, 122, 74, 73, 97, 10, 99, 13, 94, 113, 132, 121, 66, 67, 78, 68, 77, 69, 76, 115, 143, 123, 70, 72, 96, 62, 100, 24, 93, 114, 134, 158, 75, 71, 98, 30, 108, 111, 95, 48, 141, 128, 38, 1, 59, 61, 18, 40, 36, 117, 139, 125, 82, 83, 57, 60, 22, 39, 35, 91, 138, 120, 151, 92, 148, 149, 90, 152, 150, 49, 140, 157, 37, 33, 58, 56, 109, 107, 34, 51, 131, 156, 6, 46, 9, 47, 104, 106, 44, 118, 130, 155, 85, 81, 20, 41, 102, 105, 43, 52, 142, 154, 27, 45, 64, 55, 103, 101, 42, 88, 129, 153, 87, 146, 89, 147, 79, 80, 145 };
// 	if (index < 160) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    9, 84, 24, 4, 33, 3, 123, 35, 5, 125, 46, 6, 16, 48, 18, 8, 27, 7, 87, 19, 135, 15, 97, 17, 68, 29, 28, 143, 23, 39, 76, 36, 34, 114, 119, 99, 89, 113, 83, 98, 88, 136, 149, 139, 129, 144, 124, 126, 80, 110, 30, 120, 140, 20, 0, 146, 116, 95, 115, 85, 96, 86, 66, 26, 145, 25, 53, 54, 56, 58, 63, 74, 64, 44, 43, 73, 59, 57, 55, 157, 158, 134, 93, 94, 14, 133, 13, 153, 150, 155, 107, 100, 104, 69, 49, 79, 65, 45, 75, 47, 67, 148, 137, 147, 127, 138, 128, 118, 77, 117, 37, 78, 38, 50, 70, 60, 40, 90, 130, 10, 102, 52, 42, 62, 22, 92, 12, 2, 82, 151, 131, 121, 51, 41, 71, 31, 1, 11, 101, 91, 111, 81, 141, 61, 21, 159, 154, 156, 105, 106, 109, 103, 108, 152, 142, 132, 122, 112, 72, 32
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 9;
	*idx++ = 84;
	*idx++ = 24;
	*idx++ = 4;
	*idx++ = 33;
	*idx++ = 3;
	*idx++ = 123;
	*idx++ = 35;
	*idx++ = 5;
	*idx++ = 125;
	*idx++ = 46;
	*idx++ = 6;
	*idx++ = 16;
	*idx++ = 48;
	*idx++ = 18;
	*idx++ = 8;
	*idx++ = 27;
	*idx++ = 7;
	*idx++ = 87;
	*idx++ = 19;
	*idx++ = 135;
	*idx++ = 15;
	*idx++ = 97;
	*idx++ = 17;
	*idx++ = 68;
	*idx++ = 29;
	*idx++ = 28;
	*idx++ = 143;
	*idx++ = 23;
	*idx++ = 39;
	*idx++ = 76;
	*idx++ = 36;
	*idx++ = 34;
	*idx++ = 114;
	*idx++ = 119;
	*idx++ = 99;
	*idx++ = 89;
	*idx++ = 113;
	*idx++ = 83;
	*idx++ = 98;
	*idx++ = 88;
	*idx++ = 136;
	*idx++ = 149;
	*idx++ = 139;
	*idx++ = 129;
	*idx++ = 144;
	*idx++ = 124;
	*idx++ = 126;
	*idx++ = 80;
	*idx++ = 110;
	*idx++ = 30;
	*idx++ = 120;
	*idx++ = 140;
	*idx++ = 20;
	*idx++ = 0;
	*idx++ = 146;
	*idx++ = 116;
	*idx++ = 95;
	*idx++ = 115;
	*idx++ = 85;
	*idx++ = 96;
	*idx++ = 86;
	*idx++ = 66;
	*idx++ = 26;
	*idx++ = 145;
	*idx++ = 25;
	*idx++ = 53;
	*idx++ = 54;
	*idx++ = 56;
	*idx++ = 58;
	*idx++ = 63;
	*idx++ = 74;
	*idx++ = 64;
	*idx++ = 44;
	*idx++ = 43;
	*idx++ = 73;
	*idx++ = 59;
	*idx++ = 57;
	*idx++ = 55;
	*idx++ = 157;
	*idx++ = 158;
	*idx++ = 134;
	*idx++ = 93;
	*idx++ = 94;
	*idx++ = 14;
	*idx++ = 133;
	*idx++ = 13;
	*idx++ = 153;
	*idx++ = 150;
	*idx++ = 155;
	*idx++ = 107;
	*idx++ = 100;
	*idx++ = 104;
	*idx++ = 69;
	*idx++ = 49;
	*idx++ = 79;
	*idx++ = 65;
	*idx++ = 45;
	*idx++ = 75;
	*idx++ = 47;
	*idx++ = 67;
	*idx++ = 148;
	*idx++ = 137;
	*idx++ = 147;
	*idx++ = 127;
	*idx++ = 138;
	*idx++ = 128;
	*idx++ = 118;
	*idx++ = 77;
	*idx++ = 117;
	*idx++ = 37;
	*idx++ = 78;
	*idx++ = 38;
	*idx++ = 50;
	*idx++ = 70;
	*idx++ = 60;
	*idx++ = 40;
	*idx++ = 90;
	*idx++ = 130;
	*idx++ = 10;
	*idx++ = 102;
	*idx++ = 52;
	*idx++ = 42;
	*idx++ = 62;
	*idx++ = 22;
	*idx++ = 92;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 82;
	*idx++ = 151;
	*idx++ = 131;
	*idx++ = 121;
	*idx++ = 51;
	*idx++ = 41;
	*idx++ = 71;
	*idx++ = 31;
	*idx++ = 1;
	*idx++ = 11;
	*idx++ = 101;
	*idx++ = 91;
	*idx++ = 111;
	*idx++ = 81;
	*idx++ = 141;
	*idx++ = 61;
	*idx++ = 21;
	*idx++ = 159;
	*idx++ = 154;
	*idx++ = 156;
	*idx++ = 105;
	*idx++ = 106;
	*idx++ = 109;
	*idx++ = 103;
	*idx++ = 108;
	*idx++ = 152;
	*idx++ = 142;
	*idx++ = 132;
	*idx++ = 122;
	*idx++ = 112;
	*idx++ = 72;
	*idx++ = 32;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::F, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[10]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*((pow(Dx,3) + 3*B01*Dx)), I+0);
	    update((C[0][0])*W[a]*((pow(Dy,3) + 3*B01*Dy)), I+1);
	    update((C[0][0])*W[a]*(Dx*Dy*Dz), I+2);
	    update((C[0][0])*W[a]*((pow(Dz,3) + 3*B01*Dz)), I+3);
	    update((C[0][0])*W[a]*(Dy*Rx), I+4);
	    update((C[0][0])*W[a]*(Dz*Rx), I+5);
	    update((C[0][0])*W[a]*(Dx*Ry), I+6);
	    update((C[0][0])*W[a]*(Dz*Ry), I+7);
	    update((C[0][0])*W[a]*(Dx*Rz), I+8);
	    update((C[0][0])*W[a]*(Dy*Rz), I+9);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[10]) {
	double T[10];
	for (int i = 0; i < 10; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[9] = T[2];
	I[2] = T[3];
	I[3] = T[4];
	I[4] = T[5];
	I[5] = T[6];
	I[6] = T[7];
	I[7] = T[8];
	I[8] = T[9];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[10] = { 0, 1, 3, 4, 5, 6, 7, 8, 9, 2 };
// 	if (index < 10) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 9, 2, 3, 4, 5, 6, 7, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 9;
	*idx++ = 2;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[4]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {


	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))


	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Cx), I+1);
	    update((C[0][1])*W[a]*(Cy), I+2);
	    update((C[0][1])*W[a]*(Cz), I+3);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[4]) {
	double T[4];
	for (int i = 0; i < 4; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[4] = { 0, 1, 2, 3 };
// 	if (index < 4) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::S, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][1])*W[a]*((Ix*(pow(Kx,2) + B01) + 2*B00*Kx)), I+0);
	    update((C[0][1])*W[a]*((2*B00*Ky + Iy*(pow(Ky,2) + B01))), I+1);
	    update((C[0][1])*W[a]*(Iz*Kx*Ky), I+2);
	    update((C[0][0])*W[a]*(Kx*Ky), I+3);
	    update((C[0][1])*W[a]*((Iz*(pow(Kz,2) + B01) + 2*B00*Kz)), I+4);
	    update((C[0][1])*W[a]*(Iy*Kx*Kz), I+5);
	    update((C[0][0])*W[a]*(Kx*Kz), I+6);
	    update((C[0][1])*W[a]*(Ix*Ky*Kz), I+7);
	    update((C[0][0])*W[a]*(Ky*Kz), I+8);
	    double f10 = (pow(Kx,2) + B01);
	    update((C[0][1])*W[a]*(Iz*f10), I+9);
	    update((C[0][1])*W[a]*(Iy*f10), I+10);
	    update((C[0][0])*W[a]*(f10), I+11);
	    double f11 = (pow(Ky,2) + B01);
	    update((C[0][1])*W[a]*(Ix*f11), I+12);
	    update((C[0][1])*W[a]*(Iz*f11), I+13);
	    update((C[0][0])*W[a]*(f11), I+14);
	    double f3 = (B00 + Ix*Kx);
	    update((C[0][1])*W[a]*(Kz*f3), I+15);
	    update((C[0][1])*W[a]*(Ky*f3), I+16);
	    double f4 = (Iz*Kz + B00);
	    update((C[0][1])*W[a]*(Kx*f4), I+17);
	    update((C[0][1])*W[a]*(Ky*f4), I+18);
	    double f6 = (pow(Kz,2) + B01);
	    update((C[0][1])*W[a]*(Iy*f6), I+19);
	    update((C[0][1])*W[a]*(Ix*f6), I+20);
	    update((C[0][0])*W[a]*(f6), I+21);
	    double f8 = (B00 + Iy*Ky);
	    update((C[0][1])*W[a]*(Kx*f8), I+22);
	    update((C[0][1])*W[a]*(Kz*f8), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[6] = T[1];
	I[15] = T[2];
	I[12] = T[3];
	I[11] = T[4];
	I[18] = T[5];
	I[16] = T[6];
	I[21] = T[7];
	I[20] = T[8];
	I[3] = T[9];
	I[2] = T[10];
	I[0] = T[11];
	I[5] = T[12];
	I[7] = T[13];
	I[4] = T[14];
	I[17] = T[15];
	I[13] = T[16];
	I[19] = T[17];
	I[23] = T[18];
	I[10] = T[19];
	I[9] = T[20];
	I[8] = T[21];
	I[14] = T[22];
	I[22] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 11, 0, 10, 9, 14, 12, 1, 13, 21, 20, 19, 4, 3, 16, 22, 2, 6, 15, 5, 17, 8, 7, 23, 18 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 6, 15, 12, 11, 18, 16, 21, 20, 3, 2, 0, 5, 7, 4, 17, 13, 19, 23, 10, 9, 8, 14, 22
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 11;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 19;
	*idx++ = 23;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 14;
	*idx++ = 22;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::F, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    double f1 = (2*B00*Iz + Dz*(B10 + pow(Iz,2)));
	    update((C[0][0])*W[a]*(Ix*f1), I+0);
	    update((C[0][0])*W[a]*(Iy*f1), I+1);
	    double f10 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Dz*Ix*f10), I+2);
	    update((C[0][0])*W[a]*(Dx*Iz*f10), I+3);
	    double f11 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Dx*Iy*f11), I+4);
	    update((C[0][0])*W[a]*(Dy*Ix*f11), I+5);
	    double f12 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Ix*Iz*f12), I+6);
	    update((C[0][0])*W[a]*(f11*f12), I+7);
	    double f13 = (3*B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Dy*Iz*f13), I+8);
	    update((C[0][0])*W[a]*(Dx*Iz*f13), I+9);
	    double f14 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Dz*Iy*f14), I+10);
	    update((C[0][0])*W[a]*(Dy*Iz*f14), I+11);
	    update((C[0][0])*W[a]*(f12*f14), I+12);
	    double f15 = (3*B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Dy*Ix*f15), I+13);
	    update((C[0][0])*W[a]*(Dz*Ix*f15), I+14);
	    double f2 = (2*B00*Ix + Dx*(B10 + pow(Ix,2)));
	    update((C[0][0])*W[a]*(Iy*f2), I+15);
	    update((C[0][0])*W[a]*(Iz*f2), I+16);
	    double f3 = 3*B00*B10;
	    update((C[0][0])*W[a]*((3*B10*Dx*Ix + f3 + 3*B00*pow(Ix,2) + Dx*pow(Ix,3))), I+17);
	    update((C[0][0])*W[a]*((Dy*pow(Iy,3) + f3 + 3*B00*pow(Iy,2) + 3*B10*Dy*Iy)), I+18);
	    update((C[0][0])*W[a]*((Dz*pow(Iz,3) + 3*B00*pow(Iz,2) + 3*B10*Dz*Iz + f3)), I+19);
	    double f4 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Ix*Iy*f4), I+20);
	    update((C[0][0])*W[a]*(f14*f4), I+21);
	    update((C[0][0])*W[a]*(f10*f4), I+22);
	    double f6 = (3*B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Dz*Iy*f6), I+23);
	    update((C[0][0])*W[a]*(Dx*Iy*f6), I+24);
	    double f7 = (2*B00*Iy + Dy*(B10 + pow(Iy,2)));
	    update((C[0][0])*W[a]*(Iz*f7), I+25);
	    update((C[0][0])*W[a]*(Ix*f7), I+26);
	    double f9 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Iy*Iz*f9), I+27);
	    update((C[0][0])*W[a]*(f10*f9), I+28);
	    update((C[0][0])*W[a]*(f11*f9), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[27] = T[0];
	I[28] = T[1];
	I[25] = T[2];
	I[6] = T[3];
	I[8] = T[4];
	I[17] = T[5];
	I[19] = T[6];
	I[18] = T[7];
	I[12] = T[8];
	I[2] = T[9];
	I[23] = T[10];
	I[14] = T[11];
	I[13] = T[12];
	I[10] = T[13];
	I[20] = T[14];
	I[3] = T[15];
	I[4] = T[16];
	I[0] = T[17];
	I[11] = T[18];
	I[22] = T[19];
	I[29] = T[20];
	I[24] = T[21];
	I[26] = T[22];
	I[21] = T[23];
	I[1] = T[24];
	I[16] = T[25];
	I[15] = T[26];
	I[9] = T[27];
	I[5] = T[28];
	I[7] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 17, 24, 9, 15, 16, 28, 3, 29, 4, 27, 13, 18, 8, 12, 11, 26, 25, 5, 7, 6, 14, 23, 19, 10, 21, 2, 22, 0, 1, 20 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    27, 28, 25, 6, 8, 17, 19, 18, 12, 2, 23, 14, 13, 10, 20, 3, 4, 0, 11, 22, 29, 24, 26, 21, 1, 16, 15, 9, 5, 7
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 25;
	*idx++ = 6;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 2;
	*idx++ = 23;
	*idx++ = 14;
	*idx++ = 13;
	*idx++ = 10;
	*idx++ = 20;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 21;
	*idx++ = 1;
	*idx++ = 16;
	*idx++ = 15;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::S, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][1])*W[a]*(Cz*Dx), I+0);
	    update((C[0][1])*W[a]*(Cy*Dx), I+1);
	    update((C[0][0])*W[a]*(Dx), I+2);
	    update((C[0][1])*W[a]*(Cz*Dy), I+3);
	    update((C[0][1])*W[a]*(Cx*Dy), I+4);
	    update((C[0][0])*W[a]*(Dy), I+5);
	    update((C[0][1])*W[a]*(Cx*Dz), I+6);
	    update((C[0][1])*W[a]*(Cy*Dz), I+7);
	    update((C[0][0])*W[a]*(Dz), I+8);
	    update((C[0][1])*W[a]*(Qx), I+9);
	    update((C[0][1])*W[a]*(Qy), I+10);
	    update((C[0][1])*W[a]*(Qz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[2] = T[1];
	I[0] = T[2];
	I[7] = T[3];
	I[5] = T[4];
	I[4] = T[5];
	I[9] = T[6];
	I[10] = T[7];
	I[8] = T[8];
	I[1] = T[9];
	I[6] = T[10];
	I[11] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 2, 9, 1, 0, 5, 4, 10, 3, 8, 6, 7, 11 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 2, 0, 7, 5, 4, 9, 10, 8, 1, 6, 11
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 11;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::SP, rysq::D> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[24]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[1][0])*W[a]*(Dz*Kx*Ky), I+0);
	    update((C[0][0])*W[a]*(Kx*Ky), I+1);
	    update((C[1][0])*W[a]*(Dy*Kx*Kz), I+2);
	    update((C[0][0])*W[a]*(Kx*Kz), I+3);
	    update((C[1][0])*W[a]*(Dx*Ky*Kz), I+4);
	    update((C[0][0])*W[a]*(Ky*Kz), I+5);
	    update((C[1][0])*W[a]*((B01*(2*Xkl + 3*Dx) + Dx*pow(Kx,2))), I+6);
	    update((C[1][0])*W[a]*((Dy*pow(Ky,2) + B01*(3*Dy + 2*Ykl))), I+7);
	    update((C[1][0])*W[a]*((B01*(2*Zkl + 3*Dz) + Dz*pow(Kz,2))), I+8);
	    double f2 = (pow(Kz,2) + B01);
	    update((C[1][0])*W[a]*(Dy*f2), I+9);
	    update((C[1][0])*W[a]*(Dx*f2), I+10);
	    update((C[0][0])*W[a]*(f2), I+11);
	    double f3 = (B01 + Dx*Kx);
	    update((C[1][0])*W[a]*(Kz*f3), I+12);
	    update((C[1][0])*W[a]*(Ky*f3), I+13);
	    double f5 = (pow(Kx,2) + B01);
	    update((C[1][0])*W[a]*(Dz*f5), I+14);
	    update((C[1][0])*W[a]*(Dy*f5), I+15);
	    update((C[0][0])*W[a]*(f5), I+16);
	    double f6 = (Dz*Kz + B01);
	    update((C[1][0])*W[a]*(Kx*f6), I+17);
	    update((C[1][0])*W[a]*(Ky*f6), I+18);
	    double f7 = (pow(Ky,2) + B01);
	    update((C[1][0])*W[a]*(Dx*f7), I+19);
	    update((C[1][0])*W[a]*(Dz*f7), I+20);
	    update((C[0][0])*W[a]*(f7), I+21);
	    double f8 = (B01 + Dy*Ky);
	    update((C[1][0])*W[a]*(Kx*f8), I+22);
	    update((C[1][0])*W[a]*(Kz*f8), I+23);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[24]) {
	double T[24];
	for (int i = 0; i < 24; ++i) {
	    T[i] = I[i];
	}
	I[15] = T[0];
	I[12] = T[1];
	I[18] = T[2];
	I[16] = T[3];
	I[21] = T[4];
	I[20] = T[5];
	I[1] = T[6];
	I[6] = T[7];
	I[11] = T[8];
	I[10] = T[9];
	I[9] = T[10];
	I[8] = T[11];
	I[17] = T[12];
	I[13] = T[13];
	I[3] = T[14];
	I[2] = T[15];
	I[0] = T[16];
	I[19] = T[17];
	I[23] = T[18];
	I[5] = T[19];
	I[7] = T[20];
	I[4] = T[21];
	I[14] = T[22];
	I[22] = T[23];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[24] = { 16, 6, 15, 14, 21, 19, 7, 20, 11, 10, 9, 8, 1, 13, 22, 0, 3, 12, 2, 17, 5, 4, 23, 18 };
// 	if (index < 24) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    15, 12, 18, 16, 21, 20, 1, 6, 11, 10, 9, 8, 17, 13, 3, 2, 0, 19, 23, 5, 7, 4, 14, 22
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 15;
	*idx++ = 12;
	*idx++ = 18;
	*idx++ = 16;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 19;
	*idx++ = 23;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 14;
	*idx++ = 22;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[9]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Dx), I+0);
	    update((C[0][0])*W[a]*(Cz*Dx), I+1);
	    update((C[0][0])*W[a]*(Cz*Dy), I+2);
	    update((C[0][0])*W[a]*(Cx*Dy), I+3);
	    update((C[0][0])*W[a]*(Cx*Dz), I+4);
	    update((C[0][0])*W[a]*(Cy*Dz), I+5);
	    update((C[0][0])*W[a]*(Qx), I+6);
	    update((C[0][0])*W[a]*(Qy), I+7);
	    update((C[0][0])*W[a]*(Qz), I+8);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[9]) {
	double T[9];
	for (int i = 0; i < 9; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[2] = T[1];
	I[5] = T[2];
	I[3] = T[3];
	I[6] = T[4];
	I[7] = T[5];
	I[0] = T[6];
	I[4] = T[7];
	I[8] = T[8];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[9] = { 6, 0, 1, 3, 7, 2, 4, 5, 8 };
// 	if (index < 9) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 2, 5, 3, 6, 7, 0, 4, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 5;
	*idx++ = 3;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::D, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[6]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dx*Dy), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz), I+1);
	    update((C[0][0])*W[a]*(Dy*Dz), I+2);
	    update((C[0][0])*W[a]*(Rx), I+3);
	    update((C[0][0])*W[a]*(Ry), I+4);
	    update((C[0][0])*W[a]*(Rz), I+5);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[6]) {
	double T[6];
	for (int i = 0; i < 6; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[4] = T[1];
	I[5] = T[2];
	I[0] = T[3];
	I[1] = T[4];
	I[2] = T[5];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[6] = { 3, 4, 5, 0, 1, 2 };
// 	if (index < 6) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 4, 5, 0, 1, 2
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 5;
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::SP, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][4],
	      double (&I)[48]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qy = (Cy*Dy + B00);

	    update((C[0][1])*W[a]*(Cz*Dx), I+0);
	    update((C[0][1])*W[a]*(Cy*Dx), I+1);
	    update((C[0][0])*W[a]*(Dx), I+2);
	    update((C[0][1])*W[a]*(Cz*Dy), I+3);
	    update((C[0][1])*W[a]*(Cx*Dy), I+4);
	    update((C[0][0])*W[a]*(Dy), I+5);
	    update((C[0][1])*W[a]*(Cx*Dz), I+6);
	    update((C[0][1])*W[a]*(Cy*Dz), I+7);
	    update((C[0][0])*W[a]*(Dz), I+8);
	    update((C[0][2])*W[a]*(Dy*Ix), I+9);
	    update((C[0][3])*W[a]*(Cz*Dy*Ix), I+10);
	    update((C[0][3])*W[a]*(Cy*Dz*Ix), I+11);
	    update((C[0][2])*W[a]*(Dz*Ix), I+12);
	    update((C[0][3])*W[a]*(Cx*Dz*Iy), I+13);
	    update((C[0][2])*W[a]*(Dz*Iy), I+14);
	    update((C[0][2])*W[a]*(Dx*Iy), I+15);
	    update((C[0][3])*W[a]*(Cz*Dx*Iy), I+16);
	    update((C[0][2])*W[a]*(Dy*Iz), I+17);
	    update((C[0][3])*W[a]*(Cx*Dy*Iz), I+18);
	    update((C[0][2])*W[a]*(Dx*Iz), I+19);
	    update((C[0][3])*W[a]*(Cy*Dx*Iz), I+20);
	    update((C[0][3])*W[a]*(Ix*Qy), I+21);
	    update((C[0][3])*W[a]*(Iz*Qy), I+22);
	    update((C[0][1])*W[a]*(Qy), I+23);
	    update((C[0][3])*W[a]*(Cz*(Dy*Yij + Qy)), I+24);
	    update((C[0][3])*W[a]*(Cx*(Dy*Yij + Qy)), I+25);
	    update((C[0][2])*W[a]*((Dy*Yij + Qy)), I+26);
	    double f1 = Cz*Dz;
	    update((C[0][3])*W[a]*((Dz*Pz + B00*(2*Cz + Zij) + f1*Zij)), I+27);
	    update((C[0][3])*W[a]*(Cy*(B00 + Dz*Zij + f1)), I+28);
	    update((C[0][3])*W[a]*(Cx*(B00 + Dz*Zij + f1)), I+29);
	    update((C[0][2])*W[a]*((B00 + Dz*Zij + f1)), I+30);
	    update((C[0][3])*W[a]*(Iy*(B00 + f1)), I+31);
	    update((C[0][3])*W[a]*(Ix*(B00 + f1)), I+32);
	    update((C[0][1])*W[a]*((B00 + f1)), I+33);
	    double f14 = (B10 + Cz*Iz);
	    update((C[0][3])*W[a]*(Dx*f14), I+34);
	    update((C[0][3])*W[a]*(Dy*f14), I+35);
	    double f15 = (Cy*Iy + B10);
	    update((C[0][3])*W[a]*(Dz*f15), I+36);
	    update((C[0][3])*W[a]*(Dx*f15), I+37);
	    double f4 = Cx*Dx;
	    update((C[0][3])*W[a]*((f4*Xij + Dx*Px + B00*(Xij + 2*Cx))), I+38);
	    update((C[0][3])*W[a]*(Cy*(B00 + f4 + Dx*Xij)), I+39);
	    update((C[0][3])*W[a]*(Cz*(B00 + f4 + Dx*Xij)), I+40);
	    update((C[0][2])*W[a]*((B00 + f4 + Dx*Xij)), I+41);
	    update((C[0][3])*W[a]*(Iz*(B00 + f4)), I+42);
	    update((C[0][1])*W[a]*((B00 + f4)), I+43);
	    update((C[0][3])*W[a]*(Iy*(B00 + f4)), I+44);
	    double f8 = Cy*Dy;
	    update((C[0][3])*W[a]*((B00*(Yij + 2*Cy) + f8*Yij + Dy*Py)), I+45);
	    double f9 = (Cx*Ix + B10);
	    update((C[0][3])*W[a]*(Dy*f9), I+46);
	    update((C[0][3])*W[a]*(Dz*f9), I+47);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[48]) {
	double T[48];
	for (int i = 0; i < 48; ++i) {
	    T[i] = I[i];
	}
	I[3] = T[0];
	I[2] = T[1];
	I[0] = T[2];
	I[19] = T[3];
	I[17] = T[4];
	I[16] = T[5];
	I[33] = T[6];
	I[34] = T[7];
	I[32] = T[8];
	I[20] = T[9];
	I[23] = T[10];
	I[38] = T[11];
	I[36] = T[12];
	I[41] = T[13];
	I[40] = T[14];
	I[8] = T[15];
	I[11] = T[16];
	I[28] = T[17];
	I[29] = T[18];
	I[12] = T[19];
	I[14] = T[20];
	I[22] = T[21];
	I[30] = T[22];
	I[18] = T[23];
	I[27] = T[24];
	I[25] = T[25];
	I[24] = T[26];
	I[47] = T[27];
	I[46] = T[28];
	I[45] = T[29];
	I[44] = T[30];
	I[43] = T[31];
	I[39] = T[32];
	I[35] = T[33];
	I[15] = T[34];
	I[31] = T[35];
	I[42] = T[36];
	I[10] = T[37];
	I[5] = T[38];
	I[6] = T[39];
	I[7] = T[40];
	I[4] = T[41];
	I[13] = T[42];
	I[1] = T[43];
	I[9] = T[44];
	I[26] = T[45];
	I[21] = T[46];
	I[37] = T[47];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[48] = { 2, 43, 1, 0, 41, 38, 39, 40, 15, 44, 37, 16, 19, 42, 20, 34, 5, 4, 23, 3, 9, 46, 21, 10, 26, 25, 45, 24, 17, 18, 22, 35, 8, 6, 7, 33, 12, 47, 11, 32, 14, 13, 36, 31, 30, 29, 28, 27 };
// 	if (index < 48) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    3, 2, 0, 19, 17, 16, 33, 34, 32, 20, 23, 38, 36, 41, 40, 8, 11, 28, 29, 12, 14, 22, 30, 18, 27, 25, 24, 47, 46, 45, 44, 43, 39, 35, 15, 31, 42, 10, 5, 6, 7, 4, 13, 1, 9, 26, 21, 37
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 19;
	*idx++ = 17;
	*idx++ = 16;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 32;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 38;
	*idx++ = 36;
	*idx++ = 41;
	*idx++ = 40;
	*idx++ = 8;
	*idx++ = 11;
	*idx++ = 28;
	*idx++ = 29;
	*idx++ = 12;
	*idx++ = 14;
	*idx++ = 22;
	*idx++ = 30;
	*idx++ = 18;
	*idx++ = 27;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 47;
	*idx++ = 46;
	*idx++ = 45;
	*idx++ = 44;
	*idx++ = 43;
	*idx++ = 39;
	*idx++ = 35;
	*idx++ = 15;
	*idx++ = 31;
	*idx++ = 42;
	*idx++ = 10;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 13;
	*idx++ = 1;
	*idx++ = 9;
	*idx++ = 26;
	*idx++ = 21;
	*idx++ = 37;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::SP, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[12]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);


	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[1][0])*W[a]*((B01 + Dx*Kx)), I+0);
	    update((C[1][0])*W[a]*(Dz*Kx), I+1);
	    update((C[1][0])*W[a]*(Dy*Kx), I+2);
	    update((C[0][0])*W[a]*(Kx), I+3);
	    update((C[1][0])*W[a]*((B01 + Dy*Ky)), I+4);
	    update((C[1][0])*W[a]*(Dz*Ky), I+5);
	    update((C[1][0])*W[a]*(Dx*Ky), I+6);
	    update((C[0][0])*W[a]*(Ky), I+7);
	    update((C[1][0])*W[a]*((Dz*Kz + B01)), I+8);
	    update((C[1][0])*W[a]*(Dx*Kz), I+9);
	    update((C[1][0])*W[a]*(Dy*Kz), I+10);
	    update((C[0][0])*W[a]*(Kz), I+11);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[12]) {
	double T[12];
	for (int i = 0; i < 12; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[3] = T[1];
	I[2] = T[2];
	I[0] = T[3];
	I[6] = T[4];
	I[7] = T[5];
	I[5] = T[6];
	I[4] = T[7];
	I[11] = T[8];
	I[9] = T[9];
	I[10] = T[10];
	I[8] = T[11];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[12] = { 3, 0, 2, 1, 7, 6, 4, 5, 11, 9, 10, 8 };
// 	if (index < 12) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 3, 2, 0, 6, 7, 5, 4, 11, 9, 10, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 10;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::SP, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[36]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Dx), I+0);
	    update((C[0][0])*W[a]*(Cz*Dx), I+1);
	    update((C[0][0])*W[a]*(Cz*Dy), I+2);
	    update((C[0][0])*W[a]*(Cx*Dy), I+3);
	    update((C[0][0])*W[a]*(Cx*Dz), I+4);
	    update((C[0][0])*W[a]*(Cy*Dz), I+5);
	    update((C[0][1])*W[a]*(Cy*Dz*Ix), I+6);
	    update((C[0][1])*W[a]*(Cz*Dy*Ix), I+7);
	    update((C[0][1])*W[a]*(Cx*Dz*Iy), I+8);
	    update((C[0][1])*W[a]*(Cz*Dx*Iy), I+9);
	    update((C[0][1])*W[a]*(Cy*Dx*Iz), I+10);
	    update((C[0][1])*W[a]*(Cx*Dy*Iz), I+11);
	    update((C[0][1])*W[a]*(Iz*Qx), I+12);
	    update((C[0][1])*W[a]*(Iy*Qx), I+13);
	    update((C[0][0])*W[a]*(Qx), I+14);
	    update((C[0][1])*W[a]*(Ix*Qy), I+15);
	    update((C[0][1])*W[a]*(Iz*Qy), I+16);
	    update((C[0][0])*W[a]*(Qy), I+17);
	    update((C[0][1])*W[a]*(Iy*Qz), I+18);
	    update((C[0][1])*W[a]*(Ix*Qz), I+19);
	    update((C[0][0])*W[a]*(Qz), I+20);
	    update((C[0][1])*W[a]*((Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx))), I+21);
	    update((C[0][1])*W[a]*(Cz*(Dx*Xij + Qx)), I+22);
	    update((C[0][1])*W[a]*(Cy*(Dx*Xij + Qx)), I+23);
	    update((C[0][1])*W[a]*((B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10))), I+24);
	    update((C[0][1])*W[a]*(Cx*(Dy*Yij + Qy)), I+25);
	    update((C[0][1])*W[a]*(Cz*(Dy*Yij + Qy)), I+26);
	    update((C[0][1])*W[a]*((Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij))), I+27);
	    double f1 = (B00 + Dz*Iz);
	    update((C[0][1])*W[a]*(Cx*f1), I+28);
	    update((C[0][1])*W[a]*(Cy*f1), I+29);
	    double f12 = (B10 + Cz*Iz);
	    update((C[0][1])*W[a]*(Dx*f12), I+30);
	    update((C[0][1])*W[a]*(Dy*f12), I+31);
	    double f3 = (Cy*Iy + B10);
	    update((C[0][1])*W[a]*(Dz*f3), I+32);
	    update((C[0][1])*W[a]*(Dx*f3), I+33);
	    double f7 = (Cx*Ix + B10);
	    update((C[0][1])*W[a]*(Dy*f7), I+34);
	    update((C[0][1])*W[a]*(Dz*f7), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[1] = T[0];
	I[2] = T[1];
	I[14] = T[2];
	I[12] = T[3];
	I[24] = T[4];
	I[25] = T[5];
	I[28] = T[6];
	I[17] = T[7];
	I[30] = T[8];
	I[8] = T[9];
	I[10] = T[10];
	I[21] = T[11];
	I[9] = T[12];
	I[6] = T[13];
	I[0] = T[14];
	I[16] = T[15];
	I[22] = T[16];
	I[13] = T[17];
	I[32] = T[18];
	I[29] = T[19];
	I[26] = T[20];
	I[3] = T[21];
	I[5] = T[22];
	I[4] = T[23];
	I[19] = T[24];
	I[18] = T[25];
	I[20] = T[26];
	I[35] = T[27];
	I[33] = T[28];
	I[34] = T[29];
	I[11] = T[30];
	I[23] = T[31];
	I[31] = T[32];
	I[7] = T[33];
	I[15] = T[34];
	I[27] = T[35];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[36] = { 14, 0, 1, 21, 23, 22, 13, 33, 9, 12, 10, 30, 3, 17, 2, 34, 15, 7, 25, 24, 26, 11, 16, 31, 4, 5, 20, 35, 6, 19, 8, 32, 18, 28, 29, 27 };
// 	if (index < 36) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    1, 2, 14, 12, 24, 25, 28, 17, 30, 8, 10, 21, 9, 6, 0, 16, 22, 13, 32, 29, 26, 3, 5, 4, 19, 18, 20, 35, 33, 34, 11, 23, 31, 7, 15, 27
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 14;
	*idx++ = 12;
	*idx++ = 24;
	*idx++ = 25;
	*idx++ = 28;
	*idx++ = 17;
	*idx++ = 30;
	*idx++ = 8;
	*idx++ = 10;
	*idx++ = 21;
	*idx++ = 9;
	*idx++ = 6;
	*idx++ = 0;
	*idx++ = 16;
	*idx++ = 22;
	*idx++ = 13;
	*idx++ = 32;
	*idx++ = 29;
	*idx++ = 26;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 20;
	*idx++ = 35;
	*idx++ = 33;
	*idx++ = 34;
	*idx++ = 11;
	*idx++ = 23;
	*idx++ = 31;
	*idx++ = 7;
	*idx++ = 15;
	*idx++ = 27;
    }


};

template<>
struct impl<meta::braket<rysq::P, rysq::S, rysq::F, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {




#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dy*Dz*Qx), I+0);
	    update((C[0][0])*W[a]*(Dx*Dz*Qy), I+1);
	    update((C[0][0])*W[a]*(Dx*Dy*Qz), I+2);
	    update((C[0][0])*W[a]*(Cy*Dz*Rx), I+3);
	    update((C[0][0])*W[a]*(Cz*Dy*Rx), I+4);
	    update((C[0][0])*W[a]*(Qy*Rx), I+5);
	    update((C[0][0])*W[a]*(Qz*Rx), I+6);
	    update((C[0][0])*W[a]*(Cx*Dz*Ry), I+7);
	    update((C[0][0])*W[a]*(Cz*Dx*Ry), I+8);
	    update((C[0][0])*W[a]*(Qz*Ry), I+9);
	    update((C[0][0])*W[a]*(Qx*Ry), I+10);
	    update((C[0][0])*W[a]*(Cy*Dx*Rz), I+11);
	    update((C[0][0])*W[a]*(Cx*Dy*Rz), I+12);
	    update((C[0][0])*W[a]*(Qx*Rz), I+13);
	    update((C[0][0])*W[a]*(Qy*Rz), I+14);
	    double f1 = (2*B00*Dz + Cz*Rz);
	    update((C[0][0])*W[a]*(Dx*f1), I+15);
	    update((C[0][0])*W[a]*(Dy*f1), I+16);
	    double f10 = 3*B00*B01;
	    update((C[0][0])*W[a]*((f10 + Cx*pow(Dx,3) + 3*B00*pow(Dx,2) + 3*B01*Cx*Dx)), I+17);
	    update((C[0][0])*W[a]*((3*B01*Cy*Dy + f10 + Cy*pow(Dy,3) + 3*B00*pow(Dy,2))), I+18);
	    update((C[0][0])*W[a]*((f10 + 3*B00*pow(Dz,2) + 3*B01*Cz*Dz + Cz*pow(Dz,3))), I+19);
	    double f12 = (2*B00*Dy + Cy*Ry);
	    update((C[0][0])*W[a]*(Dx*f12), I+20);
	    update((C[0][0])*W[a]*(Dz*f12), I+21);
	    double f2 = (pow(Dz,2) + 3*B01);
	    update((C[0][0])*W[a]*(Cy*Dz*f2), I+22);
	    update((C[0][0])*W[a]*(Cx*Dz*f2), I+23);
	    double f4 = (3*B01 + pow(Dy,2));
	    update((C[0][0])*W[a]*(Cx*Dy*f4), I+24);
	    update((C[0][0])*W[a]*(Cz*Dy*f4), I+25);
	    double f6 = (3*B01 + pow(Dx,2));
	    update((C[0][0])*W[a]*(Cz*Dx*f6), I+26);
	    update((C[0][0])*W[a]*(Cy*Dx*f6), I+27);
	    double f7 = (2*B00*Dx + Cx*Rx);
	    update((C[0][0])*W[a]*(Dz*f7), I+28);
	    update((C[0][0])*W[a]*(Dy*f7), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[27] = T[0];
	I[28] = T[1];
	I[29] = T[2];
	I[13] = T[3];
	I[11] = T[4];
	I[10] = T[5];
	I[14] = T[6];
	I[18] = T[7];
	I[17] = T[8];
	I[20] = T[9];
	I[15] = T[10];
	I[22] = T[11];
	I[24] = T[12];
	I[21] = T[13];
	I[25] = T[14];
	I[23] = T[15];
	I[26] = T[16];
	I[0] = T[17];
	I[4] = T[18];
	I[8] = T[19];
	I[16] = T[20];
	I[19] = T[21];
	I[7] = T[22];
	I[6] = T[23];
	I[3] = T[24];
	I[5] = T[25];
	I[2] = T[26];
	I[1] = T[27];
	I[12] = T[28];
	I[9] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 17, 27, 26, 24, 18, 25, 23, 22, 19, 29, 5, 4, 28, 3, 6, 10, 20, 8, 7, 21, 9, 13, 11, 15, 12, 14, 16, 0, 1, 2 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    27, 28, 29, 13, 11, 10, 14, 18, 17, 20, 15, 22, 24, 21, 25, 23, 26, 0, 4, 8, 16, 19, 7, 6, 3, 5, 2, 1, 12, 9
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 29;
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 10;
	*idx++ = 14;
	*idx++ = 18;
	*idx++ = 17;
	*idx++ = 20;
	*idx++ = 15;
	*idx++ = 22;
	*idx++ = 24;
	*idx++ = 21;
	*idx++ = 25;
	*idx++ = 23;
	*idx++ = 26;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
	*idx++ = 16;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 12;
	*idx++ = 9;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::P, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[72]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Cz*Ix), I+0);
	    update((C[0][0])*W[a]*(Cx*Cz*Iy), I+1);
	    update((C[0][0])*W[a]*(Cx*Cy*Iz), I+2);
	    update((C[1][0])*W[a]*(Dy*Iz*Px), I+3);
	    update((C[0][0])*W[a]*(Iz*Px), I+4);
	    update((C[1][0])*W[a]*(Dz*Iy*Px), I+5);
	    update((C[0][0])*W[a]*(Iy*Px), I+6);
	    update((C[1][0])*W[a]*(Dx*Iz*Py), I+7);
	    update((C[0][0])*W[a]*(Iz*Py), I+8);
	    update((C[1][0])*W[a]*(Dz*Ix*Py), I+9);
	    update((C[0][0])*W[a]*(Ix*Py), I+10);
	    update((C[1][0])*W[a]*(Dx*Iy*Pz), I+11);
	    update((C[0][0])*W[a]*(Iy*Pz), I+12);
	    update((C[1][0])*W[a]*(Dy*Ix*Pz), I+13);
	    update((C[0][0])*W[a]*(Ix*Pz), I+14);
	    update((C[1][0])*W[a]*(Cy*Iz*Qx), I+15);
	    update((C[1][0])*W[a]*(Cz*Iy*Qx), I+16);
	    update((C[1][0])*W[a]*(Cx*Iz*Qy), I+17);
	    update((C[1][0])*W[a]*(Cz*Ix*Qy), I+18);
	    update((C[1][0])*W[a]*(Cy*Ix*Qz), I+19);
	    update((C[1][0])*W[a]*(Cx*Iy*Qz), I+20);
	    update((C[1][0])*W[a]*(Qz*(Px + Cx*Xij)), I+21);
	    update((C[1][0])*W[a]*(Qy*(Px + Cx*Xij)), I+22);
	    update((C[1][0])*W[a]*(Cy*Dz*(Px + Cx*Xij)), I+23);
	    update((C[0][0])*W[a]*(Cy*(Px + Cx*Xij)), I+24);
	    update((C[1][0])*W[a]*(Cz*Dy*(Px + Cx*Xij)), I+25);
	    update((C[0][0])*W[a]*(Cz*(Px + Cx*Xij)), I+26);
	    update((C[1][0])*W[a]*(Cx*Cy*(Dz*Zij + Qz)), I+27);
	    update((C[1][0])*W[a]*(Py*(Dz*Zij + Qz)), I+28);
	    update((C[1][0])*W[a]*(Px*(Dz*Zij + Qz)), I+29);
	    update((C[1][0])*W[a]*(Qy*(Cz*Zij + Pz)), I+30);
	    update((C[1][0])*W[a]*(Qx*(Cz*Zij + Pz)), I+31);
	    update((C[1][0])*W[a]*(Cx*Dy*(Cz*Zij + Pz)), I+32);
	    update((C[0][0])*W[a]*(Cx*(Cz*Zij + Pz)), I+33);
	    update((C[1][0])*W[a]*(Cy*Dx*(Cz*Zij + Pz)), I+34);
	    update((C[0][0])*W[a]*(Cy*(Cz*Zij + Pz)), I+35);
	    double f1 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[1][0])*W[a]*(Cx*f1), I+36);
	    update((C[1][0])*W[a]*(Cz*f1), I+37);
	    double f10 = (Cy*Iy + B10);
	    update((C[1][0])*W[a]*(Qx*f10), I+38);
	    update((C[1][0])*W[a]*(Cz*Dx*f10), I+39);
	    update((C[0][0])*W[a]*(Cz*f10), I+40);
	    update((C[1][0])*W[a]*(Cx*Dz*f10), I+41);
	    update((C[0][0])*W[a]*(Cx*f10), I+42);
	    update((C[1][0])*W[a]*(Qz*f10), I+43);
	    double f11 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[1][0])*W[a]*(Dz*f11), I+44);
	    update((C[1][0])*W[a]*(Dy*f11), I+45);
	    update((C[0][0])*W[a]*(f11), I+46);
	    double f13 = (Dx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Iy*f13), I+47);
	    update((C[1][0])*W[a]*(Iz*f13), I+48);
	    double f14 = (Dx*Ix + B00);
	    update((C[1][0])*W[a]*(Cy*Cz*f14), I+49);
	    update((C[1][0])*W[a]*(Pz*f14), I+50);
	    update((C[1][0])*W[a]*(Py*f14), I+51);
	    double f21 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[1][0])*W[a]*(Dx*f21), I+52);
	    update((C[1][0])*W[a]*(Dy*f21), I+53);
	    update((C[0][0])*W[a]*(f21), I+54);
	    double f24 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[1][0])*W[a]*(Dz*f24), I+55);
	    update((C[1][0])*W[a]*(Dx*f24), I+56);
	    update((C[0][0])*W[a]*(f24), I+57);
	    double f3 = 3*B00*B10;
	    update((C[1][0])*W[a]*((Dx*Ix*pow(Cx,2) + B00*Cx*(3*Cx + 2*Xij) + B10*Dx*(3*Cx + Xij) + f3)), I+58);
	    update((C[1][0])*W[a]*((Dy*Iy*pow(Cy,2) + f3 + B00*Cy*(3*Cy + 2*Yij) + B10*Dy*(3*Cy + Yij))), I+59);
	    update((C[1][0])*W[a]*((B00*Cz*(3*Cz + 2*Zij) + Dz*Iz*pow(Cz,2) + B10*Dz*(3*Cz + Zij) + f3)), I+60);
	    double f4 = (Dy*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(Ix*f4), I+61);
	    update((C[1][0])*W[a]*(Iz*f4), I+62);
	    double f5 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*(Cx*(f5 + Qz*Zij)), I+63);
	    update((C[1][0])*W[a]*(Cy*(f5 + Qz*Zij)), I+64);
	    update((C[1][0])*W[a]*(Ix*f5), I+65);
	    update((C[1][0])*W[a]*(Iy*f5), I+66);
	    double f7 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[1][0])*W[a]*(Cy*f7), I+67);
	    update((C[1][0])*W[a]*(Cz*f7), I+68);
	    double f9 = (Dy*Iy + B00);
	    update((C[1][0])*W[a]*(Cx*Cz*f9), I+69);
	    update((C[1][0])*W[a]*(Px*f9), I+70);
	    update((C[1][0])*W[a]*(Pz*f9), I+71);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[72]) {
	double T[72];
	for (int i = 0; i < 72; ++i) {
	    T[i] = I[i];
	}
	I[5] = T[0];
	I[10] = T[1];
	I[15] = T[2];
	I[48] = T[3];
	I[12] = T[4];
	I[60] = T[5];
	I[6] = T[6];
	I[31] = T[7];
	I[13] = T[8];
	I[55] = T[9];
	I[1] = T[10];
	I[26] = T[11];
	I[8] = T[12];
	I[38] = T[13];
	I[2] = T[14];
	I[33] = T[15];
	I[28] = T[16];
	I[51] = T[17];
	I[41] = T[18];
	I[59] = T[19];
	I[64] = T[20];
	I[58] = T[21];
	I[39] = T[22];
	I[57] = T[23];
	I[3] = T[24];
	I[40] = T[25];
	I[4] = T[26];
	I[69] = T[27];
	I[67] = T[28];
	I[66] = T[29];
	I[53] = T[30];
	I[34] = T[31];
	I[52] = T[32];
	I[16] = T[33];
	I[35] = T[34];
	I[17] = T[35];
	I[45] = T[36];
	I[47] = T[37];
	I[27] = T[38];
	I[29] = T[39];
	I[11] = T[40];
	I[63] = T[41];
	I[9] = T[42];
	I[65] = T[43];
	I[54] = T[44];
	I[36] = T[45];
	I[0] = T[46];
	I[24] = T[47];
	I[30] = T[48];
	I[23] = T[49];
	I[20] = T[50];
	I[19] = T[51];
	I[32] = T[52];
	I[50] = T[53];
	I[14] = T[54];
	I[61] = T[55];
	I[25] = T[56];
	I[7] = T[57];
	I[18] = T[58];
	I[43] = T[59];
	I[68] = T[60];
	I[37] = T[61];
	I[49] = T[62];
	I[70] = T[63];
	I[71] = T[64];
	I[56] = T[65];
	I[62] = T[66];
	I[21] = T[67];
	I[22] = T[68];
	I[46] = T[69];
	I[42] = T[70];
	I[44] = T[71];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[72] = { 46, 10, 14, 24, 26, 0, 6, 57, 12, 42, 1, 40, 4, 8, 54, 2, 33, 35, 58, 51, 50, 67, 68, 49, 47, 56, 11, 38, 16, 39, 48, 7, 52, 15, 31, 34, 45, 61, 13, 22, 25, 18, 70, 59, 71, 36, 69, 37, 3, 62, 53, 17, 32, 30, 44, 9, 65, 23, 21, 19, 5, 55, 66, 41, 20, 43, 29, 28, 60, 27, 63, 64 };
// 	if (index < 72) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    5, 10, 15, 48, 12, 60, 6, 31, 13, 55, 1, 26, 8, 38, 2, 33, 28, 51, 41, 59, 64, 58, 39, 57, 3, 40, 4, 69, 67, 66, 53, 34, 52, 16, 35, 17, 45, 47, 27, 29, 11, 63, 9, 65, 54, 36, 0, 24, 30, 23, 20, 19, 32, 50, 14, 61, 25, 7, 18, 43, 68, 37, 49, 70, 71, 56, 62, 21, 22, 46, 42, 44
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 5;
	*idx++ = 10;
	*idx++ = 15;
	*idx++ = 48;
	*idx++ = 12;
	*idx++ = 60;
	*idx++ = 6;
	*idx++ = 31;
	*idx++ = 13;
	*idx++ = 55;
	*idx++ = 1;
	*idx++ = 26;
	*idx++ = 8;
	*idx++ = 38;
	*idx++ = 2;
	*idx++ = 33;
	*idx++ = 28;
	*idx++ = 51;
	*idx++ = 41;
	*idx++ = 59;
	*idx++ = 64;
	*idx++ = 58;
	*idx++ = 39;
	*idx++ = 57;
	*idx++ = 3;
	*idx++ = 40;
	*idx++ = 4;
	*idx++ = 69;
	*idx++ = 67;
	*idx++ = 66;
	*idx++ = 53;
	*idx++ = 34;
	*idx++ = 52;
	*idx++ = 16;
	*idx++ = 35;
	*idx++ = 17;
	*idx++ = 45;
	*idx++ = 47;
	*idx++ = 27;
	*idx++ = 29;
	*idx++ = 11;
	*idx++ = 63;
	*idx++ = 9;
	*idx++ = 65;
	*idx++ = 54;
	*idx++ = 36;
	*idx++ = 0;
	*idx++ = 24;
	*idx++ = 30;
	*idx++ = 23;
	*idx++ = 20;
	*idx++ = 19;
	*idx++ = 32;
	*idx++ = 50;
	*idx++ = 14;
	*idx++ = 61;
	*idx++ = 25;
	*idx++ = 7;
	*idx++ = 18;
	*idx++ = 43;
	*idx++ = 68;
	*idx++ = 37;
	*idx++ = 49;
	*idx++ = 70;
	*idx++ = 71;
	*idx++ = 56;
	*idx++ = 62;
	*idx++ = 21;
	*idx++ = 22;
	*idx++ = 46;
	*idx++ = 42;
	*idx++ = 44;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[4]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {


	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[0][1])*W[a]*(Ix), I+1);
	    update((C[0][1])*W[a]*(Iy), I+2);
	    update((C[0][1])*W[a]*(Iz), I+3);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[4]) {
	double T[4];
	for (int i = 0; i < 4; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[4] = { 0, 1, 2, 3 };
// 	if (index < 4) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::D, rysq::SP, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[144]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Iy*Iz*Px), I+0);
	    update((C[0][0])*W[a]*(Ix*Iz*Py), I+1);
	    update((C[0][0])*W[a]*(Ix*Iy*Pz), I+2);
	    update((C[1][0])*W[a]*(Iz*Py*(Dx*Xij + Qx)), I+3);
	    update((C[1][0])*W[a]*(Iy*Qz*(Px + Cx*Xij)), I+4);
	    update((C[1][0])*W[a]*(Cy*Qz*(Xij*(Xij + 2*Cx) + Px)), I+5);
	    update((C[0][0])*W[a]*(Cy*Iz*(Px + Cx*Xij)), I+6);
	    update((C[1][0])*W[a]*(Iz*Qy*(Px + Cx*Xij)), I+7);
	    update((C[1][0])*W[a]*(Cz*Qy*(Xij*(Xij + 2*Cx) + Px)), I+8);
	    update((C[0][0])*W[a]*(Cz*Iy*(Px + Cx*Xij)), I+9);
	    update((C[0][0])*W[a]*(Cy*Cz*(Xij*(Xij + 2*Cx) + Px)), I+10);
	    update((C[1][0])*W[a]*(Dy*Pz*(Xij*(Xij + 2*Cx) + Px)), I+11);
	    update((C[0][0])*W[a]*(Pz*(Xij*(Xij + 2*Cx) + Px)), I+12);
	    update((C[1][0])*W[a]*(Dz*Py*(Xij*(Xij + 2*Cx) + Px)), I+13);
	    update((C[0][0])*W[a]*(Py*(Xij*(Xij + 2*Cx) + Px)), I+14);
	    update((C[1][0])*W[a]*(Iy*Pz*(Dx*Xij + Qx)), I+15);
	    update((C[1][0])*W[a]*(Ix*Pz*(Dy*Yij + Qy)), I+16);
	    update((C[1][0])*W[a]*(Iz*Px*(Dy*Yij + Qy)), I+17);
	    update((C[1][0])*W[a]*(Cz*(Px + Cx*Xij)*(Dy*Yij + Qy)), I+18);
	    update((C[1][0])*W[a]*(Cy*(Cz*Zij + Pz)*(Dx*Xij + Qx)), I+19);
	    update((C[1][0])*W[a]*(Cx*(Cz*Zij + Pz)*(Dy*Yij + Qy)), I+20);
	    update((C[1][0])*W[a]*(Dy*(Px + Cx*Xij)*(Cz*Zij + Pz)), I+21);
	    update((C[0][0])*W[a]*((Px + Cx*Xij)*(Cz*Zij + Pz)), I+22);
	    update((C[1][0])*W[a]*(Ix*Qy*(Cz*Zij + Pz)), I+23);
	    update((C[1][0])*W[a]*(Cx*Qy*(Zij*(2*Cz + Zij) + Pz)), I+24);
	    update((C[0][0])*W[a]*(Cx*Iy*(Cz*Zij + Pz)), I+25);
	    update((C[1][0])*W[a]*(Iy*Qx*(Cz*Zij + Pz)), I+26);
	    update((C[1][0])*W[a]*(Cy*Qx*(Zij*(2*Cz + Zij) + Pz)), I+27);
	    update((C[0][0])*W[a]*(Cy*Ix*(Cz*Zij + Pz)), I+28);
	    update((C[0][0])*W[a]*(Cx*Cy*(Zij*(2*Cz + Zij) + Pz)), I+29);
	    update((C[1][0])*W[a]*(Dx*Py*(Zij*(2*Cz + Zij) + Pz)), I+30);
	    update((C[0][0])*W[a]*(Py*(Zij*(2*Cz + Zij) + Pz)), I+31);
	    update((C[1][0])*W[a]*(Dy*Px*(Zij*(2*Cz + Zij) + Pz)), I+32);
	    update((C[0][0])*W[a]*(Px*(Zij*(2*Cz + Zij) + Pz)), I+33);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[1][0])*W[a]*((Px + Cx*Xij)*(f1 + Qz*Zij)), I+34);
	    update((C[1][0])*W[a]*(Cy*Ix*(f1 + Qz*Zij)), I+35);
	    update((C[1][0])*W[a]*(Cx*Cy*(Zij*(2*B00 + Dz*(2*Cz + Zij)) + f1)), I+36);
	    update((C[1][0])*W[a]*(Cx*Iy*(f1 + Qz*Zij)), I+37);
	    update((C[1][0])*W[a]*(Py*(Zij*(2*B00 + Dz*(2*Cz + Zij)) + f1)), I+38);
	    update((C[1][0])*W[a]*(Px*(Zij*(2*B00 + Dz*(2*Cz + Zij)) + f1)), I+39);
	    update((C[1][0])*W[a]*(f1*(Xij*(Xij + 2*Cx) + Px)), I+40);
	    update((C[1][0])*W[a]*(Ix*Iy*f1), I+41);
	    double f11 = (Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij);
	    update((C[1][0])*W[a]*(Ix*f11), I+42);
	    update((C[1][0])*W[a]*(Iy*f11), I+43);
	    double f12 = (B00 + Dz*Iz);
	    update((C[1][0])*W[a]*(Cy*f12*(Px + Cx*Xij)), I+44);
	    update((C[1][0])*W[a]*(Ix*Py*f12), I+45);
	    update((C[1][0])*W[a]*(Iy*Px*f12), I+46);
	    double f15 = (B00*(3*B10 + Ix*(3*Cx + Xij)) + Dx*(B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2)));
	    update((C[1][0])*W[a]*(Cy*f15), I+47);
	    update((C[1][0])*W[a]*(Cz*f15), I+48);
	    double f17 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[1][0])*W[a]*(Iz*f17), I+49);
	    update((C[1][0])*W[a]*(Iy*f17), I+50);
	    double f18 = (B10 + pow(Iy,2));
	    update((C[1][0])*W[a]*(Cx*Qz*f18), I+51);
	    update((C[0][0])*W[a]*(Cx*Cz*f18), I+52);
	    update((C[1][0])*W[a]*(Cz*Qx*f18), I+53);
	    update((C[1][0])*W[a]*(Dx*Pz*f18), I+54);
	    update((C[0][0])*W[a]*(Pz*f18), I+55);
	    update((C[1][0])*W[a]*(Dz*Px*f18), I+56);
	    update((C[0][0])*W[a]*(Px*f18), I+57);
	    update((C[1][0])*W[a]*(f1*f18), I+58);
	    double f2 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[1][0])*W[a]*(Qy*f2), I+59);
	    update((C[1][0])*W[a]*(Qz*f2), I+60);
	    update((C[1][0])*W[a]*(Cy*Dz*f2), I+61);
	    update((C[0][0])*W[a]*(Cy*f2), I+62);
	    update((C[1][0])*W[a]*(Cz*Dy*f2), I+63);
	    update((C[0][0])*W[a]*(Cz*f2), I+64);
	    double f22 = (2*B00*Cy*Yij + 3*B00*Py + Dy*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)));
	    update((C[1][0])*W[a]*(Ix*f22), I+65);
	    update((C[1][0])*W[a]*(Iz*f22), I+66);
	    double f25 = (B00*(Iz*(3*Cz + Zij) + 3*B10) + Dz*(B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2)));
	    update((C[1][0])*W[a]*(Cx*f25), I+67);
	    update((C[1][0])*W[a]*(Cy*f25), I+68);
	    double f27 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[1][0])*W[a]*(Qy*f27), I+69);
	    update((C[1][0])*W[a]*(Qx*f27), I+70);
	    update((C[1][0])*W[a]*(Cx*Dy*f27), I+71);
	    update((C[0][0])*W[a]*(Cx*f27), I+72);
	    update((C[1][0])*W[a]*(Cy*Dx*f27), I+73);
	    update((C[0][0])*W[a]*(Cy*f27), I+74);
	    double f28 = (Cy*Iy + B10);
	    update((C[1][0])*W[a]*(f28*(f1 + Qz*Zij)), I+75);
	    update((C[0][0])*W[a]*(f28*(Px + Cx*Xij)), I+76);
	    update((C[1][0])*W[a]*(Dz*f28*(Px + Cx*Xij)), I+77);
	    update((C[1][0])*W[a]*(Cz*f28*(Dx*Xij + Qx)), I+78);
	    update((C[1][0])*W[a]*(Ix*Qz*f28), I+79);
	    update((C[0][0])*W[a]*(f28*(Cz*Zij + Pz)), I+80);
	    update((C[1][0])*W[a]*(Dx*f28*(Cz*Zij + Pz)), I+81);
	    update((C[0][0])*W[a]*(Cz*Ix*f28), I+82);
	    update((C[1][0])*W[a]*(Iz*Qx*f28), I+83);
	    update((C[0][0])*W[a]*(Cx*Iz*f28), I+84);
	    update((C[1][0])*W[a]*(Cx*f12*f28), I+85);
	    double f29 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[1][0])*W[a]*(f29*(Dy*Yij + Qy)), I+86);
	    update((C[1][0])*W[a]*(f12*f29), I+87);
	    update((C[1][0])*W[a]*(Dy*Iz*f29), I+88);
	    update((C[0][0])*W[a]*(Iz*f29), I+89);
	    update((C[1][0])*W[a]*(Dz*Iy*f29), I+90);
	    update((C[0][0])*W[a]*(Iy*f29), I+91);
	    double f3 = (3*pow(B10,2) + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2)));
	    update((C[1][0])*W[a]*(Dx*f3), I+92);
	    update((C[1][0])*W[a]*(Dy*f3), I+93);
	    update((C[0][0])*W[a]*(f3), I+94);
	    double f30 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[1][0])*W[a]*(f30*(Px + Cx*Xij)), I+95);
	    update((C[1][0])*W[a]*(Cx*Iz*f30), I+96);
	    update((C[1][0])*W[a]*(f30*(Cz*Zij + Pz)), I+97);
	    update((C[1][0])*W[a]*(Cz*Ix*f30), I+98);
	    double f35 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[1][0])*W[a]*(f35*(Dx*Xij + Qx)), I+99);
	    update((C[1][0])*W[a]*(Dx*Iz*f35), I+100);
	    update((C[1][0])*W[a]*(Dz*Ix*f35), I+101);
	    update((C[0][0])*W[a]*(Ix*f35), I+102);
	    update((C[1][0])*W[a]*(f12*f35), I+103);
	    update((C[0][0])*W[a]*(Iz*f35), I+104);
	    double f36 = (Dy*Py + 2*B00*Cy);
	    update((C[1][0])*W[a]*(f36*(Xij*(Xij + 2*Cx) + Px)), I+105);
	    update((C[1][0])*W[a]*(Ix*Iz*f36), I+106);
	    update((C[1][0])*W[a]*(f36*(Zij*(2*Cz + Zij) + Pz)), I+107);
	    double f37 = (Dy*(Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij)) + B00*(3*B10 + Iy*(3*Cy + Yij)));
	    update((C[1][0])*W[a]*(Cx*f37), I+108);
	    update((C[1][0])*W[a]*(Cz*f37), I+109);
	    double f38 = 3*pow(B10,2);
	    update((C[1][0])*W[a]*((2*B00*(Xij + 2*Cx)*(Cx*Ix + 3*B10) + Dx*(f38 + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2)))), I+110);
	    update((C[1][0])*W[a]*((2*B00*(Yij + 2*Cy)*(3*B10 + Cy*Iy) + Dy*(B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + f38 + pow(Cy,2)*pow(Iy,2)))), I+111);
	    update((C[1][0])*W[a]*((2*B00*(2*Cz + Zij)*(3*B10 + Cz*Iz) + Dz*(f38 + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2))))), I+112);
	    double f4 = (3*pow(B10,2) + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2));
	    update((C[1][0])*W[a]*(Dz*f4), I+113);
	    update((C[1][0])*W[a]*(Dy*f4), I+114);
	    update((C[0][0])*W[a]*(f4), I+115);
	    double f41 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[1][0])*W[a]*(f41*(Dy*Yij + Qy)), I+116);
	    update((C[1][0])*W[a]*(f41*(Dx*Xij + Qx)), I+117);
	    update((C[1][0])*W[a]*(Dx*Iy*f41), I+118);
	    update((C[0][0])*W[a]*(Iy*f41), I+119);
	    update((C[1][0])*W[a]*(Dy*Ix*f41), I+120);
	    update((C[0][0])*W[a]*(Ix*f41), I+121);
	    double f5 = (3*pow(B10,2) + B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + pow(Cy,2)*pow(Iy,2));
	    update((C[1][0])*W[a]*(Dz*f5), I+122);
	    update((C[1][0])*W[a]*(Dx*f5), I+123);
	    update((C[0][0])*W[a]*(f5), I+124);
	    double f6 = (2*B00*Iy + Dy*(B10 + pow(Iy,2)));
	    update((C[1][0])*W[a]*(Cx*Cz*f6), I+125);
	    update((C[1][0])*W[a]*(Px*f6), I+126);
	    update((C[1][0])*W[a]*(Pz*f6), I+127);
	    double f7 = (Dx*Px + 2*B00*Cx);
	    update((C[1][0])*W[a]*(Cy*Iz*(Qx*Xij + f7)), I+128);
	    update((C[1][0])*W[a]*(Cz*Iy*(Qx*Xij + f7)), I+129);
	    update((C[1][0])*W[a]*(Cy*Cz*(Xij*(2*B00 + Dx*(Xij + 2*Cx)) + f7)), I+130);
	    update((C[1][0])*W[a]*(f28*(Qx*Xij + f7)), I+131);
	    update((C[1][0])*W[a]*(Py*(Xij*(2*B00 + Dx*(Xij + 2*Cx)) + f7)), I+132);
	    update((C[1][0])*W[a]*(Pz*(Xij*(2*B00 + Dx*(Xij + 2*Cx)) + f7)), I+133);
	    update((C[1][0])*W[a]*((Cz*Zij + Pz)*(Qx*Xij + f7)), I+134);
	    update((C[1][0])*W[a]*(Iy*Iz*f7), I+135);
	    update((C[1][0])*W[a]*(f7*(Zij*(2*Cz + Zij) + Pz)), I+136);
	    update((C[1][0])*W[a]*(f18*f7), I+137);
	    double f8 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[1][0])*W[a]*(Cx*Dz*f8), I+138);
	    update((C[1][0])*W[a]*(Qx*f8), I+139);
	    update((C[1][0])*W[a]*(Cz*Dx*f8), I+140);
	    update((C[0][0])*W[a]*(Cz*f8), I+141);
	    update((C[1][0])*W[a]*(Qz*f8), I+142);
	    update((C[0][0])*W[a]*(Cx*f8), I+143);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[144]) {
	double T[144];
	for (int i = 0; i < 144; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[25] = T[1];
	I[20] = T[2];
	I[61] = T[3];
	I[130] = T[4];
	I[113] = T[5];
	I[27] = T[6];
	I[99] = T[7];
	I[77] = T[8];
	I[22] = T[9];
	I[5] = T[10];
	I[74] = T[11];
	I[2] = T[12];
	I[109] = T[13];
	I[1] = T[14];
	I[56] = T[15];
	I[92] = T[16];
	I[102] = T[17];
	I[94] = T[18];
	I[65] = T[19];
	I[106] = T[20];
	I[100] = T[21];
	I[28] = T[22];
	I[101] = T[23];
	I[87] = T[24];
	I[34] = T[25];
	I[70] = T[26];
	I[51] = T[27];
	I[29] = T[28];
	I[15] = T[29];
	I[49] = T[30];
	I[13] = T[31];
	I[84] = T[32];
	I[12] = T[33];
	I[136] = T[34];
	I[137] = T[35];
	I[123] = T[36];
	I[142] = T[37];
	I[121] = T[38];
	I[120] = T[39];
	I[110] = T[40];
	I[128] = T[41];
	I[134] = T[42];
	I[140] = T[43];
	I[135] = T[44];
	I[133] = T[45];
	I[138] = T[46];
	I[39] = T[47];
	I[40] = T[48];
	I[60] = T[49];
	I[54] = T[50];
	I[118] = T[51];
	I[10] = T[52];
	I[46] = T[53];
	I[44] = T[54];
	I[8] = T[55];
	I[114] = T[56];
	I[6] = T[57];
	I[116] = T[58];
	I[75] = T[59];
	I[112] = T[60];
	I[111] = T[61];
	I[3] = T[62];
	I[76] = T[63];
	I[4] = T[64];
	I[91] = T[65];
	I[103] = T[66];
	I[124] = T[67];
	I[125] = T[68];
	I[89] = T[69];
	I[52] = T[70];
	I[88] = T[71];
	I[16] = T[72];
	I[53] = T[73];
	I[17] = T[74];
	I[143] = T[75];
	I[21] = T[76];
	I[129] = T[77];
	I[59] = T[78];
	I[131] = T[79];
	I[35] = T[80];
	I[71] = T[81];
	I[23] = T[82];
	I[69] = T[83];
	I[33] = T[84];
	I[141] = T[85];
	I[90] = T[86];
	I[132] = T[87];
	I[96] = T[88];
	I[24] = T[89];
	I[126] = T[90];
	I[18] = T[91];
	I[50] = T[92];
	I[86] = T[93];
	I[14] = T[94];
	I[93] = T[95];
	I[105] = T[96];
	I[107] = T[97];
	I[95] = T[98];
	I[55] = T[99];
	I[67] = T[100];
	I[127] = T[101];
	I[19] = T[102];
	I[139] = T[103];
	I[31] = T[104];
	I[73] = T[105];
	I[97] = T[106];
	I[85] = T[107];
	I[81] = T[108];
	I[83] = T[109];
	I[36] = T[110];
	I[79] = T[111];
	I[122] = T[112];
	I[108] = T[113];
	I[72] = T[114];
	I[0] = T[115];
	I[104] = T[116];
	I[62] = T[117];
	I[68] = T[118];
	I[32] = T[119];
	I[98] = T[120];
	I[26] = T[121];
	I[115] = T[122];
	I[43] = T[123];
	I[7] = T[124];
	I[82] = T[125];
	I[78] = T[126];
	I[80] = T[127];
	I[63] = T[128];
	I[58] = T[129];
	I[41] = T[130];
	I[57] = T[131];
	I[37] = T[132];
	I[38] = T[133];
	I[64] = T[134];
	I[66] = T[135];
	I[48] = T[136];
	I[42] = T[137];
	I[117] = T[138];
	I[45] = T[139];
	I[47] = T[140];
	I[11] = T[141];
	I[119] = T[142];
	I[9] = T[143];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[144] = { 115, 14, 12, 62, 64, 10, 57, 124, 55, 143, 52, 141, 33, 31, 94, 29, 72, 74, 91, 102, 2, 76, 9, 82, 89, 1, 121, 6, 22, 28, 0, 104, 119, 84, 25, 80, 110, 132, 133, 47, 48, 130, 137, 123, 54, 139, 53, 140, 136, 30, 92, 27, 70, 73, 50, 99, 15, 131, 129, 78, 49, 3, 117, 128, 134, 19, 135, 100, 118, 83, 26, 81, 114, 105, 11, 59, 63, 8, 126, 111, 127, 108, 125, 109, 32, 107, 93, 24, 71, 69, 86, 65, 16, 95, 18, 98, 88, 106, 120, 7, 21, 23, 17, 66, 116, 96, 20, 97, 113, 13, 40, 61, 60, 5, 56, 122, 58, 138, 51, 142, 39, 38, 112, 36, 67, 68, 90, 101, 41, 77, 4, 79, 87, 45, 42, 44, 34, 35, 46, 103, 43, 85, 37, 75 };
// 	if (index < 144) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    30, 25, 20, 61, 130, 113, 27, 99, 77, 22, 5, 74, 2, 109, 1, 56, 92, 102, 94, 65, 106, 100, 28, 101, 87, 34, 70, 51, 29, 15, 49, 13, 84, 12, 136, 137, 123, 142, 121, 120, 110, 128, 134, 140, 135, 133, 138, 39, 40, 60, 54, 118, 10, 46, 44, 8, 114, 6, 116, 75, 112, 111, 3, 76, 4, 91, 103, 124, 125, 89, 52, 88, 16, 53, 17, 143, 21, 129, 59, 131, 35, 71, 23, 69, 33, 141, 90, 132, 96, 24, 126, 18, 50, 86, 14, 93, 105, 107, 95, 55, 67, 127, 19, 139, 31, 73, 97, 85, 81, 83, 36, 79, 122, 108, 72, 0, 104, 62, 68, 32, 98, 26, 115, 43, 7, 82, 78, 80, 63, 58, 41, 57, 37, 38, 64, 66, 48, 42, 117, 45, 47, 11, 119, 9
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 30;
	*idx++ = 25;
	*idx++ = 20;
	*idx++ = 61;
	*idx++ = 130;
	*idx++ = 113;
	*idx++ = 27;
	*idx++ = 99;
	*idx++ = 77;
	*idx++ = 22;
	*idx++ = 5;
	*idx++ = 74;
	*idx++ = 2;
	*idx++ = 109;
	*idx++ = 1;
	*idx++ = 56;
	*idx++ = 92;
	*idx++ = 102;
	*idx++ = 94;
	*idx++ = 65;
	*idx++ = 106;
	*idx++ = 100;
	*idx++ = 28;
	*idx++ = 101;
	*idx++ = 87;
	*idx++ = 34;
	*idx++ = 70;
	*idx++ = 51;
	*idx++ = 29;
	*idx++ = 15;
	*idx++ = 49;
	*idx++ = 13;
	*idx++ = 84;
	*idx++ = 12;
	*idx++ = 136;
	*idx++ = 137;
	*idx++ = 123;
	*idx++ = 142;
	*idx++ = 121;
	*idx++ = 120;
	*idx++ = 110;
	*idx++ = 128;
	*idx++ = 134;
	*idx++ = 140;
	*idx++ = 135;
	*idx++ = 133;
	*idx++ = 138;
	*idx++ = 39;
	*idx++ = 40;
	*idx++ = 60;
	*idx++ = 54;
	*idx++ = 118;
	*idx++ = 10;
	*idx++ = 46;
	*idx++ = 44;
	*idx++ = 8;
	*idx++ = 114;
	*idx++ = 6;
	*idx++ = 116;
	*idx++ = 75;
	*idx++ = 112;
	*idx++ = 111;
	*idx++ = 3;
	*idx++ = 76;
	*idx++ = 4;
	*idx++ = 91;
	*idx++ = 103;
	*idx++ = 124;
	*idx++ = 125;
	*idx++ = 89;
	*idx++ = 52;
	*idx++ = 88;
	*idx++ = 16;
	*idx++ = 53;
	*idx++ = 17;
	*idx++ = 143;
	*idx++ = 21;
	*idx++ = 129;
	*idx++ = 59;
	*idx++ = 131;
	*idx++ = 35;
	*idx++ = 71;
	*idx++ = 23;
	*idx++ = 69;
	*idx++ = 33;
	*idx++ = 141;
	*idx++ = 90;
	*idx++ = 132;
	*idx++ = 96;
	*idx++ = 24;
	*idx++ = 126;
	*idx++ = 18;
	*idx++ = 50;
	*idx++ = 86;
	*idx++ = 14;
	*idx++ = 93;
	*idx++ = 105;
	*idx++ = 107;
	*idx++ = 95;
	*idx++ = 55;
	*idx++ = 67;
	*idx++ = 127;
	*idx++ = 19;
	*idx++ = 139;
	*idx++ = 31;
	*idx++ = 73;
	*idx++ = 97;
	*idx++ = 85;
	*idx++ = 81;
	*idx++ = 83;
	*idx++ = 36;
	*idx++ = 79;
	*idx++ = 122;
	*idx++ = 108;
	*idx++ = 72;
	*idx++ = 0;
	*idx++ = 104;
	*idx++ = 62;
	*idx++ = 68;
	*idx++ = 32;
	*idx++ = 98;
	*idx++ = 26;
	*idx++ = 115;
	*idx++ = 43;
	*idx++ = 7;
	*idx++ = 82;
	*idx++ = 78;
	*idx++ = 80;
	*idx++ = 63;
	*idx++ = 58;
	*idx++ = 41;
	*idx++ = 57;
	*idx++ = 37;
	*idx++ = 38;
	*idx++ = 64;
	*idx++ = 66;
	*idx++ = 48;
	*idx++ = 42;
	*idx++ = 117;
	*idx++ = 45;
	*idx++ = 47;
	*idx++ = 11;
	*idx++ = 119;
	*idx++ = 9;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::S, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[54]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Dz*Ky*Px), I+0);
	    update((C[0][0])*W[a]*(Dy*Kz*Px), I+1);
	    update((C[0][0])*W[a]*(Dz*Kx*Py), I+2);
	    update((C[0][0])*W[a]*(Dx*Kz*Py), I+3);
	    update((C[0][0])*W[a]*(Dx*Ky*Pz), I+4);
	    update((C[0][0])*W[a]*(Dy*Kx*Pz), I+5);
	    update((C[0][0])*W[a]*(Cz*Ky*Qx), I+6);
	    update((C[0][0])*W[a]*(Cy*Kz*Qx), I+7);
	    update((C[0][0])*W[a]*(Cz*Kx*Qy), I+8);
	    update((C[0][0])*W[a]*(Cx*Kz*Qy), I+9);
	    update((C[0][0])*W[a]*(Cy*Kx*Qz), I+10);
	    update((C[0][0])*W[a]*(Cx*Ky*Qz), I+11);
	    update((C[0][0])*W[a]*(Cy*Dx*(Cz*Zkl + Qz)), I+12);
	    update((C[0][0])*W[a]*(Cx*Dy*(Cz*Zkl + Qz)), I+13);
	    update((C[0][0])*W[a]*(Qx*(Cz*Zkl + Qz)), I+14);
	    update((C[0][0])*W[a]*(Qy*(Cz*Zkl + Qz)), I+15);
	    double f0 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Dz*(f0 + Py*Ykl)), I+16);
	    update((C[0][0])*W[a]*(Dx*(f0 + Py*Ykl)), I+17);
	    update((C[0][0])*W[a]*(Kx*f0), I+18);
	    update((C[0][0])*W[a]*(Kz*f0), I+19);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Dy*(Pz*Zkl + f1)), I+20);
	    update((C[0][0])*W[a]*(Dx*(Pz*Zkl + f1)), I+21);
	    update((C[0][0])*W[a]*(Kx*f1), I+22);
	    update((C[0][0])*W[a]*(Ky*f1), I+23);
	    double f13 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Ky*f13), I+24);
	    update((C[0][0])*W[a]*(Kz*f13), I+25);
	    double f15 = (Dz*Kz + B01);
	    update((C[0][0])*W[a]*(Cx*Cy*f15), I+26);
	    update((C[0][0])*W[a]*(Px*f15), I+27);
	    update((C[0][0])*W[a]*(Py*f15), I+28);
	    double f16 = (B00 + Cx*Kx);
	    update((C[0][0])*W[a]*(Cy*Dz*f16), I+29);
	    update((C[0][0])*W[a]*(Cz*Dy*f16), I+30);
	    update((C[0][0])*W[a]*(Qy*f16), I+31);
	    update((C[0][0])*W[a]*(Qz*f16), I+32);
	    double f17 = (B00 + Cy*Ky);
	    update((C[0][0])*W[a]*(Cz*Dx*f17), I+33);
	    update((C[0][0])*W[a]*(Cx*Dz*f17), I+34);
	    update((C[0][0])*W[a]*(Qx*f17), I+35);
	    update((C[0][0])*W[a]*(Qz*f17), I+36);
	    double f2 = (Cy*Dy*Ky + B01*Cy + B00*(Ykl + 2*Dy));
	    update((C[0][0])*W[a]*(Cz*f2), I+37);
	    update((C[0][0])*W[a]*(Cx*f2), I+38);
	    double f10 = B01*B10;
	    double f20 = 2*pow(B00,2);
	    update((C[0][0])*W[a]*((B01*pow(Cx,2) + f10 + f20 + 2*B00*Cx*(Xkl + 2*Dx) + Dx*Kx*Px)), I+39);
	    update((C[0][0])*W[a]*((f10 + f20 + Dy*Ky*Py + B01*pow(Cy,2) + 2*B00*Cy*(Ykl + 2*Dy))), I+40);
	    update((C[0][0])*W[a]*((f10 + f20 + B01*pow(Cz,2) + Dz*Kz*Pz + 2*B00*Cz*(2*Dz + Zkl))), I+41);
	    double f22 = (B01 + Dy*Ky);
	    update((C[0][0])*W[a]*(Cx*Cz*f22), I+42);
	    update((C[0][0])*W[a]*(Pz*f22), I+43);
	    update((C[0][0])*W[a]*(Px*f22), I+44);
	    double f4 = (B01*Cx + Cx*Dx*Kx + B00*(Xkl + 2*Dx));
	    update((C[0][0])*W[a]*(Cy*f4), I+45);
	    update((C[0][0])*W[a]*(Cz*f4), I+46);
	    double f6 = (B01*Cz + B00*(2*Dz + Zkl) + Cz*Dz*Kz);
	    update((C[0][0])*W[a]*(Cx*f6), I+47);
	    update((C[0][0])*W[a]*(Cy*f6), I+48);
	    double f7 = (B01 + Dx*Kx);
	    update((C[0][0])*W[a]*(Cy*Cz*f7), I+49);
	    update((C[0][0])*W[a]*(Py*f7), I+50);
	    update((C[0][0])*W[a]*(Pz*f7), I+51);
	    double f9 = (Kx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Dy*f9), I+52);
	    update((C[0][0])*W[a]*(Dz*f9), I+53);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[54]) {
	double T[54];
	for (int i = 0; i < 54; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[42] = T[1];
	I[13] = T[2];
	I[37] = T[3];
	I[20] = T[4];
	I[8] = T[5];
	I[22] = T[6];
	I[39] = T[7];
	I[11] = T[8];
	I[45] = T[9];
	I[17] = T[10];
	I[34] = T[11];
	I[41] = T[12];
	I[46] = T[13];
	I[40] = T[14];
	I[47] = T[15];
	I[31] = T[16];
	I[19] = T[17];
	I[7] = T[18];
	I[43] = T[19];
	I[44] = T[20];
	I[38] = T[21];
	I[14] = T[22];
	I[32] = T[23];
	I[18] = T[24];
	I[36] = T[25];
	I[51] = T[26];
	I[48] = T[27];
	I[49] = T[28];
	I[15] = T[29];
	I[10] = T[30];
	I[9] = T[31];
	I[16] = T[32];
	I[23] = T[33];
	I[33] = T[34];
	I[21] = T[35];
	I[35] = T[36];
	I[29] = T[37];
	I[27] = T[38];
	I[0] = T[39];
	I[25] = T[40];
	I[50] = T[41];
	I[28] = T[42];
	I[26] = T[43];
	I[24] = T[44];
	I[3] = T[45];
	I[4] = T[46];
	I[52] = T[47];
	I[53] = T[48];
	I[5] = T[49];
	I[1] = T[50];
	I[2] = T[51];
	I[6] = T[52];
	I[12] = T[53];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[54] = { 39, 50, 51, 45, 46, 49, 52, 18, 5, 31, 30, 8, 53, 2, 22, 29, 32, 10, 24, 17, 4, 35, 6, 33, 44, 40, 43, 38, 42, 37, 0, 16, 23, 34, 11, 36, 25, 3, 21, 7, 14, 12, 1, 19, 20, 9, 13, 15, 27, 28, 41, 26, 47, 48 };
// 	if (index < 54) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    30, 42, 13, 37, 20, 8, 22, 39, 11, 45, 17, 34, 41, 46, 40, 47, 31, 19, 7, 43, 44, 38, 14, 32, 18, 36, 51, 48, 49, 15, 10, 9, 16, 23, 33, 21, 35, 29, 27, 0, 25, 50, 28, 26, 24, 3, 4, 52, 53, 5, 1, 2, 6, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 30;
	*idx++ = 42;
	*idx++ = 13;
	*idx++ = 37;
	*idx++ = 20;
	*idx++ = 8;
	*idx++ = 22;
	*idx++ = 39;
	*idx++ = 11;
	*idx++ = 45;
	*idx++ = 17;
	*idx++ = 34;
	*idx++ = 41;
	*idx++ = 46;
	*idx++ = 40;
	*idx++ = 47;
	*idx++ = 31;
	*idx++ = 19;
	*idx++ = 7;
	*idx++ = 43;
	*idx++ = 44;
	*idx++ = 38;
	*idx++ = 14;
	*idx++ = 32;
	*idx++ = 18;
	*idx++ = 36;
	*idx++ = 51;
	*idx++ = 48;
	*idx++ = 49;
	*idx++ = 15;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 16;
	*idx++ = 23;
	*idx++ = 33;
	*idx++ = 21;
	*idx++ = 35;
	*idx++ = 29;
	*idx++ = 27;
	*idx++ = 0;
	*idx++ = 25;
	*idx++ = 50;
	*idx++ = 28;
	*idx++ = 26;
	*idx++ = 24;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 52;
	*idx++ = 53;
	*idx++ = 5;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 6;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::SP, rysq::SP, rysq::P, rysq::P> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][4],
	      double (&I)[144]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][4],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][2])*W[a]*(Dz*Iy*Kx), I+0);
	    update((C[0][1])*W[a]*(Cy*Dz*Kx), I+1);
	    update((C[0][0])*W[a]*(Dz*Kx), I+2);
	    update((C[0][2])*W[a]*(Dy*Iz*Kx), I+3);
	    update((C[0][1])*W[a]*(Cz*Dy*Kx), I+4);
	    update((C[0][0])*W[a]*(Dy*Kx), I+5);
	    update((C[0][2])*W[a]*(Dx*Iz*Ky), I+6);
	    update((C[0][1])*W[a]*(Cz*Dx*Ky), I+7);
	    update((C[0][0])*W[a]*(Dx*Ky), I+8);
	    update((C[0][2])*W[a]*(Dz*Ix*Ky), I+9);
	    update((C[0][0])*W[a]*(Dz*Ky), I+10);
	    update((C[0][1])*W[a]*(Cx*Dz*Ky), I+11);
	    update((C[0][2])*W[a]*(Dy*Ix*Kz), I+12);
	    update((C[0][1])*W[a]*(Cx*Dy*Kz), I+13);
	    update((C[0][2])*W[a]*(Dx*Iy*Kz), I+14);
	    update((C[0][1])*W[a]*(Cy*Dx*Kz), I+15);
	    update((C[0][0])*W[a]*(Dx*Kz), I+16);
	    update((C[0][0])*W[a]*(Dy*Kz), I+17);
	    update((C[0][3])*W[a]*(Iz*Ky*Qx), I+18);
	    update((C[0][1])*W[a]*(Ky*Qx), I+19);
	    update((C[0][3])*W[a]*(Iy*Kz*Qx), I+20);
	    update((C[0][1])*W[a]*(Kz*Qx), I+21);
	    update((C[0][3])*W[a]*(Ix*Kz*Qy), I+22);
	    update((C[0][3])*W[a]*(Iz*Kx*Qy), I+23);
	    update((C[0][1])*W[a]*(Kx*Qy), I+24);
	    update((C[0][1])*W[a]*(Kz*Qy), I+25);
	    update((C[0][3])*W[a]*(Ix*Ky*Qz), I+26);
	    update((C[0][1])*W[a]*(Ky*Qz), I+27);
	    update((C[0][3])*W[a]*(Iy*Kx*Qz), I+28);
	    update((C[0][1])*W[a]*(Kx*Qz), I+29);
	    update((C[0][3])*W[a]*(Cy*Kz*(Dx*Xij + Qx)), I+30);
	    update((C[0][2])*W[a]*(Kz*(Dx*Xij + Qx)), I+31);
	    update((C[0][3])*W[a]*(Cz*Ky*(Dx*Xij + Qx)), I+32);
	    update((C[0][2])*W[a]*(Ky*(Dx*Xij + Qx)), I+33);
	    update((C[0][3])*W[a]*(Dy*Ix*(Cz*Zkl + Qz)), I+34);
	    update((C[0][3])*W[a]*((Cz*Zkl + Qz)*(Dx*Xij + Qx)), I+35);
	    update((C[0][3])*W[a]*(Dx*Iy*(Cz*Zkl + Qz)), I+36);
	    update((C[0][1])*W[a]*(Dx*(Cz*Zkl + Qz)), I+37);
	    update((C[0][1])*W[a]*(Dy*(Cz*Zkl + Qz)), I+38);
	    double f14 = (B00 + Dz*Iz);
	    update((C[0][3])*W[a]*(Qx*(f14 + Iz*Zkl)), I+39);
	    update((C[0][3])*W[a]*(Cx*Dy*(f14 + Iz*Zkl)), I+40);
	    update((C[0][2])*W[a]*(Dy*(f14 + Iz*Zkl)), I+41);
	    update((C[0][3])*W[a]*(Qy*(f14 + Iz*Zkl)), I+42);
	    update((C[0][2])*W[a]*(Dx*(f14 + Iz*Zkl)), I+43);
	    update((C[0][3])*W[a]*(Cy*Dx*(f14 + Iz*Zkl)), I+44);
	    update((C[0][3])*W[a]*(Cy*Kx*f14), I+45);
	    update((C[0][2])*W[a]*(Kx*f14), I+46);
	    update((C[0][3])*W[a]*(Cx*Ky*f14), I+47);
	    update((C[0][2])*W[a]*(Ky*f14), I+48);
	    double f18 = (B00 + Iy*Ky);
	    update((C[0][3])*W[a]*(Qx*f18), I+49);
	    update((C[0][3])*W[a]*(Cx*Dz*f18), I+50);
	    update((C[0][2])*W[a]*(Dz*f18), I+51);
	    update((C[0][3])*W[a]*(Qz*f18), I+52);
	    update((C[0][2])*W[a]*(Dx*f18), I+53);
	    update((C[0][3])*W[a]*(Cz*Dx*f18), I+54);
	    double f19 = (B00 + Cy*Ky);
	    update((C[0][3])*W[a]*(f19*(Dx*Xij + Qx)), I+55);
	    update((C[0][3])*W[a]*(Dx*Iz*f19), I+56);
	    update((C[0][1])*W[a]*(Dx*f19), I+57);
	    update((C[0][3])*W[a]*(Dz*Ix*f19), I+58);
	    update((C[0][1])*W[a]*(Dz*f19), I+59);
	    update((C[0][3])*W[a]*(f14*f19), I+60);
	    double f20 = (Dy*Iy + B00);
	    update((C[0][3])*W[a]*(f20*(Cz*Zkl + Qz)), I+61);
	    update((C[0][3])*W[a]*(Cx*Kz*f20), I+62);
	    update((C[0][2])*W[a]*(Kx*f20), I+63);
	    update((C[0][3])*W[a]*(Cz*Kx*f20), I+64);
	    update((C[0][2])*W[a]*(Kz*f20), I+65);
	    double f22 = (Dz*(B10 + Cz*Iz) + B00*(2*Cz + Zij));
	    update((C[0][3])*W[a]*(Dx*(f22 + Zkl*(B10 + Cz*Iz))), I+66);
	    update((C[0][3])*W[a]*(Dy*(f22 + Zkl*(B10 + Cz*Iz))), I+67);
	    update((C[0][3])*W[a]*(Kx*f22), I+68);
	    update((C[0][3])*W[a]*(Ky*f22), I+69);
	    double f24 = (Cx*Ix + B10);
	    update((C[0][3])*W[a]*(Dy*Kz*f24), I+70);
	    update((C[0][3])*W[a]*(Dz*Ky*f24), I+71);
	    double f27 = (Kx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][3])*W[a]*(Dz*f27), I+72);
	    update((C[0][3])*W[a]*(Dy*f27), I+73);
	    double f3 = (Dz*Kz + B01);
	    update((C[0][3])*W[a]*(Cy*Ix*f3), I+74);
	    update((C[0][3])*W[a]*(Cx*Iy*f3), I+75);
	    update((C[0][1])*W[a]*(Cx*f3), I+76);
	    update((C[0][1])*W[a]*(Cy*f3), I+77);
	    update((C[0][2])*W[a]*(Ix*f3), I+78);
	    update((C[0][3])*W[a]*(f24*f3), I+79);
	    update((C[0][2])*W[a]*(Iy*f3), I+80);
	    update((C[0][0])*W[a]*(f3), I+81);
	    double f30 = (Dx*(Cx*Ix + B10) + B00*(Xij + 2*Cx));
	    update((C[0][3])*W[a]*(Ky*f30), I+82);
	    update((C[0][3])*W[a]*(Kz*f30), I+83);
	    double f31 = (B01 + Dx*Kx);
	    update((C[0][2])*W[a]*(Iy*f31), I+84);
	    update((C[0][3])*W[a]*(Cz*Iy*f31), I+85);
	    update((C[0][1])*W[a]*(Cz*f31), I+86);
	    update((C[0][2])*W[a]*(Iz*f31), I+87);
	    update((C[0][3])*W[a]*(Cy*Iz*f31), I+88);
	    update((C[0][1])*W[a]*(Cy*f31), I+89);
	    update((C[0][0])*W[a]*(f31), I+90);
	    double f32 = (Cy*Iy + B10);
	    update((C[0][3])*W[a]*(Dz*Kx*f32), I+91);
	    update((C[0][3])*W[a]*(Dx*Kz*f32), I+92);
	    update((C[0][3])*W[a]*(f3*f32), I+93);
	    update((C[0][3])*W[a]*(f31*f32), I+94);
	    double f33 = (B00*(Yij + 2*Cy) + Ky*(Cy*Iy + B10));
	    update((C[0][3])*W[a]*(Dx*f33), I+95);
	    update((C[0][3])*W[a]*(Dz*f33), I+96);
	    double f34 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][3])*W[a]*(Kx*f34), I+97);
	    update((C[0][3])*W[a]*(Kz*f34), I+98);
	    double f36 = 2*B00*Dz;
	    double f13 = Cz*pow(Dz,2);
	    double f1 = B00*Zkl;
	    double f21 = Cz*Dz*Zkl;
	    update((C[0][3])*W[a]*(Cy*(f13 + f21 + f36 + Dz*Kz*Zij + f1 + B01*Iz)), I+99);
	    update((C[0][3])*W[a]*(Cx*(f13 + f21 + f36 + Dz*Kz*Zij + f1 + B01*Iz)), I+100);
	    update((C[0][2])*W[a]*((f13 + f21 + f36 + Dz*Kz*Zij + f1 + B01*Iz)), I+101);
	    double f37 = (B01 + Dy*Ky);
	    update((C[0][3])*W[a]*(f24*f37), I+102);
	    update((C[0][2])*W[a]*(Ix*f37), I+103);
	    update((C[0][3])*W[a]*(Cz*Ix*f37), I+104);
	    update((C[0][1])*W[a]*(Cz*f37), I+105);
	    update((C[0][2])*W[a]*(Iz*f37), I+106);
	    update((C[0][3])*W[a]*(Cx*Iz*f37), I+107);
	    update((C[0][1])*W[a]*(Cx*f37), I+108);
	    update((C[0][0])*W[a]*(f37), I+109);
	    double f4 = (B00 + Cx*Kx);
	    update((C[0][3])*W[a]*(Cy*Dz*(Kx*Xij + f4)), I+110);
	    update((C[0][2])*W[a]*(Dz*(Kx*Xij + f4)), I+111);
	    update((C[0][3])*W[a]*(Qy*(Kx*Xij + f4)), I+112);
	    update((C[0][3])*W[a]*(Cz*Dy*(Kx*Xij + f4)), I+113);
	    update((C[0][2])*W[a]*(Dy*(Kx*Xij + f4)), I+114);
	    update((C[0][3])*W[a]*(Qz*(Kx*Xij + f4)), I+115);
	    update((C[0][3])*W[a]*(f14*f4), I+116);
	    update((C[0][3])*W[a]*(f20*f4), I+117);
	    update((C[0][1])*W[a]*(Dz*f4), I+118);
	    update((C[0][3])*W[a]*(Dz*Iy*f4), I+119);
	    update((C[0][1])*W[a]*(Dy*f4), I+120);
	    update((C[0][3])*W[a]*(Dy*Iz*f4), I+121);
	    double f5 = (B00*(Ykl + 2*Dy) + Iy*(B01 + Dy*Ky));
	    update((C[0][3])*W[a]*(Cx*f5), I+122);
	    update((C[0][3])*W[a]*(Cz*f5), I+123);
	    update((C[0][2])*W[a]*(f5), I+124);
	    double f2 = B01*B10;
	    double f51 = B01*Cz;
	    double f46 = 2*pow(B00,2);
	    update((C[0][3])*W[a]*((Dz*Kz*pow(Cz,2) + f46 + Zij*(f13 + f21 + f36 + f1) + B10*Dz*Zkl + B10*pow(Dz,2) + 2*Cz*(f36 + f1) + f2 + f51*(Cz + Zij))), I+125);
	    update((C[0][3])*W[a]*(Iy*(f13 + f51 + f21 + f36 + f1)), I+126);
	    update((C[0][3])*W[a]*(Ix*(f13 + f51 + f21 + f36 + f1)), I+127);
	    update((C[0][1])*W[a]*((f13 + f51 + f21 + f36 + f1)), I+128);
	    double f52 = (B10 + Cz*Iz);
	    update((C[0][3])*W[a]*(Dy*Kx*f52), I+129);
	    update((C[0][3])*W[a]*(Dx*Ky*f52), I+130);
	    update((C[0][3])*W[a]*(f31*f52), I+131);
	    update((C[0][3])*W[a]*(f37*f52), I+132);
	    double f45 = Cy*pow(Dy,2);
	    double f23 = B01*Cy;
	    double f6 = B00*Ykl;
	    double f7 = Cy*Dy*Ykl;
	    double f28 = 2*B00*Dy;
	    update((C[0][3])*W[a]*((f46 + f23*Yij + 2*Cy*(f28 + f6) + B01*pow(Cy,2) + Dy*Ky*pow(Cy,2) + f2 + B10*pow(Dy,2) + B10*Dy*Ykl + Yij*(f45 + f28 + f7 + f6))), I+133);
	    update((C[0][3])*W[a]*(Iz*(f45 + f28 + f23 + f7 + f6)), I+134);
	    update((C[0][3])*W[a]*(Ix*(f45 + f28 + f23 + f7 + f6)), I+135);
	    update((C[0][1])*W[a]*((f45 + f28 + f23 + f7 + f6)), I+136);
	    double f39 = B01*Cx;
	    double f8 = Cx*Dx*Xkl;
	    double f42 = Cx*pow(Dx,2);
	    double f10 = B00*Xkl;
	    double f9 = 2*B00*Dx;
	    update((C[0][3])*W[a]*((f46 + 2*Cx*(f10 + f9) + B10*Dx*Xkl + f39*(Cx + Xij) + B10*pow(Dx,2) + f2 + Xij*(f10 + f42 + f9 + f8) + Dx*Kx*pow(Cx,2))), I+137);
	    update((C[0][3])*W[a]*(Cy*(f10 + f42 + f9 + f8 + B01*Ix + Dx*Kx*Xij)), I+138);
	    update((C[0][3])*W[a]*(Cz*(f10 + f42 + f9 + f8 + B01*Ix + Dx*Kx*Xij)), I+139);
	    update((C[0][2])*W[a]*((f10 + f42 + f9 + f8 + B01*Ix + Dx*Kx*Xij)), I+140);
	    update((C[0][3])*W[a]*(Iy*(f10 + f42 + f39 + f9 + f8)), I+141);
	    update((C[0][3])*W[a]*(Iz*(f10 + f42 + f39 + f9 + f8)), I+142);
	    update((C[0][1])*W[a]*((f10 + f42 + f39 + f9 + f8)), I+143);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[144]) {
	double T[144];
	for (int i = 0; i < 144; ++i) {
	    T[i] = I[i];
	}
	I[40] = T[0];
	I[34] = T[1];
	I[32] = T[2];
	I[28] = T[3];
	I[19] = T[4];
	I[16] = T[5];
	I[60] = T[6];
	I[51] = T[7];
	I[48] = T[8];
	I[84] = T[9];
	I[80] = T[10];
	I[81] = T[11];
	I[116] = T[12];
	I[113] = T[13];
	I[104] = T[14];
	I[98] = T[15];
	I[96] = T[16];
	I[112] = T[17];
	I[61] = T[18];
	I[49] = T[19];
	I[105] = T[20];
	I[97] = T[21];
	I[118] = T[22];
	I[30] = T[23];
	I[18] = T[24];
	I[114] = T[25];
	I[87] = T[26];
	I[83] = T[27];
	I[43] = T[28];
	I[35] = T[29];
	I[102] = T[30];
	I[100] = T[31];
	I[55] = T[32];
	I[52] = T[33];
	I[119] = T[34];
	I[103] = T[35];
	I[107] = T[36];
	I[99] = T[37];
	I[115] = T[38];
	I[109] = T[39];
	I[125] = T[40];
	I[124] = T[41];
	I[126] = T[42];
	I[108] = T[43];
	I[110] = T[44];
	I[46] = T[45];
	I[44] = T[46];
	I[93] = T[47];
	I[92] = T[48];
	I[57] = T[49];
	I[89] = T[50];
	I[88] = T[51];
	I[91] = T[52];
	I[56] = T[53];
	I[59] = T[54];
	I[54] = T[55];
	I[62] = T[56];
	I[50] = T[57];
	I[86] = T[58];
	I[82] = T[59];
	I[94] = T[60];
	I[123] = T[61];
	I[121] = T[62];
	I[24] = T[63];
	I[27] = T[64];
	I[120] = T[65];
	I[111] = T[66];
	I[127] = T[67];
	I[47] = T[68];
	I[95] = T[69];
	I[117] = T[70];
	I[85] = T[71];
	I[37] = T[72];
	I[21] = T[73];
	I[134] = T[74];
	I[137] = T[75];
	I[129] = T[76];
	I[130] = T[77];
	I[132] = T[78];
	I[133] = T[79];
	I[136] = T[80];
	I[128] = T[81];
	I[53] = T[82];
	I[101] = T[83];
	I[8] = T[84];
	I[11] = T[85];
	I[3] = T[86];
	I[12] = T[87];
	I[14] = T[88];
	I[2] = T[89];
	I[0] = T[90];
	I[42] = T[91];
	I[106] = T[92];
	I[138] = T[93];
	I[10] = T[94];
	I[58] = T[95];
	I[90] = T[96];
	I[26] = T[97];
	I[122] = T[98];
	I[142] = T[99];
	I[141] = T[100];
	I[140] = T[101];
	I[69] = T[102];
	I[68] = T[103];
	I[71] = T[104];
	I[67] = T[105];
	I[76] = T[106];
	I[77] = T[107];
	I[65] = T[108];
	I[64] = T[109];
	I[38] = T[110];
	I[36] = T[111];
	I[22] = T[112];
	I[23] = T[113];
	I[20] = T[114];
	I[39] = T[115];
	I[45] = T[116];
	I[25] = T[117];
	I[33] = T[118];
	I[41] = T[119];
	I[17] = T[120];
	I[29] = T[121];
	I[73] = T[122];
	I[75] = T[123];
	I[72] = T[124];
	I[143] = T[125];
	I[139] = T[126];
	I[135] = T[127];
	I[131] = T[128];
	I[31] = T[129];
	I[63] = T[130];
	I[15] = T[131];
	I[79] = T[132];
	I[74] = T[133];
	I[78] = T[134];
	I[70] = T[135];
	I[66] = T[136];
	I[5] = T[137];
	I[6] = T[138];
	I[7] = T[139];
	I[4] = T[140];
	I[9] = T[141];
	I[13] = T[142];
	I[1] = T[143];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[144] = { 90, 143, 89, 86, 140, 137, 138, 139, 84, 141, 94, 85, 87, 142, 88, 131, 5, 120, 24, 4, 114, 73, 112, 113, 63, 117, 97, 64, 3, 121, 23, 129, 2, 118, 1, 29, 111, 72, 110, 115, 0, 119, 91, 28, 46, 116, 45, 68, 8, 19, 57, 7, 33, 82, 55, 32, 53, 49, 95, 54, 6, 18, 56, 130, 109, 108, 136, 105, 103, 102, 135, 104, 124, 122, 133, 123, 106, 107, 134, 132, 10, 11, 59, 27, 9, 71, 58, 26, 51, 50, 96, 52, 48, 47, 60, 69, 16, 21, 15, 37, 31, 83, 30, 35, 14, 20, 92, 36, 43, 39, 44, 66, 17, 13, 25, 38, 12, 70, 22, 34, 65, 62, 98, 61, 41, 40, 42, 67, 81, 76, 77, 128, 78, 79, 74, 127, 80, 75, 93, 126, 101, 100, 99, 125 };
// 	if (index < 144) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    40, 34, 32, 28, 19, 16, 60, 51, 48, 84, 80, 81, 116, 113, 104, 98, 96, 112, 61, 49, 105, 97, 118, 30, 18, 114, 87, 83, 43, 35, 102, 100, 55, 52, 119, 103, 107, 99, 115, 109, 125, 124, 126, 108, 110, 46, 44, 93, 92, 57, 89, 88, 91, 56, 59, 54, 62, 50, 86, 82, 94, 123, 121, 24, 27, 120, 111, 127, 47, 95, 117, 85, 37, 21, 134, 137, 129, 130, 132, 133, 136, 128, 53, 101, 8, 11, 3, 12, 14, 2, 0, 42, 106, 138, 10, 58, 90, 26, 122, 142, 141, 140, 69, 68, 71, 67, 76, 77, 65, 64, 38, 36, 22, 23, 20, 39, 45, 25, 33, 41, 17, 29, 73, 75, 72, 143, 139, 135, 131, 31, 63, 15, 79, 74, 78, 70, 66, 5, 6, 7, 4, 9, 13, 1
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 40;
	*idx++ = 34;
	*idx++ = 32;
	*idx++ = 28;
	*idx++ = 19;
	*idx++ = 16;
	*idx++ = 60;
	*idx++ = 51;
	*idx++ = 48;
	*idx++ = 84;
	*idx++ = 80;
	*idx++ = 81;
	*idx++ = 116;
	*idx++ = 113;
	*idx++ = 104;
	*idx++ = 98;
	*idx++ = 96;
	*idx++ = 112;
	*idx++ = 61;
	*idx++ = 49;
	*idx++ = 105;
	*idx++ = 97;
	*idx++ = 118;
	*idx++ = 30;
	*idx++ = 18;
	*idx++ = 114;
	*idx++ = 87;
	*idx++ = 83;
	*idx++ = 43;
	*idx++ = 35;
	*idx++ = 102;
	*idx++ = 100;
	*idx++ = 55;
	*idx++ = 52;
	*idx++ = 119;
	*idx++ = 103;
	*idx++ = 107;
	*idx++ = 99;
	*idx++ = 115;
	*idx++ = 109;
	*idx++ = 125;
	*idx++ = 124;
	*idx++ = 126;
	*idx++ = 108;
	*idx++ = 110;
	*idx++ = 46;
	*idx++ = 44;
	*idx++ = 93;
	*idx++ = 92;
	*idx++ = 57;
	*idx++ = 89;
	*idx++ = 88;
	*idx++ = 91;
	*idx++ = 56;
	*idx++ = 59;
	*idx++ = 54;
	*idx++ = 62;
	*idx++ = 50;
	*idx++ = 86;
	*idx++ = 82;
	*idx++ = 94;
	*idx++ = 123;
	*idx++ = 121;
	*idx++ = 24;
	*idx++ = 27;
	*idx++ = 120;
	*idx++ = 111;
	*idx++ = 127;
	*idx++ = 47;
	*idx++ = 95;
	*idx++ = 117;
	*idx++ = 85;
	*idx++ = 37;
	*idx++ = 21;
	*idx++ = 134;
	*idx++ = 137;
	*idx++ = 129;
	*idx++ = 130;
	*idx++ = 132;
	*idx++ = 133;
	*idx++ = 136;
	*idx++ = 128;
	*idx++ = 53;
	*idx++ = 101;
	*idx++ = 8;
	*idx++ = 11;
	*idx++ = 3;
	*idx++ = 12;
	*idx++ = 14;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 42;
	*idx++ = 106;
	*idx++ = 138;
	*idx++ = 10;
	*idx++ = 58;
	*idx++ = 90;
	*idx++ = 26;
	*idx++ = 122;
	*idx++ = 142;
	*idx++ = 141;
	*idx++ = 140;
	*idx++ = 69;
	*idx++ = 68;
	*idx++ = 71;
	*idx++ = 67;
	*idx++ = 76;
	*idx++ = 77;
	*idx++ = 65;
	*idx++ = 64;
	*idx++ = 38;
	*idx++ = 36;
	*idx++ = 22;
	*idx++ = 23;
	*idx++ = 20;
	*idx++ = 39;
	*idx++ = 45;
	*idx++ = 25;
	*idx++ = 33;
	*idx++ = 41;
	*idx++ = 17;
	*idx++ = 29;
	*idx++ = 73;
	*idx++ = 75;
	*idx++ = 72;
	*idx++ = 143;
	*idx++ = 139;
	*idx++ = 135;
	*idx++ = 131;
	*idx++ = 31;
	*idx++ = 63;
	*idx++ = 15;
	*idx++ = 79;
	*idx++ = 74;
	*idx++ = 78;
	*idx++ = 70;
	*idx++ = 66;
	*idx++ = 5;
	*idx++ = 6;
	*idx++ = 7;
	*idx++ = 4;
	*idx++ = 9;
	*idx++ = 13;
	*idx++ = 1;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::S, rysq::S, rysq::SP> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 1;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<1> &t2, const vector<1> &W,
    // 		     const double (&C)[2][1],
	      double (&I)[4]) {
	eval<1>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[2][1],
	      double *I, const U &update) {


	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {



	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(1), I+0);
	    update((C[1][0])*W[a]*(Kx), I+1);
	    update((C[1][0])*W[a]*(Ky), I+2);
	    update((C[1][0])*W[a]*(Kz), I+3);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[4]) {
	double T[4];
	for (int i = 0; i < 4; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[2] = T[2];
	I[3] = T[3];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[4] = { 0, 1, 2, 3 };
// 	if (index < 4) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 2, 3
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 2;
	*idx++ = 3;
    }


};

template<>
struct impl<meta::braket<rysq::F, rysq::D, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[60]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Cz*Py*(Xij*(Xij + 2*Cx) + Px)), I+0);
	    update((C[0][0])*W[a]*(Iz*Py*(Px + Cx*Xij)), I+1);
	    update((C[0][0])*W[a]*(Cy*Pz*(Xij*(Xij + 2*Cx) + Px)), I+2);
	    update((C[0][0])*W[a]*(Iy*Pz*(Px + Cx*Xij)), I+3);
	    update((C[0][0])*W[a]*((B10*Cx*(12*Cx*Xij + 10*pow(Cx,2) + 3*pow(Xij,2)) + 3*pow(B10,2)*(5*Cx + 2*Xij) + pow(Cx,3)*pow(Ix,2))), I+4);
	    update((C[0][0])*W[a]*((3*pow(B10,2)*(5*Cy + 2*Yij) + B10*Cy*(12*Cy*Yij + 3*pow(Yij,2) + 10*pow(Cy,2)) + pow(Cy,3)*pow(Iy,2))), I+5);
	    update((C[0][0])*W[a]*((3*pow(B10,2)*(5*Cz + 2*Zij) + B10*Cz*(3*pow(Zij,2) + 10*pow(Cz,2) + 12*Cz*Zij) + pow(Cz,3)*pow(Iz,2))), I+6);
	    double f0 = (3*B10 + pow(Cx,2));
	    update((C[0][0])*W[a]*(Cx*Iy*Iz*f0), I+7);
	    double f10 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Cz*f10*(Px + Cx*Xij)), I+8);
	    update((C[0][0])*W[a]*(Ix*Pz*f10), I+9);
	    update((C[0][0])*W[a]*(Iz*Px*f10), I+10);
	    double f11 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[0][0])*W[a]*(Cx*Cz*f11), I+11);
	    update((C[0][0])*W[a]*(Pz*f11), I+12);
	    update((C[0][0])*W[a]*(Px*f11), I+13);
	    double f12 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Cy*Iz*f12), I+14);
	    update((C[0][0])*W[a]*(Cz*Iy*f12), I+15);
	    update((C[0][0])*W[a]*(f10*f12), I+16);
	    double f13 = (B10 + Cz*Iz);
	    update((C[0][0])*W[a]*(Cy*f13*(Px + Cx*Xij)), I+17);
	    update((C[0][0])*W[a]*(Ix*Py*f13), I+18);
	    update((C[0][0])*W[a]*(Iy*Px*f13), I+19);
	    update((C[0][0])*W[a]*(Cx*f10*f13), I+20);
	    update((C[0][0])*W[a]*(f12*f13), I+21);
	    double f14 = (3*pow(B10,2) + 3*B10*Cx*(Xij + 2*Cx) + Ix*pow(Cx,3));
	    update((C[0][0])*W[a]*(Iz*f14), I+22);
	    update((C[0][0])*W[a]*(Iy*f14), I+23);
	    double f15 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Cx*Pz*f15), I+24);
	    update((C[0][0])*W[a]*(Cx*f0*f15), I+25);
	    update((C[0][0])*W[a]*(Cz*Px*f15), I+26);
	    double f16 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cy*Px*f16), I+27);
	    update((C[0][0])*W[a]*(Cx*Py*f16), I+28);
	    update((C[0][0])*W[a]*(Cx*f0*f16), I+29);
	    double f19 = (3*pow(B10,2) + 3*B10*Cz*(2*Cz + Zij) + Iz*pow(Cz,3));
	    update((C[0][0])*W[a]*(Ix*f19), I+30);
	    update((C[0][0])*W[a]*(Iy*f19), I+31);
	    double f2 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[0][0])*W[a]*(Cy*Cz*f2), I+32);
	    update((C[0][0])*W[a]*(Py*f2), I+33);
	    update((C[0][0])*W[a]*(Pz*f2), I+34);
	    double f21 = (3*B10 + pow(Cz,2));
	    update((C[0][0])*W[a]*(Cz*f21*(Xij*(Xij + 2*Cx) + Px)), I+35);
	    update((C[0][0])*W[a]*(Cz*Ix*Iy*f21), I+36);
	    update((C[0][0])*W[a]*(Cz*f15*f21), I+37);
	    double f22 = (3*B10 + pow(Cy,2));
	    update((C[0][0])*W[a]*(Cy*f22*(Xij*(Xij + 2*Cx) + Px)), I+38);
	    update((C[0][0])*W[a]*(Cy*Ix*Iz*f22), I+39);
	    update((C[0][0])*W[a]*(Cy*f16*f22), I+40);
	    double f23 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Cy*Ix*f23), I+41);
	    update((C[0][0])*W[a]*(f23*(Px + Cx*Xij)), I+42);
	    update((C[0][0])*W[a]*(Cx*Iy*f23), I+43);
	    update((C[0][0])*W[a]*(f10*f23), I+44);
	    double f26 = (3*pow(B10,2) + Iy*pow(Cy,3) + 3*B10*Cy*(Yij + 2*Cy));
	    update((C[0][0])*W[a]*(Ix*f26), I+45);
	    update((C[0][0])*W[a]*(Iz*f26), I+46);
	    double f3 = (3*pow(B10,2) + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2)));
	    update((C[0][0])*W[a]*(Cx*f3), I+47);
	    update((C[0][0])*W[a]*(Cy*f3), I+48);
	    double f4 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Cz*Ix*f4), I+49);
	    update((C[0][0])*W[a]*(f4*(Px + Cx*Xij)), I+50);
	    update((C[0][0])*W[a]*(Cx*Iz*f4), I+51);
	    update((C[0][0])*W[a]*(f13*f4), I+52);
	    double f5 = (3*pow(B10,2) + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2));
	    update((C[0][0])*W[a]*(Cy*f5), I+53);
	    update((C[0][0])*W[a]*(Cz*f5), I+54);
	    double f7 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*Cy*f7), I+55);
	    update((C[0][0])*W[a]*(Px*f7), I+56);
	    update((C[0][0])*W[a]*(Py*f7), I+57);
	    double f9 = (3*pow(B10,2) + B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + pow(Cy,2)*pow(Iy,2));
	    update((C[0][0])*W[a]*(Cz*f9), I+58);
	    update((C[0][0])*W[a]*(Cx*f9), I+59);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[60]) {
	double T[60];
	for (int i = 0; i < 60; ++i) {
	    T[i] = I[i];
	}
	I[6] = T[0];
	I[45] = T[1];
	I[8] = T[2];
	I[37] = T[3];
	I[0] = T[4];
	I[11] = T[5];
	I[22] = T[6];
	I[50] = T[7];
	I[39] = T[8];
	I[38] = T[9];
	I[53] = T[10];
	I[19] = T[11];
	I[18] = T[12];
	I[13] = T[13];
	I[43] = T[14];
	I[34] = T[15];
	I[33] = T[16];
	I[49] = T[17];
	I[46] = T[18];
	I[54] = T[19];
	I[59] = T[20];
	I[44] = T[21];
	I[40] = T[22];
	I[30] = T[23];
	I[17] = T[24];
	I[10] = T[25];
	I[14] = T[26];
	I[23] = T[27];
	I[25] = T[28];
	I[20] = T[29];
	I[42] = T[30];
	I[52] = T[31];
	I[9] = T[32];
	I[5] = T[33];
	I[7] = T[34];
	I[2] = T[35];
	I[32] = T[36];
	I[12] = T[37];
	I[1] = T[38];
	I[41] = T[39];
	I[21] = T[40];
	I[48] = T[41];
	I[47] = T[42];
	I[57] = T[43];
	I[58] = T[44];
	I[31] = T[45];
	I[51] = T[46];
	I[27] = T[47];
	I[28] = T[48];
	I[36] = T[49];
	I[35] = T[50];
	I[55] = T[51];
	I[56] = T[52];
	I[3] = T[53];
	I[4] = T[54];
	I[29] = T[55];
	I[24] = T[56];
	I[26] = T[57];
	I[16] = T[58];
	I[15] = T[59];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[60] = { 4, 38, 35, 53, 54, 33, 0, 34, 2, 32, 25, 5, 37, 13, 26, 59, 58, 24, 12, 11, 29, 40, 6, 27, 56, 28, 57, 47, 48, 55, 23, 45, 36, 16, 15, 50, 49, 3, 9, 8, 22, 39, 30, 14, 21, 1, 18, 42, 41, 17, 7, 46, 31, 10, 19, 51, 52, 43, 44, 20 };
// 	if (index < 60) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    6, 45, 8, 37, 0, 11, 22, 50, 39, 38, 53, 19, 18, 13, 43, 34, 33, 49, 46, 54, 59, 44, 40, 30, 17, 10, 14, 23, 25, 20, 42, 52, 9, 5, 7, 2, 32, 12, 1, 41, 21, 48, 47, 57, 58, 31, 51, 27, 28, 36, 35, 55, 56, 3, 4, 29, 24, 26, 16, 15
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 6;
	*idx++ = 45;
	*idx++ = 8;
	*idx++ = 37;
	*idx++ = 0;
	*idx++ = 11;
	*idx++ = 22;
	*idx++ = 50;
	*idx++ = 39;
	*idx++ = 38;
	*idx++ = 53;
	*idx++ = 19;
	*idx++ = 18;
	*idx++ = 13;
	*idx++ = 43;
	*idx++ = 34;
	*idx++ = 33;
	*idx++ = 49;
	*idx++ = 46;
	*idx++ = 54;
	*idx++ = 59;
	*idx++ = 44;
	*idx++ = 40;
	*idx++ = 30;
	*idx++ = 17;
	*idx++ = 10;
	*idx++ = 14;
	*idx++ = 23;
	*idx++ = 25;
	*idx++ = 20;
	*idx++ = 42;
	*idx++ = 52;
	*idx++ = 9;
	*idx++ = 5;
	*idx++ = 7;
	*idx++ = 2;
	*idx++ = 32;
	*idx++ = 12;
	*idx++ = 1;
	*idx++ = 41;
	*idx++ = 21;
	*idx++ = 48;
	*idx++ = 47;
	*idx++ = 57;
	*idx++ = 58;
	*idx++ = 31;
	*idx++ = 51;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 36;
	*idx++ = 35;
	*idx++ = 55;
	*idx++ = 56;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 29;
	*idx++ = 24;
	*idx++ = 26;
	*idx++ = 16;
	*idx++ = 15;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[6]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*((B10 + pow(Ix,2))), I+0);
	    update((C[0][0])*W[a]*((B10 + pow(Iy,2))), I+1);
	    update((C[0][0])*W[a]*(Ix*Iy), I+2);
	    update((C[0][0])*W[a]*((B10 + pow(Iz,2))), I+3);
	    update((C[0][0])*W[a]*(Ix*Iz), I+4);
	    update((C[0][0])*W[a]*(Iy*Iz), I+5);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[6]) {
	double T[6];
	for (int i = 0; i < 6; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[1] = T[1];
	I[3] = T[2];
	I[2] = T[3];
	I[4] = T[4];
	I[5] = T[5];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[6] = { 0, 1, 3, 2, 4, 5 };
// 	if (index < 6) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 1, 3, 2, 4, 5
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 1;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 4;
	*idx++ = 5;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::D, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[108]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));
 	    double Qx = (Cx*Dx + B00);
 	    double Qy = (Cy*Dy + B00);
 	    double Qz = (Cz*Dz + B00);

	    update((C[0][0])*W[a]*(Cy*Qz*(Xij*(Xij + 2*Cx) + Px)), I+0);
	    update((C[0][0])*W[a]*(Iy*Qz*(Px + Cx*Xij)), I+1);
	    update((C[0][0])*W[a]*(Dz*Py*(Xij*(Xij + 2*Cx) + Px)), I+2);
	    update((C[0][0])*W[a]*(Iz*Qy*(Px + Cx*Xij)), I+3);
	    update((C[0][0])*W[a]*(Cz*Qy*(Xij*(Xij + 2*Cx) + Px)), I+4);
	    update((C[0][0])*W[a]*((2*B00*(Xij + 2*Cx)*(Cx*Ix + 3*B10) + Dx*(3*pow(B10,2) + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2)))), I+5);
	    update((C[0][0])*W[a]*(Iz*Py*(Dx*Xij + Qx)), I+6);
	    update((C[0][0])*W[a]*(Iy*Pz*(Dx*Xij + Qx)), I+7);
	    update((C[0][0])*W[a]*(Dy*Pz*(Xij*(Xij + 2*Cx) + Px)), I+8);
	    update((C[0][0])*W[a]*((2*B00*(Yij + 2*Cy)*(3*B10 + Cy*Iy) + Dy*(3*pow(B10,2) + B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + pow(Cy,2)*pow(Iy,2)))), I+9);
	    update((C[0][0])*W[a]*(Iz*Px*(Dy*Yij + Qy)), I+10);
	    update((C[0][0])*W[a]*(Cz*(Px + Cx*Xij)*(Dy*Yij + Qy)), I+11);
	    update((C[0][0])*W[a]*(Ix*Pz*(Dy*Yij + Qy)), I+12);
	    update((C[0][0])*W[a]*(Cy*(Cz*Zij + Pz)*(Dx*Xij + Qx)), I+13);
	    update((C[0][0])*W[a]*(Dy*(Px + Cx*Xij)*(Cz*Zij + Pz)), I+14);
	    update((C[0][0])*W[a]*(Cy*(Px + Cx*Xij)*(Dz*Zij + Qz)), I+15);
	    update((C[0][0])*W[a]*(Ix*Py*(Dz*Zij + Qz)), I+16);
	    update((C[0][0])*W[a]*(Iy*Px*(Dz*Zij + Qz)), I+17);
	    update((C[0][0])*W[a]*((Dz*(3*pow(B10,2) + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2))) + 2*B00*(2*Cz + Zij)*(3*B10 + Cz*Iz))), I+18);
	    update((C[0][0])*W[a]*(Cx*(Cz*Zij + Pz)*(Dy*Yij + Qy)), I+19);
	    update((C[0][0])*W[a]*(Ix*Qy*(Cz*Zij + Pz)), I+20);
	    update((C[0][0])*W[a]*(Iy*Qx*(Cz*Zij + Pz)), I+21);
	    double f1 = (Dz*Pz + 2*B00*Cz);
	    update((C[0][0])*W[a]*(Cx*Iy*(f1 + Qz*Zij)), I+22);
	    update((C[0][0])*W[a]*((Px + Cx*Xij)*(f1 + Qz*Zij)), I+23);
	    update((C[0][0])*W[a]*(Py*(Zij*(2*B00 + Dz*(2*Cz + Zij)) + f1)), I+24);
	    update((C[0][0])*W[a]*(Px*(Zij*(2*B00 + Dz*(2*Cz + Zij)) + f1)), I+25);
	    update((C[0][0])*W[a]*(Cy*Ix*(f1 + Qz*Zij)), I+26);
	    update((C[0][0])*W[a]*(Cx*Cy*(Zij*(2*B00 + Dz*(2*Cz + Zij)) + f1)), I+27);
	    update((C[0][0])*W[a]*(f1*(Xij*(Xij + 2*Cx) + Px)), I+28);
	    update((C[0][0])*W[a]*(Ix*Iy*f1), I+29);
	    double f10 = (3*pow(B10,2) + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2)));
	    update((C[0][0])*W[a]*(Dx*f10), I+30);
	    update((C[0][0])*W[a]*(Dy*f10), I+31);
	    double f11 = (B00*(Iz*(3*Cz + Zij) + 3*B10) + Dz*(B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2)));
	    update((C[0][0])*W[a]*(Cx*f11), I+32);
	    update((C[0][0])*W[a]*(Cy*f11), I+33);
	    double f12 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(f12*(Dz*Zij + Qz)), I+34);
	    update((C[0][0])*W[a]*(Dz*Ix*f12), I+35);
	    update((C[0][0])*W[a]*(f12*(Dx*Xij + Qx)), I+36);
	    update((C[0][0])*W[a]*(Dx*Iz*f12), I+37);
	    double f13 = (3*pow(B10,2) + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2));
	    update((C[0][0])*W[a]*(Dy*f13), I+38);
	    update((C[0][0])*W[a]*(Dz*f13), I+39);
	    double f14 = (Dy*(Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij)) + B00*(3*B10 + Iy*(3*Cy + Yij)));
	    update((C[0][0])*W[a]*(Cz*f14), I+40);
	    update((C[0][0])*W[a]*(Cx*f14), I+41);
	    double f2 = (Dz*(Iz*pow(Cz,2) + B10*(3*Cz + Zij)) + 3*B00*Pz + 2*B00*Cz*Zij);
	    update((C[0][0])*W[a]*(Ix*f2), I+42);
	    update((C[0][0])*W[a]*(Iy*f2), I+43);
	    double f20 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*Dy*f20), I+44);
	    update((C[0][0])*W[a]*(Cy*Dx*f20), I+45);
	    update((C[0][0])*W[a]*(Qy*f20), I+46);
	    update((C[0][0])*W[a]*(Qx*f20), I+47);
	    double f21 = (3*pow(B10,2) + B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + pow(Cy,2)*pow(Iy,2));
	    update((C[0][0])*W[a]*(Dx*f21), I+48);
	    update((C[0][0])*W[a]*(Dz*f21), I+49);
	    double f22 = (Cy*Iy + B10);
	    update((C[0][0])*W[a]*(Dz*f22*(Px + Cx*Xij)), I+50);
	    update((C[0][0])*W[a]*(Cx*f22*(Dz*Zij + Qz)), I+51);
	    update((C[0][0])*W[a]*(f22*(f1 + Qz*Zij)), I+52);
	    update((C[0][0])*W[a]*(Dx*f22*(Cz*Zij + Pz)), I+53);
	    update((C[0][0])*W[a]*(Cz*f22*(Dx*Xij + Qx)), I+54);
	    update((C[0][0])*W[a]*(Ix*Qz*f22), I+55);
	    update((C[0][0])*W[a]*(Iz*Qx*f22), I+56);
	    double f23 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[0][0])*W[a]*(Cx*Dz*f23), I+57);
	    update((C[0][0])*W[a]*(Cz*Dx*f23), I+58);
	    update((C[0][0])*W[a]*(Qx*f23), I+59);
	    update((C[0][0])*W[a]*(Qz*f23), I+60);
	    double f24 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(f24*(Dy*Yij + Qy)), I+61);
	    update((C[0][0])*W[a]*(Dy*Iz*f24), I+62);
	    update((C[0][0])*W[a]*(f24*(Dz*Zij + Qz)), I+63);
	    update((C[0][0])*W[a]*(Dz*Iy*f24), I+64);
	    double f25 = (B00*(3*B10 + Ix*(3*Cx + Xij)) + Dx*(B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2)));
	    update((C[0][0])*W[a]*(Cy*f25), I+65);
	    update((C[0][0])*W[a]*(Cz*f25), I+66);
	    double f28 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Dz*Px*f28), I+67);
	    update((C[0][0])*W[a]*(Cx*Qz*f28), I+68);
	    update((C[0][0])*W[a]*(Cz*Qx*f28), I+69);
	    update((C[0][0])*W[a]*(Dx*Pz*f28), I+70);
	    update((C[0][0])*W[a]*(f1*f28), I+71);
	    double f29 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*Qy*f29), I+72);
	    update((C[0][0])*W[a]*(Dy*Px*f29), I+73);
	    update((C[0][0])*W[a]*(Cy*Qx*f29), I+74);
	    update((C[0][0])*W[a]*(Dx*Py*f29), I+75);
	    double f37 = (2*B00*Cy*Yij + 3*B00*Py + Dy*(B10*(3*Cy + Yij) + Iy*pow(Cy,2)));
	    update((C[0][0])*W[a]*(Iz*f37), I+76);
	    update((C[0][0])*W[a]*(Ix*f37), I+77);
	    double f38 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(f38*(Dy*Yij + Qy)), I+78);
	    update((C[0][0])*W[a]*(Dy*Ix*f38), I+79);
	    update((C[0][0])*W[a]*(Dx*Iy*f38), I+80);
	    update((C[0][0])*W[a]*(f38*(Dx*Xij + Qx)), I+81);
	    double f39 = (Dx*(B10*(3*Cx + Xij) + Ix*pow(Cx,2)) + 2*B00*Cx*Xij + 3*B00*Px);
	    update((C[0][0])*W[a]*(Iy*f39), I+82);
	    update((C[0][0])*W[a]*(Iz*f39), I+83);
	    double f4 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[0][0])*W[a]*(Cz*Dy*f4), I+84);
	    update((C[0][0])*W[a]*(Cy*Dz*f4), I+85);
	    update((C[0][0])*W[a]*(Qz*f4), I+86);
	    update((C[0][0])*W[a]*(Qy*f4), I+87);
	    double f5 = (B00*(Yij + 2*Cy) + Dy*(Cy*Iy + B10));
	    update((C[0][0])*W[a]*(f5*(Cz*Zij + Pz)), I+88);
	    update((C[0][0])*W[a]*(Cz*Ix*f5), I+89);
	    update((C[0][0])*W[a]*(f5*(Px + Cx*Xij)), I+90);
	    update((C[0][0])*W[a]*(Cx*Iz*f5), I+91);
	    double f6 = (Dx*Px + 2*B00*Cx);
	    update((C[0][0])*W[a]*(Cy*Iz*(Qx*Xij + f6)), I+92);
	    update((C[0][0])*W[a]*((Cz*Zij + Pz)*(Qx*Xij + f6)), I+93);
	    update((C[0][0])*W[a]*(Cz*Iy*(Qx*Xij + f6)), I+94);
	    update((C[0][0])*W[a]*(Cy*Cz*(Xij*(2*B00 + Dx*(Xij + 2*Cx)) + f6)), I+95);
	    update((C[0][0])*W[a]*(f22*(Qx*Xij + f6)), I+96);
	    update((C[0][0])*W[a]*(Pz*(Xij*(2*B00 + Dx*(Xij + 2*Cx)) + f6)), I+97);
	    update((C[0][0])*W[a]*(Py*(Xij*(2*B00 + Dx*(Xij + 2*Cx)) + f6)), I+98);
	    update((C[0][0])*W[a]*(Iy*Iz*f6), I+99);
	    update((C[0][0])*W[a]*(f29*f6), I+100);
	    update((C[0][0])*W[a]*(f28*f6), I+101);
	    double f9 = (Dy*Py + 2*B00*Cy);
	    update((C[0][0])*W[a]*(Cx*Cz*(Yij*(2*B00 + Dy*(Yij + 2*Cy)) + f9)), I+102);
	    update((C[0][0])*W[a]*(Pz*(Yij*(2*B00 + Dy*(Yij + 2*Cy)) + f9)), I+103);
	    update((C[0][0])*W[a]*(Px*(Yij*(2*B00 + Dy*(Yij + 2*Cy)) + f9)), I+104);
	    update((C[0][0])*W[a]*(Ix*Iz*f9), I+105);
	    update((C[0][0])*W[a]*(f9*(Xij*(Xij + 2*Cx) + Px)), I+106);
	    update((C[0][0])*W[a]*(f29*f9), I+107);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[108]) {
	double T[108];
	for (int i = 0; i < 108; ++i) {
	    T[i] = I[i];
	}
	I[77] = T[0];
	I[94] = T[1];
	I[73] = T[2];
	I[63] = T[3];
	I[41] = T[4];
	I[0] = T[5];
	I[25] = T[6];
	I[20] = T[7];
	I[38] = T[8];
	I[43] = T[9];
	I[66] = T[10];
	I[58] = T[11];
	I[56] = T[12];
	I[29] = T[13];
	I[64] = T[14];
	I[99] = T[15];
	I[97] = T[16];
	I[102] = T[17];
	I[86] = T[18];
	I[70] = T[19];
	I[65] = T[20];
	I[34] = T[21];
	I[106] = T[22];
	I[100] = T[23];
	I[85] = T[24];
	I[84] = T[25];
	I[101] = T[26];
	I[87] = T[27];
	I[74] = T[28];
	I[92] = T[29];
	I[14] = T[30];
	I[50] = T[31];
	I[88] = T[32];
	I[89] = T[33];
	I[103] = T[34];
	I[91] = T[35];
	I[19] = T[36];
	I[31] = T[37];
	I[36] = T[38];
	I[72] = T[39];
	I[47] = T[40];
	I[45] = T[41];
	I[98] = T[42];
	I[104] = T[43];
	I[52] = T[44];
	I[17] = T[45];
	I[53] = T[46];
	I[16] = T[47];
	I[7] = T[48];
	I[79] = T[49];
	I[93] = T[50];
	I[105] = T[51];
	I[107] = T[52];
	I[35] = T[53];
	I[23] = T[54];
	I[95] = T[55];
	I[33] = T[56];
	I[81] = T[57];
	I[11] = T[58];
	I[9] = T[59];
	I[83] = T[60];
	I[54] = T[61];
	I[60] = T[62];
	I[96] = T[63];
	I[90] = T[64];
	I[3] = T[65];
	I[4] = T[66];
	I[78] = T[67];
	I[82] = T[68];
	I[10] = T[69];
	I[8] = T[70];
	I[80] = T[71];
	I[51] = T[72];
	I[48] = T[73];
	I[15] = T[74];
	I[13] = T[75];
	I[67] = T[76];
	I[55] = T[77];
	I[68] = T[78];
	I[62] = T[79];
	I[32] = T[80];
	I[26] = T[81];
	I[18] = T[82];
	I[24] = T[83];
	I[40] = T[84];
	I[75] = T[85];
	I[76] = T[86];
	I[39] = T[87];
	I[71] = T[88];
	I[59] = T[89];
	I[57] = T[90];
	I[69] = T[91];
	I[27] = T[92];
	I[28] = T[93];
	I[22] = T[94];
	I[5] = T[95];
	I[21] = T[96];
	I[2] = T[97];
	I[1] = T[98];
	I[30] = T[99];
	I[12] = T[100];
	I[6] = T[101];
	I[46] = T[102];
	I[44] = T[103];
	I[42] = T[104];
	I[61] = T[105];
	I[37] = T[106];
	I[49] = T[107];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[108] = { 5, 98, 97, 65, 66, 95, 101, 48, 70, 59, 69, 58, 100, 75, 30, 74, 47, 45, 82, 36, 7, 96, 94, 54, 83, 6, 81, 92, 93, 13, 99, 37, 80, 56, 21, 53, 38, 106, 8, 87, 84, 4, 104, 9, 103, 41, 102, 40, 73, 107, 31, 72, 44, 46, 61, 77, 12, 90, 11, 89, 62, 105, 79, 3, 14, 20, 10, 76, 78, 91, 19, 88, 39, 2, 28, 85, 86, 0, 67, 49, 71, 57, 68, 60, 25, 24, 18, 27, 32, 33, 64, 35, 29, 50, 1, 55, 63, 16, 42, 15, 23, 26, 17, 34, 43, 51, 22, 52 };
// 	if (index < 108) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    77, 94, 73, 63, 41, 0, 25, 20, 38, 43, 66, 58, 56, 29, 64, 99, 97, 102, 86, 70, 65, 34, 106, 100, 85, 84, 101, 87, 74, 92, 14, 50, 88, 89, 103, 91, 19, 31, 36, 72, 47, 45, 98, 104, 52, 17, 53, 16, 7, 79, 93, 105, 107, 35, 23, 95, 33, 81, 11, 9, 83, 54, 60, 96, 90, 3, 4, 78, 82, 10, 8, 80, 51, 48, 15, 13, 67, 55, 68, 62, 32, 26, 18, 24, 40, 75, 76, 39, 71, 59, 57, 69, 27, 28, 22, 5, 21, 2, 1, 30, 12, 6, 46, 44, 42, 61, 37, 49
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 77;
	*idx++ = 94;
	*idx++ = 73;
	*idx++ = 63;
	*idx++ = 41;
	*idx++ = 0;
	*idx++ = 25;
	*idx++ = 20;
	*idx++ = 38;
	*idx++ = 43;
	*idx++ = 66;
	*idx++ = 58;
	*idx++ = 56;
	*idx++ = 29;
	*idx++ = 64;
	*idx++ = 99;
	*idx++ = 97;
	*idx++ = 102;
	*idx++ = 86;
	*idx++ = 70;
	*idx++ = 65;
	*idx++ = 34;
	*idx++ = 106;
	*idx++ = 100;
	*idx++ = 85;
	*idx++ = 84;
	*idx++ = 101;
	*idx++ = 87;
	*idx++ = 74;
	*idx++ = 92;
	*idx++ = 14;
	*idx++ = 50;
	*idx++ = 88;
	*idx++ = 89;
	*idx++ = 103;
	*idx++ = 91;
	*idx++ = 19;
	*idx++ = 31;
	*idx++ = 36;
	*idx++ = 72;
	*idx++ = 47;
	*idx++ = 45;
	*idx++ = 98;
	*idx++ = 104;
	*idx++ = 52;
	*idx++ = 17;
	*idx++ = 53;
	*idx++ = 16;
	*idx++ = 7;
	*idx++ = 79;
	*idx++ = 93;
	*idx++ = 105;
	*idx++ = 107;
	*idx++ = 35;
	*idx++ = 23;
	*idx++ = 95;
	*idx++ = 33;
	*idx++ = 81;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 83;
	*idx++ = 54;
	*idx++ = 60;
	*idx++ = 96;
	*idx++ = 90;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 78;
	*idx++ = 82;
	*idx++ = 10;
	*idx++ = 8;
	*idx++ = 80;
	*idx++ = 51;
	*idx++ = 48;
	*idx++ = 15;
	*idx++ = 13;
	*idx++ = 67;
	*idx++ = 55;
	*idx++ = 68;
	*idx++ = 62;
	*idx++ = 32;
	*idx++ = 26;
	*idx++ = 18;
	*idx++ = 24;
	*idx++ = 40;
	*idx++ = 75;
	*idx++ = 76;
	*idx++ = 39;
	*idx++ = 71;
	*idx++ = 59;
	*idx++ = 57;
	*idx++ = 69;
	*idx++ = 27;
	*idx++ = 28;
	*idx++ = 22;
	*idx++ = 5;
	*idx++ = 21;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 30;
	*idx++ = 12;
	*idx++ = 6;
	*idx++ = 46;
	*idx++ = 44;
	*idx++ = 42;
	*idx++ = 61;
	*idx++ = 37;
	*idx++ = 49;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::SP, rysq::S, rysq::F> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][2],
	      double (&I)[40]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][2],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];

	const double &Xkl = rkl[0];
	const double &Ykl = rkl[1];
	const double &Zkl = rkl[2];


#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Kx = (Xkl + Dx);
 	    double Ky = (Ykl + Dy);
 	    double Kz = (Dz + Zkl);

	    update((C[0][0])*W[a]*(Kx*Ky*Kz), I+0);
	    double f1 = (Iz*(pow(Kz,2) + B01) + 2*B00*Kz);
	    update((C[0][1])*W[a]*(Kx*f1), I+1);
	    update((C[0][1])*W[a]*(Ky*f1), I+2);
	    double f10 = (pow(Kz,2) + B01);
	    update((C[0][1])*W[a]*(Iy*Kx*f10), I+3);
	    update((C[0][0])*W[a]*(Kx*f10), I+4);
	    update((C[0][1])*W[a]*(Ix*Ky*f10), I+5);
	    update((C[0][0])*W[a]*(Ky*f10), I+6);
	    double f11 = (2*B00*Ky + Iy*(pow(Ky,2) + B01));
	    update((C[0][1])*W[a]*(Kx*f11), I+7);
	    update((C[0][1])*W[a]*(Kz*f11), I+8);
	    double f14 = (pow(Kx,2) + B01);
	    update((C[0][0])*W[a]*(Kz*f14), I+9);
	    update((C[0][1])*W[a]*(Iy*Kz*f14), I+10);
	    update((C[0][0])*W[a]*(Ky*f14), I+11);
	    update((C[0][1])*W[a]*(Iz*Ky*f14), I+12);
	    double f15 = (pow(Kz,2) + 3*B01);
	    update((C[0][1])*W[a]*(Iy*Kz*f15), I+13);
	    update((C[0][1])*W[a]*(Ix*Kz*f15), I+14);
	    update((C[0][0])*W[a]*(Kz*f15), I+15);
	    double f16 = (B00 + Iy*Ky);
	    update((C[0][1])*W[a]*(Kx*Kz*f16), I+16);
	    update((C[0][1])*W[a]*(f10*f16), I+17);
	    update((C[0][1])*W[a]*(f14*f16), I+18);
	    double f17 = (3*B01 + pow(Kx,2));
	    update((C[0][1])*W[a]*(Iz*Kx*f17), I+19);
	    update((C[0][1])*W[a]*(Iy*Kx*f17), I+20);
	    update((C[0][0])*W[a]*(Kx*f17), I+21);
	    double f18 = (Ix*(pow(Kx,2) + B01) + 2*B00*Kx);
	    update((C[0][1])*W[a]*(Kz*f18), I+22);
	    update((C[0][1])*W[a]*(Ky*f18), I+23);
	    double f4 = (B00 + Ix*Kx);
	    update((C[0][1])*W[a]*(Ky*Kz*f4), I+24);
	    update((C[0][1])*W[a]*(f10*f4), I+25);
	    double f5 = (pow(Ky,2) + B01);
	    update((C[0][1])*W[a]*(Ix*Kz*f5), I+26);
	    update((C[0][0])*W[a]*(Kz*f5), I+27);
	    update((C[0][1])*W[a]*(f4*f5), I+28);
	    update((C[0][0])*W[a]*(Kx*f5), I+29);
	    update((C[0][1])*W[a]*(Iz*Kx*f5), I+30);
	    double f6 = (pow(Ky,2) + 3*B01);
	    update((C[0][1])*W[a]*(Iz*Ky*f6), I+31);
	    update((C[0][1])*W[a]*(Ix*Ky*f6), I+32);
	    update((C[0][0])*W[a]*(Ky*f6), I+33);
	    double f7 = 3*B00*B01;
	    update((C[0][1])*W[a]*((f7 + Kx*(Ix*(3*B01 + pow(Kx,2)) + 3*B00*Kx))), I+34);
	    update((C[0][1])*W[a]*((f7 + Ky*(3*B00*Ky + Iy*(pow(Ky,2) + 3*B01)))), I+35);
	    update((C[0][1])*W[a]*((f7 + Kz*(3*B00*Kz + Iz*(pow(Kz,2) + 3*B01)))), I+36);
	    double f8 = (Iz*Kz + B00);
	    update((C[0][1])*W[a]*(Kx*Ky*f8), I+37);
	    update((C[0][1])*W[a]*(f14*f8), I+38);
	    update((C[0][1])*W[a]*(f5*f8), I+39);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[40]) {
	double T[40];
	for (int i = 0; i < 40; ++i) {
	    T[i] = I[i];
	}
	I[36] = T[0];
	I[31] = T[1];
	I[35] = T[2];
	I[30] = T[3];
	I[28] = T[4];
	I[33] = T[5];
	I[32] = T[6];
	I[22] = T[7];
	I[26] = T[8];
	I[16] = T[9];
	I[18] = T[10];
	I[12] = T[11];
	I[15] = T[12];
	I[10] = T[13];
	I[9] = T[14];
	I[8] = T[15];
	I[38] = T[16];
	I[34] = T[17];
	I[14] = T[18];
	I[3] = T[19];
	I[2] = T[20];
	I[0] = T[21];
	I[17] = T[22];
	I[13] = T[23];
	I[37] = T[24];
	I[29] = T[25];
	I[25] = T[26];
	I[24] = T[27];
	I[21] = T[28];
	I[20] = T[29];
	I[23] = T[30];
	I[7] = T[31];
	I[5] = T[32];
	I[4] = T[33];
	I[1] = T[34];
	I[6] = T[35];
	I[11] = T[36];
	I[39] = T[37];
	I[19] = T[38];
	I[27] = T[39];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[40] = { 21, 34, 20, 19, 33, 32, 35, 31, 15, 14, 13, 36, 11, 23, 18, 12, 9, 22, 10, 38, 29, 28, 7, 30, 27, 26, 8, 39, 4, 25, 3, 1, 6, 5, 17, 2, 0, 24, 16, 37 };
// 	if (index < 40) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    36, 31, 35, 30, 28, 33, 32, 22, 26, 16, 18, 12, 15, 10, 9, 8, 38, 34, 14, 3, 2, 0, 17, 13, 37, 29, 25, 24, 21, 20, 23, 7, 5, 4, 1, 6, 11, 39, 19, 27
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 36;
	*idx++ = 31;
	*idx++ = 35;
	*idx++ = 30;
	*idx++ = 28;
	*idx++ = 33;
	*idx++ = 32;
	*idx++ = 22;
	*idx++ = 26;
	*idx++ = 16;
	*idx++ = 18;
	*idx++ = 12;
	*idx++ = 15;
	*idx++ = 10;
	*idx++ = 9;
	*idx++ = 8;
	*idx++ = 38;
	*idx++ = 34;
	*idx++ = 14;
	*idx++ = 3;
	*idx++ = 2;
	*idx++ = 0;
	*idx++ = 17;
	*idx++ = 13;
	*idx++ = 37;
	*idx++ = 29;
	*idx++ = 25;
	*idx++ = 24;
	*idx++ = 21;
	*idx++ = 20;
	*idx++ = 23;
	*idx++ = 7;
	*idx++ = 5;
	*idx++ = 4;
	*idx++ = 1;
	*idx++ = 6;
	*idx++ = 11;
	*idx++ = 39;
	*idx++ = 19;
	*idx++ = 27;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::P, rysq::F, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[30]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B01 = recurrence::coefficient(1.0/B, A, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Rx = (B01 + pow(Dx,2));
 	    double Ry = (B01 + pow(Dy,2));
 	    double Rz = (pow(Dz,2) + B01);

	    update((C[0][0])*W[a]*(Dz*Iy*Rx), I+0);
	    update((C[0][0])*W[a]*(Dy*Iz*Rx), I+1);
	    update((C[0][0])*W[a]*(Dz*Ix*Ry), I+2);
	    update((C[0][0])*W[a]*(Dx*Iz*Ry), I+3);
	    update((C[0][0])*W[a]*(Dx*Iy*Rz), I+4);
	    update((C[0][0])*W[a]*(Dy*Ix*Rz), I+5);
	    double f0 = (Iz*Rz + 2*B00*Dz);
	    update((C[0][0])*W[a]*(Dx*f0), I+6);
	    update((C[0][0])*W[a]*(Dy*f0), I+7);
	    double f10 = (2*B00*Dx + Ix*Rx);
	    update((C[0][0])*W[a]*(Dz*f10), I+8);
	    update((C[0][0])*W[a]*(Dy*f10), I+9);
	    double f11 = (3*B01 + pow(Dy,2));
	    update((C[0][0])*W[a]*(Dy*Ix*f11), I+10);
	    update((C[0][0])*W[a]*(Dy*Iz*f11), I+11);
	    double f12 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Dx*Dz*f12), I+12);
	    update((C[0][0])*W[a]*(Rx*f12), I+13);
	    update((C[0][0])*W[a]*(Rz*f12), I+14);
	    double f13 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Dy*Dz*f13), I+15);
	    update((C[0][0])*W[a]*(Ry*f13), I+16);
	    update((C[0][0])*W[a]*(Rz*f13), I+17);
	    double f15 = (2*B00*Dy + Iy*Ry);
	    update((C[0][0])*W[a]*(Dx*f15), I+18);
	    update((C[0][0])*W[a]*(Dz*f15), I+19);
	    double f2 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Dx*Dy*f2), I+20);
	    update((C[0][0])*W[a]*(Rx*f2), I+21);
	    update((C[0][0])*W[a]*(Ry*f2), I+22);
	    double f3 = (3*B01 + pow(Dx,2));
	    update((C[0][0])*W[a]*(Dx*Iz*f3), I+23);
	    update((C[0][0])*W[a]*(Dx*Iy*f3), I+24);
	    double f5 = (pow(Dz,2) + 3*B01);
	    update((C[0][0])*W[a]*(Dz*Iy*f5), I+25);
	    update((C[0][0])*W[a]*(Dz*Ix*f5), I+26);
	    double f7 = 3*B00*B01;
	    update((C[0][0])*W[a]*((Dx*(3*B00*Dx + Ix*f3) + f7)), I+27);
	    update((C[0][0])*W[a]*((Dy*(Iy*f11 + 3*B00*Dy) + f7)), I+28);
	    update((C[0][0])*W[a]*((f7 + Dz*(Iz*f5 + 3*B00*Dz))), I+29);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[30]) {
	double T[30];
	for (int i = 0; i < 30; ++i) {
	    T[i] = I[i];
	}
	I[13] = T[0];
	I[11] = T[1];
	I[18] = T[2];
	I[17] = T[3];
	I[22] = T[4];
	I[24] = T[5];
	I[23] = T[6];
	I[26] = T[7];
	I[12] = T[8];
	I[9] = T[9];
	I[3] = T[10];
	I[5] = T[11];
	I[28] = T[12];
	I[10] = T[13];
	I[25] = T[14];
	I[27] = T[15];
	I[15] = T[16];
	I[21] = T[17];
	I[16] = T[18];
	I[19] = T[19];
	I[29] = T[20];
	I[14] = T[21];
	I[20] = T[22];
	I[2] = T[23];
	I[1] = T[24];
	I[7] = T[25];
	I[6] = T[26];
	I[0] = T[27];
	I[4] = T[28];
	I[8] = T[29];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[30] = { 27, 24, 23, 10, 28, 11, 26, 25, 29, 9, 13, 1, 8, 0, 21, 16, 18, 3, 2, 19, 22, 17, 4, 6, 5, 14, 7, 15, 12, 20 };
// 	if (index < 30) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    13, 11, 18, 17, 22, 24, 23, 26, 12, 9, 3, 5, 28, 10, 25, 27, 15, 21, 16, 19, 29, 14, 20, 2, 1, 7, 6, 0, 4, 8
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 13;
	*idx++ = 11;
	*idx++ = 18;
	*idx++ = 17;
	*idx++ = 22;
	*idx++ = 24;
	*idx++ = 23;
	*idx++ = 26;
	*idx++ = 12;
	*idx++ = 9;
	*idx++ = 3;
	*idx++ = 5;
	*idx++ = 28;
	*idx++ = 10;
	*idx++ = 25;
	*idx++ = 27;
	*idx++ = 15;
	*idx++ = 21;
	*idx++ = 16;
	*idx++ = 19;
	*idx++ = 29;
	*idx++ = 14;
	*idx++ = 20;
	*idx++ = 2;
	*idx++ = 1;
	*idx++ = 7;
	*idx++ = 6;
	*idx++ = 0;
	*idx++ = 4;
	*idx++ = 8;
    }


};

template<>
struct impl<meta::braket<rysq::S, rysq::D, rysq::P, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 2;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<2> &t2, const vector<2> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[18]) {
	eval<2>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B00 = 0.5*t2[a];
	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);

	    double Dx = recurrence::coefficient<0>(rBk, -A, rAB, t2[a]);
	    double Dy = recurrence::coefficient<1>(rBk, -A, rAB, t2[a]);
	    double Dz = recurrence::coefficient<2>(rBk, -A, rAB, t2[a]);

#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);

	    update((C[0][0])*W[a]*((2*B00*Ix + Dx*(B10 + pow(Ix,2)))), I+0);
	    update((C[0][0])*W[a]*((2*B00*Iy + Dy*(B10 + pow(Iy,2)))), I+1);
	    update((C[0][0])*W[a]*(Dz*Ix*Iy), I+2);
	    update((C[0][0])*W[a]*((2*B00*Iz + Dz*(B10 + pow(Iz,2)))), I+3);
	    update((C[0][0])*W[a]*(Dy*Ix*Iz), I+4);
	    update((C[0][0])*W[a]*(Dx*Iy*Iz), I+5);
	    double f1 = (B00 + Dz*Iz);
	    update((C[0][0])*W[a]*(Ix*f1), I+6);
	    update((C[0][0])*W[a]*(Iy*f1), I+7);
	    double f4 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Dx*f4), I+8);
	    update((C[0][0])*W[a]*(Dz*f4), I+9);
	    double f5 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Dx*f5), I+10);
	    update((C[0][0])*W[a]*(Dy*f5), I+11);
	    double f6 = (Dy*Iy + B00);
	    update((C[0][0])*W[a]*(Iz*f6), I+12);
	    update((C[0][0])*W[a]*(Ix*f6), I+13);
	    double f7 = (Dx*Ix + B00);
	    update((C[0][0])*W[a]*(Iy*f7), I+14);
	    update((C[0][0])*W[a]*(Iz*f7), I+15);
	    double f8 = (B10 + pow(Ix,2));
	    update((C[0][0])*W[a]*(Dy*f8), I+16);
	    update((C[0][0])*W[a]*(Dz*f8), I+17);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[18]) {
	double T[18];
	for (int i = 0; i < 18; ++i) {
	    T[i] = I[i];
	}
	I[0] = T[0];
	I[7] = T[1];
	I[15] = T[2];
	I[14] = T[3];
	I[10] = T[4];
	I[5] = T[5];
	I[16] = T[6];
	I[17] = T[7];
	I[1] = T[8];
	I[13] = T[9];
	I[2] = T[10];
	I[8] = T[11];
	I[11] = T[12];
	I[9] = T[13];
	I[3] = T[14];
	I[4] = T[15];
	I[6] = T[16];
	I[12] = T[17];
    }

//     BOOST_GPU_ENABLED static
//     size_t reorder(size_t index) {
// 
// 
// 	const unsigned short index_[18] = { 0, 8, 10, 14, 15, 5, 16, 1, 11, 13, 4, 12, 17, 9, 3, 2, 6, 7 };
// 	if (index < 18) return index_[index];
// 	return size_t(-1);
//     }

    BOOST_GPU_ENABLED
    static int index(int i) {
	const unsigned short index[] = {
	    0, 7, 15, 14, 10, 5, 16, 17, 1, 13, 2, 8, 11, 9, 3, 4, 6, 12
	};
	return index[i];
    }

    template<typename U>
    BOOST_GPU_ENABLED
    static void index(U *idx) {
	*idx++ = 0;
	*idx++ = 7;
	*idx++ = 15;
	*idx++ = 14;
	*idx++ = 10;
	*idx++ = 5;
	*idx++ = 16;
	*idx++ = 17;
	*idx++ = 1;
	*idx++ = 13;
	*idx++ = 2;
	*idx++ = 8;
	*idx++ = 11;
	*idx++ = 9;
	*idx++ = 3;
	*idx++ = 4;
	*idx++ = 6;
	*idx++ = 12;
    }


};

template<>
struct impl<meta::braket<rysq::D, rysq::D, rysq::S, rysq::S> > {
    typedef void enable;
    static const bool value = true; 
    static const int N = 3;

    struct update {
	BOOST_GPU_ENABLED
	void operator()(const double &q, double *Q) const {
	    *Q += q;
	}
    };

    template<
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
    // void eval(double A, double B,
    // 		     const vector<3> &rAi, const vector<3> &rBk,
    // 		     const vector<3> &rAB,
    // 		     const vector<3> &rij, const vector<3> &rkl,
    // 		     const vector<3> &t2, const vector<3> &W,
    // 		     const double (&C)[1][1],
	      double (&I)[36]) {
	eval<3>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I) {
	eval<M>(A, B, rAi, rBk, rAB, rij, rkl, t2, W, C, I, update());
    }

    template<int M,
	class RAI, class RBK, 
	class RAB,
	class RIJ, class RKL,
	class T2_, class W_,
	class U>
    BOOST_GPU_ENABLED
    static inline
    void eval(double A, double B,
	      const RAI &rAi, const RBK &rBk,
	      const RAB &rAB,
	      const RIJ &rij, const RKL &rkl,
	      const T2_ &t2, const W_ &W,
	      const double (&C)[1][1],
	      double *I, const U &update) {

	const double &Xij = rij[0];
	const double &Yij = rij[1];
	const double &Zij = rij[2];



#if defined (__INTEL_COMPILER) 
#pragma ivdep
#pragma vector aligned
#endif
#ifdef __CUDACC__
#pragma unroll
#endif
	for (int a = 0; a < M; ++a) {

	    double B10 = recurrence::coefficient(1.0/A, B, t2[a]);

	    double Cx = recurrence::coefficient<0>(rAi, B, rAB, t2[a]);
	    double Cy = recurrence::coefficient<1>(rAi, B, rAB, t2[a]);
	    double Cz = recurrence::coefficient<2>(rAi, B, rAB, t2[a]);


#define pow(x,y) recurrence::pow<y>((x))

 	    double Ix = (Cx + Xij);
 	    double Iy = (Cy + Yij);
 	    double Iz = (Cz + Zij);
 	    double Px = (B10 + pow(Cx,2));
 	    double Py = (B10 + pow(Cy,2));
 	    double Pz = (B10 + pow(Cz,2));

	    update((C[0][0])*W[a]*(Iy*Iz*Px), I+0);
	    update((C[0][0])*W[a]*(Ix*Iz*Py), I+1);
	    update((C[0][0])*W[a]*(Ix*Iy*Pz), I+2);
	    update((C[0][0])*W[a]*(Cy*Iz*(Px + Cx*Xij)), I+3);
	    update((C[0][0])*W[a]*(Cz*Iy*(Px + Cx*Xij)), I+4);
	    update((C[0][0])*W[a]*(Cy*Cz*(Xij*(Xij + 2*Cx) + Px)), I+5);
	    update((C[0][0])*W[a]*(Py*(Xij*(Xij + 2*Cx) + Px)), I+6);
	    update((C[0][0])*W[a]*(Pz*(Xij*(Xij + 2*Cx) + Px)), I+7);
	    update((C[0][0])*W[a]*(Cz*Ix*(Py + Cy*Yij)), I+8);
	    update((C[0][0])*W[a]*(Cx*Iz*(Py + Cy*Yij)), I+9);
	    update((C[0][0])*W[a]*((Px + Cx*Xij)*(Py + Cy*Yij)), I+10);
	    update((C[0][0])*W[a]*((Py + Cy*Yij)*(Cz*Zij + Pz)), I+11);
	    update((C[0][0])*W[a]*((Px + Cx*Xij)*(Cz*Zij + Pz)), I+12);
	    update((C[0][0])*W[a]*(Cx*Iy*(Cz*Zij + Pz)), I+13);
	    update((C[0][0])*W[a]*(Cy*Ix*(Cz*Zij + Pz)), I+14);
	    double f1 = (B10*(3*Cx + 2*Xij) + Cx*pow(Ix,2));
	    update((C[0][0])*W[a]*(Cy*f1), I+15);
	    update((C[0][0])*W[a]*(Cz*f1), I+16);
	    double f10 = (B10 + pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*Cy*f10), I+17);
	    update((C[0][0])*W[a]*(Px*f10), I+18);
	    update((C[0][0])*W[a]*(Py*f10), I+19);
	    double f11 = 3*pow(B10,2);
	    update((C[0][0])*W[a]*((f11 + B10*(6*Cx*Xij + pow(Xij,2) + 6*pow(Cx,2)) + pow(Cx,2)*pow(Ix,2))), I+20);
	    update((C[0][0])*W[a]*((B10*(6*Cy*Yij + pow(Yij,2) + 6*pow(Cy,2)) + f11 + pow(Cy,2)*pow(Iy,2))), I+21);
	    update((C[0][0])*W[a]*((f11 + pow(Cz,2)*pow(Iz,2) + B10*(6*Cz*Zij + pow(Zij,2) + 6*pow(Cz,2)))), I+22);
	    double f15 = (Iz*pow(Cz,2) + B10*(3*Cz + Zij));
	    update((C[0][0])*W[a]*(Ix*f15), I+23);
	    update((C[0][0])*W[a]*(Iy*f15), I+24);
	    double f3 = (B10*(3*Cy + Yij) + Iy*pow(Cy,2));
	    update((C[0][0])*W[a]*(Ix*f3), I+25);
	    update((C[0][0])*W[a]*(Iz*f3), I+26);
	    double f6 = (B10*(3*Cz + 2*Zij) + Cz*pow(Iz,2));
	    update((C[0][0])*W[a]*(Cx*f6), I+27);
	    update((C[0][0])*W[a]*(Cy*f6), I+28);
	    double f7 = (Cy*pow(Iy,2) + B10*(3*Cy + 2*Yij));
	    update((C[0][0])*W[a]*(Cz*f7), I+29);
	    update((C[0][0])*W[a]*(Cx*f7), I+30);
	    double f8 = (B10*(3*Cx + Xij) + Ix*pow(Cx,2));
	    update((C[0][0])*W[a]*(Iz*f8), I+31);
	    update((C[0][0])*W[a]*(Iy*f8), I+32);
	    double f9 = (B10 + pow(Iy,2));
	    update((C[0][0])*W[a]*(Cx*Cz*f9), I+33);
	    update((C[0][0])*W[a]*(Pz*f9), I+34);
	    update((C[0][0])*W[a]*(Px*f9), I+35);
#undef pow

	}

    }

    BOOST_GPU_ENABLED static
    void reorder(double (&I)[36]) {
	double T[36];
	for (int i = 0; i < 36; ++i) {
	    T[i] = I[i];
	}
	I[30] = T[0];
	I[25] = T[1];
	I[20] = T[2];
	I[27] = T[3];
	I[22] = T[4];
	I[5] = T[5];
	I[1] = T[6];
	I[2] = T[7];
	I[23] = T[8];
	I[33] = T[9];
	I[21] = T[10];
	I[35] = T[11];
	I[28] = T[12];
	I[34] = T[13];
	I[29] = T[14];
	I[3] = T[15];
	I[4] = T[16];
	I[15] = T[17];
	I