//
// pit_grape6.C
//
// Version 2001/3/31 Jun Makino
//
// serves GRAPE-6 version of calculate_acc_and_jerk_for_list
//
// *should not* do anything more complecated in here.

// Max number of clusters connected to a single host
// very unlikely to have more than 2...
#define MAXG6CLUSTERS 5

#define SLEEPTIME (10)
#include "pit_system.h"
#include <unistd.h>
extern "C"{
#include "grape6.h"
}



int myid; // for debug ... MPI rank

int MP_get_grape_id();


void MP_add_acc_and_jerk_for_list_from_other_host(particle* pb,
						       int nbody,
						       int nbh,
						       node_time* nt, 
						       int  n_next,
						       real eps2,
						       int nprocs,
						       int myid);

void accumulate_acc_and_jerk_for_list_on_host(particle* pb,
					     int nbody,
					     int nbh,
					     node_time* nt, 
					     int  n_next,
					     real eps2)
{
    //    cerr << "Enter calculate_acc_and_jerk_for_list_on_host" << endl;

    real sys_t = nt[0].next_time;
    int i,j;
    for (i = 0; i < n_next; i++) {
	particle *bi = nt[i].pptr;
	particle *bj = pb;
	for(j=0;j<nbody;j++,bj++){
	    real epstmp2 = eps2;
	    if ((bi->get_index() <nbh)||(  bj->get_index()<nbh))
		epstmp2 = EPS2BH;
	    //	    if ((bi->get_index() <nbh)&&(  bj->get_index()<nbh)){
	    //
	    //	    cerr << "Time = "<< sys_t << "indices  = "<< bi->get_index() <<"  " << bj->get_index()
	    //		     << " " << bi->get_pred_pos()
	    //		     << " " << bj->get_pred_pos() <<endl;
	    //	    }
	    if (bi->get_index() != bj->get_index())bi->accumulate_acc_and_jerk(bj,epstmp2);
	}
    }
}

void particle_system::calculate_acc_and_jerk_from_bh_for_list(int n_next)
{
    for (int i = 0; i < nbh; i++) {
	bhp[i].predict_loworder(time);
    }
    accumulate_acc_and_jerk_for_list_on_host(bhp, nbh, nbh, nt, n_next,
					     EPS2BH);
}


static int grape6_open = 0;
static int grape6_id   = -1;
static int npipes;
static int ntotalpipes;
static int grape6_nclusters;
static int * gindex;
static vector * xi;
static vector * veli;
static vector * acci;
static vector * jerki;
static real * poti;
static real * eps2i;
static real * h2i;


#define FOREACHG6CLUSTERS for(int ig6 = 0;ig6<grape6_nclusters;ig6++)

void send_all(particle * pb, int nbody)
{
    vector j218 = vector(0.0,0.0,0.0);
    particle * bi = pb;
    for(int i = 0; i<nbody; i++){
	vector j6 = ONE_SIXTH*bi->get_old_jerk();
	vector a2 = 0.5L*bi->get_old_acc();
	FOREACHG6CLUSTERS
	g6_set_j_particle(grape6_id+ig6,bi->get_grape_index(),
			   bi->get_index(),
			   bi->get_time(),
			   bi->get_timestep(),
			   bi->get_grape_mass(),
			   (real*)&j218,
			   (real*)&j6,
			   (real*)&a2,
			   (real*)bi->pget_vel(),
			    (real*)bi->pget_pos());
	bi++;
    }
}
    

void pit_grape6_initialize(particle * pb, int nbody, int nbh)
{
    cerr << myid << " grape6_initialize called\n";

    if (grape6_id == -1){
	npipes = g6_npipes();
	ntotalpipes = npipes * grape6_nclusters;
	gindex = new int[npipes*MAXG6CLUSTERS]; 
	xi = new vector[npipes*MAXG6CLUSTERS]; 
	veli = new vector[npipes*MAXG6CLUSTERS]; 
	acci = new vector[npipes*MAXG6CLUSTERS]; 
	jerki = new vector[npipes*MAXG6CLUSTERS]; 
	poti = new real[npipes*MAXG6CLUSTERS]; 
	eps2i = new real[npipes*MAXG6CLUSTERS]; 
	h2i = new real[npipes*MAXG6CLUSTERS]; 
	grape6_id = MP_get_grape_id();
	cerr << myid << " grape6 id = "<< grape6_id <<endl;
    }


    FOREACHG6CLUSTERS g6_open(grape6_id+ig6);
    grape6_open = 1;

    // Set some non-zero value for dt to keep the library happy...
    for(int i=0;i<nbody;i++)if (pb[i].get_grape_index() == -1) {
	pb[i].set_timestep(0.0078125);
	pb[i].set_grape_index(i);
	pb[i].set_grape_mass(pb[i].get_mass());
	if (pb[i].get_index()<nbh) pb[i].set_grape_mass(0);
    }
    
    
    MP_sync();
    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6, 1);
    for(int i = 0; i< 1000; i++)g6_dummywait();
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6, 1);
    for(int i = 0; i< 1000; i++)g6_dummywait();
    MP_sync();
    send_all(pb, nbody);
    FOREACHG6CLUSTERS g6_setup_njdata(grape6_id+ig6, nbody);
    MP_sync();
    for(int i = 0; i< 1000; i++)g6_dummywait();
    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6, 0);
    MP_sync();
    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6, 0);
    for(int i = 0; i< 1000; i++)g6_dummywait();
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);

    
    for(int i = 0; i<npipes;i++)h2i[i] = 0;
    if (pb->get_pot() == 0.0){
	// zero potential ... assume that this is the first time
	// and  fill some arbitrary values for force etc
	real mtotal = 0;
	real rinvtotal = 0;
	particle * bi = pb;
	for(int i=0;i<nbody;i++){
	    mtotal+= bi->get_mass();
	    rinvtotal += 1.0/sqrt(bi->get_pos()*bi->get_pos()+0.1);
	    bi++;
	}
	real r = ((real)nbody)/rinvtotal;
	real ptyp = mtotal/r*30;
	real f = mtotal/r/r*30;
	real v = sqrt(mtotal/r);
	real j = f*v/r;
	vector atyp = vector(f,f,f);
	vector jtyp = vector(j,j,j);
	PRC(ptyp); PRC(atyp); PRL(jtyp);
	bi = pb;
	for(int i=0;i<nbody;i++){
	    bi->set_pot(ptyp);
	    bi->set_acc(atyp);
	    bi->set_jerk(jtyp);
	    bi++;
	}
    }

}

#define cvector (real(*)[3])

static int grape6_serialize_mode = -1;

void set_grape6_serialize_mode(int mode)
{
    grape6_serialize_mode = mode;
}

static int g6xunit = 52; // test to change G6 scaling after
                         // each hard error...
static int g6_clock_state = 1;
void calculate_acc_and_jerk_for_list_on_grape6(particle_system* ps,
					       particle* pb,
					       int nbody,
					       int nbh,
					       node_time* nt, 
					       int  n_next,
					     real eps2)
{

    int error;
    grape6_nclusters = ps->get_grape_nclusters();
    int local_error[5];
    int errorcount = 0;
    int n_next_max = MP_intmax(n_next);
    do{
	error = 0;
	for(int i = 0 ; i< grape6_nclusters; i++) local_error[i] = 0;
	if (grape6_open == 0) {
	    g6xunit ++;
	    if (g6xunit > 54) g6xunit = 51;
	    g6_set_xunit(g6xunit);
	    pit_grape6_initialize(pb, nbody, nbh);
	    g6_clock_state = 1;
	    ps->update_grape_data(nbody);
	}else{
	    // restore clock...
	    if (g6_clock_state == 0){
		FOREACHG6CLUSTERS g6_change_clock(grape6_id+ig6,1);
		g6_clock_state = 1;
	    }
	}
	if ((grape6_id == 1) && (grape6_serialize_mode == 1)) MP_sync();
	real sys_t = nt[0].next_time; 
	for (int i = 0; i < n_next; i++) {
	    particle *bi = nt[i].pptr;
	    bi->predict_loworder(sys_t);
	}
	FOREACHG6CLUSTERS g6_set_ti(grape6_id+ig6,sys_t);
	for (int i = 0; i < n_next_max; i+= ntotalpipes) {
	    for(int k = 0; k<ntotalpipes;k++)eps2i[k] = eps2;
	    particle *bi = nt[i].pptr;
	    int np = ntotalpipes;
	    if (i+np > n_next) np = n_next - i;
	    for (int ii = 0; ii < np; ii++){
		particle *bi = nt[i+ii].pptr;
		gindex[ii] = bi->get_index();
		if (gindex[ii] < nbh) eps2i[ii]=EPS2BH;
		xi[ii] = bi->get_pred_pos();
		veli[ii] = bi->get_pred_vel();
		acci[ii] = bi->get_acc();
		jerki[ii] = bi->get_jerk()*0.1;
		poti[ii] = bi->get_pot();
	    }
	    int mode = 2;
	    if ((grape6_id == 1) && (grape6_serialize_mode == 0)) MP_sync();
	    if ( (np > 0) && (error == 0)){
		int ic, ifirst;
		for(ic=0,ifirst=0;ic<grape6_nclusters;ic++,ifirst+=npipes){
		    int npreal = np-ifirst;
		    if (npreal > npipes) npreal = npipes;
		    //		    cerr << "Calling firsthalf " << ic << " " << npreal << endl;
		    if (npreal > 0){
			g6calc_firsthalf0(grape6_id+ic,nbody, npreal, gindex+ifirst,
					  (cvector xi)+ifirst, (cvector veli)+ifirst,
					  (cvector acci)+ifirst, (cvector jerki)+ifirst,
					  poti+ifirst, eps2i+ifirst, h2i+ifirst,mode);
		    }
		}
	    }
	    if ((grape6_id == 0) && (grape6_serialize_mode == 0)) MP_sync();
	    if (grape6_serialize_mode == 0)MP_sync();
	    if ((grape6_id == 1)&&( grape6_serialize_mode == 0)) MP_sync();
	    if ( (np > 0) && (error == 0)){
		int ic, ifirst;
		for(ic=0,ifirst=0;ic<grape6_nclusters;ic++,ifirst+=npipes){
		    int npreal = np-ifirst;
		    if (npreal > npipes) npreal = npipes;
		    if (npreal > 0){
			local_error[ic] = g6calc_lasthalf0(grape6_id+ic, nbody, npreal, gindex+ifirst,
						  (cvector xi)+ifirst, (cvector veli)+ifirst,
						  eps2i+ifirst, h2i+ifirst,
							   (cvector acci)+ifirst, (cvector jerki)+ifirst,
							   poti+ifirst,mode);
			if(local_error[ic])  {
			    cerr << myid << " GRAPE-6 local hardware error on "
				 << grape6_id+ic << " at i = " << i<<endl;
			    g6_print_chip_status(grape6_id+ic);
			}
			error |= local_error[ic];
		    }
		}
		if (error){
		    error = 1;
		}else{
		    if (g6_changed_index_() >= 0){
			particle *bi = nt[i+g6_changed_index_() ].pptr;
			cerr << "Scaling change for index = " << g6_changed_index_()
			     <<" Dt = "<< bi->get_timestep()<<endl;  
			cerr << "Jold = " << bi->get_jerk() <<endl;
			cerr << "Jnew = " << jerki[g6_changed_index()]<<endl;
			cerr << "Fold = " << bi->get_acc() <<endl;
			cerr << "Fnew = " << acci[g6_changed_index()]<<endl;
		    }
		    for (int ii = 0; ii < np; ii++){
			particle *bi = nt[i+ii].pptr;
			bi->set_acc(acci[ii]); 
			bi->set_jerk(jerki[ii]); 
			bi->set_pot(poti[ii]); 
		    }
		}
	    }
	    if ((grape6_id == 0) && (grape6_serialize_mode == 0))MP_sync();

	}
	if ((grape6_id == 0) && (grape6_serialize_mode == 1)) MP_sync();
	if (MP_get_grape_error(error)){
	    int mode = 0;
	    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6, mode);
	    if (error == 0) cerr << "GRAPE hardware error" <<endl;
	    errorcount ++;
	    if (errorcount > 0){ 
		FOREACHG6CLUSTERS {
		    //		    if (local_error[ig6]||local_error[0]){
		    g6_reset(grape6_id+ig6);
		    g6_reset_fofpga(grape6_id+ig6);
		    g6_close(grape6_id+ig6);
		    //}
		}
		grape6_open = 0;
		errorcount = 0;
	    }
	    error = 1;
	}
    }while (error);
    if (n_next > 3000){
	if (grape6_nclusters > 1){
	    g6_change_clock(grape6_id,0);
	    g6_clock_state = 0;
	}
    }
}

#if 0
// Old version...
void particle_system::update_grape_data(int  n_next)
{
    //    cerr << "Update grape data "; PRC(n_next);PRL(time);
    vector j218 = vector(0.0,0.0,0.0);

    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6, 1);
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
    //    usleep(SLEEPTIME);
    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6, 1);
    for (int i = 0; i< 200; i++) g6_dummywait();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
    FOREACHG6CLUSTERS g6_initialize_jp_buffer(grape6_id+ig6,n);
    for (int i = 0; i < n_next; i++) {
	particle *bi = nt[i].pptr;
	vector j6 = bi->ONE_SIXTH*get_jerk();
	vector a2 = bi->0.5L*get_acc();
	FOREACHG6CLUSTERS g6_set_j_particle(grape6_id+ig6,bi->get_grape_index(),
			   bi->get_index(),
			   bi->get_time(),
			   bi->get_timestep(),
			   bi->get_grape_mass(),
			   (real*)&j218,
			   (real*)&j6,
			   (real*)&a2,
			   (real*)bi->pget_vel(),
			   (real*)bi->pget_pos());
	if (bi->get_index() < 0) cerr << "mass set for index " << bi->get_index() << " "
					<<bi->get_grape_index() << "  "
				        << bi->get_grape_mass() <<endl;
    }

    // MP_sync below is to serialize the use of multicast network. Seemed to be necessary
    // on SMP alpha with multiple PCI bus, but not needed for a cluster of 1CPU P4 boxen...
    // VERY strange.
    //    if (grape6_id == 1) MP_sync();
    FOREACHG6CLUSTERS g6_flush_jp_buffer(grape6_id+ig6);
    //    if (grape6_id == 0) MP_sync();

    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6,0);
    MP_sync();
    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6,0);
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);

}

#endif
void particle_system::update_grape_data(int  n_next)
{
    //    cerr << "Update grape data "; PRC(n_next);PRL(time);
    vector j218 = vector(0.0,0.0,0.0);

    //#define JP_NO_BUFFER

#ifdef JP_NO_BUFFER
    if (g6_clock_state ==0){
	g6_change_clock(grape6_id,1);
	g6_clock_state = 1;
    }
    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6, 1);
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
    //    usleep(SLEEPTIME);
    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6, 1);
    for (int i = 0; i< 200; i++) g6_dummywait();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
#endif
    
#ifndef JP_NO_BUFFER
    g6_initialize_jp_buffer(grape6_id,n);
#endif    

    for (int i = 0; i < n_next; i++) {
	particle *bi = nt[i].pptr;
	vector j6 = ONE_SIXTH*bi->get_jerk();
	vector a2 = 0.5L*bi->get_acc();
	g6_set_j_particle_multisend(grape6_id,grape6_nclusters,
				    bi->get_grape_index(),
			   bi->get_index(),
			   bi->get_time(),
			   bi->get_timestep(),
			   bi->get_grape_mass(),
			   (real*)&j218,
			   (real*)&j6,
			   (real*)&a2,
			   (real*)bi->pget_vel(),
			   (real*)bi->pget_pos());
	if (bi->get_index() < 0) cerr << "mass set for index " << bi->get_index() << " "
					<<bi->get_grape_index() << "  "
				        << bi->get_grape_mass() <<endl;
    }

    // MP_sync below is to serialize the use of multicast network. Seemed to be necessary
    // on SMP alpha with multiple PCI bus, but not needed for a cluster of 1CPU P4 boxen...
    // VERY strange.
    //    if (grape6_id == 1) MP_sync();

#ifndef JP_NO_BUFFER
    if (g6_clock_state ==0){
	g6_change_clock(grape6_id,1);
	g6_clock_state = 1;
    }
    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6, 1);
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
    //    usleep(SLEEPTIME);
    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6, 1);
    for (int i = 0; i< 200; i++) g6_dummywait();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);
    for (int i = 0; i< 200; i++) g6_dummywait();
    g6_flush_jp_buffer_and_multisend(grape6_id, grape6_nclusters);
#endif


    //    if (grape6_id == 0) MP_sync();

    FOREACHG6CLUSTERS g6_set_ijp_mode(grape6_id+ig6,0);
    MP_sync();
    FOREACHG6CLUSTERS g6_change_cbjpmode(grape6_id+ig6,0);
    MP_sync();
    FOREACHG6CLUSTERS g6_reset(grape6_id+ig6);

}



void calculate_acc_and_jerk_for_list_on_host(particle* pb,
					     int nbody,
					     int nbh,
					     node_time* nt, 
					     int  n_next,
					     real eps2)
{
    //    cerr << "Enter calculate_acc_and_jerk_for_list_on_host" << endl;

    real sys_t = nt[0].next_time;
    int i,j;
    for (i = 0; i < n_next; i++) {
	particle *bi = nt[i].pptr;
	bi->predict_loworder(sys_t);
	bi->clear_interaction();
    }
    for (i = 0; i < n_next; i++) {
	particle *bi = nt[i].pptr;
	particle *bj = pb;
	for(j=0;j<nbody;j++,bj++){
	    real epstmp2 = eps2;
	    if ((bi->get_index() <nbh)||(  bj->get_index()<nbh))epstmp2 = EPS2BH;
	    if (bi->get_index() != bj->get_index())bi->accumulate_acc_and_jerk(bj,epstmp2);
	}
    }
}


void particle_system::calculate_acc_and_jerk_for_list(int n_next,
						      bool &restart_grape)
{
#if 0
    cerr << "Enter calculate_acc_and_jerk_for_list" << endl;
    predict_loworder_all(time);	
    calculate_acc_and_jerk_for_list_on_host(pb, n, nbh, nt, n_next,
					     eps2);
    MP_add_acc_and_jerk_for_list_from_other_host(pb, n, nbh, nt, n_next,
					     eps2,nprocessors,myprocid);
    cerr << "Force on host" << endl;
    for(int i = 0;i<n_next;i++){
	particle * bi = nt[i].pptr;
	{PRC(i); PRC(bi->get_acc());PRC(bi->get_jerk());PRL(bi->get_pot());}
    }
#endif

    collect_BH_data();
    MP_distribute_BH_data();

    int nprocs = nprocessors;
    myid = myprocid;
    g6_setmccount(nprocs);
    calculate_acc_and_jerk_for_list_on_grape6(this,pb, n, nbh, nt, n_next,
					     eps2);

#if 0
    cerr << "Force on GRAPE before BH correction" << endl;
    for(int i = 0;i<n_next;i++){
	particle * bi = nt[i].pptr;
	{PRC(i); PRC(bi->get_acc());PRC(bi->get_jerk());PRL(bi->get_pot());}
    }
#endif    
    // need to take care of force from BH
    calculate_acc_and_jerk_from_bh_for_list(n_next);
#if 0
    sleep(myid*1);
    cerr << "Force on GRAPE" << endl;
    for(int i = 0;i<n_next;i++){
	particle * bi = nt[i].pptr;
	{PRC(i); PRC(bi->get_acc());PRC(bi->get_jerk());PRL(bi->get_pot());}
    }
    MP_sync();
#endif    
}

void close_grape()
{
    FOREACHG6CLUSTERS {
	g6_close(grape6_id+ig6);
    }
}
