#include <oxstd.h>
#include <oxfloat.h>
#include <packages/oxmpi/loop.oxh>
#ifdef OX_MPI
	#include <packages/oxmpi/oxmpi.oxh>
#endif

enum
{	TAG_BLOCK = 10, TAG_RESULT
};

Loop::SetWorkerOutput(const fSet)
{
	sm_fWorkerOutput = fSet;
#ifdef OX_MPI
	Init();
	if (sm_cWorkers)
		OxMPI_SetMaster(-1, !sm_fWorkerOutput);
#endif
}
Loop::SetVerbose(const iVerbose)
{
	sm_iVerbose = iVerbose;
}
Loop::SetBlockSize(const cBlockSize)
{
	sm_cBlockSize = cBlockSize;
}
Loop::UseRanSeed(const bUse)
{
	return UseIterSeed(bUse);
}
Loop::UseIterSeed(const bUse)
{
	decl bold = sm_bUseIterSeed;
	if (bUse >= 0)
		sm_bUseIterSeed = bUse;
	return bold;
}


Loop::getResultSize(const mAdd)
{
	decl c;
	
	if (sizerc(mAdd) == 0)		 // nothing to append
		return 0;
	
	// get the column size in c
	if (isarray(mAdd))		     // yes: is an array of matrices
	{							 // column size of first non-empty element
		decl k, carray = sizeof(mAdd);
		for (k = c = 0; c == 0 && k < carray; ++k)
			c = sizec(mAdd[k]);
	}
	else
		c = sizec(mAdd);
return c;
}
Loop::storeResult(const amResult, mAdd, const iRep, const cRep, const fnProcess)
{
	// process first if necessary
	if (isfunction(fnProcess))
		mAdd = fnProcess(iRep, mAdd);

	if (sizerc(mAdd) == 0)		 // nothing further to do
		return 0;
		
	if (sizerc(amResult[0]) == 0)// first time to store: create matrices
	{							 // use storage instead of appending to
		if (isarray(mAdd))		 // avoid frequent reallocations
		{
			decl k, carray = sizeof(mAdd);
				
			amResult[0] = new array[carray];
			for (k = 0; k < carray; ++k)
				amResult[0][k] = constant(.NaN, sizerc(mAdd[k]), cRep);
		}
		else
			amResult[0] = constant(.NaN, sizerc(mAdd), cRep);
	}
	if (isarray(mAdd))		     // now store the result
	{
		decl k, carray = sizeof(mAdd);
		for (k = 0; k < carray; ++k)
			amResult[0][k][][iRep] = vecr(mAdd[k]);
	}
	else
		amResult[0][][iRep] = vecr(mAdd);
return 1;
}
Loop::storeResultsBlock(const amResult, mAdd, const iRep, const cRepTot)
{
	if (sizerc(mAdd) == 0)		 // nothing further to do
		return 0;
		
	if (sizerc(amResult[0]) == 0)// first time to store: create matrices
	{							 // use storage instead of appending to
		if (isarray(mAdd))		 // avoid frequent reallocations
		{
			decl k, carray = sizeof(mAdd);
				
			amResult[0] = new array[carray];
			for (k = 0; k < carray; ++k)
				amResult[0][k] = constant(.NaN, sizer(mAdd[k]), cRepTot);
		}
		else
			amResult[0] = constant(.NaN, sizer(mAdd), cRepTot);
	}
	if (isarray(mAdd))		     // now store the result
	{
		decl k, carray = sizeof(mAdd);
			
		for (k = 0; k < carray; ++k)
			amResult[0][k][][iRep : ] = matrix(mAdd[k]);
	}
	else
		amResult[0][][iRep : ] = matrix(mAdd);
return 1;
}
Loop::checkResult(const amResult, const cRep, const cRepTot)
{
	if (cRep < cRepTot)			 // some replications were rejected
	{
		if (isarray(amResult[0]))
		{
			decl k, carray = sizeof(amResult[0]);

			for (k = 0; k < carray; ++k)
				if (sizec(amResult[0][k]) > cRep)
					amResult[0][k] = amResult[0][k][][: cRep - 1];
		}
		else if (sizec(amResult[0]) > cRep)
			amResult[0] = amResult[0][][: cRep - 1];
	}
}

Loop::doLoopAsWorker(const iManager, const fnRun, const fnProcess, const acReject)
{
#ifdef OX_MPI
	decl repseed, seed, i0, i, k, c, cb, iret, mresult, mrep, crep;

	for (k = cb = 0; ; ++cb)
	{
		// ask the manager for a seed and number of replications
		repseed = OxMPI_Recv(iManager, TAG_BLOCK);
		crep = repseed[0];
		i0 = repseed[1];
		if (crep <= 0)
			break;
		
		// run this block of experiments
		for (i = c = 0, mresult = <>; i < crep; ++i, ++k)
		{
			ranloopseed(i0 + i, 0);
			mrep = fnRun(i0 + i);
			if (sizerc(mrep) > 0)		// otherwise failure
			{
				storeResult(&mresult, mrep, c, crep, fnProcess);
				++c;
			}
		}
		// check the result (removing failed iters) and return to the manager
		checkResult(&mresult, c, crep);
		OxMPI_Send(mresult, iManager, TAG_RESULT);
	}
	// receive aggregate to get all processes in the same state
	mresult = OxMPI_Recv(iManager, TAG_RESULT);
	acReject[0] = i0;

	if (sm_iVerbose > 0)
	    println(k, " replications done on ", OxMPI_Get_processor_name(),
			"(", OxMPI_Comm_rank(), "), in ", cb, " blocks");
	
	return mresult;
#endif // OX_MPI
}	
Loop::doLoopAsManager(const cWorkers, const cRep, const fnProcess, const acReject)
{
#ifdef OX_MPI
	decl itno, i, j, jnext, jindex, c, cstarted, status, citer, creject, cb;
	decl mresult = <>, mrep, vcstarted;

	// set the blocksize, using default if none specified
	if (sm_cBlockSize > 0)
	{	cb = sm_cBlockSize;
		if (cb * cWorkers > cRep)
			cb = max(int((cRep / cWorkers) / 2), 1);
	}
	else
		cb = max(min(int((cRep / cWorkers) / 10), 1000), 1);
	if (sm_iVerbose > 0)
		println("M=", cRep, " block size=", cb);

	// start-up all workers
	vcstarted = zeros(1, cWorkers);
	for (j = cstarted = 0; j < cWorkers; ++j)
	{
		OxMPI_Send(cb ~ cstarted, j, TAG_BLOCK);
		vcstarted[j] = cb;
		cstarted += cb;
	}
	citer = cstarted;			 // this is the actual iteration index

	// replication loop, repeat while any left to do
	for (itno = 0; itno < cRep; )
	{
		// wait (hopefully efficiently) until a result comes in
		status = OxMPI_Probe(OxMPI_ANY_SOURCE, TAG_RESULT);
		jnext = status[0];

		// process this replication; check on others to prevent starvation
		for (i = 0; itno < cRep && i < cWorkers; ++i)
		{
			j = i + jnext;		 // first time (i=0): j==jnext
			if (j >= cWorkers) j = 0;
			if (i)				 // next times: non-blocking probe of j
			{	status = OxMPI_Iprobe(j, TAG_RESULT);
				if (!sizerc(status))
					continue;	 // try next one if nothing pending
			}
			// now get results from worker j
			mrep = OxMPI_Recv(j, TAG_RESULT);
			// get actual no replications done by j
			c = getResultSize(mrep);
			cstarted += c - vcstarted[j]; // adjust number started
			
			if (cstarted < cRep) // yes: more work to do
			{	// shrink block size if getting close to finishing
				if (cstarted > cRep - cWorkers * cb)
					cb = max(int(cb / 2), 1);
				OxMPI_Send(cb ~ citer, j, TAG_BLOCK);
				vcstarted[j] = cb;
				cstarted += cb;
				citer += cb;
			}
			if (sm_iVerbose > 1)
			{
				println("Received ", c, " replications from worker ", j);
				if (cstarted - cb < cRep)
					println("Sent ", cb, " replications to ", j, " left=", cRep - itno, " total started=", citer);
			}
			// finally: store the results before probing next
			storeResultsBlock(&mresult, mrep, itno, cRep);
			itno += c;
		}
	}

	// close down workers
	creject = citer - cRep;
	for (j = 0; j < cWorkers; ++j)
	{
		OxMPI_Send(0 ~ creject, j, TAG_BLOCK);
		OxMPI_Send(mresult, j, TAG_RESULT);
	}

	acReject[0] = creject;
	return mresult;
#endif // OX_MPI
}
Loop::doLoop(const cRep, const fnRun, const fnProcess, const acReject,
	const bBlockSeed)
{
	decl i, itno, c, mresult = <>, mrep;

	for (itno = i = 0; itno < cRep; ++i)
    {
		ranloopseed(i, 0);
		mrep = fnRun(i);
		c = getResultSize(mrep);
		storeResult(&mresult, mrep, itno, cRep, fnProcess);
		itno += c;
	}

	acReject[0] = i - itno;
	return mresult;
}

Loop::Run(const fnRun, const cRep)
{
	return RunEx(0, fnRun, cRep, 0);
}
Loop::RunEx(const fnInit, const fnRun, const cRep, const fnProcess, ...)
{
	decl creject = 0, mresult, sran, seed0;

	Init();						// initialization musy be done once

#ifdef OX_MPI
	if (IsManager())			// get all processes in the same RNG state
	{
		seed0 = ranseed(0);
		for (decl j = 0; j < sm_cWorkers; ++j)
		{
			OxMPI_Send(seed0, j, TAG_BLOCK);
		}
	}
	else if (IsWorker())
	{
		seed0 = OxMPI_Recv(sm_iManager, TAG_BLOCK);
		ranseed(seed0);
	}
	OxMPI_Barrier();
#endif

	if (isfunction(fnInit) && !IsManager())
	{
		fnInit();
	}	

	if (sm_bUseIterSeed)
		ranloopseed(0, -1);
		
	if (IsWorker())
		mresult = doLoopAsWorker(sm_iManager, fnRun, fnProcess, &creject);
	else if (IsManager())
		mresult = doLoopAsManager(sm_cWorkers, cRep, fnProcess, &creject);
	else
		mresult = doLoop(cRep, fnRun, fnProcess, &creject, sm_bUseIterSeed);

	if (sm_bUseIterSeed)
		ranloopseed(cRep, 1);

	decl args = va_arglist();
	if (sizeof(args))
		args[0][0] = creject;
		
#ifdef OX_MPI
	OxMPI_Barrier();
#endif
	return mresult;
}

Loop::Init()
{
	if (sm_fInitialized)
		return;
		
#ifdef OX_MPI
    OxMPI_Init();

    sm_cWorkers = OxMPI_Comm_size() - 1;
	sm_iManager = sm_cWorkers;
	if (sm_cWorkers)
		OxMPI_SetMaster(sm_cWorkers && OxMPI_Comm_rank() == sm_cWorkers, !sm_fWorkerOutput);

	if (sm_iVerbose > 0)
	{	decl myid = OxMPI_Comm_rank();
    	decl procname = OxMPI_Get_processor_name();
		if (sm_cWorkers)
		    println(OxMPI_IsMaster() ? "Manager" : "Worker", " on ", procname, "(", myid, ")");
		else
		    println("Only one process");
	}
#endif

	sm_fInitialized = TRUE;
}
Loop::IsManager()
{
#ifdef OX_MPI
	if (!sm_fInitialized)
		Init();
	return sm_cWorkers ? OxMPI_IsMaster() : 0;
#else	
	return 0;
#endif
}
Loop::IsWorker()
{
#ifdef OX_MPI
	if (!sm_fInitialized)
		Init();
	return sm_cWorkers ? !OxMPI_IsMaster() : 0;
#else	
	return 0;
#endif
}
Loop::Timer()
{
#ifdef OX_MPI
	if (!sm_fInitialized)
		Init();
	return OxMPI_Wtime();
#else	
	return timer() / 100; // translate to seconds
#endif
}