#ifndef lint
static char SCCSid[] = "@(#) ./comm/global/gsetop.c 07/23/93";
#endif

#include "comm/comm.h"
#include "comm/procset.h"
#include "comm/global/global.h"
#include <stdio.h>

/*
     Gsetop - apply a global operation on a subset of processors

     Algorithm: 
     All the hard stuff is in the ProcSet structure.  Basically,
     a virtual tree is constructed in the processor set structure; this
     tree is used in all operations.  In addition, the access to the
     tree is designed so that if messages are to be received from
     two children, they may be received in any order.

.     gsetop   - generic routine
.     gisumset - integer sum reduction
.     gdsumset - double sum reduction
.     gihighset - integer max reduction
.     gilowset  - integer min reduction
.     gsyncset  - barrier

     Notes:
     By including the distribution of the results back to the senders as
     part of the definition of these routines, we can optimize the 
     communication in ways that can not be done with "half" operations.
  */

/*+
    gsetopT - Tree-oriented combine operation.

    Since this really isn't optimal, we've added code to insure deterministic
    execution.  We do this by identifying which nodes are left children
    of their parent, and setting a bit to indicate a message coming up the 
    left or right branch.
+*/ 
void gsetopT( val, n, work, procset, elmsize, datatype, op )
void    *val, *work, (*op)();
int     n, elmsize, datatype;
ProcSet *procset;
{
int size = n * elmsize;
int l_child, r_child, parent, am_left, rid;
int msgup, msgdn, lphase;

/* printf( "[%d] doing gsetop with size = %d\n", MYPROCID, size ); */
if (!procset) {
    if (_PIPARENT < -1)
	PISetupCollectiveTree( );
    l_child = _PILCHILD;
    r_child = _PIRCHILD;
    parent  = _PIPARENT;
    am_left = _PIAM_LEFT;
    _PIPHASE= _PIPHASE ? 0 : 1;
    lphase  = _PIPHASE;
    }
else {
    l_child = procset->l_child;
    r_child = procset->r_child;
    parent  = procset->parent;
    am_left = procset->am_left;
    GMSGPHASE(procset,lphase);
    }
    
msgup = GMSGTYPE(procset,MSG_UP|lphase);
msgdn = GMSGTYPE(procset,MSG_DN|lphase);

LOGPUSHATOMIC;
/* printf( "[%d] parent = %d, r_child = %d, l_child = %d\n", MYPROCID,
        parent, r_child, l_child ); */
/* Receive values from my children and accumulate */
if (l_child >= 0) {
    RECVSYNCNOMEM(msgup | MSG_LEFT, work,size,datatype);
    (*op)( val, work, n );
    }
if (r_child >= 0) {
    RECVSYNCNOMEM(msgup | MSG_RIGHT, work,size,datatype);
    (*op)( val, work, n );
    }
/* Send to parent */
if (parent >= 0) {
    RECVASYNCNOMEMFORCE(msgdn,val,size,datatype,rid);
    if (am_left) msgup |= MSG_LEFT;
    else         msgup |= MSG_RIGHT;
    SENDSYNCNOMEM(msgup,val,size,parent,datatype);
    RECVWAITNOMEMFORCE(msgdn,val,size,datatype,rid);
    }
/* Distribute final value back down */
if (l_child >= 0)
    SENDSYNCNOMEMFORCE(msgdn,val,size,l_child,datatype);
if (r_child >= 0)
    SENDSYNCNOMEMFORCE(msgdn,val,size,r_child,datatype);

LOGPOPATOMIC;
}

/*
     GsetopL - apply a global operation on a subset of processors

     Algorithm: 
     All the hard stuff is in the ProcSet structure.  This assumes that
     the processors are laid out in a line (in node_nums order).  The 
     method is to recursively gather from further and further away, then
     distribute the results back down (using the reverse process).
 */
void gsetopL( val, n, work, procset, elmsize, datatype, op )
void    *val, *work, (*op)();
int     n, elmsize, datatype;
ProcSet *procset;
{
int size = n * elmsize;
int mask, len, dest, myid, np, rid, sender;
int msgup, msgdn;

LOGpop();
/* printf( "[%d] doing gsetop with size = %d\n", MYPROCID, size ); */
msgup = GMSGTYPE(procset,MSG_UP);
msgdn = GMSGTYPE(procset,MSG_DN);
/* The procset version of this just uses the node_nums array to get the
   actual destinations */
if (!procset) {
    myid    = MYPROCID;
    np      = NUMNODES;
    }
else {
    myid    = procset->lidx;
    np      = procset->npset;
    }
len     = 1;
mask    = 1;
sender  = (myid & mask);
LOGPUSHATOMIC;
if (myid != 0) {
    RECVASYNCNOMEMFORCE(msgdn,val,size,datatype,rid);
    }
while (len < np) {
    dest   = myid - len;
    if (procset)
	dest = procset->node_nums[dest];
    mask <<= 1;
    if (sender && dest >= 0) {
	SENDSYNCNOMEM(msgup|mask,val,size,dest,datatype);
	break;
	}
    else {
	RECVSYNCNOMEM(msgup|mask,work,size,datatype);
	(*op)( val, work, n );
	sender = (myid & mask);
	}
    len = len + len;
    }
/* Reverse the process */
if (myid != 0) {
    RECVWAITNOMEMFORCE(msgdn,val,size,datatype,rid);
    /* The trick here is that once we've received a message,
       we need to know what distance to send the next message
       (essentially our "phase" in the communication).
       This phase is given by the number of bits until the first
       non-zero bit is found */
    len   = 1;
    while (!(len & myid)) len <<= 1;
    len   >>= 1;
    }
else {
    len     = np / 2;
    }
while (len > 0) {
    dest   = myid + len;
    if (dest < np) {
	if (procset)
	    dest = procset->node_nums[dest];
	SENDSYNCNOMEMFORCE(msgdn,val,size,dest,datatype);
	}
    len = len / 2;
    }
LOGpush();
LOGPOPATOMIC;
}

/*
    We also need:
    gsetopM (Mesh or 2-d ring code)
    gsetopB (BiRing or bidirectional ring)
    gsetopHC (Hypercube - log p sends/receives per node)
 */

