c***********************************************************************
c                                                                     
c                  B P   A M E R I C A
c          PROPRIETARY - TO BE MAINTAINED IN CONFIDENCE 
c                        COPYRIGHTED 2008
c***********************************************************************
c
c     Program to stack across any selected axis.
c     Originally written by Jerry Ehlers.
c     Parallel version written by Richard Clarke.
c
      program dstack
      implicit none
c
c     INCLUDE THE MPI API (Application Program Interface)
c
#include <mpif.h>
c
c     INCLUDE THE DDS API (Application Program Interface)
c
#include <fdds.h>
c
      external Master_ReceiveResult
      external Master_ReceiveFinalResults
      external Slave_Work
      external Slave_SendResult
      external Slave_SendFinalResults
      external Master_BroadcastResults
      external online_help
c
      integer ier, nproc, np, node
      integer numjobs,iverbose,debug
      integer in_bin,out_bin,naxis,saxis
      integer nsin,nsout,ntin,ntout
      integer size(RANK_MAX)
      character*(AXISNAME_MAX) axis(RANK_MAX)
      character*80 title
      logical norm
c
      real    work(1)
      pointer(ptr_work,work)

      data title/'dstack: Stack across selected axis'/
c
c     Initialize MPI and OpenMP, open printfile:
c
      ier=fddx_initmpix(nproc,np,node,'dstack',online_help)
c
      call OpenInput(in_bin,naxis,axis,size,title)
c
      call ReadParams(node,naxis,axis,saxis,norm,iverbose)
c
c     only the master writes the output
c
      if ( node.eq.0 ) then
         call OpenOutput(in_bin,out_bin,title,naxis,saxis,axis)
      else
         out_bin = -1
      endif
c
c     there could be different errors for the master and the slaves
c     check that no-one encountered errors, else die cleanly.
c
      ier = fddx_checkforerrors(np,node)
c
      call Setup(node,in_bin,out_bin,
     $     naxis,saxis,size,norm,
     $     nsin,nsout,ntin,ntout,numjobs,
     $     ptr_work)
c
      ier = fddx_slavedriver(
     $     np, node, numjobs, iverbose, debug, work,
     $     Slave_Work,
     $     Slave_SendResult,
     $     Slave_SendFinalResults,
     $     Master_ReceiveResult,
     $     Master_ReceiveFinalResults,
     $     Master_BroadcastResults )
c
c     shut down gracefully
c
      ier = fddx_stop( np, node )
c
      end
c
c***********************************************************************
c
c     online help
c
c***********************************************************************
c
      subroutine online_help()
c
      implicit none
c
      write(0,*) 'Program to stack along any selected axis with the'
      write(0,*) 'option to normalize by live samples.'
      write(0,*) ' '
      write(0,*) 'usage:'
      write(0,*)
     *'   dstack [np= ] [in=dat] [in_data=bin] [in_format=fmt] \\'
      write(0,*)
     *'   [out=dat] [out_data=bin] [out_format=fmt] \\'
      write(0,*)
     *'   saxis=n [normalize=y|n] [help=]'
      write(0,*) '   '
      write(0,*) 'where:'
      write(0,*) '   np=        # of mpi nodes to use.'
      write(0,*) '   in=        input data to stack (dflt stdin:)'
      write(0,*) '   in_bin=    input data binary'
      write(0,*) '   in_format= input data format'
      write(0,*) '   out=       output data dictionary (dflt stdout:)'
      write(0,*) '   out_data=  output data binary'
      write(0,*) '   out_format=output data format'
      write(0,*) '   saxis=     axis # to stack (no dflt)'
      write(0,*) '   normalize= y|n: switch to normalize (dflt y)'
      write(0,*) '   help=      this help'
      write(0,*) ' '
c
      stop 0
c
      end
c
c***********************************************************************
c
c     OpenInput: open input file, get axis names and dimensions
c
c***********************************************************************
c
      subroutine OpenInput(in_bin,naxis,axis,size,title)
c
      implicit none
c
#include <fdds.h>
c
      integer in_bin,naxis,i,ier
      integer size(RANK_MAX)
      character*(AXISNAME_MAX) axis(RANK_MAX)
      character*80 title,text
c
      in_bin=fddx_in('in','stdin:',title)
      if (in_bin.lt.0) then
         ier=fdds_prterr('Unable to open input data\n\0')
      endif
c     
      naxis=fdds_scank('axis',' ')
      ier=fdds_scanf('axis','\0')
      do i=1,naxis
         ier=fdds_scanf(' ','%s\0',axis(i))
      enddo
c
      do i=1,naxis
         ier=fdds_sprintf(text,'size.axis(%d)\0',i)
         ier=fdds_scanf(text,'%d\0',size(i))
      enddo
c
      return
      end
c
c
c***********************************************************************
c
c     read the user parameters
c
c***********************************************************************
c
      subroutine ReadParams(node,naxis,axis,saxis,norm,iverbose)
c
      implicit none
c
#include <fdds.h>
c
      integer node,naxis,saxis,iverbose,ier
      character*(AXISNAME_MAX) axis(RANK_MAX)
      logical norm,verbose
c
      ier=fdds_dict('par:','scan')
c
      saxis=0
      ier=fdds_scanf('saxis','%d\0',saxis)
      if (saxis.lt.1.or.saxis.gt.naxis) then
         ier=fdds_prterr('saxis(%d) must be between 1 and %d\n\0',
     $        saxis,naxis)
         saxis=max(1,min(saxis,naxis))
      endif
c
      norm=(fdds_switch('normalize',1).ne.0)
      iverbose = 0
      verbose = (fdds_switch('verbose',0).eq.1)
      if(verbose.eq..true.) iverbose = 1
c
c-----------------------------------------------------------------------
c     print parameters
c-----------------------------------------------------------------------
      if ( node .eq. 0 ) then
c
         ier=fdds_prtmsg('\n*** PARAMETERS ***\n\n\0')
         if (norm) then
            ier=fdds_prtmsg('Normalizing stack\n\0')
         else
            ier=fdds_prtmsg('No Normalization\n\0')
         endif
         ier=fdds_prtmsg('\tsaxis = %d (%s)\n\0',saxis,axis(saxis))
         ier=fdds_prtmsg('\n\0')
         if (verbose) then
            ier=fdds_prtmsg('Writing more verbose printout.\n\0')
         else
            ier=fdds_prtmsg('Writing less verbose printout.\n\0')
         endif
c
      endif
c
      return
      end
c
c***********************************************************************
c
c     open the output file
c
c***********************************************************************
c
      subroutine OpenOutput(in_bin,out_bin,title,
     $     naxis,saxis,axis)
c
      implicit none
c
#include <fdds.h>
c
      integer in_bin,out_bin,naxis,saxis,ier,i
      character*(AXISNAME_MAX) axis(RANK_MAX)
      character*80 title,text
c
      out_bin=fddx_out('out','stdout:',title,in_bin)
      if (ier.lt.0) then
c
         ier=fdds_prterr('opening output file\n\0')
c
      else
c     
c        define the output axis
c
         if (naxis.gt.3) then
            ier=fdds_printf('axis','\0')
            do i=1,naxis
               if (i.ne.saxis) ier=fdds_printf(' ',' %s\0',axis(i))
            enddo
            ier=fdds_printf(' ','\n\0')
         else
            ier=fdds_sprintf(text,'size.axis(%d)\0',saxis)
            ier=fdds_printf(text,'1\n\0')
         endif
c         
c        force open the open now
c        not really necessary, but makes debugging easier.
c
         ier=fdds_lseek(out_bin,0,0,SEEK_SET)
         if (ier.lt.0) then
            ier=fdds_prterr('opening output file\n\0')
         endif
c
      endif
c
      return
      end
c
c***********************************************************************
c
c     Setup:Do the prep work before the parallelization.
c     We need to pack any "work" data into the array.
c
c***********************************************************************
c
      subroutine Setup(node,in_bin,out_bin,
     $     naxis,saxis,size,norm,
     $     nsin,nsout,ntin,ntout,numjobs,
     $     ptr_work)
c
      implicit none
c
#include <fdds.h>
c
      integer node,in_bin,out_bin
      integer naxis,saxis,size(RANK_MAX)
      integer nsin,nsout,ntin,ntout,numjobs,ii,ier
      real    work(1)
      real*8  size8
      logical norm
      pointer(ptr_work,work)
c
      nsin=size(1)
      if (saxis.eq.1) then
         nsout=size(2)
         ntin=size(2)
         ntout=1
      else
         nsout=size(1)
         ntout=1
         do ii=2,saxis-1
            ntout=ntout*size(ii)
         enddo
         ntin=ntout*size(saxis)
      endif
c
      numjobs = 1
      do ii = saxis+1,naxis
         numjobs = numjobs*size(ii)
      end do
c
      if ( node .eq. 0 ) then
         ier=fdds_prtmsg('There are %d jobs.\n\0',numjobs)
      endif
c
      size8 = sizeof_real*(9+RANK_MAX)
      size8 = size8 + (SIZEOF_REAL*nsin)
      size8 = size8 + (SIZEOF_REAL*nsout*ntout)
      if (norm.and.saxis.ne.1) then
         size8 = size8 + (SIZEOF_REAL*nsout*ntout)
      endif
c
      ptr_work = fdds_malloc8( size8 )
c
      if ( node .eq. 0 ) then
         ier=fdds_prtmsg('Allocating %lg Gbytes.\n\0',size8*1.0e-9)
      endif
c
      call PackData( work,
     $     in_bin, out_bin,
     $     naxis, saxis, norm,
     $     nsin ,nsout, ntin, ntout, size)
c
      return
      end
c
c***********************************************************************
c
c     PackData: put data into the work array.
c
c***********************************************************************
c
      subroutine PackData( ww,
     $     in_bin, out_bin,
     $     naxis, saxis, norm,
     $     nsin ,nsout, ntin, ntout, size)
c
      implicit none
c
#include <fdds.h>
c
      integer in_bin,out_bin,naxis,saxis,ii
      integer nsin,nsout,ntin,ntout
      integer size(RANK_MAX)
      real    ww(*)
      logical norm
c
      ww(1) = in_bin
      ww(2) = out_bin
      ww(3) = naxis
      ww(4) = saxis
      ww(5) = norm
      ww(6) = nsin
      ww(7) = nsout
      ww(8) = ntin
      ww(9) = ntout
      do ii = 1,naxis
           ww(9+ii) = size(ii)
      end do
c      
      return
      end
c
c***********************************************************************
c
c     UnPackData: get data out from the work array.
c
c***********************************************************************
c
      subroutine UnPackData( ww,
     $     in_bin, out_bin,
     $     naxis, saxis, norm,
     $     nsin ,nsout, ntin, ntout, size,
     $     iinbuf, ibuffer, inrmbuf )
c
      implicit none
c
#include <fdds.h>
c
      integer in_bin,out_bin,naxis,saxis,ii,ier
      integer nsin,nsout,ntin,ntout
      integer size(RANK_MAX)
      integer iinbuf, ibuffer, inrmbuf
      real    ww(*)
      logical norm
c
      in_bin  = ww(1)
      out_bin = ww(2)
      naxis   = ww(3)
      saxis   = ww(4)
      norm    = ww(5)
      nsin    = ww(6)
      nsout   = ww(7)
      ntin    = ww(8)
      ntout   = ww(9)
      do ii = 1,naxis
         size(ii) = ww(9+ii)
      end do
      iinbuf  = naxis + 9 + 1
      ibuffer = iinbuf + nsin
      inrmbuf = ibuffer + nsout*ntout
c
      return
      end
c 
c***********************************************************************
c
c     Slave_Work: wrapper to the routine that actually does the work.
c
c***********************************************************************
c
      subroutine Slave_Work( job, work )
c
      implicit none
c
#include <fdds.h>
c
      integer job,ier
      integer in_bin,out_bin,naxis,saxis
      integer nsin,nsout,ntin,ntout
      integer size(RANK_MAX)
      integer iinbuf, ibuffer, inrmbuf
      real    work(*)
      logical norm
c
      call UnPackData( work,
     $     in_bin, out_bin,
     $     naxis, saxis, norm,
     $     nsin ,nsout, ntin, ntout, size,
     $     iinbuf, ibuffer, inrmbuf )
c
      call stackit( job,
     $     in_bin, saxis, norm,
     $     nsin ,nsout, ntin, ntout,
     $     work(iinbuf),work(ibuffer),work(inrmbuf) )
c    
      return
      end
c
c***********************************************************************
c
c     stackit: routine to stack 1 input buffer of traces
c
c***********************************************************************
c
      subroutine stackit( job,
     $     in_bin,saxis,norm,
     $     nsin,nsout,ntin,ntout,
     $     buf,buffer,nrmbuf)
c
      implicit none
c
#include <fdds.h>
c
      integer   in_bin,job
      integer   saxis,nsin,nsout,ntin,ntout
      logical   norm
      real      buf(nsin)
      real      buffer(nsout,ntout)
      real      nrmbuf(nsout,ntout)
c
      integer   ier,i,j,itin,itout
      integer*8 iseek
      real      nrm,eps,x
c
c     clear the output chunk
c
      eps=0.00000001
      if (norm.and.saxis.ne.1) then
         do i=1,ntout
            do j=1,nsout
               buffer(j,i)=0.0
               nrmbuf(j,i)=eps
            enddo
         enddo
      else
         do i=1,ntout
            do j=1,nsout
               buffer(j,i)=0.0
            enddo
         enddo
      endif
c
c     seek to the start of traces for this job
c
      iseek = job*ntin
      ier = fdds_lseek8(in_bin, 0, iseek, SEEK_SET)
c
c     stack in the input
c
      do itin=1,ntin
c
c        read input trace
c
         ier=fddx_read(in_bin,buf,1)
         if (ier.ne.1) then
            if (itin.gt.1) then
               ier=fdds_prterr('reading input\n\0')
            endif
            return
         endif
c
c        stack it
c
         if (saxis.eq.1) then
            x=0.0
            nrm=0.0
            do i=1,nsin
               x=x+buf(i)
               if (buf(i).ne.0.0) nrm=nrm+1.0
            enddo
            if (norm.and.nrm.gt.0.0) x=x/nrm
            buffer(itin,1)=x
         else
            itout=1+mod(itin-1,ntout)
            if (norm) then
               do i=1,nsout
                  if (buf(i).ne.0.0) then
                     buffer(i,itout)=buffer(i,itout)+buf(i)
                     nrmbuf(i,itout)=nrmbuf(i,itout)+1.0
                  endif
               enddo
            else
               do i=1,nsout
                  buffer(i,itout)=buffer(i,itout)+buf(i)
               enddo
            endif
         endif
      enddo
c
c     normalize stack
c
      if (norm.and.saxis.ne.1) then
         do i=1,ntout
            do j=1,nsout
               buffer(j,i)=buffer(j,i)/nrmbuf(j,i)
            enddo
         enddo
      endif
c
      return
      end
c
c***********************************************************************
c
c     Slave_SendResult:
c     Routine to send back any data that the master needs after the slave has
c     finished a job. Sometimes this routine can be empty, if all the data is
c     sent back after all of the jobs have been completed.
c
c***********************************************************************
c
      subroutine Slave_SendResult( np, node, job, work )
c
      implicit none
c
#include <mpif>
#include <fdds.h>
c
      integer np, node,job,tag,ier
      integer in_bin,out_bin,naxis,saxis
      integer nsin,nsout,ntin,ntout
      integer size(RANK_MAX)
      integer iinbuf, ibuffer, inrmbuf
      real    work(*)
      logical norm
c
      if ( np .gt. 1 ) then
c
c     IMPORTANT: The tags for mpi_sends in Slave_SendResult must match 
c     those in Master_ReceiveResult
c
         tag = 1234
c         
         call UnPackData( work,
     $        in_bin, out_bin,
     $        naxis, saxis, norm,
     $        nsin ,nsout, ntin, ntout, size,
     $        iinbuf, ibuffer, inrmbuf )
c         
         call mpi_send(work(ibuffer),nsout*ntout,MPI_REAL,0,tag,
     $        MPI_COMM_WORLD,ier)
c         
      endif
c
      return
      end
c
c***********************************************************************
c
c     Slave_SendFinalResults:
c     Routine to send back any data that the master needs after the slave has
c     finished all the jobs. Note: this routine can sometimes do nothing.
c
c***********************************************************************
c
      subroutine Slave_SendFinalResults( work )
c
      implicit none
c
      real work(*)
c
c     IMPORTANT: The tags for mpi_sends in Slave_SendFinalResults must 
c     match those in Master_ReceiveFinalResults
c
      return
      end
c
c***********************************************************************
c
c     Master_ReceiveResult:
c     Routine to process any data that a slave has sent back after the slave 
c     has finished a job. Note: sometimes this routine can be empty, if all
c     the data is sent back after all of the jobs have been completed.
c
c***********************************************************************
c
      subroutine Master_ReceiveResult( np, slave, job, work )
c
      implicit none
c
#include <mpif>
#include <fdds.h>
c
      integer   np,slave,job,ier
      integer   in_bin,out_bin,naxis,saxis
      integer   nsin,nsout,ntin,ntout
      integer   size(RANK_MAX)
      integer   iinbuf, ibuffer, inrmbuf
      integer   status(MPI_STATUS_SIZE),tag
      integer*8 iseek
      real      work(*)
      logical   norm
c
c     IMPORTANT: The tags for mpi_sends in Slave_SendResult must match 
c     those in Master_ReceiveResult
c
      tag = 1234
c
      call UnPackData( work,
     $     in_bin, out_bin, 
     $     naxis, saxis, norm,
     $     nsin ,nsout, ntin, ntout, size,
     $     iinbuf, ibuffer, inrmbuf )
c
      if ( np .gt. 1 ) then
         call mpi_recv(work(ibuffer),nsout*ntout,MPI_REAL,slave,tag,
     $        MPI_COMM_WORLD,status,ier)
      endif
c
c     seek to the start of traces for this job
c     and then write out the traces
c
      iseek = job*ntout
      ier = fdds_lseek8(out_bin, 0, iseek, SEEK_SET)
      if ( ier .lt. 0 ) then
         ier = fdds_prterr('seeking in output!\n\0')
      endif
c
      ier = fddx_write(out_bin,work(ibuffer),ntout)
      if ( ier .ne. ntout ) then
         ier = fdds_prterr('writing output!\n\0')
      endif
c
      return
      end
c
c***********************************************************************
c
c     Master_ReceiveFinalResults:
c     Routine  where the master receives any final data after the slaves have
c     finished all the jobs. Note: this routine can sometimes do nothing.
c***********************************************************************
c
      subroutine Master_ReceiveFinalResults( work )
c
      implicit none
c
      real work(*)
c
c     IMPORTANT: The tags for mpi_sends in Slave_SendFinalResults must
c      match those in Master_ReceiveFinalResults
c
      return
      end
c
c***********************************************************************
c
c     Master_BroadcastResults:
c     Once the slaves have finished all the jobs and the master has received
c     all of the data, sometimes it is necessary to share information again
c     amongst the slaves, for further processing. Note: this routine can 
c     sometimes do nothing.
c
c***********************************************************************
c
      subroutine Master_BroadcastResults( work )
c
      implicit none
c
      real work(*)
c
      return
      end