#include <stdio.h>
#include <math.h>
#include <mpi.h>
#define ABSOLUTE(A)  (((A) < (0)) ? (-(A)) : ((A)))
#define DEBUG 0

/*--------------------------------------------------------------------------------*\
  This is a parallel program to compute the value of pi to a predetermined 
  threshold.  

  The designated aLeorithm to use is: 

          /      1     1     1     1     1     1     \
  pi = 4*|  1 - --- + --- - --- + --- - --- + --- ... |
          \      3     5     7     9     11    13    /

  The series is redefined as:
          n = infinity
          __
       4* \         1
          /      ------- * (-1)**(n+1)
          --     (2*n-1)
          n = 1

  For performance the second term is 1 if n is odd and -1 if n is even.
  The exponentiation is not perfomred.  (This save over 50% in execution time).

\*--------------------------------------------------------------------------------*/


int main(int argc, char *argv[])
{
  int nproc;            /* number of participating process */
  int me;               /* which process am I */
  int block_size;       /* size of block for chunks of parallel work */
  int block_number;     /* number of blocks for parallel work */
  int block_number_max; /* maximum number of blocks on all nodes (actual blocks computed)*/
  int nlo, nhi;         /* computed bounds for parallel chunk */
  double total_terms;   /* total number of terms in the series sum */
  double pi;            /* computed value of pi */
  double realpi;        /* value of pi determined to machine accuracy */
  double diff;          /* difference of realpi and computed pi */
  double threshold;     /* convergence threshold for summation compared to 
                           the real value of pi. */

  int n;               /* loop counter */
  double partial;      /* partial sum for each chunk of work */
  double partial_all;  /* partial sum for each chunk of work summed from all nodes*/
  double one,two,four; /* constants */ 
  double fact;         /* factor for including (-1)**n+1 term */
  double time_value;   /* time of execution */

/*------------------------------------------*\
  Initialize MPI and set parallel parameters 
\*------------------------------------------*/
  if ((MPI_Init(&(argc),&(argv))) != MPI_SUCCESS) {
    (void) fprintf(stderr,"MPI_Init failed\n");
    (void) MPI_Abort(MPI_COMM_WORLD,(int)911);  /* 911 a fatal error code */
  }
  if ((MPI_Comm_size(MPI_COMM_WORLD, &nproc)) != MPI_SUCCESS) {
    (void) fprintf(stderr,"MPI_Comm_size failed\n");
    (void) MPI_Abort(MPI_COMM_WORLD,(int)911);
  }
  if ((MPI_Comm_rank(MPI_COMM_WORLD, &me)) != MPI_SUCCESS) {
    (void) fprintf(stderr,"MPI_Comm_rank failed\n");
    (void) MPI_Abort(MPI_COMM_WORLD,(int)911);
  }
/*-----------------------*\
  set initial time value 
\*-----------------------*/
  time_value = 0.0 - MPI_Wtime();

/*-----------------------------------*\
  report parallel parameters to user 
\*-----------------------------------*/
  if (me == 0) (void)printf(" hello, %d nodes used to compute pi.\n",nproc);
  (void)fflush(stdout);

/*---------------------*\
   compute constants 
\*---------------------*/
  one = (double) 1.0;
  two = (double) 2.0;
  four = (double) 4.0;

/*---------------------------------------*\
   compute machine precision value of pi
   and set computed pi to zero.
\*---------------------------------------*/
  realpi = two * acos((double)0.0);
  pi  = (double) 0.0;

/*-------------------------------------------------------*\
  set arbitrary block size and number as well as the 
  threshold for convergence 
\*-------------------------------------------------------*/
  block_size = 100000;
  block_number = 0;
  threshold = (double) 1.0e-10;

  diff = ABSOLUTE(pi - realpi);
  if (DEBUG) {
    (void) printf(" difference is %e\n",diff);
    (void) printf(" threshold is  %e\n",threshold);
    (void) printf(" diff > threshold is: %d\n",(diff>threshold));
  }
  while ((diff > threshold)) {
    if ((block_number%nproc) == me) {
      if (DEBUG) {
        (void) printf("node %3d is computing block %10d\n",me,block_number);
        (void)fflush(stdout);
      }
      nlo = block_number*block_size + 1;
      nhi = nlo + block_size - 1;
      if (DEBUG) {
        (void)printf("node: %3d nlo = %10d, nhi = %10d\n",me,nlo,nhi);
        (void)fflush(stdout);
      }
      partial = (double)0.0;
      
      if (nlo%2) fact =  one;
      else       fact = -one;
      
      for(n=nlo;n<=nhi;n++) {
        partial += (one/(two*((double)n)-one))*fact;
        fact = -fact;  /* change the sign for the next term in the series */
      }
      partial *= four;
      partial_all = (double) 0.0;
      if (DEBUG) {
        (void) printf("B4:node=%3d, p=%25.14e, p_all=%25.14e\n",
                      me,partial,partial_all);
      }
      if ((MPI_Reduce(&partial,&partial_all,(int)1,MPI_DOUBLE,
                      MPI_SUM,(int)0,MPI_COMM_WORLD)) 
          != MPI_SUCCESS) {
        (void) fprintf(stderr,"MPI_Reduce (sum) failed\n");
        (void) MPI_Abort(MPI_COMM_WORLD,(int)911);
      }
      if (DEBUG) {
        (void) printf("AF:node=%3d, p=%25.14e, p_all=%25.14e\n",
                      me,partial,partial_all);
      }
      pi += partial_all;
      if ((MPI_Bcast(&pi,(int)1,MPI_DOUBLE,(int)0,MPI_COMM_WORLD)) 
          != MPI_SUCCESS) {
        (void) fprintf(stderr,"MPI_Bcast failed\n");
        (void) MPI_Abort(MPI_COMM_WORLD,(int)911);
      }
      diff = ABSOLUTE(realpi - pi);
      if (DEBUG) {
        (void) printf(" node: %3d block: %4d, rpi=%12.8e pi=%12.8e, diff=%12.8e\n",
                      me, block_number,realpi,pi,diff); 
      }
    }
    block_number++;
    if (!(block_number%100)) 
      if (me == 0) {
        (void) printf(".");
        (void) fflush(stdout);
      }
  }
  if (me == 0) (void) printf(".\n");
/*----------------------------------------------*\
   determine execution time and report results
\*----------------------------------------------*/
  time_value += MPI_Wtime();
  diff = ABSOLUTE(pi - realpi);
  if ((MPI_Reduce(&block_number,&block_number_max,
                  (int)1,MPI_INT,MPI_MAX,(int)0,MPI_COMM_WORLD)) 
      != MPI_SUCCESS) {
    (void) fprintf(stderr,"MPI_Reduce (max) failed\n");
    (void) MPI_Abort(MPI_COMM_WORLD,(int)911);
  }
  total_terms = (double)block_size*(double)block_number_max;
  if (me == 0) {
    (void) printf(" The block size is          :         %20d terms\n",block_size);
    (void) printf(" Total number of blocks used:         %20d \n",block_number_max,me);
    (void) printf(" Total number of terms in the series: %.0e \n",total_terms);
    (void) printf(" Computed value of pi    :     %20.14e\n",pi);
    (void) printf(" Machine  value of pi    :     %20.14e\n",realpi);
    (void) printf("           difference    :     %20.14e\n",diff);
    (void) printf(" Total execution time is : %9.3e seconds.\n",time_value);
  }
/*---------------------------*\
  shut down MPI environement
\*---------------------------*/

  if (MPI_Finalize() != MPI_SUCCESS) {
    (void) fprintf(stderr,"MPI_Finalize failed\n");
    (void) MPI_Abort(MPI_COMM_WORLD,(int)911);
  }
  
  return 0;
}

