Isaac
Cassini’s last full image of Saturn
Newton's Playground
I have not as yet been able to discover the reason for these properties of gravity from phenomena, and I do not feign hypotheses.Isaac Newton, on the origin of gravity

In A Miniature Solar System, we've seen how Newton's law of universal gravitation can be coded to simulate a 3-body gravitational interaction in 2-dimensions. Now we would like to expand our simulation such that it is generalized for an arbitrary number of massive bodies, and in 3-dimensions. This is more useful in attempting to understand large, complex systems such as our own Asteroid belt. Of course, as we generalize our simulation, we'll also find that it becomes computationally more expensive, as we'll be asking the computer to perform more operations necessary to simulate an $N$-body system. This being the case, we will have to resort to more robust computing methods to accomplish our task.

This page examines some tools employed in modern high performance computing (HPC): Message Passing Interface (MPI), Coarray Fortran (CAF), and OpenMP. Shown in the tabbed interface below is a program that computes the average gravitational force in the $x$-, $y$-, and $z$-directons in a distribution of masses. This program is modified to leverage these HPC methods; the original non-HPC version is also included for comparison.

MPI

Pros

Cons

OpenMP

Pros

Cons


References

Newton's Law of Universal Gravitation

Isaac

Newton's law of universal gravitation models the force experienced by a pair of masses via a long-range inverse square law:

$\displaystyle{ \mathbf{F}_{12} = -G \frac{m_1 m_2}{|\mathbf{r}_{12}|^2} \hat{\mathbf{r}}_{12} \text{,} }$

where $\mathbf{F}_{12}$ is the force applied on object $1$ by object $2$, $G$ is the gravitational constant, $m_1$ and $m_2$ are the masses of objects $1$ and $2$, respectively, $|\mathbf{r}_{12}| = |\mathbf{r}_2 − \mathbf{r}_1|$ is the distance between objects $1$ and $2$, and $\mathbf{\hat{r}}_{12} \equiv \mathbf{r}_2 - \mathbf{r}_1 / |\mathbf{r}_2 - \mathbf{r}_1|$ is the unit vector from object $1$ to object $2$. The negative sign indicates that the gravitational force is attractive.

This version of the program finds the average gravitational force in the $x$-, $y$-, and $z$-directons in a distribution of masses as specified by npts, with the understanding that a large number of masses can be computationally expensive.

MODULE Shared
  USE, INTRINSIC :: ISO_Fortran_env, dp=>REAL64!modern DOUBLE PRECISION
  IMPLICIT NONE
  INTEGER :: i, j, k, myrank, npts, nprocs, nlocal
  INTEGER, PARAMETER :: pseudohost=0, n=100000, MM=1, PX=2, PY=3, PZ=4, FX=5, FY=6, FZ=7
  REAL(dp) :: random, sum_x, sum_y, sum_z
  REAL(dp), DIMENSION(:), ALLOCATABLE :: dx, dy, dz, dist, square, frac, tx, ty, tz
  REAL(dp), DIMENSION(:,:), ALLOCATABLE :: p, q
  CHARACTER(*), PARAMETER :: my_format='(f21.17)'!dp: matches unformatted output
END MODULE Shared

MODULE My_Module
  CONTAINS
    SUBROUTINE My_Subroutine(is, js, ps, qs)
      USE Shared, ONLY : dp, MM, PX, PY, PZ, FX, FY, FZ, dx, dy, dz, dist, square, frac, tx, ty, tz
      IMPLICIT NONE
      INTEGER, INTENT(IN) :: is, js
      REAL(dp), INTENT(IN OUT), CONTIGUOUS :: ps(:,:), qs(:,:)!Rules 22. & 25.
      dx(is) = ps(PX, is) - qs(PX, js)
      dy(is) = ps(PY, is) - qs(PY, js)
      dz(is) = ps(PZ, is) - qs(PZ, js)
      square(is) = dx(is)**2 + dy(is)**2 + dz(is)**2
      dist(is) = SQRT(square(is))
      frac(is) = (ps(MM,is) * qs(MM,js)) / (dist(is) * square(is))
      tx(is) = frac(is) * dx(is)
      ty(is) = frac(is) * dy(is)
      tz(is) = frac(is) * dz(is)
      ps(FX, is) = ps(FX, is) - tx(is)
      qs(FX, js) = qs(FX, js) + tx(is)
      ps(FY, is) = ps(FY, is) - ty(is)
      qs(FY, js) = qs(FY, js) + ty(is)
      ps(FZ, is) = ps(FZ, is) - tz(is)
      qs(FZ, js) = qs(FZ, js) + tz(is)
    END SUBROUTINE My_Subroutine
END MODULE My_Module

PROGRAM Gravity_Fortran
  USE Shared
  USE My_Module
  IMPLICIT NONE
  CALL INIT_RANDOM_SEED()
  ALLOCATE(dx(MM:n), dy(MM:n), dz(MM:n), dist(MM:n), square(MM:n), frac(MM:n), tx(MM:n), ty(MM:n), tz(MM:n), p(MM:FZ, MM:n),       &
  q(MM:FZ, MM:n))
  myrank = 0
  WRITE(6, '(a18)', advance='no') "Number of points: "
  READ *, npts
  nprocs = 2!MOD
  IF (myrank == pseudohost) THEN
    IF (MOD(nprocs, 2) == 0) THEN
      IF (npts > nprocs * n) THEN
        WRITE(6, '(a28)') "Warning: Size out-of-bounds."
        npts = -1
      ELSE IF (MOD(npts, nprocs) /= 0) THEN
        WRITE(6, '(a65)') "Number of processes (nprocs) must divide number of points (npts)."
        npts = -1
      END IF
    ELSE
      WRITE(6, '(a42)') "Number of processes (nprocs) must be even."
      npts = -1
    END IF
  END IF
  IF (npts == -1) GO TO 999
  nlocal = npts / nprocs!determines size of loop
  IF (myrank == pseudohost) THEN
    particle: DO i = 1, nlocal
      CALL RANDOM_NUMBER(random)
      p(MM, i) = random
      CALL RANDOM_NUMBER(random)
      p(PX, i) = random
      CALL RANDOM_NUMBER(random)
      p(PY, i) = random
      CALL RANDOM_NUMBER(random)
      p(PZ, i) = random
      p(FX, i) = 0.0
      p(FY, i) = 0.0
      p(FZ, i) = 0.0
    END DO particle
    process: DO k = 1, nprocs
      IF (k /= pseudohost) THEN
        send: DO i = 1, nlocal
          CALL RANDOM_NUMBER(random)
          q(MM, i) = random
          CALL RANDOM_NUMBER(random)
          q(PX, i) = random
          CALL RANDOM_NUMBER(random)
          q(PY, i) = random
          CALL RANDOM_NUMBER(random)
          q(PZ, i) = random
          q(FX, i) = 0.0
          q(FY, i) = 0.0
          q(FZ, i) = 0.0
        END DO send
      END IF
    END DO process
  END IF
  copy: DO i = 1, nlocal
    q(MM, i) = p(MM, i)
    q(PX, i) = p(PX, i)
    q(PY, i) = p(PY, i)
    q(PZ, i) = p(PZ, i)
    q(FX, i) = 0.0
    q(FY, i) = 0.0
    q(FZ, i) = 0.0
  END DO copy
  mass1: DO i = 1, nlocal
    mass2: DO j = i + 1, nlocal
      CALL My_Subroutine(i, j, p, q)
    END DO mass2
  END DO mass1
  sum_x = 0.0
  sum_y = 0.0
  sum_z = 0.0
  total: DO i = 1, nlocal
    p(FX, i) = p(FX, i) + q(FX, i)
    sum_x = sum_x + p(FX, i)
    p(FY, i) = p(FY, i) + q(FY, i)
    sum_y = sum_y + p(FY, i)
    p(FZ, i) = p(FZ, i) + q(FZ, i)
    sum_z = sum_z + p(FZ, i)
  END DO total
  WRITE(6, '(a8)') "AVERAGES"
  WRITE(6, '(a5)', advance='no') "F_x ="
  WRITE(6, my_format) sum_x / nlocal - 1
  WRITE(6, '(a5)', advance='no') "F_y ="
  WRITE(6, my_format) sum_y / nlocal - 1
  WRITE(6, '(a5)', advance='no') "F_z ="
  WRITE(6, my_format) sum_z / nlocal - 1
  999 STOP
  DEALLOCATE(dx, dy, dz, dist, square, frac, tx, ty, tz, p, q)
END PROGRAM Gravity_Fortran

SUBROUTINE INIT_RANDOM_SEED()
  USE ISO_Fortran_env, ONLY: INT64
  IMPLICIT NONE
  INTEGER, ALLOCATABLE :: seed(:)
  INTEGER :: i, n, un, istat, dt(8), pid
  INTEGER(INT64) :: t
  CALL RANDOM_SEED(size = n)
  ALLOCATE(seed(n))
  OPEN(newunit=un, file='/dev/urandom', access='stream', status='old', action='read', form='unformatted', iostat=istat)
  IF (istat == 0) THEN
    READ(un) seed
    CLOSE(un)
  ELSE
    CALL SYSTEM_CLOCK(t)
    IF (t == 0) THEN
      CALL DATE_AND_TIME(values = dt)
      t = (dt(1) - 1970) * 365_INT64 * 24 * 60 * 60 * 1000 + dt(2) * 31_INT64 * 24 * 60 * 60 * 1000 + dt(3) * 24_INT64 * 60 * 60   &
      * 1000 + dt(5) * 60 * 60 * 1000 + dt(6) * 60 * 1000 + dt(7) * 1000 + dt(8)
    END IF
    pid = GETPID()
    t = IEOR(t, INT(pid, KIND(t)))
    DO i = 1, n
      seed(i) = lcg(t)
    END DO
  END IF
  CALL RANDOM_SEED(put = seed)
  DEALLOCATE(seed)
CONTAINS
  FUNCTION lcg(s)
    INTEGER :: lcg
    INTEGER(INT64) :: s
    IF (s == 0) THEN
      s = 104729
    ELSE
      s = MOD(s, 4294967296_INT64)
    END IF
    s = MOD(s * 279470273_INT64, 4294967291_INT64)
    lcg = INT(MOD(s, INT(HUGE(0), INT64)), KIND(0))
  END FUNCTION lcg
END SUBROUTINE INIT_RANDOM_SEED
            

The steps below detail the Message Passing Interface version of the program, where the number of processes, nprocs, must be even, and the total number of masses, npts, must be exactly divisible by the number of processes. I used 2 × Dell OptiPlex 755 (ultra small form factor) desktops with Intel Core 2 Duo processors for this test.

  1. CALL MPI_INIT(ierr) — Initialize MPI, find the rank of each process, and the total number of processes.
  2. IF (myrank == pseudohost) THEN — One process acts as the host and reads in the number of particles.
  3. CALL MPI_BCAST(npts, 1, MPI_INTEGER, pseudohost, MPI_COMM_WORLD, ierr) — The number of particles is broadcast to all processes.
  4. IF (npts == -1) GO TO 999 — Abort if the number of processes and/or particles is incorrect.
  5. nlocal = npts / nprocs — Work out the number of particles in each process.
  6. IF (myrank == pseudohost) THEN — The pseudohost initializes the particle data and sends each process its particles.
  7. start = MPI_WTIME() — Initialization is now complete. Start the clock and begin work. First each process makes a copy of its particles.
  8. mass1: DO i = 1, nlocal — Now the interactions between the particles in a single process is computed.
  9. dest = MOD(nprocs + myrank - 1, nprocs) — The processes are arranged in a ring. Data will be passed in an anti-clockwise direction around the ring.
  10. neighbors: DO k = 1, nprocs / (2 - 1) — Each process interacts with the particles from its nprocs / 2 - 1 anti-clockwise neighbors. At the end of this loop, p(i) in each process has accumulated the force from interactions with particles i + 1, ..., nlocal - 1 in its own process, plus all the particles from its nprocs / 2 - 1 anti-clockwise neighbors. The "home" of the q array is regarded as the process from which it originated. At the end of this loop, q(i) has accumulated the force from interactions with particles 0, ..., i - 1 in its home process, plus all the particles from the nprocs / 2 - 1 processes it has rotated to.
  11. IF (nprocs > 1) THEN — Now q is rotated once more so that it is diametrically opposite its home process. p(i) accumulates force contributions from the interaction with particles 0, ..., i - 1 from its opposing process. q(i) accumulates force from the interaction of its home particles with particles i + 1, ..., nlocal - 1 in its current location.
  12. IF (myrank < nprocs / 2) THEN — In half the processes we include the interaction of each particle with the corresponding particle in the opposing process.
  13. dest = MOD(nprocs + myrank - nprocs / 2, nprocs) — Now the q array is returned to its home process.
  14. total: DO i = 1, nlocal — The p and q arrays are summed to give the total force on each particle.
  15. end = MPI_WTIME() — Stop clock and write out timings.
  16. CALL MPI_BARRIER(MPI_COMM_WORLD, ierr) — Do a barrier to make sure the timings are written out first.
  17. IF (myrank = pseudohost) THEN — Each process returns its force contributions to the pseudohost.
  18. CALL MPI_FINALIZE(ierr) — Close MPI.

MPI has the disadvantage that it can be rather labor intensive to implement and debug.

MODULE Shared
  USE, INTRINSIC :: ISO_Fortran_env, dp=>REAL64!modern DOUBLE PRECISION
  USE MPI
  IMPLICIT NONE
  INTEGER :: i, j, k, myrank, npts, nprocs, nlocal, ierr, dest, src, newtype
  INTEGER, DIMENSION(:), ALLOCATABLE :: status
  INTEGER, PARAMETER :: pseudohost=0, n=100000, MM=1, PX=2, PY=3, PZ=4, FX=5, FY=6, FZ=7
  REAL(dp) :: random, sum_x, sum_y, sum_z, start, end
  REAL(dp), DIMENSION(:), ALLOCATABLE :: dx, dy, dz, dist, square, frac, tx, ty, tz
  REAL(dp), DIMENSION(:,:), ALLOCATABLE :: p, q
  CHARACTER(*), PARAMETER :: my_format='(f21.17)'!dp: matches unformatted output
END MODULE Shared

MODULE My_Module
  CONTAINS
    SUBROUTINE My_Subroutine(is, js, ps, qs)
      USE Shared, ONLY : dp, MM, PX, PY, PZ, FX, FY, FZ, dx, dy, dz, dist, square, frac, tx, ty, tz
      IMPLICIT NONE
      INTEGER, INTENT(IN) :: is, js
      REAL(dp), INTENT(IN OUT), CONTIGUOUS :: ps(:,:), qs(:,:)!Rules 22. & 25.
      dx(is) = ps(PX, is) - qs(PX, js)
      dy(is) = ps(PY, is) - qs(PY, js)
      dz(is) = ps(PZ, is) - qs(PZ, js)
      square(is) = dx(is)**2 + dy(is)**2 + dz(is)**2
      dist(is) = SQRT(square(is))
      frac(is) = (ps(MM,is) * qs(MM,js)) / (dist(is) * square(is))
      tx(is) = frac(is) * dx(is)
      ty(is) = frac(is) * dy(is)
      tz(is) = frac(is) * dz(is)
      ps(FX, is) = ps(FX, is) - tx(is)
      qs(FX, js) = qs(FX, js) + tx(is)
      ps(FY, is) = ps(FY, is) - ty(is)
      qs(FY, js) = qs(FY, js) + ty(is)
      ps(FZ, is) = ps(FZ, is) - tz(is)
      qs(FZ, js) = qs(FZ, js) + tz(is)
    END SUBROUTINE My_Subroutine
END MODULE My_Module

PROGRAM Gravity_MPI
  USE Shared
  USE My_Module
  IMPLICIT NONE
  CALL INIT_RANDOM_SEED()
  ALLOCATE(status(MPI_STATUS_SIZE), dx(MM:n), dy(MM:n), dz(MM:n), dist(MM:n), square(MM:n), frac(MM:n), tx(MM:n), ty(MM:n),        &
  tz(MM:n), p(MM:FZ, MM:n), q(MM:FZ, MM:n))
  myrank = 0
  WRITE(6, '(a18)', advance='no') "Number of points: "
  READ *, npts
  nprocs = 2!MOD
  CALL MPI_INIT(ierr)
  CALL MPI_COMM_RANK(MPI_COMM_WORLD, myrank, ierr)
  CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr)
  IF (myrank == pseudohost) THEN
    IF (MOD(nprocs, 2) == 0) THEN
      IF (npts > nprocs * n) THEN
        WRITE(6, '(a28)') "Warning: Size out-of-bounds."
        npts = -1
      ELSE IF (MOD(npts, nprocs) /= 0) THEN
        WRITE(6, '(a65)') "Number of processes (nprocs) must divide number of points (npts)."
        npts = -1
      END IF
    ELSE
      WRITE(6, '(a42)') "Number of processes (nprocs) must be even."
      npts = -1
    END IF
  END IF
  CALL MPI_BCAST(npts, 1, MPI_INTEGER, pseudohost, MPI_COMM_WORLD, ierr)
  IF (npts == -1) GO TO 999
  nlocal = npts / nprocs!determines size of loop
  IF (myrank == pseudohost) THEN
    particle: DO i = 1, nlocal
      CALL RANDOM_NUMBER(random)
      p(MM, i) = random
      CALL RANDOM_NUMBER(random)
      p(PX, i) = random
      CALL RANDOM_NUMBER(random)
      p(PY, i) = random
      CALL RANDOM_NUMBER(random)
      p(PZ, i) = random
      p(FX, i) = 0.0
      p(FY, i) = 0.0
      p(FZ, i) = 0.0
    END DO particle
    process: DO k = 1, nprocs
      IF (k /= pseudohost) THEN
        send: DO i = 1, nlocal
          CALL RANDOM_NUMBER(random)
          q(MM, i) = random
          CALL RANDOM_NUMBER(random)
          q(PX, i) = random
          CALL RANDOM_NUMBER(random)
          q(PY, i) = random
          CALL RANDOM_NUMBER(random)
          q(PZ, i) = random
          q(FX, i) = 0.0
          q(FY, i) = 0.0
          q(FZ, i) = 0.0
        END DO send
        CALL MPI_SEND(q, 7 * nlocal, MPI_REAL, k, 100, MPI_COMM_WORLD, ierr)
      END IF
    END DO process
  ELSE
    CALL MPI_RECV(p, 7 * nlocal, MPI_REAL, pseudohost, 100, MPI_COMM_WORLD, status, ierr)
  END IF
  start = MPI_WTIME()
  copy: DO i = 1, nlocal
    q(MM, i) = p(MM, i)
    q(PX, i) = p(PX, i)
    q(PY, i) = p(PY, i)
    q(PZ, i) = p(PZ, i)
    q(FX, i) = 0.0
    q(FY, i) = 0.0
    q(FZ, i) = 0.0
  END DO copy
  mass1: DO i = 1, nlocal
    mass2: DO j = i + 1, nlocal
      CALL My_Subroutine(i, j, p, q)
    END DO mass2
  END DO mass1
  dest = MOD(nprocs + myrank - 1, nprocs)
  src  = MOD(myrank + 1, nprocs)
  neighbors: DO k = 1, nprocs / (2 - 1)
    CALL MPI_SENDRECV_REPLACE(q, 7 * nlocal, MPI_REAL, dest, 200, src, 200, MPI_COMM_WORLD, status, ierr)
    m1: DO i = 1, nlocal
      m2: DO j = 1, nlocal
        CALL My_Subroutine(i, j, p, q)
      END DO m2
    END DO m1
  END DO neighbors
  IF (nprocs > 1) THEN
    CALL MPI_SENDRECV_REPLACE(q, 7 * nlocal, MPI_REAL, dest, 300, src, 300, MPI_COMM_WORLD, status, ierr)
    accumulate1: DO i = nlocal, 1, -1
      accumulate2: DO j = i, 1, -1
        CALL My_Subroutine(i, j, p, q)
      END DO accumulate2
    END DO accumulate1
    IF (myrank < nprocs / 2) THEN
      half: DO i = 1, nlocal
        CALL My_Subroutine(i, j, p, q)
      END DO half
    ENDIF
    dest = MOD(nprocs + myrank - nprocs / 2, nprocs)
    src  = MOD(myrank + nprocs / 2, nprocs)
    CALL MPI_SENDRECV_REPLACE(q, 7 * nlocal, MPI_REAL, dest, 400, src, 400, MPI_COMM_WORLD, status, ierr)
  END IF
  sum_x = 0.0
  sum_y = 0.0
  sum_z = 0.0
  total: DO i = 1, nlocal
    p(FX, i) = p(FX, i) + q(FX, i)
    sum_x = sum_x + p(FX, i)
    p(FY, i) = p(FY, i) + q(FY, i)
    sum_y = sum_y + p(FY, i)
    p(FZ, i) = p(FZ, i) + q(FZ, i)
    sum_z = sum_z + p(FZ, i)
  END DO total
  WRITE(6, '(a8)') "AVERAGES"
  WRITE(6, '(a5)', advance='no') "F_x ="
  WRITE(6, my_format) sum_x / nlocal - 1
  WRITE(6, '(a5)', advance='no') "F_y ="
  WRITE(6, my_format) sum_y / nlocal - 1
  WRITE(6, '(a5)', advance='no') "F_z ="
  WRITE(6, my_format) sum_z / nlocal - 1
  end = MPI_WTIME()
  PRINT *, "Node", myrank, ":", end - start, "seconds"
  CALL MPI_BARRIER(MPI_COMM_WORLD, ierr)
  IF (myrank == pseudohost) THEN
    CALL MPI_TYPE_VECTOR(nlocal, 3, 7, MPI_REAL, newtype, ierr)
    CALL MPI_TYPE_COMMIT(newtype, ierr)
    forces: DO k = 1, nprocs
      IF (k /= pseudohost) CALL MPI_RECV(q(FX, 1), 1, newtype, k, 100, MPI_COMM_WORLD, status, ierr)
    END DO forces
  ELSE
    CALL MPI_TYPE_VECTOR(nlocal, 3, 7, MPI_REAL, newtype, ierr)
    CALL MPI_TYPE_COMMIT(newtype, ierr)
    CALL MPI_SEND(p(FX, 1), 1, newtype, pseudohost, 100, MPI_COMM_WORLD, ierr)
  END IF
  999 CALL MPI_FINALIZE(ierr)
  CALL MPI_ABORT(ierr)
  DEALLOCATE(status, dx, dy, dz, dist, square, frac, tx, ty, tz, p, q)
END PROGRAM Gravity_MPI

SUBROUTINE INIT_RANDOM_SEED()
  USE ISO_Fortran_env, ONLY: INT64
  IMPLICIT NONE
  INTEGER, ALLOCATABLE :: seed(:)
  INTEGER :: i, n, un, istat, dt(8), pid
  INTEGER(INT64) :: t
  CALL RANDOM_SEED(size = n)
  ALLOCATE(seed(n))
  OPEN(newunit=un, file='/dev/urandom', access='stream', status='old', action='read', form='unformatted', iostat=istat)
  IF (istat == 0) THEN
    READ(un) seed
    CLOSE(un)
  ELSE
    CALL SYSTEM_CLOCK(t)
    IF (t == 0) THEN
      CALL DATE_AND_TIME(values = dt)
      t = (dt(1) - 1970) * 365_INT64 * 24 * 60 * 60 * 1000 + dt(2) * 31_INT64 * 24 * 60 * 60 * 1000 + dt(3) * 24_INT64 * 60 * 60   &
      * 1000 + dt(5) * 60 * 60 * 1000 + dt(6) * 60 * 1000 + dt(7) * 1000 + dt(8)
    END IF
    pid = GETPID()
    t = IEOR(t, INT(pid, KIND(t)))
    DO i = 1, n
      seed(i) = lcg(t)
    END DO
  END IF
  CALL RANDOM_SEED(put = seed)
  DEALLOCATE(seed)
CONTAINS
  FUNCTION lcg(s)
    INTEGER :: lcg
    INTEGER(INT64) :: s
    IF (s == 0) THEN
      s = 104729
    ELSE
      s = MOD(s, 4294967296_INT64)
    END IF
    s = MOD(s * 279470273_INT64, 4294967291_INT64)
    lcg = INT(MOD(s, INT(HUGE(0), INT64)), KIND(0))
  END FUNCTION lcg
END SUBROUTINE INIT_RANDOM_SEED
            

You can read more about Fortran coarrays here.

            

OpenMP is easy to implement compared to MPI. I added the line !$OMP PARALLEL DO above the line mass1: DO i = 1, nlocal, and the line !$OMP END PARALLEL DO below the line END DO mass1. You'll also need to specify how many cores to use by setting the environment variable at the command line with export OMP_NUM_THREADS=4 (on a Unix/Linux system). After compiling with the -fopenmp flag and running the program, you should see something like this in your system resources monitor:

Using all cores!

That staircase-like feature in the CPU usage graph tells you that the program is using all 4 cores (initially) to compute the average force. The sequential drop-off occurs when each core finishes its job. The interested reader can read more about the OpenMP application programming interface here.

MODULE Shared
  USE, INTRINSIC :: ISO_Fortran_env, dp=>REAL64!modern DOUBLE PRECISION
  IMPLICIT NONE
  INTEGER :: i, j, k, myrank, npts, nprocs, nlocal
  INTEGER, PARAMETER :: pseudohost=0, n=100000, MM=1, PX=2, PY=3, PZ=4, FX=5, FY=6, FZ=7
  REAL(dp) :: random, sum_x, sum_y, sum_z
  REAL(dp), DIMENSION(:), ALLOCATABLE :: dx, dy, dz, dist, square, frac, tx, ty, tz
  REAL(dp), DIMENSION(:,:), ALLOCATABLE :: p, q
  CHARACTER(*), PARAMETER :: my_format='(f21.17)'!dp: matches unformatted output
END MODULE Shared

MODULE My_Module
  CONTAINS
    SUBROUTINE My_Subroutine(is, js, ps, qs)
      USE Shared, ONLY : dp, MM, PX, PY, PZ, FX, FY, FZ, dx, dy, dz, dist, square, frac, tx, ty, tz
      IMPLICIT NONE
      INTEGER, INTENT(IN) :: is, js
      REAL(dp), INTENT(IN OUT), CONTIGUOUS :: ps(:,:), qs(:,:)!Rules 22. & 25.
      dx(is) = ps(PX, is) - qs(PX, js)
      dy(is) = ps(PY, is) - qs(PY, js)
      dz(is) = ps(PZ, is) - qs(PZ, js)
      square(is) = dx(is)**2 + dy(is)**2 + dz(is)**2
      dist(is) = SQRT(square(is))
      frac(is) = (ps(MM,is) * qs(MM,js)) / (dist(is) * square(is))
      tx(is) = frac(is) * dx(is)
      ty(is) = frac(is) * dy(is)
      tz(is) = frac(is) * dz(is)
      ps(FX, is) = ps(FX, is) - tx(is)
      qs(FX, js) = qs(FX, js) + tx(is)
      ps(FY, is) = ps(FY, is) - ty(is)
      qs(FY, js) = qs(FY, js) + ty(is)
      ps(FZ, is) = ps(FZ, is) - tz(is)
      qs(FZ, js) = qs(FZ, js) + tz(is)
    END SUBROUTINE My_Subroutine
END MODULE My_Module

PROGRAM Gravity_OpenMP
  USE Shared
  USE My_Module
  IMPLICIT NONE
  CALL INIT_RANDOM_SEED()
  ALLOCATE(dx(MM:n), dy(MM:n), dz(MM:n), dist(MM:n), square(MM:n), frac(MM:n), tx(MM:n), ty(MM:n), tz(MM:n), p(MM:FZ, MM:n),       &
  q(MM:FZ, MM:n))
  myrank = 0
  WRITE(6, '(a18)', advance='no') "Number of points: "
  READ *, npts
  nprocs = 2!MOD
  IF (myrank == pseudohost) THEN
    IF (MOD(nprocs, 2) == 0) THEN
      IF (npts > nprocs * n) THEN
        WRITE(6, '(a28)') "Warning: Size out-of-bounds."
        npts = -1
      ELSE IF (MOD(npts, nprocs) /= 0) THEN
        WRITE(6, '(a65)') "Number of processes (nprocs) must divide number of points (npts)."
        npts = -1
      END IF
    ELSE
      WRITE(6, '(a42)') "Number of processes (nprocs) must be even."
      npts = -1
    END IF
  END IF
  IF (npts == -1) GO TO 999
  nlocal = npts / nprocs!determines size of loop
  IF (myrank == pseudohost) THEN
    particle: DO i = 1, nlocal
      CALL RANDOM_NUMBER(random)
      p(MM, i) = random
      CALL RANDOM_NUMBER(random)
      p(PX, i) = random
      CALL RANDOM_NUMBER(random)
      p(PY, i) = random
      CALL RANDOM_NUMBER(random)
      p(PZ, i) = random
      p(FX, i) = 0.0
      p(FY, i) = 0.0
      p(FZ, i) = 0.0
    END DO particle
    process: DO k = 1, nprocs
      IF (k /= pseudohost) THEN
        send: DO i = 1, nlocal
          CALL RANDOM_NUMBER(random)
          q(MM, i) = random
          CALL RANDOM_NUMBER(random)
          q(PX, i) = random
          CALL RANDOM_NUMBER(random)
          q(PY, i) = random
          CALL RANDOM_NUMBER(random)
          q(PZ, i) = random
          q(FX, i) = 0.0
          q(FY, i) = 0.0
          q(FZ, i) = 0.0
        END DO send
      END IF
    END DO process
  END IF
  copy: DO i = 1, nlocal
    q(MM, i) = p(MM, i)
    q(PX, i) = p(PX, i)
    q(PY, i) = p(PY, i)
    q(PZ, i) = p(PZ, i)
    q(FX, i) = 0.0
    q(FY, i) = 0.0
    q(FZ, i) = 0.0
  END DO copy
  !$OMP PARALLEL DO
  mass1: DO i = 1, nlocal
    mass2: DO j = i + 1, nlocal
      CALL My_Subroutine(i, j, p, q)
    END DO mass2
  END DO mass1
  !$OMP END PARALLEL DO
  sum_x = 0.0
  sum_y = 0.0
  sum_z = 0.0
  total: DO i = 1, nlocal
    p(FX, i) = p(FX, i) + q(FX, i)
    sum_x = sum_x + p(FX, i)
    p(FY, i) = p(FY, i) + q(FY, i)
    sum_y = sum_y + p(FY, i)
    p(FZ, i) = p(FZ, i) + q(FZ, i)
    sum_z = sum_z + p(FZ, i)
  END DO total
  WRITE(6, '(a8)') "AVERAGES"
  WRITE(6, '(a5)', advance='no') "F_x ="
  WRITE(6, my_format) sum_x / nlocal - 1
  WRITE(6, '(a5)', advance='no') "F_y ="
  WRITE(6, my_format) sum_y / nlocal - 1
  WRITE(6, '(a5)', advance='no') "F_z ="
  WRITE(6, my_format) sum_z / nlocal - 1
  999 STOP
  DEALLOCATE(dx, dy, dz, dist, square, frac, tx, ty, tz, p, q)
END PROGRAM Gravity_OpenMP

SUBROUTINE INIT_RANDOM_SEED()
  USE ISO_Fortran_env, ONLY: INT64
  IMPLICIT NONE
  INTEGER, ALLOCATABLE :: seed(:)
  INTEGER :: i, n, un, istat, dt(8), pid
  INTEGER(INT64) :: t
  CALL RANDOM_SEED(size = n)
  ALLOCATE(seed(n))
  OPEN(newunit=un, file='/dev/urandom', access='stream', status='old', action='read', form='unformatted', iostat=istat)
  IF (istat == 0) THEN
    READ(un) seed
    CLOSE(un)
  ELSE
    CALL SYSTEM_CLOCK(t)
    IF (t == 0) THEN
      CALL DATE_AND_TIME(values = dt)
      t = (dt(1) - 1970) * 365_INT64 * 24 * 60 * 60 * 1000 + dt(2) * 31_INT64 * 24 * 60 * 60 * 1000 + dt(3) * 24_INT64 * 60 * 60   &
      * 1000 + dt(5) * 60 * 60 * 1000 + dt(6) * 60 * 1000 + dt(7) * 1000 + dt(8)
    END IF
    pid = GETPID()
    t = IEOR(t, INT(pid, KIND(t)))
    DO i = 1, n
      seed(i) = lcg(t)
    END DO
  END IF
  CALL RANDOM_SEED(put = seed)
  DEALLOCATE(seed)
CONTAINS
  FUNCTION lcg(s)
    INTEGER :: lcg
    INTEGER(INT64) :: s
    IF (s == 0) THEN
      s = 104729
    ELSE
      s = MOD(s, 4294967296_INT64)
    END IF
    s = MOD(s * 279470273_INT64, 4294967291_INT64)
    lcg = INT(MOD(s, INT(HUGE(0), INT64)), KIND(0))
  END FUNCTION lcg
END SUBROUTINE INIT_RANDOM_SEED
            

Download the source code for each program here:

If you find any bugs in these programs, please let me know.

back-to-top DANGER