MPI

- Runs on either shared or distributed memory architectures
- Can be used on a wider range of problems than OpenMP
- Each process has its own local variables
- Distributed memory computers are less expensive than large shared memory computers
- Requires more programming changes to go from serial to parallel version
- Can be more difficult to debug
- Performance is limited by the communication network between the nodes
OpenMP

- Easier to program and debug than MPI
- Gradual parallelization: directives can be added incrementally
- Can still run the program as a serial code
- Serial code statements usually don't need modification
- Code is easier to understand and perhaps more easily maintained
- Can only be run on shared memory computers
- Requires a compiler that supports OpenMP (e.g., gfortran)
- Mostly used for loop parallelization
References

- "Pros and Cons of OpenMP/MPI." Dartmouth College, Intro to MPI Course Overview.
- "Examples in MPI/Fortran." Jackson State University School of Science & Technology, Computer Science Department, Computational Science for Simulation Applications.
Newton's Law of Universal Gravitation
Newton's law of universal gravitation models the force experienced by a pair of masses via a long-range inverse square law:
$\displaystyle{ \mathbf{F}_{12} = -G \frac{m_1 m_2}{|\mathbf{r}_{12}|^2} \hat{\mathbf{r}}_{12} \text{,} }$
where $\mathbf{F}_{12}$ is the force applied on object $1$ by object $2$, $G$ is the gravitational constant, $m_1$ and $m_2$ are the masses of objects $1$ and $2$, respectively, $|\mathbf{r}_{12}| = |\mathbf{r}_2 − \mathbf{r}_1|$ is the distance between objects $1$ and $2$, and $\mathbf{\hat{r}}_{12} \equiv \mathbf{r}_2 - \mathbf{r}_1 / |\mathbf{r}_2 - \mathbf{r}_1|$ is the unit vector from object $1$ to object $2$. The negative sign indicates that the gravitational force is attractive.
This version of the program finds the average gravitational force in the $x$-, $y$-, and $z$-directons in a distribution of masses as specified by npts
, with the understanding that a large number of masses can be computationally expensive.
MODULE Shared USE, INTRINSIC :: ISO_Fortran_env, dp=>REAL64!modern DOUBLE PRECISION IMPLICIT NONE INTEGER :: i, j, k, myrank, npts, nprocs, nlocal INTEGER, PARAMETER :: pseudohost=0, n=100000, MM=1, PX=2, PY=3, PZ=4, FX=5, FY=6, FZ=7 REAL(dp) :: random, sum_x, sum_y, sum_z REAL(dp), DIMENSION(:), ALLOCATABLE :: dx, dy, dz, dist, square, frac, tx, ty, tz REAL(dp), DIMENSION(:,:), ALLOCATABLE :: p, q CHARACTER(*), PARAMETER :: my_format='(f21.17)'!dp: matches unformatted output END MODULE Shared MODULE My_Module CONTAINS SUBROUTINE My_Subroutine(is, js, ps, qs) USE Shared, ONLY : dp, MM, PX, PY, PZ, FX, FY, FZ, dx, dy, dz, dist, square, frac, tx, ty, tz IMPLICIT NONE INTEGER, INTENT(IN) :: is, js REAL(dp), INTENT(IN OUT), CONTIGUOUS :: ps(:,:), qs(:,:)!Rules 22. & 25. dx(is) = ps(PX, is) - qs(PX, js) dy(is) = ps(PY, is) - qs(PY, js) dz(is) = ps(PZ, is) - qs(PZ, js) square(is) = dx(is)**2 + dy(is)**2 + dz(is)**2 dist(is) = SQRT(square(is)) frac(is) = (ps(MM,is) * qs(MM,js)) / (dist(is) * square(is)) tx(is) = frac(is) * dx(is) ty(is) = frac(is) * dy(is) tz(is) = frac(is) * dz(is) ps(FX, is) = ps(FX, is) - tx(is) qs(FX, js) = qs(FX, js) + tx(is) ps(FY, is) = ps(FY, is) - ty(is) qs(FY, js) = qs(FY, js) + ty(is) ps(FZ, is) = ps(FZ, is) - tz(is) qs(FZ, js) = qs(FZ, js) + tz(is) END SUBROUTINE My_Subroutine END MODULE My_Module PROGRAM Gravity_Fortran USE Shared USE My_Module IMPLICIT NONE CALL INIT_RANDOM_SEED() ALLOCATE(dx(MM:n), dy(MM:n), dz(MM:n), dist(MM:n), square(MM:n), frac(MM:n), tx(MM:n), ty(MM:n), tz(MM:n), p(MM:FZ, MM:n), & q(MM:FZ, MM:n)) myrank = 0 WRITE(6, '(a18)', advance='no') "Number of points: " READ *, npts nprocs = 2!MOD IF (myrank == pseudohost) THEN IF (MOD(nprocs, 2) == 0) THEN IF (npts > nprocs * n) THEN WRITE(6, '(a28)') "Warning: Size out-of-bounds." npts = -1 ELSE IF (MOD(npts, nprocs) /= 0) THEN WRITE(6, '(a65)') "Number of processes (nprocs) must divide number of points (npts)." npts = -1 END IF ELSE WRITE(6, '(a42)') "Number of processes (nprocs) must be even." npts = -1 END IF END IF IF (npts == -1) GO TO 999 nlocal = npts / nprocs!determines size of loop IF (myrank == pseudohost) THEN particle: DO i = 1, nlocal CALL RANDOM_NUMBER(random) p(MM, i) = random CALL RANDOM_NUMBER(random) p(PX, i) = random CALL RANDOM_NUMBER(random) p(PY, i) = random CALL RANDOM_NUMBER(random) p(PZ, i) = random p(FX, i) = 0.0 p(FY, i) = 0.0 p(FZ, i) = 0.0 END DO particle process: DO k = 1, nprocs IF (k /= pseudohost) THEN send: DO i = 1, nlocal CALL RANDOM_NUMBER(random) q(MM, i) = random CALL RANDOM_NUMBER(random) q(PX, i) = random CALL RANDOM_NUMBER(random) q(PY, i) = random CALL RANDOM_NUMBER(random) q(PZ, i) = random q(FX, i) = 0.0 q(FY, i) = 0.0 q(FZ, i) = 0.0 END DO send END IF END DO process END IF copy: DO i = 1, nlocal q(MM, i) = p(MM, i) q(PX, i) = p(PX, i) q(PY, i) = p(PY, i) q(PZ, i) = p(PZ, i) q(FX, i) = 0.0 q(FY, i) = 0.0 q(FZ, i) = 0.0 END DO copy mass1: DO i = 1, nlocal mass2: DO j = i + 1, nlocal CALL My_Subroutine(i, j, p, q) END DO mass2 END DO mass1 sum_x = 0.0 sum_y = 0.0 sum_z = 0.0 total: DO i = 1, nlocal p(FX, i) = p(FX, i) + q(FX, i) sum_x = sum_x + p(FX, i) p(FY, i) = p(FY, i) + q(FY, i) sum_y = sum_y + p(FY, i) p(FZ, i) = p(FZ, i) + q(FZ, i) sum_z = sum_z + p(FZ, i) END DO total WRITE(6, '(a8)') "AVERAGES" WRITE(6, '(a5)', advance='no') "F_x =" WRITE(6, my_format) sum_x / nlocal - 1 WRITE(6, '(a5)', advance='no') "F_y =" WRITE(6, my_format) sum_y / nlocal - 1 WRITE(6, '(a5)', advance='no') "F_z =" WRITE(6, my_format) sum_z / nlocal - 1 999 STOP DEALLOCATE(dx, dy, dz, dist, square, frac, tx, ty, tz, p, q) END PROGRAM Gravity_Fortran SUBROUTINE INIT_RANDOM_SEED() USE ISO_Fortran_env, ONLY: INT64 IMPLICIT NONE INTEGER, ALLOCATABLE :: seed(:) INTEGER :: i, n, un, istat, dt(8), pid INTEGER(INT64) :: t CALL RANDOM_SEED(size = n) ALLOCATE(seed(n)) OPEN(newunit=un, file='/dev/urandom', access='stream', status='old', action='read', form='unformatted', iostat=istat) IF (istat == 0) THEN READ(un) seed CLOSE(un) ELSE CALL SYSTEM_CLOCK(t) IF (t == 0) THEN CALL DATE_AND_TIME(values = dt) t = (dt(1) - 1970) * 365_INT64 * 24 * 60 * 60 * 1000 + dt(2) * 31_INT64 * 24 * 60 * 60 * 1000 + dt(3) * 24_INT64 * 60 * 60 & * 1000 + dt(5) * 60 * 60 * 1000 + dt(6) * 60 * 1000 + dt(7) * 1000 + dt(8) END IF pid = GETPID() t = IEOR(t, INT(pid, KIND(t))) DO i = 1, n seed(i) = lcg(t) END DO END IF CALL RANDOM_SEED(put = seed) DEALLOCATE(seed) CONTAINS FUNCTION lcg(s) INTEGER :: lcg INTEGER(INT64) :: s IF (s == 0) THEN s = 104729 ELSE s = MOD(s, 4294967296_INT64) END IF s = MOD(s * 279470273_INT64, 4294967291_INT64) lcg = INT(MOD(s, INT(HUGE(0), INT64)), KIND(0)) END FUNCTION lcg END SUBROUTINE INIT_RANDOM_SEED
The steps below detail the Message Passing Interface version of the program, where the number of processes, nprocs
, must be even, and the total number of masses, npts
, must be exactly divisible by the number of processes. I used 2 × Dell OptiPlex 755 (ultra small form factor) desktops with Intel Core 2 Duo processors for this test.
CALL MPI_INIT(ierr)
— Initialize MPI, find the rank of each process, and the total number of processes.IF (myrank == pseudohost) THEN
— One process acts as the host and reads in the number of particles.CALL MPI_BCAST(npts, 1, MPI_INTEGER, pseudohost, MPI_COMM_WORLD, ierr)
— The number of particles is broadcast to all processes.IF (npts == -1) GO TO 999
— Abort if the number of processes and/or particles is incorrect.nlocal = npts / nprocs
— Work out the number of particles in each process.IF (myrank == pseudohost) THEN
— The pseudohost initializes the particle data and sends each process its particles.start = MPI_WTIME()
— Initialization is now complete. Start the clock and begin work. First each process makes a copy of its particles.mass1: DO i = 1, nlocal
— Now the interactions between the particles in a single process is computed.dest = MOD(nprocs + myrank - 1, nprocs)
— The processes are arranged in a ring. Data will be passed in an anti-clockwise direction around the ring.neighbors: DO k = 1, nprocs / (2 - 1)
— Each process interacts with the particles from itsnprocs / 2 - 1
anti-clockwise neighbors. At the end of this loop,p(i)
in each process has accumulated the force from interactions with particlesi + 1, ..., nlocal - 1
in its own process, plus all the particles from itsnprocs / 2 - 1
anti-clockwise neighbors. The "home" of theq
array is regarded as the process from which it originated. At the end of this loop,q(i)
has accumulated the force from interactions with particles0, ..., i - 1
in its home process, plus all the particles from thenprocs / 2 - 1
processes it has rotated to.IF (nprocs > 1) THEN
— Nowq
is rotated once more so that it is diametrically opposite its home process.p(i)
accumulates force contributions from the interaction with particles0, ..., i - 1
from its opposing process.q(i)
accumulates force from the interaction of its home particles with particlesi + 1, ..., nlocal - 1
in its current location.IF (myrank < nprocs / 2) THEN
— In half the processes we include the interaction of each particle with the corresponding particle in the opposing process.dest = MOD(nprocs + myrank - nprocs / 2, nprocs)
— Now theq
array is returned to its home process.total: DO i = 1, nlocal
— Thep
andq
arrays are summed to give the total force on each particle.end = MPI_WTIME()
— Stop clock and write out timings.CALL MPI_BARRIER(MPI_COMM_WORLD, ierr)
— Do a barrier to make sure the timings are written out first.IF (myrank = pseudohost) THEN
— Each process returns its force contributions to the pseudohost.CALL MPI_FINALIZE(ierr)
— Close MPI.
MPI has the disadvantage that it can be rather labor intensive to implement and debug.
MODULE Shared USE, INTRINSIC :: ISO_Fortran_env, dp=>REAL64!modern DOUBLE PRECISION USE MPI IMPLICIT NONE INTEGER :: i, j, k, myrank, npts, nprocs, nlocal, ierr, dest, src, newtype INTEGER, DIMENSION(:), ALLOCATABLE :: status INTEGER, PARAMETER :: pseudohost=0, n=100000, MM=1, PX=2, PY=3, PZ=4, FX=5, FY=6, FZ=7 REAL(dp) :: random, sum_x, sum_y, sum_z, start, end REAL(dp), DIMENSION(:), ALLOCATABLE :: dx, dy, dz, dist, square, frac, tx, ty, tz REAL(dp), DIMENSION(:,:), ALLOCATABLE :: p, q CHARACTER(*), PARAMETER :: my_format='(f21.17)'!dp: matches unformatted output END MODULE Shared MODULE My_Module CONTAINS SUBROUTINE My_Subroutine(is, js, ps, qs) USE Shared, ONLY : dp, MM, PX, PY, PZ, FX, FY, FZ, dx, dy, dz, dist, square, frac, tx, ty, tz IMPLICIT NONE INTEGER, INTENT(IN) :: is, js REAL(dp), INTENT(IN OUT), CONTIGUOUS :: ps(:,:), qs(:,:)!Rules 22. & 25. dx(is) = ps(PX, is) - qs(PX, js) dy(is) = ps(PY, is) - qs(PY, js) dz(is) = ps(PZ, is) - qs(PZ, js) square(is) = dx(is)**2 + dy(is)**2 + dz(is)**2 dist(is) = SQRT(square(is)) frac(is) = (ps(MM,is) * qs(MM,js)) / (dist(is) * square(is)) tx(is) = frac(is) * dx(is) ty(is) = frac(is) * dy(is) tz(is) = frac(is) * dz(is) ps(FX, is) = ps(FX, is) - tx(is) qs(FX, js) = qs(FX, js) + tx(is) ps(FY, is) = ps(FY, is) - ty(is) qs(FY, js) = qs(FY, js) + ty(is) ps(FZ, is) = ps(FZ, is) - tz(is) qs(FZ, js) = qs(FZ, js) + tz(is) END SUBROUTINE My_Subroutine END MODULE My_Module PROGRAM Gravity_MPI USE Shared USE My_Module IMPLICIT NONE CALL INIT_RANDOM_SEED() ALLOCATE(status(MPI_STATUS_SIZE), dx(MM:n), dy(MM:n), dz(MM:n), dist(MM:n), square(MM:n), frac(MM:n), tx(MM:n), ty(MM:n), & tz(MM:n), p(MM:FZ, MM:n), q(MM:FZ, MM:n)) myrank = 0 WRITE(6, '(a18)', advance='no') "Number of points: " READ *, npts nprocs = 2!MOD CALL MPI_INIT(ierr) CALL MPI_COMM_RANK(MPI_COMM_WORLD, myrank, ierr) CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr) IF (myrank == pseudohost) THEN IF (MOD(nprocs, 2) == 0) THEN IF (npts > nprocs * n) THEN WRITE(6, '(a28)') "Warning: Size out-of-bounds." npts = -1 ELSE IF (MOD(npts, nprocs) /= 0) THEN WRITE(6, '(a65)') "Number of processes (nprocs) must divide number of points (npts)." npts = -1 END IF ELSE WRITE(6, '(a42)') "Number of processes (nprocs) must be even." npts = -1 END IF END IF CALL MPI_BCAST(npts, 1, MPI_INTEGER, pseudohost, MPI_COMM_WORLD, ierr) IF (npts == -1) GO TO 999 nlocal = npts / nprocs!determines size of loop IF (myrank == pseudohost) THEN particle: DO i = 1, nlocal CALL RANDOM_NUMBER(random) p(MM, i) = random CALL RANDOM_NUMBER(random) p(PX, i) = random CALL RANDOM_NUMBER(random) p(PY, i) = random CALL RANDOM_NUMBER(random) p(PZ, i) = random p(FX, i) = 0.0 p(FY, i) = 0.0 p(FZ, i) = 0.0 END DO particle process: DO k = 1, nprocs IF (k /= pseudohost) THEN send: DO i = 1, nlocal CALL RANDOM_NUMBER(random) q(MM, i) = random CALL RANDOM_NUMBER(random) q(PX, i) = random CALL RANDOM_NUMBER(random) q(PY, i) = random CALL RANDOM_NUMBER(random) q(PZ, i) = random q(FX, i) = 0.0 q(FY, i) = 0.0 q(FZ, i) = 0.0 END DO send CALL MPI_SEND(q, 7 * nlocal, MPI_REAL, k, 100, MPI_COMM_WORLD, ierr) END IF END DO process ELSE CALL MPI_RECV(p, 7 * nlocal, MPI_REAL, pseudohost, 100, MPI_COMM_WORLD, status, ierr) END IF start = MPI_WTIME() copy: DO i = 1, nlocal q(MM, i) = p(MM, i) q(PX, i) = p(PX, i) q(PY, i) = p(PY, i) q(PZ, i) = p(PZ, i) q(FX, i) = 0.0 q(FY, i) = 0.0 q(FZ, i) = 0.0 END DO copy mass1: DO i = 1, nlocal mass2: DO j = i + 1, nlocal CALL My_Subroutine(i, j, p, q) END DO mass2 END DO mass1 dest = MOD(nprocs + myrank - 1, nprocs) src = MOD(myrank + 1, nprocs) neighbors: DO k = 1, nprocs / (2 - 1) CALL MPI_SENDRECV_REPLACE(q, 7 * nlocal, MPI_REAL, dest, 200, src, 200, MPI_COMM_WORLD, status, ierr) m1: DO i = 1, nlocal m2: DO j = 1, nlocal CALL My_Subroutine(i, j, p, q) END DO m2 END DO m1 END DO neighbors IF (nprocs > 1) THEN CALL MPI_SENDRECV_REPLACE(q, 7 * nlocal, MPI_REAL, dest, 300, src, 300, MPI_COMM_WORLD, status, ierr) accumulate1: DO i = nlocal, 1, -1 accumulate2: DO j = i, 1, -1 CALL My_Subroutine(i, j, p, q) END DO accumulate2 END DO accumulate1 IF (myrank < nprocs / 2) THEN half: DO i = 1, nlocal CALL My_Subroutine(i, j, p, q) END DO half ENDIF dest = MOD(nprocs + myrank - nprocs / 2, nprocs) src = MOD(myrank + nprocs / 2, nprocs) CALL MPI_SENDRECV_REPLACE(q, 7 * nlocal, MPI_REAL, dest, 400, src, 400, MPI_COMM_WORLD, status, ierr) END IF sum_x = 0.0 sum_y = 0.0 sum_z = 0.0 total: DO i = 1, nlocal p(FX, i) = p(FX, i) + q(FX, i) sum_x = sum_x + p(FX, i) p(FY, i) = p(FY, i) + q(FY, i) sum_y = sum_y + p(FY, i) p(FZ, i) = p(FZ, i) + q(FZ, i) sum_z = sum_z + p(FZ, i) END DO total WRITE(6, '(a8)') "AVERAGES" WRITE(6, '(a5)', advance='no') "F_x =" WRITE(6, my_format) sum_x / nlocal - 1 WRITE(6, '(a5)', advance='no') "F_y =" WRITE(6, my_format) sum_y / nlocal - 1 WRITE(6, '(a5)', advance='no') "F_z =" WRITE(6, my_format) sum_z / nlocal - 1 end = MPI_WTIME() PRINT *, "Node", myrank, ":", end - start, "seconds" CALL MPI_BARRIER(MPI_COMM_WORLD, ierr) IF (myrank == pseudohost) THEN CALL MPI_TYPE_VECTOR(nlocal, 3, 7, MPI_REAL, newtype, ierr) CALL MPI_TYPE_COMMIT(newtype, ierr) forces: DO k = 1, nprocs IF (k /= pseudohost) CALL MPI_RECV(q(FX, 1), 1, newtype, k, 100, MPI_COMM_WORLD, status, ierr) END DO forces ELSE CALL MPI_TYPE_VECTOR(nlocal, 3, 7, MPI_REAL, newtype, ierr) CALL MPI_TYPE_COMMIT(newtype, ierr) CALL MPI_SEND(p(FX, 1), 1, newtype, pseudohost, 100, MPI_COMM_WORLD, ierr) END IF 999 CALL MPI_FINALIZE(ierr) CALL MPI_ABORT(ierr) DEALLOCATE(status, dx, dy, dz, dist, square, frac, tx, ty, tz, p, q) END PROGRAM Gravity_MPI SUBROUTINE INIT_RANDOM_SEED() USE ISO_Fortran_env, ONLY: INT64 IMPLICIT NONE INTEGER, ALLOCATABLE :: seed(:) INTEGER :: i, n, un, istat, dt(8), pid INTEGER(INT64) :: t CALL RANDOM_SEED(size = n) ALLOCATE(seed(n)) OPEN(newunit=un, file='/dev/urandom', access='stream', status='old', action='read', form='unformatted', iostat=istat) IF (istat == 0) THEN READ(un) seed CLOSE(un) ELSE CALL SYSTEM_CLOCK(t) IF (t == 0) THEN CALL DATE_AND_TIME(values = dt) t = (dt(1) - 1970) * 365_INT64 * 24 * 60 * 60 * 1000 + dt(2) * 31_INT64 * 24 * 60 * 60 * 1000 + dt(3) * 24_INT64 * 60 * 60 & * 1000 + dt(5) * 60 * 60 * 1000 + dt(6) * 60 * 1000 + dt(7) * 1000 + dt(8) END IF pid = GETPID() t = IEOR(t, INT(pid, KIND(t))) DO i = 1, n seed(i) = lcg(t) END DO END IF CALL RANDOM_SEED(put = seed) DEALLOCATE(seed) CONTAINS FUNCTION lcg(s) INTEGER :: lcg INTEGER(INT64) :: s IF (s == 0) THEN s = 104729 ELSE s = MOD(s, 4294967296_INT64) END IF s = MOD(s * 279470273_INT64, 4294967291_INT64) lcg = INT(MOD(s, INT(HUGE(0), INT64)), KIND(0)) END FUNCTION lcg END SUBROUTINE INIT_RANDOM_SEED
You can read more about Fortran coarrays here.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
OpenMP is easy to implement compared to MPI. I added the line !$OMP PARALLEL DO
above the line mass1: DO i = 1, nlocal
, and the line !$OMP END PARALLEL DO
below the line END DO mass1
. You'll also need to specify how many cores to use by setting the environment variable at the command line with export OMP_NUM_THREADS=4
(on a Unix/Linux system). After compiling with the -fopenmp
flag and running the program, you should see something like this in your system resources monitor:

That staircase-like feature in the CPU usage graph tells you that the program is using all 4 cores (initially) to compute the average force. The sequential drop-off occurs when each core finishes its job. The interested reader can read more about the OpenMP application programming interface here.
MODULE Shared USE, INTRINSIC :: ISO_Fortran_env, dp=>REAL64!modern DOUBLE PRECISION IMPLICIT NONE INTEGER :: i, j, k, myrank, npts, nprocs, nlocal INTEGER, PARAMETER :: pseudohost=0, n=100000, MM=1, PX=2, PY=3, PZ=4, FX=5, FY=6, FZ=7 REAL(dp) :: random, sum_x, sum_y, sum_z REAL(dp), DIMENSION(:), ALLOCATABLE :: dx, dy, dz, dist, square, frac, tx, ty, tz REAL(dp), DIMENSION(:,:), ALLOCATABLE :: p, q CHARACTER(*), PARAMETER :: my_format='(f21.17)'!dp: matches unformatted output END MODULE Shared MODULE My_Module CONTAINS SUBROUTINE My_Subroutine(is, js, ps, qs) USE Shared, ONLY : dp, MM, PX, PY, PZ, FX, FY, FZ, dx, dy, dz, dist, square, frac, tx, ty, tz IMPLICIT NONE INTEGER, INTENT(IN) :: is, js REAL(dp), INTENT(IN OUT), CONTIGUOUS :: ps(:,:), qs(:,:)!Rules 22. & 25. dx(is) = ps(PX, is) - qs(PX, js) dy(is) = ps(PY, is) - qs(PY, js) dz(is) = ps(PZ, is) - qs(PZ, js) square(is) = dx(is)**2 + dy(is)**2 + dz(is)**2 dist(is) = SQRT(square(is)) frac(is) = (ps(MM,is) * qs(MM,js)) / (dist(is) * square(is)) tx(is) = frac(is) * dx(is) ty(is) = frac(is) * dy(is) tz(is) = frac(is) * dz(is) ps(FX, is) = ps(FX, is) - tx(is) qs(FX, js) = qs(FX, js) + tx(is) ps(FY, is) = ps(FY, is) - ty(is) qs(FY, js) = qs(FY, js) + ty(is) ps(FZ, is) = ps(FZ, is) - tz(is) qs(FZ, js) = qs(FZ, js) + tz(is) END SUBROUTINE My_Subroutine END MODULE My_Module PROGRAM Gravity_OpenMP USE Shared USE My_Module IMPLICIT NONE CALL INIT_RANDOM_SEED() ALLOCATE(dx(MM:n), dy(MM:n), dz(MM:n), dist(MM:n), square(MM:n), frac(MM:n), tx(MM:n), ty(MM:n), tz(MM:n), p(MM:FZ, MM:n), & q(MM:FZ, MM:n)) myrank = 0 WRITE(6, '(a18)', advance='no') "Number of points: " READ *, npts nprocs = 2!MOD IF (myrank == pseudohost) THEN IF (MOD(nprocs, 2) == 0) THEN IF (npts > nprocs * n) THEN WRITE(6, '(a28)') "Warning: Size out-of-bounds." npts = -1 ELSE IF (MOD(npts, nprocs) /= 0) THEN WRITE(6, '(a65)') "Number of processes (nprocs) must divide number of points (npts)." npts = -1 END IF ELSE WRITE(6, '(a42)') "Number of processes (nprocs) must be even." npts = -1 END IF END IF IF (npts == -1) GO TO 999 nlocal = npts / nprocs!determines size of loop IF (myrank == pseudohost) THEN particle: DO i = 1, nlocal CALL RANDOM_NUMBER(random) p(MM, i) = random CALL RANDOM_NUMBER(random) p(PX, i) = random CALL RANDOM_NUMBER(random) p(PY, i) = random CALL RANDOM_NUMBER(random) p(PZ, i) = random p(FX, i) = 0.0 p(FY, i) = 0.0 p(FZ, i) = 0.0 END DO particle process: DO k = 1, nprocs IF (k /= pseudohost) THEN send: DO i = 1, nlocal CALL RANDOM_NUMBER(random) q(MM, i) = random CALL RANDOM_NUMBER(random) q(PX, i) = random CALL RANDOM_NUMBER(random) q(PY, i) = random CALL RANDOM_NUMBER(random) q(PZ, i) = random q(FX, i) = 0.0 q(FY, i) = 0.0 q(FZ, i) = 0.0 END DO send END IF END DO process END IF copy: DO i = 1, nlocal q(MM, i) = p(MM, i) q(PX, i) = p(PX, i) q(PY, i) = p(PY, i) q(PZ, i) = p(PZ, i) q(FX, i) = 0.0 q(FY, i) = 0.0 q(FZ, i) = 0.0 END DO copy !$OMP PARALLEL DO mass1: DO i = 1, nlocal mass2: DO j = i + 1, nlocal CALL My_Subroutine(i, j, p, q) END DO mass2 END DO mass1 !$OMP END PARALLEL DO sum_x = 0.0 sum_y = 0.0 sum_z = 0.0 total: DO i = 1, nlocal p(FX, i) = p(FX, i) + q(FX, i) sum_x = sum_x + p(FX, i) p(FY, i) = p(FY, i) + q(FY, i) sum_y = sum_y + p(FY, i) p(FZ, i) = p(FZ, i) + q(FZ, i) sum_z = sum_z + p(FZ, i) END DO total WRITE(6, '(a8)') "AVERAGES" WRITE(6, '(a5)', advance='no') "F_x =" WRITE(6, my_format) sum_x / nlocal - 1 WRITE(6, '(a5)', advance='no') "F_y =" WRITE(6, my_format) sum_y / nlocal - 1 WRITE(6, '(a5)', advance='no') "F_z =" WRITE(6, my_format) sum_z / nlocal - 1 999 STOP DEALLOCATE(dx, dy, dz, dist, square, frac, tx, ty, tz, p, q) END PROGRAM Gravity_OpenMP SUBROUTINE INIT_RANDOM_SEED() USE ISO_Fortran_env, ONLY: INT64 IMPLICIT NONE INTEGER, ALLOCATABLE :: seed(:) INTEGER :: i, n, un, istat, dt(8), pid INTEGER(INT64) :: t CALL RANDOM_SEED(size = n) ALLOCATE(seed(n)) OPEN(newunit=un, file='/dev/urandom', access='stream', status='old', action='read', form='unformatted', iostat=istat) IF (istat == 0) THEN READ(un) seed CLOSE(un) ELSE CALL SYSTEM_CLOCK(t) IF (t == 0) THEN CALL DATE_AND_TIME(values = dt) t = (dt(1) - 1970) * 365_INT64 * 24 * 60 * 60 * 1000 + dt(2) * 31_INT64 * 24 * 60 * 60 * 1000 + dt(3) * 24_INT64 * 60 * 60 & * 1000 + dt(5) * 60 * 60 * 1000 + dt(6) * 60 * 1000 + dt(7) * 1000 + dt(8) END IF pid = GETPID() t = IEOR(t, INT(pid, KIND(t))) DO i = 1, n seed(i) = lcg(t) END DO END IF CALL RANDOM_SEED(put = seed) DEALLOCATE(seed) CONTAINS FUNCTION lcg(s) INTEGER :: lcg INTEGER(INT64) :: s IF (s == 0) THEN s = 104729 ELSE s = MOD(s, 4294967296_INT64) END IF s = MOD(s * 279470273_INT64, 4294967291_INT64) lcg = INT(MOD(s, INT(HUGE(0), INT64)), KIND(0)) END FUNCTION lcg END SUBROUTINE INIT_RANDOM_SEED
Download the source code for each program here:
- Fortran — original, compile with
gfortran -g -std=f2008 -Wall -Wextra -O2 -fall-intrinsics
- MPI — Message-Passing Interface, compile with
gfortran -g -std=f2008 -Wall -Wextra -O2 -fall-intrinsics -lmpi
- Coarrays — Coarray Fortran (CAF)
- OpenMP — Open Multi-Processing, compile with
gfortran -g -std=f2008 -Wall -Wextra -O2 -fall-intrinsics -fopenmp
If you find any bugs in these programs, please let me know.