CoMD
A Mini-app for Co-Design of Classical Molecular Dynamics.
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
CoMD.c
Go to the documentation of this file.
1 /// \file
2 /// Main program
3 ///
4 /// \mainpage CoMD: A Classical Molecular Dynamics Mini-app
5 ///
6 /// CoMD is a reference implementation of typical classical molecular
7 /// dynamics algorithms and workloads. It is created and maintained by
8 /// The Exascale Co-Design Center for Materials in Extreme Environments
9 /// (ExMatEx). http://codesign.lanl.gov/projects/exmatex. The
10 /// code is intended to serve as a vehicle for co-design by allowing
11 /// others to extend and/or reimplement it as needed to test performance of
12 /// new architectures, programming models, etc.
13 ///
14 /// The current version of CoMD is available from:
15 /// http://exmatex.github.io/CoMD
16 ///
17 /// To contact the developers of CoMD send email to: exmatex-comd@llnl.gov.
18 ///
19 /// Table of Contents
20 /// =================
21 ///
22 /// Click on the links below to browse the CoMD documentation.
23 ///
24 /// \subpage pg_md_basics
25 ///
26 /// \subpage pg_building_comd
27 ///
28 /// \subpage pg_running_comd
29 ///
30 /// \subpage pg_measuring_performance
31 ///
32 /// \subpage pg_problem_selection_and_scaling
33 ///
34 /// \subpage pg_verifying_correctness
35 ///
36 /// \subpage pg_comd_architecture
37 ///
38 /// \subpage pg_optimization_targets
39 ///
40 /// \subpage pg_whats_new
41 
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <strings.h>
46 #include <unistd.h>
47 #include <assert.h>
48 
49 #include "CoMDTypes.h"
50 #include "decomposition.h"
51 #include "linkCells.h"
52 #include "eam.h"
53 #include "ljForce.h"
54 #include "initAtoms.h"
55 #include "memUtils.h"
56 #include "yamlOutput.h"
57 #include "parallel.h"
58 #include "performanceTimers.h"
59 #include "mycommand.h"
60 #include "timestep.h"
61 #include "constants.h"
62 
63 #define REDIRECT_OUTPUT 0
64 #define MIN(A,B) ((A) < (B) ? (A) : (B))
65 
66 static SimFlat* initSimulation(Command cmd);
67 static void destroySimulation(SimFlat** ps);
68 
69 static void initSubsystems(void);
70 static void finalizeSubsystems(void);
71 
73  int doeam, const char* potDir, const char* potName, const char* potType);
75 static Validate* initValidate(SimFlat* s);
76 static void validateResult(const Validate* val, SimFlat *sim);
77 
78 static void sumAtoms(SimFlat* s);
79 static void printThings(SimFlat* s, int iStep, double elapsedTime);
80 static void printSimulationDataYaml(FILE* file, SimFlat* s);
81 static void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8]);
82 
83 
84 int main(int argc, char** argv)
85 {
86  // Prolog
87  initParallel(&argc, &argv);
90  timestampBarrier("Starting Initialization\n");
91 
94 
95  Command cmd = parseCommandLine(argc, argv);
96  printCmdYaml(yamlFile, &cmd);
97  printCmdYaml(screenOut, &cmd);
98 
99  SimFlat* sim = initSimulation(cmd);
102 
103  Validate* validate = initValidate(sim); // atom counts, energy
104  timestampBarrier("Initialization Finished\n");
105 
106  timestampBarrier("Starting simulation\n");
107 
108  // This is the CoMD main loop
109  const int nSteps = sim->nSteps;
110  const int printRate = sim->printRate;
111  int iStep = 0;
113  for (; iStep<nSteps;)
114  {
116  sumAtoms(sim);
118 
120 
122  timestep(sim, printRate, sim->dt);
124 
125  iStep += printRate;
126  }
128 
129  sumAtoms(sim);
131  timestampBarrier("Ending simulation\n");
132 
133  // Epilog
134  validateResult(validate, sim);
136 
139 
140  destroySimulation(&sim);
141  comdFree(validate);
143 
144  timestampBarrier("CoMD Ending\n");
145  destroyParallel();
146 
147  return 0;
148 }
149 
150 /// Initialized the main CoMD data stucture, SimFlat, based on command
151 /// line input from the user. Also performs certain sanity checks on
152 /// the input to screen out certain non-sensical inputs.
153 ///
154 /// Simple data members such as the time step dt are initialized
155 /// directly, substructures such as the potential, the link cells, the
156 /// atoms, etc., are initialized by calling additional initialization
157 /// functions (initPotential(), initLinkCells(), initAtoms(), etc.).
158 /// Initialization order is set by the natural dependencies of the
159 /// substructure such as the atoms need the link cells so the link cells
160 /// must be initialized before the atoms.
162 {
163  SimFlat* sim = comdMalloc(sizeof(SimFlat));
164  sim->nSteps = cmd.nSteps;
165  sim->printRate = cmd.printRate;
166  sim->dt = cmd.dt;
167  sim->domain = NULL;
168  sim->boxes = NULL;
169  sim->atoms = NULL;
170  sim->ePotential = 0.0;
171  sim->eKinetic = 0.0;
172  sim->atomExchange = NULL;
173 
174  sim->pot = initPotential(cmd.doeam, cmd.potDir, cmd.potName, cmd.potType);
175  real_t latticeConstant = cmd.lat;
176  if (cmd.lat < 0.0)
177  latticeConstant = sim->pot->lat;
178 
179  // ensure input parameters make sense.
180  sanityChecks(cmd, sim->pot->cutoff, latticeConstant, sim->pot->latticeType);
181 
182  sim->species = initSpecies(sim->pot);
183 
184  real3 globalExtent;
185  globalExtent[0] = cmd.nx * latticeConstant;
186  globalExtent[1] = cmd.ny * latticeConstant;
187  globalExtent[2] = cmd.nz * latticeConstant;
188 
189  sim->domain = initDecomposition(
190  cmd.xproc, cmd.yproc, cmd.zproc, globalExtent);
191 
192  sim->boxes = initLinkCells(sim->domain, sim->pot->cutoff);
193  sim->atoms = initAtoms(sim->boxes);
194 
195  // create lattice with desired temperature and displacement.
196  createFccLattice(cmd.nx, cmd.ny, cmd.nz, latticeConstant, sim);
197  setTemperature(sim, cmd.temperature);
199 
200  sim->atomExchange = initAtomHaloExchange(sim->domain, sim->boxes);
201 
202  // Forces must be computed before we call the time stepper.
204  redistributeAtoms(sim);
206 
208  computeForce(sim);
210 
211  kineticEnergy(sim);
212 
213  return sim;
214 }
215 
216 /// frees all data associated with *ps and frees *ps
218 {
219  if ( ! ps ) return;
220 
221  SimFlat* s = *ps;
222  if ( ! s ) return;
223 
224  BasePotential* pot = s->pot;
225  if ( pot) pot->destroy(&pot);
226  destroyLinkCells(&(s->boxes));
227  destroyAtoms(s->atoms);
229  comdFree(s);
230  *ps = NULL;
231 
232  return;
233 }
234 
235 void initSubsystems(void)
236 {
237 #if REDIRECT_OUTPUT
238  freopen("testOut.txt","w",screenOut);
239 #endif
240 
241  yamlBegin();
242 }
243 
245 {
246 #if REDIRECT_OUTPUT
247  fclose(screenOut);
248 #endif
249  yamlEnd();
250 }
251 
252 /// decide whether to get LJ or EAM potentials
254  int doeam, const char* potDir, const char* potName, const char* potType)
255 {
256  BasePotential* pot = NULL;
257 
258  if (doeam)
259  pot = initEamPot(potDir, potName, potType);
260  else
261  pot = initLjPot();
262  assert(pot);
263  return pot;
264 }
265 
267 {
268  SpeciesData* species = comdMalloc(sizeof(SpeciesData));
269 
270  strcpy(species->name, pot->name);
271  species->atomicNo = pot->atomicNo;
272  species->mass = pot->mass;
273 
274  return species;
275 }
276 
278 {
279  sumAtoms(sim);
280  Validate* val = comdMalloc(sizeof(Validate));
281  val->eTot0 = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
282  val->nAtoms0 = sim->atoms->nGlobal;
283 
284  if (printRank())
285  {
286  fprintf(screenOut, "\n");
288  fprintf(screenOut, "Initial energy : %14.12f, atom count : %d \n",
289  val->eTot0, val->nAtoms0);
290  fprintf(screenOut, "\n");
291  }
292  return val;
293 }
294 
295 void validateResult(const Validate* val, SimFlat* sim)
296 {
297  if (printRank())
298  {
299  real_t eFinal = (sim->ePotential + sim->eKinetic) / sim->atoms->nGlobal;
300 
301  int nAtomsDelta = (sim->atoms->nGlobal - val->nAtoms0);
302 
303  fprintf(screenOut, "\n");
304  fprintf(screenOut, "\n");
305  fprintf(screenOut, "Simulation Validation:\n");
306 
307  fprintf(screenOut, " Initial energy : %14.12f\n", val->eTot0);
308  fprintf(screenOut, " Final energy : %14.12f\n", eFinal);
309  fprintf(screenOut, " eFinal/eInitial : %f\n", eFinal/val->eTot0);
310  if ( nAtomsDelta == 0)
311  {
312  fprintf(screenOut, " Final atom count : %d, no atoms lost\n",
313  sim->atoms->nGlobal);
314  }
315  else
316  {
317  fprintf(screenOut, "#############################\n");
318  fprintf(screenOut, "# WARNING: %6d atoms lost #\n", nAtomsDelta);
319  fprintf(screenOut, "#############################\n");
320  }
321  }
322 }
323 
325 {
326  // sum atoms across all processers
327  s->atoms->nLocal = 0;
328  for (int i = 0; i < s->boxes->nLocalBoxes; i++)
329  {
330  s->atoms->nLocal += s->boxes->nAtoms[i];
331  }
332 
334  addIntParallel(&s->atoms->nLocal, &s->atoms->nGlobal, 1);
336 }
337 
338 /// Prints current time, energy, performance etc to monitor the state of
339 /// the running simulation. Performance per atom is scaled by the
340 /// number of local atoms per process this should give consistent timing
341 /// assuming reasonable load balance
342 void printThings(SimFlat* s, int iStep, double elapsedTime)
343 {
344  // keep track previous value of iStep so we can calculate number of steps.
345  static int iStepPrev = -1;
346  static int firstCall = 1;
347 
348  int nEval = iStep - iStepPrev; // gives nEval = 1 for zeroth step.
349  iStepPrev = iStep;
350 
351  if (! printRank() )
352  return;
353 
354  if (firstCall)
355  {
356  firstCall = 0;
357  fprintf(screenOut,
358  "# Performance\n"
359  "# Loop Time(fs) Total Energy Potential Energy Kinetic Energy Temperature (us/atom) # Atoms\n");
360  fflush(screenOut);
361  }
362 
363  real_t time = iStep*s->dt;
364  real_t eTotal = (s->ePotential+s->eKinetic) / s->atoms->nGlobal;
365  real_t eK = s->eKinetic / s->atoms->nGlobal;
366  real_t eU = s->ePotential / s->atoms->nGlobal;
367  real_t Temp = (s->eKinetic / s->atoms->nGlobal) / (kB_eV * 1.5);
368 
369  double timePerAtom = 1.0e6*elapsedTime/(double)(nEval*s->atoms->nLocal);
370 
371  fprintf(screenOut, " %6d %10.2f %18.12f %18.12f %18.12f %12.4f %10.4f %12d\n",
372  iStep, time, eTotal, eU, eK, Temp, timePerAtom, s->atoms->nGlobal);
373 }
374 
375 /// Print information about the simulation in a format that is (mostly)
376 /// YAML compliant.
377 void printSimulationDataYaml(FILE* file, SimFlat* s)
378 {
379  // All ranks get maxOccupancy
380  int maxOcc = maxOccupancy(s->boxes);
381 
382  // Only rank 0 prints
383  if (! printRank())
384  return;
385 
386  fprintf(file,"Simulation data: \n");
387  fprintf(file," Total atoms : %d\n",
388  s->atoms->nGlobal);
389  fprintf(file," Min global bounds : [ %14.10f, %14.10f, %14.10f ]\n",
390  s->domain->globalMin[0], s->domain->globalMin[1], s->domain->globalMin[2]);
391  fprintf(file," Max global bounds : [ %14.10f, %14.10f, %14.10f ]\n",
392  s->domain->globalMax[0], s->domain->globalMax[1], s->domain->globalMax[2]);
393  printSeparator(file);
394  fprintf(file,"Decomposition data: \n");
395  fprintf(file," Processors : %6d,%6d,%6d\n",
396  s->domain->procGrid[0], s->domain->procGrid[1], s->domain->procGrid[2]);
397  fprintf(file," Local boxes : %6d,%6d,%6d = %8d\n",
398  s->boxes->gridSize[0], s->boxes->gridSize[1], s->boxes->gridSize[2],
399  s->boxes->gridSize[0]*s->boxes->gridSize[1]*s->boxes->gridSize[2]);
400  fprintf(file," Box size : [ %14.10f, %14.10f, %14.10f ]\n",
401  s->boxes->boxSize[0], s->boxes->boxSize[1], s->boxes->boxSize[2]);
402  fprintf(file," Box factor : [ %14.10f, %14.10f, %14.10f ] \n",
403  s->boxes->boxSize[0]/s->pot->cutoff,
404  s->boxes->boxSize[1]/s->pot->cutoff,
405  s->boxes->boxSize[2]/s->pot->cutoff);
406  fprintf(file, " Max Link Cell Occupancy: %d of %d\n",
407  maxOcc, MAXATOMS);
408  printSeparator(file);
409  fprintf(file,"Potential data: \n");
410  s->pot->print(file, s->pot);
411 
412  fflush(file);
413 }
414 
415 /// Check that the user input meets certain criteria.
416 void sanityChecks(Command cmd, double cutoff, double latticeConst, char latticeType[8])
417 {
418  int failCode = 0;
419 
420  // Check that domain grid matches number of ranks. (fail code 1)
421  int nProcs = cmd.xproc * cmd.yproc * cmd.zproc;
422  if (nProcs != getNRanks())
423  {
424  failCode |= 1;
425  if (printRank() )
426  fprintf(screenOut,
427  "\nNumber of MPI ranks must match xproc * yproc * zproc\n");
428  }
429 
430  // Check whether simuation is too small (fail code 2)
431  double minx = 2*cutoff*cmd.xproc;
432  double miny = 2*cutoff*cmd.yproc;
433  double minz = 2*cutoff*cmd.zproc;
434  double sizex = cmd.nx*latticeConst;
435  double sizey = cmd.ny*latticeConst;
436  double sizez = cmd.nz*latticeConst;
437 
438  if ( sizex < minx || sizey < miny || sizez < minz)
439  {
440  failCode |= 2;
441  if (printRank())
442  fprintf(screenOut,"\nSimulation too small.\n"
443  " Increase the number of unit cells to make the simulation\n"
444  " at least (%3.2f, %3.2f. %3.2f) Ansgstroms in size\n",
445  minx, miny, minz);
446  }
447 
448  // Check for supported lattice structure (fail code 4)
449  if (strcasecmp(latticeType, "FCC") != 0)
450  {
451  failCode |= 4;
452  if ( printRank() )
453  fprintf(screenOut,
454  "\nOnly FCC Lattice type supported, not %s. Fatal Error.\n",
455  latticeType);
456  }
457  int checkCode = failCode;
458  bcastParallel(&checkCode, sizeof(int), 0);
459  // This assertion can only fail if different tasks failed different
460  // sanity checks. That should not be possible.
461  assert(checkCode == failCode);
462 
463  if (failCode != 0)
464  exit(failCode);
465 }
466 
467 // --------------------------------------------------------------
468 
469 
470 /// \page pg_building_comd Building CoMD
471 ///
472 /// CoMD is written with portability in mind and should compile using
473 /// practically any compiler that implements the C99 standard. You will
474 /// need to create a Makefile by copying the sample provided with the
475 /// distribution (Makefile.vanilla).
476 ///
477 /// $ cp Makefile.vanilla Makefile
478 ///
479 /// and use the make command to build the code
480 ///
481 /// $ make
482 ///
483 /// The sample Makefile will compile the code on many platforms. See
484 /// comments in Makefile.vanilla for information about specifying the
485 /// name of the C compiler, and/or additional compiler switches that
486 /// might be necessary for your platform.
487 ///
488 /// The main options available in the Makefile are toggling single/double
489 /// precision and enabling/disabling MPI. In the event MPI is not
490 /// available, setting the DO_MPI flag to OFF will create a purely
491 /// serial build (you will likely also need to change the setting of
492 /// CC).
493 ///
494 /// The makefile should handle all the dependency checking needed, via
495 /// makedepend.
496 ///
497 /// 'make clean' removes the object and dependency files.
498 ///
499 /// 'make distclean' additionally removes the executable file and the
500 /// documentation files.
501 ///
502 /// Other build options
503 /// -------------------
504 ///
505 /// Various other options are made available by \#define arguments within
506 /// some of the source files.
507 ///
508 /// #REDIRECT_OUTPUT in CoMD.c
509 ///
510 /// Setting this to 1 will redirect all screen output to a file,
511 /// currently set to 'testOut.txt'.
512 ///
513 /// #POT_SHIFT in ljForce.c
514 ///
515 /// This is set to 1.0 by default, and shifts the values of the cohesive
516 /// energy given by the Lennard-Jones potential so it is zero at the
517 /// cutoff radius. This setting improves energy conservation
518 /// step-to-step as it reduces the noise generated by atoms crossing the
519 /// cutoff threshold. However, it does not affect the long-term energy
520 /// conservation of the code.
521 ///
522 /// #MAXATOMS in linkCells.h
523 ///
524 /// The default value is 64, which allows ample padding of the linkCell
525 /// structure to allow for density fluctuations. Reducing it may improve
526 /// the efficiency of the code via improved thread utilization and
527 /// reduced memory footprint.
528 
529 // --------------------------------------------------------------
530 
531 
532 // --------------------------------------------------------------
533 
534 
535 /// \page pg_measuring_performance Measuring Performance
536 ///
537 /// CoMD implements a simple and extensible system of internal timers to
538 /// measure the performance profile of the code. As explained in
539 /// performanceTimers.c, it is easy to create additional timers and
540 /// associate them with code regions of specific interest. In addition,
541 /// the getTime() and getTick() functions can be easily reimplemented to
542 /// take advantage of platform specific timing resources.
543 ///
544 /// A timing report is printed at the end of each simulation.
545 ///
546 /// ~~~~
547 /// Timings for Rank 0
548 /// Timer # Calls Avg/Call (s) Total (s) % Loop
549 /// ___________________________________________________________________
550 /// total 1 50.6701 50.6701 100.04
551 /// loop 1 50.6505 50.6505 100.00
552 /// timestep 1 50.6505 50.6505 100.00
553 /// position 10000 0.0000 0.0441 0.09
554 /// velocity 20000 0.0000 0.0388 0.08
555 /// redistribute 10001 0.0003 3.4842 6.88
556 /// atomHalo 10001 0.0002 2.4577 4.85
557 /// force 10001 0.0047 47.0856 92.96
558 /// eamHalo 10001 0.0001 1.0592 2.09
559 /// commHalo 60006 0.0000 1.7550 3.46
560 /// commReduce 12 0.0000 0.0003 0.00
561 ///
562 /// Timing Statistics Across 8 Ranks:
563 /// Timer Rank: Min(s) Rank: Max(s) Avg(s) Stdev(s)
564 /// _____________________________________________________________________________
565 /// total 3: 50.6697 0: 50.6701 50.6699 0.0001
566 /// loop 0: 50.6505 4: 50.6505 50.6505 0.0000
567 /// timestep 0: 50.6505 4: 50.6505 50.6505 0.0000
568 /// position 2: 0.0437 0: 0.0441 0.0439 0.0001
569 /// velocity 2: 0.0380 4: 0.0392 0.0385 0.0004
570 /// redistribute 0: 3.4842 1: 3.7085 3.6015 0.0622
571 /// atomHalo 0: 2.4577 7: 2.6441 2.5780 0.0549
572 /// force 1: 46.8624 0: 47.0856 46.9689 0.0619
573 /// eamHalo 3: 0.2269 6: 1.2936 1.0951 0.3344
574 /// commHalo 3: 1.0803 6: 2.1856 1.9363 0.3462
575 /// commReduce 6: 0.0002 2: 0.0003 0.0003 0.0000
576 ///
577 /// ---------------------------------------------------
578 /// Average atom update rate: 9.39 us/atom/task
579 /// ---------------------------------------------------
580 ///
581 /// ~~~~
582 /// This report consists of two blocks. The upper block lists the absolute
583 /// wall clock time spent in each timer on rank 0 of the job. The lower
584 /// block reports minimum, maximum, average, and standard deviation of
585 /// times across all tasks.
586 /// The ranks where the minimum and maximum values occured are also reported
587 /// to aid in identifying hotspots or load imbalances.
588 ///
589 /// The last line of the report gives the atom update rate in
590 /// microseconds/atom/task. Since this quantity is normalized by both
591 /// the number of atoms and the number of tasks it provides a simple
592 /// figure of merit to compare performance between runs with different
593 /// numbers of atoms and different numbers of tasks. Any increase in
594 /// this number relative to a large number of atoms on a single task
595 /// represents a loss of parallel efficiency.
596 ///
597 /// Choosing the problem size correctly has important implications for the
598 /// reported performance. Small problem sizes may run entirely in the cache
599 /// of some architectures, leading to very good performance results.
600 /// For general characterization of performance, it is probably best to
601 /// choose problem sizes which force the code to access main memory, even
602 /// though there may be strong scaling scenarios where the code is indeed
603 /// running mainly in cache.
604 
605 // --------------------------------------------------------------
606 
607 
608 /// \page pg_problem_selection_and_scaling Problem Selection and Scaling
609 ///
610 /// CoMD is a reference molecular dynamics simulation code as used in
611 /// materials science.
612 ///
613 /// Problem Specification {#sec_problem_spec}
614 /// ======================
615 ///
616 /// The reference problem is solid Copper starting from a face-centered
617 /// cubic (FCC) lattice. The initial thermodynamic conditions
618 /// (Temperature and Volume (via the lattice spacing, lat))can be specified
619 /// from the command line input. The default is 600 K and standard
620 /// volume (lat = 3.615 Angstroms).
621 /// Different temperatures (e.g. T =3000K) and volumes can be
622 /// specified to melt the system and enhance the interchange of atoms
623 /// between domains.
624 ///
625 /// The dynamics is micro-canonical (NVE = constant Number of atoms,
626 /// constant total system Volume, and constant total system Energy). As
627 /// a result, the temperature is not fixed. Rather, the temperature will
628 /// adjust from the initial temperature (as specified on the command line)
629 /// to a final temperature as the total system kinetic energy comes into
630 /// equilibrium with the total system potential energy.
631 ///
632 /// The total size of the problem (number of atoms) is specified by the
633 /// number (nx, ny, nz) of FCC unit cells in the x, y, z directions: nAtoms
634 /// = 4 * nx * ny * nz. The default size is nx = ny = nz = 20 or 32,000 atoms.
635 ///
636 /// The simulation models bulk copper by replicating itself in every
637 /// direction using periodic boundary conditions.
638 ///
639 /// Two interatomic force models are available: the Lennard-Jones (LJ)
640 /// two-body potential (ljForce.c) and the many-body Embedded-Atom Model (EAM)
641 /// potential (eam.c). The LJ potential is included for comparison and
642 /// is a valid approximation for constant volume and uniform
643 /// density. The EAM potential is a more accurate model of cohesion in
644 /// simple metals like Copper and includes the energetics necessary to
645 /// model non-uniform density and free surfaces.
646 ///
647 /// Scaling Studies in CoMD {#sec_scaling_studies}
648 /// =======================
649 ///
650 /// CoMD implements a simple geometric domain decomposition to divide
651 /// the total problem space into domains, which are owned by MPI
652 /// ranks. Each domain is a single-program multiple data (SPMD)
653 /// partition of the larger problem.
654 ///
655 /// Caution: When doing scaling studies, it is important to distinguish
656 /// between the problem setup phase and the problem execution phase. Both
657 /// are important to the workflow of doing molecular dynamics, but it
658 /// is the execution phase we want to quantify in the scaling studies
659 /// described below, for that dominates the execution time for long runs
660 /// (millions of time steps). The problem setup can be an appreciable fraction
661 /// of the execution time for short runs (the default is 100 time steps)
662 /// and erroneous conclusions drawn.
663 ///
664 /// This code is configured with timers. The times are reported per particle
665 /// and the timers for the force calculation, timestep, etc start after the
666 /// initialization phase is done.
667 ///
668 /// Weak Scaling {#ssec_weak_scaling}
669 /// -----------
670 ///
671 /// A weak scaling test fixes the amount of work per processor and
672 /// compares the execution time over number of processors. Weak scaling
673 /// keeps the ratio of inter-processor communication (surface) to
674 /// intra-processor work (volume) fixed. The amount of inter-processor
675 /// work scales with the number of processors in the domain and O(1000)
676 /// atoms per domain are needed for reasonable performance.
677 ///
678 /// Examples,
679 ///
680 /// - Increase in processor count by 8: <br>
681 /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=40)
682 ///
683 /// - Increase in processor count by 2: <br>
684 /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=20, nz=40)
685 ///
686 /// In general, it is wise to keep the ratio of processor count to
687 /// system size in each direction fixed (i.e. cubic domains): xproc_0 / nx_0 = xproc_1 /
688 /// nx_1, since this minimizes surface area to volume.
689 /// Feel free to experiment, you might learn something about
690 /// algorithms to optimize communication relative to work.
691 ///
692 /// Strong Scaling {#ssec_strong_scaling}
693 /// ---------------
694 ///
695 /// A strong scaling test fixes the total problem size and compares the
696 /// execution time for different numbers of processors. Strong scaling
697 /// increases the ratio of inter-processor communication (surface) to
698 /// intra-processor work (volume).
699 ///
700 /// Examples,
701 ///
702 /// - Increase in processor count by 8: <br>
703 /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=zproc=4, nx=ny=nz=20)
704 ///
705 /// - Increase in processor count by 2: <br>
706 /// (xproc=yproc=zproc=2, nx=ny=nz=20) -> (xproc=yproc=2, zproc=4, nx=ny=nz=20)
707 ///
708 /// The domain decomposition requires O(1000) atoms per domain and
709 /// begins to scale poorly for small numbers of atoms per domain.
710 /// Again, feel free to experiment, you might learn something here as
711 /// well. For example, when molecular dynamics codes were written for
712 /// vector supercomputers, large lists of force pairs were created for
713 /// the vector processor. These force lists provide a natural force
714 /// decomposition for early parallel computers (Fast Parallel Algorithms
715 /// for Short-Range Molecular Dynamics, S. J. Plimpton, J Comp Phys,
716 /// 117, 1-19 (1995).) Using replicated data, force decomposition can
717 /// scale to fewer than one atom per processor and is a natural
718 /// mechanism to exploit intra-processor parallelism.
719 ///
720 /// For further details see for example:
721 /// https://support.scinet.utoronto.ca/wiki/index.php/Introduction_To_Performance
722 
723 
724 // --------------------------------------------------------------
725 
726 
727 /// \page pg_verifying_correctness Verifying Correctness
728 ///
729 /// Verifying the correctness of an MD simulation is challenging.
730 /// Because MD is Lyapunov unstable, any small errors, even harmless
731 /// round-off errors, will lead to a long-term divergence in the atom
732 /// trajectories. Hence, comparing atom positions at the end of a run
733 /// is not always a useful verification technique. (Such divergences
734 /// are not a problem for science applications of MD since they do not
735 /// alter the statistical physics.) Small, single-particle errors can
736 /// also be difficult to detect in system-wide quantities such as the
737 /// kinetic or potential energy that are averaged over a large number of
738 /// particles.
739 ///
740 /// In spite of these challenges, there are several methods which are
741 /// likely to catch significant errors.
742 ///
743 /// Cohesive Energy {#sec_ver_cohesive_energy}
744 /// ===============
745 ///
746 /// With a perfect lattice as the initial structure (this is the
747 /// default), the potential energy per atom is the cohesive energy.
748 /// This value should be computed correctly to many decimal places. Any
749 /// variation beyond the last 1 or 2 decimal places is cause for
750 /// investigation. The correct values for the cohesive energy are
751 ///
752 /// | Potential | Cohesive Energy |
753 /// | :------------- | :-------------- |
754 /// | Lennard-Jones | -1.243619295058 |
755 /// | EAM (Adams) | -3.538079224691 |
756 /// | EAM (Mishin) | -3.539999969176 |
757 ///
758 /// The \link sec_command_line_options command
759 /// line options \endlink documentation explains the switches used to
760 /// select the potential used in the simulation.
761 ///
762 /// Note that the cohesive energy calculation is not sensitive to errors
763 /// in forces. It is also performed on a highly symmetric structure so
764 /// there are many errors this will not catch. Still, it is a good
765 /// first check.
766 ///
767 /// Energy Conservation {#sec_ver_energy_conservation}
768 /// ===================
769 ///
770 /// A correctly implemented force kernel, with an appropriate time step
771 /// (the default value of 1 fs is conservative for temperatures under
772 /// 10,000K) will conserve total energy over long times to 5 or more
773 /// digits. Any long term systematic drift in the total energy is a
774 /// cause for concern.
775 ///
776 /// To facilitate checking energy conservation CoMD prints the final and
777 /// initial values of the total energy. When comparing these values, pay
778 /// careful attention to these details:
779 ///
780 /// - It is common to observe an initial transient change in the total
781 /// energy. Differences in the total energy of 2-3% can be expected in
782 /// the first 10-100 time steps.
783 /// - The best way to check energy conservation is to run at least
784 /// several thousand steps and look at the slope of the total energy
785 /// ignoring at least the first one or two thousand steps. More steps
786 /// are even better.
787 /// - Set the temperature to at least several hundred K. This ensures
788 /// that atoms will sample a large range of configurations and expose
789 /// possible errors.
790 /// - Fluctuations in the energy can make it difficult to tell if
791 /// conservation is observed. Increasing the number of atoms will reduce
792 /// the fluctuations.
793 ///
794 ///
795 /// Particle Conservation {#sec_ver_particle_conservation}
796 /// =====================
797 ///
798 /// The simulation should always end with the same number of particles
799 /// it started with. Any change is a bug. CoMD checks the initial and
800 /// final number of particles and prints a warning at the end of the
801 /// simulation if they are not equal.
802 ///
803 /// Reproducibility {#sec_ver_reproducibility}
804 /// ===============
805 ///
806 /// The same simulation run repeatedly on the same hardware should
807 /// produce the same result. Because parallel computing can add
808 /// elements of non-determinism we do not expect perfect long term
809 /// reproducibility, however over a few hundred to a few thousand time
810 /// steps the energies should not exhibit run-to-run differences outside
811 /// the last 1 or 2 decimal places. Larger differences are a sign of
812 /// trouble and should be investigated. This kind of test is
813 /// practically the only way to detect race conditions in shared memory
814 /// parallelism.
815 ///
816 /// Portability {#sec_ver_portability}
817 /// ===========
818 ///
819 /// In our experience, simulations that start from the same initial
820 /// condition tend to produce very similar trajectories over short terms
821 /// (100 to 1000 time step), even on different hardware platforms.
822 /// Short term differences beyond the last 1 or 2 decimal places should
823 /// likely be investigated.
824 ///
825 /// General Principles {#sec_ver_general}
826 /// =======================
827 ///
828 /// - Simulations run at 0K are too trivial for verification, set
829 /// the initial temperature to at least several hundred K.
830 /// - Longer runs are better to check conservation. Compare
831 /// energies after initial transients are damped out.
832 /// - Larger runs are better to check conservation. Fluctuations in the
833 /// energy are averaged out.
834 /// - Short term (order 100 time steps) discrepancies from run-to-run
835 /// or platform-to platform beyond the last one or two decimal places
836 /// are reason for concern. Differences in 4th or 5th decimal place
837 /// is almost certainly a bug.
838 /// - Contact the CoMD developers (exmatex-comd@llnl.gov) if you have
839 /// questions about validation.
840 ///
841 
842 // --------------------------------------------------------------
843 
844 
845 /// \page pg_comd_architecture CoMD Architecture
846 ///
847 /// Program Flow {#sec_program_flow}
848 /// ============
849 ///
850 /// We have attempted to make the program flow in CoMD 1.1 as simple and
851 /// transparent as possible. The main program consists of three blocks:
852 /// prolog, main loop, and epilog.
853 ///
854 /// Prolog {#ssec_flow_prolog}
855 /// -------
856 ///
857 /// The job of the prolog is to initialize the simulation and prepare
858 /// for the main loop. Notable tasks in the prolog include calling
859 /// - initParallel() to start MPI
860 /// - parseCommandLine() to read the command line options
861 /// - initSimulation() to initialize the main data structure, SimFlatSt.
862 /// This includes tasks such as
863 /// - initEamPot() to read tabular data for the potential function
864 /// - initDecomposition() to set up the domain decomposition
865 /// - createFccLattice() to generate an initial structure for the atoms
866 /// - initValidate() to store initial data for a simple validation check
867 ///
868 /// In CoMD 1.1 all atomic structures are internally generated so
869 /// there is no need to read large files with atom coordinate data.
870 ///
871 /// Main Loop {#ssec_flow_main_loop}
872 /// ---------
873 ///
874 /// The main loop calls
875 /// - timestep(), the integrator to update particle positions,
876 /// - printThings() to periodically prints simulation information
877 ///
878 /// The timestep() function is the heart of the code as it choreographs
879 /// updating the particle positions, along with computing forces
880 /// (computeForce()) and communicating atoms between ranks
881 /// (redistributeAtoms()).
882 ///
883 /// Epilog {#ssec_flow_epilog}
884 /// -------
885 ///
886 /// The epilog code handles end of run bookkeeping such as
887 /// - validateResult() to check validation
888 /// - printPerformanceResults() to print a performance summary
889 /// - destroySimulation() to free memory
890 ///
891 /// Key Data Structures {#sec_key_data_structures}
892 /// ==================
893 ///
894 /// Practically all data in CoMD belongs to the SimFlatSt structure.
895 /// This includes:
896 /// - BasePotentialSt A polymorphic structure for the potential model
897 /// - HaloExchangeSt A polymorphic strcuture for communication halo data
898 /// - DomainSt The parallel domain decomposition
899 /// - LinkCellSt The link cells
900 /// - AtomsSt The atom coordinates and velocities
901 /// - SpeciesDataSt Properties of the atomic species being simulated.
902 ///
903 /// Consult the individual pages for each of these structures to learn
904 /// more. The descriptions in haloExchange.c and initLinkCells() are
905 /// especially useful to understand how the atoms are commuicated
906 /// between tasks and stored in link cells for fast pair finding.
907 
908 // --------------------------------------------------------------
909 
910 
911 /// \page pg_optimization_targets Optimization Targets
912 ///
913 /// Computation {#sec_computation}
914 /// ============
915 ///
916 /// The computational effort of classical MD is usually highly focused
917 /// in the force kernel. The two force kernels supplied by CoMD are
918 /// eamForce() and ljForce(). Both kernels are fundamentally loops over
919 /// pairs of atoms with significant opportunity to exploit high levels
920 /// of concurrency. One potential challenge when reordering or
921 /// parallelizing the pair loop structure is preventing race conditions
922 /// that result if two concurrent pair evaluations try to simultaneously
923 /// increment the forces and energies on the same atom.
924 ///
925 /// The supplied EAM kernel uses interpolation from tabular data to
926 /// evaluate functions. Hence the interpolate() function is another
927 /// potential optimization target. Note that the two potential files
928 /// distributed with CoMD have very different sizes. The Adams
929 /// potential (Cu_u6.eam) has 500 points per function in the table while
930 /// the Mishin potential (Cu01.eam.alloy) has 10,000 points per
931 /// function. This difference could potentially impact important
932 /// details such as cache miss rates.
933 ///
934 /// Communication {#sec_communication}
935 /// =============
936 ///
937 /// As the number of atoms per MPI rank decreases, the communication
938 /// routines will start to require a significant fraction of the
939 /// run time. The main communication routine in CoMD is haloExchange().
940 /// The halo exchange is simple nearest neighbor, point-to-point
941 /// communication so it should scale well to practically any number of
942 /// nodes.
943 ///
944 /// The halo exchange in CoMD 1.1 is a very simple 6-direction
945 /// structured halo exchange (see haloExchange.c). Other exchange
946 /// algorithms can be implemented without much difficulty.
947 ///
948 /// The halo exchange function is called in two very different contexts.
949 /// The main usage is to exchange halo particle information (see
950 /// initAtomHaloExchange()). This process is coordinated by the
951 /// redistributeAtoms() function.
952 ///
953 /// In addition to the atom exchange, when using the EAM potential, a
954 /// halo exchange is performed in the force routine (see
955 /// initForceHaloExchange()).
956 
957 
958 // --------------------------------------------------------------
959 
960 
961 /// \page pg_whats_new New Features and Changes in CoMD 1.1
962 ///
963 /// The main goals of the 1.1 release were to add support for MPI and to
964 /// improve the structure and clarity of the code. Achieving these
965 /// goals required considerable changes compared to the 1.0 release.
966 /// However, the core structure of the most computationally intensive
967 /// kernels (the force routines) is mostly unchanged. We believe that
968 /// lessons learned from optimizing 1.0 force kernels to specific
969 /// hardware or programming models can be quickly transferred to kernels
970 /// in the 1.1 release.
971 ///
972 /// Significant changes in CoMD 1.1 include:
973 ///
974 /// - MPI support. Both MPI and single node serial executables can be
975 /// built from the same source files.
976 ///
977 /// - Improved modularity and code clarity. Major data structures are
978 /// now organized with their own structs and initialization routines.
979 ///
980 /// - The build system has been simplified to use only standard
981 /// Makefiles instead of CMake.
982 ///
983 /// - The halo exchange operation needed to communicate remote particle
984 /// data between MPI ranks also creates "image" particles in the
985 /// serial build.
986 ///
987 /// - Unified force kernels for both serial and MPI builds
988 ///
989 /// - The addition of remote/image atoms allows periodic boundary
990 /// conditions to be handled outside the force loop.
991 ///
992 /// - An additional communication/data copy step to handle electron
993 /// density on remote/image atoms has been added to the EAM force
994 /// loop.
995 ///
996 /// - The coordinate system has been simplified to a single global
997 /// coordinate system for all particles.
998 ///
999 /// - Evaluation of energies and forces using a Chebyshev polynomial
1000 /// fits has been removed. Polynomial approximation of energies and
1001 /// forces will return in a future CoMD version.
1002 ///
1003 /// - Atomic structures are now generated internally, eliminating the
1004 /// requirement to read, write, and distribute large atom
1005 /// configuration files. Arbitrarily large initial structures can
1006 /// be generated with specified initial temperature and random
1007 /// displacements from lattice positions. Code to read/write atomic
1008 /// positions has been removed.
1009 ///
1010 /// - EAM potentials are now read from standard funcfl and setfl format
1011 /// files. Voter style files are no longer supported.
1012 ///
1013 /// - Collection of performance metrics is significantly improved.
1014 /// Developers can easily add new timers to regions of interest. The
1015 /// system is also designed to allow easy integration with platform
1016 /// specific API's to high resolution timers, cycle counters,
1017 /// hardware counters, etc.
1018 ///
1019 ///
1020 /// - Hooks to in-situ analysis and visualization have been removed.
1021 /// In-situ analysis capabilities will return in a future CoMD release.
1022 ///
1023 /// Please contact the CoMD developers (exematex-comd@llnl.gov) if
1024 /// any of the deleted features negative impacts your work. We
1025 /// may be able to help produce a custom version that includes the code
1026 /// you need.
1027 
1028 
1029 // --------------------------------------------------------------
1030 
1031 
1032 /// \page pg_md_basics MD Basics
1033 ///
1034 /// The molecular dynamics (MD) computer simulation method is a well
1035 /// established and important tool for the study of the dynamical
1036 /// properties of liquids, solids, and other systems of interest in
1037 /// Materials Science and Engineering, Chemistry and Biology. A material
1038 /// is represented in terms of atoms and molecules. The method of MD
1039 /// simulation involves the evaluation of the force acting on each atom
1040 /// due to all other atoms in the system and the numerical integration
1041 /// of the Newtonian equations of motion. Though MD was initially
1042 /// developed to compute the equilibrium thermodynamic behavior of
1043 /// materials (equation of state), most recent applications have used MD
1044 /// to study non-equilibrium processes.
1045 ///
1046 /// Wikipeda offers a basic introduction to molecular dynamics with
1047 /// many references:
1048 ///
1049 /// http://en.wikipedia.org/wiki/Molecular_dynamics
1050 ///
1051 /// For a thorough treatment of MD methods, see:
1052 /// - "Computer simulation of liquids" by M.P. Allen and D.J. Tildesley
1053 /// (Oxford, 1989)
1054 /// ISBN-10: 0198556454 | ISBN-13: 978-0198556459.
1055 ///
1056 /// For an understanding of MD simulations and application to statistical mechanics:
1057 /// - "Understanding Molecular Simulation, Second Edition: From Algorithms
1058 /// to Applications," by D. Frenkel and B. Smit (Academic Press, 2001)
1059 /// ISBN-10: 0122673514 | ISBN-13: 978-0122673511
1060 /// - "Statistical and Thermal Physics: With Computer Applications," by
1061 /// H. Gould and J. Tobochnik (Princeton, 2010)
1062 /// ISBN-10: 0691137447 | ISBN-13: 978-0691137445
1063 ///
1064 /// CoMD implements both the Lennard-Jones Potential (ljForce.c) and the
1065 /// Embedded Atom Method Potential (eam.c).
1066 ///