Restarting a Checkpointed Job
#!/bin/sh
# Resubmit.slurm
## Name of the job in the squeue output
#SBATCH --job-name ResubmitDMTCP
## Deal with output and errors. Separate into 2 files (not the default).
## NOTE: %u=userID, %x=jobName, %N=nodeID, %j=jobID, %A=arrayMain, %a=arraySub
#SBATCH -o /scratch/%u/%x-%N-%j.out # Output file
#SBATCH -e /scratch/%u/%x-%N-%j.err # Error file
#SBATCH --mail-type=BEGIN,END,FAIL # NONE,BEGIN,END,FAIL,REQUEUE,ALL,...
#SBATCH --mail-user=userID@gmu.edu # Put your GMU email address here
## Specifying an upper limit on needed resources will improve your scheduling
## priority, but if you exceed these values, your job will be terminated.
## Check your "Job Ended" emails for actual resource usage info.
#SBATCH --mem=1G # Total memory needed for your job (suffixes: K,M,G,T)
#SBATCH --time=0-00:10 # Total time needed for your job: Days-Hours:Minutes
## These options are more useful when running parallel and array jobs
#SBATCH --nodes 1 # Number of nodes (computers) to reserve
#SBATCH --tasks 1 # Number of CPUs to reserve for each run
## Load dmtcp module
module load dmtcp/2.5.2
## Setup the checkpoint directory
export CKDIR=/scratch/$USER/checkpoints
## Restart the program again after interruption.
## Make sure the appropriate .dmtcp file exists in the $CKDIR directory.
# Get the name of the checkpoint file that was created most recently.
# This may not always be what we want, but hopefully it is for the purposes
# of this demo.
export LATEST_CHECKPOINT=ls -lart $CKDIR/ckpt_*.dmtcp | tail -1 | tr -s ' ' | cut -d ' ' -f9
## !!! If restarting DmtcpCppDemo, uncomment (remove one leading '#') below
##SBATCH --cpus-per-task 4 # Number of CPUs to reserve for threads
# If you know the the actual .dmtcp file you want, it would be safer to use
# that in place of $LATEST_CHECKPOINT below. Be sure to give the full path.
dmtcp_restart $LATEST_CHECKPOINT