#! /bin/csh
#
# Job script to make batch runs under IBM's loadleveler. 
# Before submitting:
#   -- Set SUBMITDIR, OUTPUT, REMOTE, and POSTCLEAN.
#   -- Set job class and wallclock limit in LoadLeveler step 1.
#   -- INCLUDE any mods to model in build step below.
#   -- INCLUDE input file in exec step below.
#
# 4/02: Added check of $return_code from tgcm.setup and tgcm.exec
#       instead of exiting the job script, which can prevent output
#       files from being rcp'd, or empty error output being rcp'd
#       by the cleanup step.
#
#-----------------------------------------------------------------------
#
set job = tgcm15
echo " "
echo "Begin job $job at `date`"
echo " "
#
# SUBMITDIR: Directory on IBM where job steps are executed.
# OUTPUT   : Name of output file to be returned to REMOTE.
# REMOTE   : Machine:path for rcp of output file.
# POSTCLEAN: If set to 1, SUBMITDIR is removed after job is completed.
#            (this avoids exceeding disk quota after several jobs)
#            (if model execution fails, POSTCLEAN is reset to 0)
#
setenv SUBMITDIR /ptmp/$user/$job.$$  # unique build and exec directory
setenv OUTPUT $job.$$.out             # unique output, using PID $$
setenv REMOTE ouray.hao:ntwk          # remote directory for output rcp
setenv POSTCLEAN 0                    # if 0, SUBMITDIR is saved
#
set err = 0
mkdir -p $SUBMITDIR || set err = 1
if ($err == 1) then
  echo "WARNING: error making directory $SUBMITDIR"
  exit
else
  echo "Made directory $SUBMITDIR"
endif
cd $SUBMITDIR
echo "Moved to submit dir $SUBMITDIR"
#-----------------------------------------------------------------------
#
# Make loadleveler command file:
#
cat << 'EOF_LL' >! loadlev.job
#
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# LoadLeveler step 0: build the executable (serial)
#
# @ job_name         = tgcm15
## @ account_no       = 28103032
# @ step_name        = build_step
# @ class            = share
# @ wall_clock_limit = 00:15:00
# @ job_type         = serial
# @ environment      = COPY_ALL
# @ output           = $(job_name)ibm_build.out
# @ error            = $(job_name)ibm_build.out
# @ executable       = $(step_name).csh
# @ notification     = error
# ja report will be mailed to user after the day's accounting at midnight
## @ ja_report = yes
# @ queue
#
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# LoadLeveler step 1: execute (parallel)
#
# @ step_name        = exec_step
## @ class            = share
## @ class            = com_pr
# @ class            = com_reg
# @ wall_clock_limit = 06:00:00
## @ wall_clock_limit = 00:30:00
# @ job_type         = parallel
# @ environment      = COPY_ALL
# @ network.MPI      = css0,shared,us
# @ total_tasks      = 12
# @ node             = 3
# @ node_usage       = not_shared
# @ output           = $(job_name)ibm_exec.out
# @ error            = $(job_name)ibm_exec.out
# @ executable       = $(step_name).csh
# @ notification     = error
# @ dependency       = (build_step == 0)
# @ queue
#
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Loadleveler step 2: clean up and send output (interactive)
#
# @ step_name        = cleanup_step
# @ class            = interactive
# @ wall_clock_limit = 00:00:30
# @ job_type         = serial
# @ environment      = COPY_ALL
# @ output           = $(job_name)ibm_cleanup.out
# @ error            = $(job_name)ibm_cleanup.out
# @ executable       = $(step_name).csh
# @ notification     = complete
# @ requirements     = (Machine == "$(hostname)")
# @ dependency       = (exec_step == 0) || (exec_step != 0) 
# @ queue
#
'EOF_LL'
#-----------------------------------------------------------------------
#
# Make csh script for build step 0:
#
echo "Making $SUBMITDIR/build_step.csh"
cat << 'EOF_BUILD_STEP' >!  $SUBMITDIR/build_step.csh
#
echo " "
echo "Begin build step at `date`"
set tgcm_version = tgcm15  # model version name
# 
# Setup script establishes working directories and gets fixed source 
#   and object code from mss.
#
#INCLUDE $TGCMROOT/bld/tgcm.setup
INCLUDE tgcm.setup
if ($?return_code) then
  if ($return_code != 0) then
    echo ">>> build_step tgcm.setup return_code = $return_code"
    goto return
  endif
endif
#
# Include any source code to be overlayed on fixed source and recompiled:
#
INCLUDE -h modsrc.gonz/*.F
#INCLUDE -h Makefile
#
# Exec script builds, but will not execute (because EXEC is not set): 
# (execution will take place in loadleveler exec step 1)
#
set RETURN = return
#INCLUDE $TGCMROOT/bld/tgcm.exec
INCLUDE tgcm.exec
if ($?return_code) then
  if ($return_code != 0) then
    echo ">>> build_step tgcm.exec return_code = $return_code"
    goto return
  endif
endif
#
# Save build output to IBM home:
#
return:
cd $SUBMITDIR
echo "Copying build output $LOADL_STEP_OUT to $home/$OUTPUT"
cp $LOADL_STEP_OUT $home/$OUTPUT
if ($?return_code) then
  if ($return_code != 0) then
    exit $return_code
  else
    exit 0
  endif
endif
'EOF_BUILD_STEP'
#-----------------------------------------------------------------------
#
# Make csh script for exec step 1:
#
echo "Making $SUBMITDIR/exec_step.csh"
cat << 'EOF_EXEC_STEP' >!  $SUBMITDIR/exec_step.csh
#
echo " "
echo "Begin exec step at `date`"
echo " "
cd $SUBMITDIR
#
# Get input file:
#
set input = tgcm.inp
cat << 'EOFINP' >! $input
INCLUDE gonz.storm.inp
'EOFINP'
#
# Set poe env vars that are not set by loadleveler commands: 
# These MP_ env vars are also set in tgcm.exec script, where
# they work for interactive runs, but under loadleveler, those 
# are executed from a different shell, so they have to be repeated 
# here before execution.
#
setenv MP_PGMMODEL SPMD
setenv MP_LABELIO YES
setenv MP_SHARED_MEMORY YES
#setenv MP_STDOUTMODE ORDERED
#setenv MP_INFOLEVEL 3
env | grep MP_
#
# Execute the model:
#
set exec = tgcm.aix        # must match exec target in Makefile
echo " "
echo "Executing $exec < $input at `date`"
echo " "
set err = 0
timex $exec < $input || set err = 1
echo "Execution completed at `date`"
if ($err == 0) then
  echo "Execution of $exec completed normally."
else
  echo "Execution of $exec failed."
  echo "Will save contents of directory $SUBMITDIR"
  echo "setenv POSTCLEAN 0" >! postclean.csh # this will be checked by cleanup
endif
#
# Run mklogs on the output file to separate tasks output:
#
INCLUDE -h $TGCMROOT/bld/mklogs
set execout = $LOADL_STEP_OUT
perl mklogs $execout
#
# Append task0 exec output to OUTPUT:
#
if (-e $execout:r_task0.out) then
  echo "Appending $execout:r_task0.out to $home/$OUTPUT"
  cat $execout:r_task0.out >> $home/$OUTPUT
else
  echo ">>> WARNING task0 output not found (possible error from mklogs)."
  echo "    Appending $execout to $home/$OUTPUT"
  cat $execout >> $home/$OUTPUT
endif
exit
'EOF_EXEC_STEP'
#-----------------------------------------------------------------------
#
# Make csh script for cleanup step 2:
#
echo "Making $SUBMITDIR/cleanup_step.csh"
cat << 'EOF_CLEANUP_STEP' >!  $SUBMITDIR/cleanup_step.csh
#
# Check if POSTCLEAN has been changed by previous step:
#
cd $SUBMITDIR
if (-e postclean.csh) then
  source postclean.csh
  echo "Cleanup: set POSTCLEAN $POSTCLEAN"
endif
#
# Go to home and remove initial directory and contents if POSTCLEAN
# is set:
#
cd
if ($POSTCLEAN == 1) then
  set err = 0
  rm -r $SUBMITDIR || set err = 1
  if ($err == 0) then
    echo "Removed directory $SUBMITDIR"
  else
    echo ">>> WARNING: error removing directory $SUBMITDIR"
  endif
else
  echo "Initial directory $SUBMITDIR was saved."
endif
#
# Append cleanup output to build output in home, and remote copy final
# output file to REMOTE:
#
echo "Appending $LOADL_STEP_OUT to $home/$OUTPUT"
cat $LOADL_STEP_OUT >> $home/$OUTPUT
echo "Remote copying output file $OUTPUT to $REMOTE"
set err = 0
rcp $home/$OUTPUT $REMOTE || set err = 1
if ($err == 0) then
  echo "Successful rcp of output $home/$OUTPUT to $REMOTE"
  echo "Removing $home/$OUTPUT"
  rm $home/$OUTPUT # remove output file from home because rcp was ok
else
  echo ">>> WARNING: error from rcp of output $home/$OUTPUT to $REMOTE"
endif
exit
'EOF_CLEANUP_STEP'
#-----------------------------------------------------------------------
#
# Submit loadleveler job:
#
llsubmit loadlev.job
exit