#! /bin/csh # # Job script to make batch runs under IBM's loadleveler. # Before submitting: # -- Set SUBMITDIR, OUTPUT, REMOTE, and POSTCLEAN. # -- Set job class and wallclock limit in LoadLeveler step 1. # -- INCLUDE any mods to model in build step below. # -- INCLUDE input file in exec step below. # # 4/02: Added check of $return_code from tgcm.setup and tgcm.exec # instead of exiting the job script, which can prevent output # files from being rcp'd, or empty error output being rcp'd # by the cleanup step. # #----------------------------------------------------------------------- # set job = tgcm15 echo " " echo "Begin job $job at `date`" echo " " # # SUBMITDIR: Directory on IBM where job steps are executed. # OUTPUT : Name of output file to be returned to REMOTE. # REMOTE : Machine:path for rcp of output file. # POSTCLEAN: If set to 1, SUBMITDIR is removed after job is completed. # (this avoids exceeding disk quota after several jobs) # (if model execution fails, POSTCLEAN is reset to 0) # setenv SUBMITDIR /ptmp/$user/$job.$$ # unique build and exec directory setenv OUTPUT $job.$$.out # unique output, using PID $$ setenv REMOTE ouray.hao:ntwk # remote directory for output rcp setenv POSTCLEAN 0 # if 0, SUBMITDIR is saved # set err = 0 mkdir -p $SUBMITDIR || set err = 1 if ($err == 1) then echo "WARNING: error making directory $SUBMITDIR" exit else echo "Made directory $SUBMITDIR" endif cd $SUBMITDIR echo "Moved to submit dir $SUBMITDIR" #----------------------------------------------------------------------- # # Make loadleveler command file: # cat << 'EOF_LL' >! loadlev.job # #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # LoadLeveler step 0: build the executable (serial) # # @ job_name = tgcm15 ## @ account_no = 28103032 # @ step_name = build_step # @ class = share # @ wall_clock_limit = 00:15:00 # @ job_type = serial # @ environment = COPY_ALL # @ output = $(job_name)ibm_build.out # @ error = $(job_name)ibm_build.out # @ executable = $(step_name).csh # @ notification = error # ja report will be mailed to user after the day's accounting at midnight ## @ ja_report = yes # @ queue # #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # LoadLeveler step 1: execute (parallel) # # @ step_name = exec_step ## @ class = share ## @ class = com_pr # @ class = com_reg # @ wall_clock_limit = 06:00:00 ## @ wall_clock_limit = 00:30:00 # @ job_type = parallel # @ environment = COPY_ALL # @ network.MPI = css0,shared,us # @ total_tasks = 12 # @ node = 3 # @ node_usage = not_shared # @ output = $(job_name)ibm_exec.out # @ error = $(job_name)ibm_exec.out # @ executable = $(step_name).csh # @ notification = error # @ dependency = (build_step == 0) # @ queue # #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Loadleveler step 2: clean up and send output (interactive) # # @ step_name = cleanup_step # @ class = interactive # @ wall_clock_limit = 00:00:30 # @ job_type = serial # @ environment = COPY_ALL # @ output = $(job_name)ibm_cleanup.out # @ error = $(job_name)ibm_cleanup.out # @ executable = $(step_name).csh # @ notification = complete # @ requirements = (Machine == "$(hostname)") # @ dependency = (exec_step == 0) || (exec_step != 0) # @ queue # 'EOF_LL' #----------------------------------------------------------------------- # # Make csh script for build step 0: # echo "Making $SUBMITDIR/build_step.csh" cat << 'EOF_BUILD_STEP' >! $SUBMITDIR/build_step.csh # echo " " echo "Begin build step at `date`" set tgcm_version = tgcm15 # model version name # # Setup script establishes working directories and gets fixed source # and object code from mss. # #INCLUDE $TGCMROOT/bld/tgcm.setup INCLUDE tgcm.setup if ($?return_code) then if ($return_code != 0) then echo ">>> build_step tgcm.setup return_code = $return_code" goto return endif endif # # Include any source code to be overlayed on fixed source and recompiled: # INCLUDE -h modsrc.gonz/*.F #INCLUDE -h Makefile # # Exec script builds, but will not execute (because EXEC is not set): # (execution will take place in loadleveler exec step 1) # set RETURN = return #INCLUDE $TGCMROOT/bld/tgcm.exec INCLUDE tgcm.exec if ($?return_code) then if ($return_code != 0) then echo ">>> build_step tgcm.exec return_code = $return_code" goto return endif endif # # Save build output to IBM home: # return: cd $SUBMITDIR echo "Copying build output $LOADL_STEP_OUT to $home/$OUTPUT" cp $LOADL_STEP_OUT $home/$OUTPUT if ($?return_code) then if ($return_code != 0) then exit $return_code else exit 0 endif endif 'EOF_BUILD_STEP' #----------------------------------------------------------------------- # # Make csh script for exec step 1: # echo "Making $SUBMITDIR/exec_step.csh" cat << 'EOF_EXEC_STEP' >! $SUBMITDIR/exec_step.csh # echo " " echo "Begin exec step at `date`" echo " " cd $SUBMITDIR # # Get input file: # set input = tgcm.inp cat << 'EOFINP' >! $input INCLUDE gonz.storm.inp 'EOFINP' # # Set poe env vars that are not set by loadleveler commands: # These MP_ env vars are also set in tgcm.exec script, where # they work for interactive runs, but under loadleveler, those # are executed from a different shell, so they have to be repeated # here before execution. # setenv MP_PGMMODEL SPMD setenv MP_LABELIO YES setenv MP_SHARED_MEMORY YES #setenv MP_STDOUTMODE ORDERED #setenv MP_INFOLEVEL 3 env | grep MP_ # # Execute the model: # set exec = tgcm.aix # must match exec target in Makefile echo " " echo "Executing $exec < $input at `date`" echo " " set err = 0 timex $exec < $input || set err = 1 echo "Execution completed at `date`" if ($err == 0) then echo "Execution of $exec completed normally." else echo "Execution of $exec failed." echo "Will save contents of directory $SUBMITDIR" echo "setenv POSTCLEAN 0" >! postclean.csh # this will be checked by cleanup endif # # Run mklogs on the output file to separate tasks output: # INCLUDE -h $TGCMROOT/bld/mklogs set execout = $LOADL_STEP_OUT perl mklogs $execout # # Append task0 exec output to OUTPUT: # if (-e $execout:r_task0.out) then echo "Appending $execout:r_task0.out to $home/$OUTPUT" cat $execout:r_task0.out >> $home/$OUTPUT else echo ">>> WARNING task0 output not found (possible error from mklogs)." echo " Appending $execout to $home/$OUTPUT" cat $execout >> $home/$OUTPUT endif exit 'EOF_EXEC_STEP' #----------------------------------------------------------------------- # # Make csh script for cleanup step 2: # echo "Making $SUBMITDIR/cleanup_step.csh" cat << 'EOF_CLEANUP_STEP' >! $SUBMITDIR/cleanup_step.csh # # Check if POSTCLEAN has been changed by previous step: # cd $SUBMITDIR if (-e postclean.csh) then source postclean.csh echo "Cleanup: set POSTCLEAN $POSTCLEAN" endif # # Go to home and remove initial directory and contents if POSTCLEAN # is set: # cd if ($POSTCLEAN == 1) then set err = 0 rm -r $SUBMITDIR || set err = 1 if ($err == 0) then echo "Removed directory $SUBMITDIR" else echo ">>> WARNING: error removing directory $SUBMITDIR" endif else echo "Initial directory $SUBMITDIR was saved." endif # # Append cleanup output to build output in home, and remote copy final # output file to REMOTE: # echo "Appending $LOADL_STEP_OUT to $home/$OUTPUT" cat $LOADL_STEP_OUT >> $home/$OUTPUT echo "Remote copying output file $OUTPUT to $REMOTE" set err = 0 rcp $home/$OUTPUT $REMOTE || set err = 1 if ($err == 0) then echo "Successful rcp of output $home/$OUTPUT to $REMOTE" echo "Removing $home/$OUTPUT" rm $home/$OUTPUT # remove output file from home because rcp was ok else echo ">>> WARNING: error from rcp of output $home/$OUTPUT to $REMOTE" endif exit 'EOF_CLEANUP_STEP' #----------------------------------------------------------------------- # # Submit loadleveler job: # llsubmit loadlev.job exit