!  SVN:$Id: ice_distribution.F90 732 2013-09-19 18:19:31Z eclare $
!|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

 module ice_distribution

!  This module provides data types and routines for distributing
!  blocks across processors.
!
! author: Phil Jones, LANL
! Oct. 2004: Adapted from POP by William H. Lipscomb, LANL
! Jan. 2008: Elizabeth Hunke updated to new POP infrastructure

   use ice_kinds_mod
   use ice_domain_size, only: max_blocks
   use ice_communicate, only: my_task, master_task, create_communicator
   use ice_blocks, only: nblocks_x, nblocks_y, nblocks_tot
   use ice_exit, only: abort_ice
   use ice_fileunits, only: nu_diag

   implicit none
   private
   save

   type, public :: distrb  ! distribution data type
      integer (int_kind) :: &
         nprocs            ,&! number of processors in this dist
         communicator      ,&! communicator to use in this dist
         numLocalBlocks      ! number of blocks distributed to this
                             !   local processor

      integer (int_kind), dimension(:), pointer :: &
         blockLocation     ,&! processor location for all blocks
         blockLocalID      ,&! local  block id for all blocks
         blockGlobalID       ! global block id for each local block

      integer (int_kind), dimension(:), pointer ::  blockCnt
      integer (int_kind), dimension(:,:), pointer :: blockIndex

   end type

   public :: create_distribution, &
             ice_distributionGet,         &
             ice_distributionGetBlockLoc, &
             ice_distributionGetBlockID, &
             create_local_block_ids

   character (char_len), public :: &
       processor_shape       ! 'square-pop' (approx) POP default config
                             ! 'square-ice' like square-pop but better for ice
                             ! 'slenderX1' (NPX x 1)
                             ! 'slenderX2' (NPX x 2)

!***********************************************************************

 contains

!***********************************************************************

 function create_distribution(dist_type, nprocs, work_per_block)

!  This routine determines the distribution of blocks across processors
!  by call the appropriate subroutine based on distribution type
!  requested.  Currently three distributions are supported:
!  2-d Cartesian distribution (cartesian), a load-balanced
!  distribution using a rake algorithm based on an input amount of work 
!  per block, and a space-filling-curve algorithm.

   character (*), intent(in) :: &
      dist_type             ! method for distributing blocks
                            !  either cartesian or rake

   integer (int_kind), intent(in) :: &
      nprocs                ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      work_per_block        ! amount of work per block

   type (distrb) :: &
      create_distribution   ! resulting structure describing
                            !  distribution of blocks
!----------------------------------------------------------------------
!
!  select the appropriate distribution type
!
!----------------------------------------------------------------------

   select case (trim(dist_type))

   case('cartesian')

      create_distribution = create_distrb_cart(nprocs, work_per_block)

   case('rake')

      create_distribution = create_distrb_rake(nprocs, work_per_block)

   case('roundrobin')

      create_distribution = create_distrb_roundrobin(nprocs, work_per_block)

   case('sectrobin')

      create_distribution = create_distrb_sectrobin(nprocs, work_per_block)

   case('sectcart')

      create_distribution = create_distrb_sectcart(nprocs, work_per_block)

   case('spacecurve')

      create_distribution = create_distrb_spacecurve(nprocs, &
                                                   work_per_block)

   case default

      call abort_ice('ice distribution: unknown distribution type')

   end select

!-----------------------------------------------------------------------

 end function create_distribution

!***********************************************************************

 subroutine create_local_block_ids(block_ids, distribution)

!  This routine determines which blocks in an input distribution are
!  located on the local processor and creates an array of block ids
!  for all local blocks.

   type (distrb), intent(in) :: &
      distribution           ! input distribution for which local
                             !  blocks required

   integer (int_kind), dimension(:), pointer :: &
      block_ids              ! array of block ids for every block
                             ! that resides on the local processor
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------

   integer (int_kind) :: &
      n, bcount              ! dummy counters

   logical (log_kind) :: dbug

!-----------------------------------------------------------------------
!
!  first determine number of local blocks to allocate array
!
!-----------------------------------------------------------------------

   bcount = 0
   do n=1,size(distribution%blockLocation)
      if (distribution%blockLocation(n) == my_task+1) bcount = bcount + 1
   end do


   if (bcount > 0) allocate(block_ids(bcount))

!-----------------------------------------------------------------------
!
!  now fill array with proper block ids
!
!-----------------------------------------------------------------------

!   dbug = .true.
   dbug = .false.
   if (bcount > 0) then
      do n=1,size(distribution%blockLocation)
         if (distribution%blockLocation(n) == my_task+1) then
            block_ids(distribution%blockLocalID(n)) = n

            if (dbug) then
            write(nu_diag,*) 'block id, proc, local_block: ', &
                             block_ids(distribution%blockLocalID(n)), &
                             distribution%blockLocation(n), &
                             distribution%blockLocalID(n)
            endif
         endif
      end do
   endif

 end subroutine create_local_block_ids

!***********************************************************************

 subroutine proc_decomposition(nprocs, nprocs_x, nprocs_y)

!  This subroutine attempts to find an optimal (nearly square)
!  2d processor decomposition for a given number of processors.

   integer (int_kind), intent(in) :: &
      nprocs                       ! total number or processors

   integer (int_kind), intent(out) :: &
      nprocs_x, nprocs_y           ! number of procs in each dimension

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      iguess, jguess               ! guesses for nproc_x,y

   real (real_kind) :: &
      square                       ! square root of nprocs

!----------------------------------------------------------------------
!
!  start with an initial guess
!
!----------------------------------------------------------------------

   square = sqrt(real(nprocs,kind=real_kind))
   nprocs_x = 0
   nprocs_y = 0

   if (processor_shape == 'square-pop') then ! make as square as possible
      iguess = nint(square)
      jguess = nprocs/iguess
   elseif (processor_shape == 'square-ice') then ! better for bipolar ice
      jguess = nint(square)
      iguess = nprocs/jguess
   elseif (processor_shape == 'slenderX1') then ! 1 proc in y direction
      jguess = 1
      iguess = nprocs/jguess
   else                                  ! 2 processors in y direction
      jguess = min(2, nprocs)
      iguess = nprocs/jguess
   endif

!----------------------------------------------------------------------
!
!  try various decompositions to find the best
!
!----------------------------------------------------------------------

   proc_loop: do
   if (processor_shape == 'square-pop') then
      jguess = nprocs/iguess
   else
      iguess = nprocs/jguess
   endif

      if (iguess*jguess == nprocs) then ! valid decomp

         !*** if the blocks can be evenly distributed, it is a
         !*** good decomposition
         if (mod(nblocks_x,iguess) == 0 .and. &
             mod(nblocks_y,jguess) == 0) then
            nprocs_x = iguess
            nprocs_y = jguess
            exit proc_loop

         !*** if the blocks can be evenly distributed in a
         !*** transposed direction, it is a good decomposition
         else if (mod(nblocks_x,jguess) == 0 .and. &
                mod(nblocks_y,iguess) == 0) then
            nprocs_x = jguess
            nprocs_y = iguess
            exit proc_loop

         !*** A valid decomposition, but keep searching for
         !***  a better one
         else
            if (nprocs_x == 0) then
               nprocs_x = iguess
               nprocs_y = jguess
            endif
            if (processor_shape == 'square-pop') then
               iguess = iguess - 1
               if (iguess == 0) then
                  exit proc_loop
               else
                  cycle proc_loop
               endif
            else
               jguess = jguess - 1
               if (jguess == 0) then
                  exit proc_loop
               else
                  cycle proc_loop
               endif
            endif
         endif

      else ! invalid decomp - keep trying

         if (processor_shape == 'square-pop') then
            iguess = iguess - 1
            if (iguess == 0) then
               exit proc_loop
            else
               cycle proc_loop
            endif
         else
            jguess = jguess - 1
            if (jguess == 0) then
               exit proc_loop
            else
               cycle proc_loop
            endif
         endif
      endif

   end do proc_loop

   if (nprocs_x == 0) then
      call abort_ice('ice: Unable to find 2d processor config')
   endif

   if (my_task == master_task) then
     write(nu_diag,'(a23,i4,a3,i4)') '  Processors (X x Y) = ', &
                                        nprocs_x,' x ',nprocs_y
   endif

!----------------------------------------------------------------------

 end subroutine proc_decomposition

!**********************************************************************

 subroutine ice_distributionDestroy(distribution)

!  This routine destroys a defined distribution by deallocating
!  all memory associated with the distribution.

   type (distrb), intent(inout) :: &
      distribution          ! distribution to destroy

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: istat  ! status flag for deallocate

!----------------------------------------------------------------------
!
!  reset scalars
!
!----------------------------------------------------------------------

   distribution%nprocs       = 0
   distribution%communicator   = 0
   distribution%numLocalBlocks = 0

!----------------------------------------------------------------------
!
!  deallocate arrays
!
!----------------------------------------------------------------------

   deallocate(distribution%blockLocation, stat=istat)
   deallocate(distribution%blockLocalID , stat=istat)
   deallocate(distribution%blockGlobalID, stat=istat)

!-----------------------------------------------------------------------

 end subroutine ice_distributionDestroy

!***********************************************************************

 subroutine ice_distributionGet(distribution,&
                            nprocs, communicator, numLocalBlocks, &
                            blockLocation, blockLocalID, blockGlobalID)

!  This routine extracts information from a distribution.

   type (distrb), intent(in) :: &
      distribution           ! input distribution for which information
                             !  is requested

      integer (int_kind), intent(out), optional ::   &
         nprocs          ,&! number of processors in this dist
         communicator      ,&! communicator to use in this dist
         numLocalBlocks      ! number of blocks distributed to this
                             !   local processor

      integer (int_kind), dimension(:), pointer, optional :: &
         blockLocation     ,&! processor location for all blocks
         blockLocalID      ,&! local  block id for all blocks
         blockGlobalID       ! global block id for each local block

!-----------------------------------------------------------------------
!
!  depending on which optional arguments are present, extract the
!  requested info
!
!-----------------------------------------------------------------------

   if (present(nprocs))       nprocs       = distribution%nprocs
   if (present(communicator))   communicator   = distribution%communicator
   if (present(numLocalBlocks)) numLocalBlocks = distribution%numLocalBlocks

   if (present(blockLocation)) then
      if (associated(distribution%blockLocation)) then
         blockLocation => distribution%blockLocation
      else
        call abort_ice( &
            'ice_distributionGet: blockLocation not allocated')
         return
      endif
   endif

   if (present(blockLocalID)) then
      if (associated(distribution%blockLocalID)) then
         blockLocalID = distribution%blockLocalID
      else
        call abort_ice( &
            'ice_distributionGet: blockLocalID not allocated')
         return
      endif
   endif

   if (present(blockGlobalID)) then
      if (associated(distribution%blockGlobalID)) then
         blockGlobalID = distribution%blockGlobalID
      else
        call abort_ice( &
            'ice_distributionGet: blockGlobalID not allocated')
         return
      endif
   endif

!-----------------------------------------------------------------------

 end subroutine ice_distributionGet

!***********************************************************************

 subroutine ice_distributionGetBlockLoc(distribution, blockID, &
                                        processor, localID)

!  Given a distribution of blocks and a global block ID, return
!  the processor and local index for the block.  A zero for both
!  is returned in the case that the block has been eliminated from
!  the distribution (i.e. has no active points).

   type (distrb), intent(in) :: &
      distribution           ! input distribution for which information
                             !  is requested

   integer (int_kind), intent(in) :: &
      blockID                ! global block id for which location requested

   integer (int_kind), intent(out) ::  &
      processor,            &! processor on which block resides
      localID                ! local index for this block on this proc

!-----------------------------------------------------------------------
!
!  check for valid blockID
!
!-----------------------------------------------------------------------

   if (blockID < 0 .or. blockID > nblocks_tot) then
     call abort_ice( &
         'ice_distributionGetBlockLoc: invalid block id')
      return
   endif

!-----------------------------------------------------------------------
!
!  extract the location from the distribution data structure
!
!-----------------------------------------------------------------------

   processor = distribution%blockLocation(blockID)
   localID   = distribution%blockLocalID (blockID)

!-----------------------------------------------------------------------

 end subroutine ice_distributionGetBlockLoc

!***********************************************************************

 subroutine ice_distributionGetBlockID(distribution, localID, &
                                       blockID)

!  Given a distribution of blocks and a local block index, return
!  the global block id for the block.

   type (distrb), intent(in) :: &
      distribution           ! input distribution for which information
                             !  is requested

   integer (int_kind), intent(in) ::  &
      localID                ! local index for this block on this proc

   integer (int_kind), intent(out) :: &
      blockID                ! global block id for this local block

!-----------------------------------------------------------------------
!
!  check for valid localID
!
!-----------------------------------------------------------------------

   if (localID < 0 .or. localID > distribution%numLocalBlocks) then
     call abort_ice( &
         'ice_distributionGetBlockID: invalid local id')
      return
   endif

!-----------------------------------------------------------------------
!
!  extract the global ID from the distribution data structure
!
!-----------------------------------------------------------------------

   blockID   = distribution%blockGlobalID (localID)

!-----------------------------------------------------------------------

 end subroutine ice_distributionGetBlockID

!***********************************************************************

 function create_distrb_cart(nprocs, workPerBlock) result(newDistrb)

!  This function creates a distribution of blocks across processors
!  using a 2-d Cartesian distribution.

   integer (int_kind), intent(in) :: &
      nprocs            ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      workPerBlock        ! amount of work per block

   type (distrb) :: &
      newDistrb           ! resulting structure describing Cartesian
                          !  distribution of blocks

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      i, j,                  &! dummy loop indices
      istat,                 &! status flag for allocation
      iblock, jblock,        &!
      is, ie, js, je,        &! start, end block indices for each proc
      processor,             &! processor position in cartesian decomp
      globalID,              &! global block ID
      localID,               &! block location on this processor
      nprocsX,             &! num of procs in x for global domain
      nprocsY,             &! num of procs in y for global domain
      numBlocksXPerProc,     &! num of blocks per processor in x
      numBlocksYPerProc       ! num of blocks per processor in y

!----------------------------------------------------------------------
!
!  create communicator for this distribution
!
!----------------------------------------------------------------------

   call create_communicator(newDistrb%communicator, nprocs)

!----------------------------------------------------------------------
!
!  try to find best processor arrangement
!
!----------------------------------------------------------------------

   newDistrb%nprocs = nprocs

   call proc_decomposition(nprocs, nprocsX, nprocsY)
                                  

!----------------------------------------------------------------------
!
!  allocate space for decomposition
!
!----------------------------------------------------------------------

   allocate (newDistrb%blockLocation(nblocks_tot), &
             newDistrb%blockLocalID (nblocks_tot), stat=istat)

!----------------------------------------------------------------------
!
!  distribute blocks linearly across processors in each direction
!
!----------------------------------------------------------------------

   numBlocksXPerProc = (nblocks_x-1)/nprocsX + 1
   numBlocksYPerProc = (nblocks_y-1)/nprocsY + 1

   do j=1,nprocsY
   do i=1,nprocsX
      processor = (j-1)*nprocsX + i    ! number the processors 
                                         ! left to right, bot to top

      is = (i-1)*numBlocksXPerProc + 1   ! starting block in i
      ie =  i   *numBlocksXPerProc       ! ending   block in i
      if (ie > nblocks_x) ie = nblocks_x
      js = (j-1)*numBlocksYPerProc + 1   ! starting block in j
      je =  j   *numBlocksYPerProc       ! ending   block in j
      if (je > nblocks_y) je = nblocks_y

      localID        = 0  ! initialize counter for local index
      do jblock = js,je
      do iblock = is,ie
         globalID = (jblock - 1)*nblocks_x + iblock
         if (workPerBlock(globalID) /= 0) then
            localID = localID + 1
            newDistrb%blockLocation(globalID) = processor
            newDistrb%blockLocalID (globalID) = localID
         else  ! no work - eliminate block from distribution
            newDistrb%blockLocation(globalID) = 0
            newDistrb%blockLocalID (globalID) = 0
         endif
      end do
      end do

      ! if this is the local processor, set number of local blocks
      if (my_task == processor - 1) then
         newDistrb%numLocalBlocks = localID
      endif

   end do
   end do

!----------------------------------------------------------------------
!
!  now store the local info
!
!----------------------------------------------------------------------

   if (newDistrb%numLocalBlocks > 0) then
      allocate (newDistrb%blockGlobalID(newDistrb%numLocalBlocks), &
                stat=istat)

      do j=1,nprocsY
      do i=1,nprocsX
         processor = (j-1)*nprocsX + i

         if (processor == my_task + 1) then
            is = (i-1)*numBlocksXPerProc + 1   ! starting block in i
            ie =  i   *numBlocksXPerProc       ! ending   block in i
            if (ie > nblocks_x) ie = nblocks_x
            js = (j-1)*numBlocksYPerProc + 1   ! starting block in j
            je =  j   *numBlocksYPerProc       ! ending   block in j
            if (je > nblocks_y) je = nblocks_y

            localID        = 0  ! initialize counter for local index
            do jblock = js,je
            do iblock = is,ie
               globalID = (jblock - 1)*nblocks_x + iblock
               if (workPerBlock(globalID) /= 0) then
                  localID = localID + 1
                  newDistrb%blockGlobalID (localID) = globalID
               endif
            end do
            end do

         endif

      end do
      end do

   endif

!----------------------------------------------------------------------

 end function create_distrb_cart

!**********************************************************************

 function create_distrb_rake(nprocs, workPerBlock) result(newDistrb)

!  This  function distributes blocks across processors in a
!  load-balanced manner based on the amount of work per block.
!  A rake algorithm is used in which the blocks are first distributed
!  in a Cartesian distribution and then a rake is applied in each
!  Cartesian direction.

   integer (int_kind), intent(in) :: &
      nprocs                ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      workPerBlock        ! amount of work per block

   type (distrb) :: &
      newDistrb           ! resulting structure describing
                          ! load-balanced distribution of blocks

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) ::    &
      i,j,n              ,&! dummy loop indices
      pid                ,&! dummy for processor id
      istat              ,&! status flag for allocates
      localBlock         ,&! local block position on processor
      numOcnBlocks       ,&! number of ocean blocks
      maxWork            ,&! max amount of work in any block
      nprocsX          ,&! num of procs in x for global domain
      nprocsY            ! num of procs in y for global domain

   integer (int_kind), dimension(:), allocatable :: &
      priority           ,&! priority for moving blocks
      workTmp            ,&! work per row or column for rake algrthm
      procTmp              ! temp processor id for rake algrthm

   type (distrb) :: dist  ! temp hold distribution

!----------------------------------------------------------------------
!
!  first set up as Cartesian distribution
!
!----------------------------------------------------------------------

   dist = create_distrb_cart(nprocs, workPerBlock)
                                    
!----------------------------------------------------------------------
!
!  if the number of blocks is close to the number of processors,
!  only do a 1-d rake on the entire distribution
!
!----------------------------------------------------------------------

   numOcnBlocks = count(workPerBlock /= 0)
   maxWork = maxval(workPerBlock)

   if (numOcnBlocks <= 2*nprocs) then

      allocate(priority(nblocks_tot), stat=istat)

      !*** initialize priority array

      do j=1,nblocks_y
      do i=1,nblocks_x
         n=(j-1)*nblocks_x + i
         if (workPerBlock(n) > 0) then
            priority(n) = maxWork + n - workPerBlock(n)
         else
            priority(n) = 0
         endif
      end do
      end do

      allocate(workTmp(nblocks_tot), procTmp(nblocks_tot), stat=istat)

      workTmp(:) = 0
      do i=1,nprocs
         procTmp(i) = i
         do n=1,nblocks_tot
            if (dist%blockLocation(n) == i) then
               workTmp(i) = workTmp(i) + workPerBlock(n)
            endif
         end do
      end do

      call ice_distributionRake (workTmp, procTmp, workPerBlock, &
                                 priority, dist)

      deallocate(workTmp, procTmp, stat=istat)

!----------------------------------------------------------------------
!
!  otherwise re-distribute blocks using a rake in each direction
!
!----------------------------------------------------------------------

   else

      call proc_decomposition(dist%nprocs, nprocsX, nprocsY)

!----------------------------------------------------------------------
!
!     load-balance using a rake algorithm in the x-direction first
!
!----------------------------------------------------------------------

      allocate(priority(nblocks_tot), stat=istat)

      !*** set highest priority such that eastern-most blocks
      !*** and blocks with the least amount of work are
      !*** moved first

      do j=1,nblocks_y
      do i=1,nblocks_x
         n=(j-1)*nblocks_x + i
         if (workPerBlock(n) > 0) then
            priority(n) = (maxWork + 1)*(nblocks_x + i) - &
                          workPerBlock(n)
         else
            priority(n) = 0
         endif
      end do
      end do

      allocate(workTmp(nprocsX), procTmp(nprocsX), stat=istat)

      do j=1,nprocsY

         workTmp(:) = 0
         do i=1,nprocsX
            pid = (j-1)*nprocsX + i
            procTmp(i) = pid
            do n=1,nblocks_tot
               if (dist%blockLocation(n) == pid) then
                  workTmp(i) = workTmp(i) + workPerBlock(n)
               endif
            end do
         end do

         call ice_distributionRake (workTmp, procTmp, workPerBlock, &
                                    priority, dist)
      end do
   
      deallocate(workTmp, procTmp, stat=istat)

!----------------------------------------------------------------------
!
!     use a rake algorithm in the y-direction now
!
!----------------------------------------------------------------------

      !*** set highest priority for northern-most blocks

      do j=1,nblocks_y
      do i=1,nblocks_x
         n=(j-1)*nblocks_x + i
         if (workPerBlock(n) > 0) then
            priority(n) = (maxWork + 1)*(nblocks_y + j) - &
                          workPerBlock(n)
         else
            priority(n) = 0
         endif
      end do
      end do

      allocate(workTmp(nprocsY), procTmp(nprocsY), stat=istat)

      do i=1,nprocsX

         workTmp(:) = 0
         do j=1,nprocsY
            pid = (j-1)*nprocsX + i
            procTmp(j) = pid
            do n=1,nblocks_tot
               if (dist%blockLocation(n) == pid) then
                  workTmp(j) = workTmp(j) + workPerBlock(n)
               endif
            end do
         end do

         call ice_distributionRake (workTmp, procTmp, workPerBlock, &
                                    priority, dist)

      end do

      deallocate(workTmp, procTmp, priority, stat=istat)

   endif  ! 1d or 2d rake

!----------------------------------------------------------------------
!
!  create new distribution with info extracted from the temporary
!  distribution
!
!----------------------------------------------------------------------

   newDistrb%nprocs     = nprocs
   newDistrb%communicator = dist%communicator

   allocate(newDistrb%blockLocation(nblocks_tot), &
            newDistrb%blockLocalID(nblocks_tot), stat=istat)

   allocate(procTmp(nprocs), stat=istat)

   procTmp = 0
   do n=1,nblocks_tot
      pid = dist%blockLocation(n)  ! processor id
      newDistrb%blockLocation(n) = pid

      if (pid > 0) then
         procTmp(pid) = procTmp(pid) + 1
         newDistrb%blockLocalID (n) = procTmp(pid)
      else
         newDistrb%blockLocalID (n) = 0
      endif
   end do

   newDistrb%numLocalBlocks = procTmp(my_task+1)

   if (minval(procTmp) < 1) then
      call abort_ice( &
         'create_distrb_rake: processors left with no blocks')
      return
   endif

   deallocate(procTmp, stat=istat)

   if (istat > 0) then
      call abort_ice( &
         'create_distrb_rake: error allocating last procTmp')
      return
   endif

   allocate(newDistrb%blockGlobalID(newDistrb%numLocalBlocks), &
            stat=istat)

   if (istat > 0) then
      call abort_ice( &
         'create_distrb_rake: error allocating blockGlobalID')
      return
   endif

   localBlock = 0
   do n=1,nblocks_tot
      if (newDistrb%blockLocation(n) == my_task+1) then
         localBlock = localBlock + 1
         newDistrb%blockGlobalID(localBlock) = n
      endif
   end do

!----------------------------------------------------------------------

   call ice_distributionDestroy(dist)

!----------------------------------------------------------------------

 end function create_distrb_rake

!***********************************************************************

 function create_distrb_roundrobin(nprocs, workPerBlock) result(newDistrb)

!  This function creates a distribution of blocks across processors
!  using a simple roundrobin algorithm. Mean for prescribed ice or
!  standalone CAM mode.

   integer (int_kind), intent(in) :: &
      nprocs            ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      workPerBlock        ! amount of work per block

   type (distrb) :: &
      newDistrb           ! resulting structure describing Cartesian
                          !  distribution of blocks

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      i, j,                  &! dummy loop indices
      istat,                 &! status flag for allocation
      processor,             &! processor position in cartesian decomp
      globalID,              &! global block ID
      localID                 ! block location on this processor

   integer (int_kind), dimension(:), allocatable :: &
      proc_tmp           ! temp processor id
   
!----------------------------------------------------------------------
!
!  create communicator for this distribution
!
!----------------------------------------------------------------------

   call create_communicator(newDistrb%communicator, nprocs)

!----------------------------------------------------------------------
!
!  try to find best processor arrangement
!
!----------------------------------------------------------------------

   newDistrb%nprocs = nprocs

!----------------------------------------------------------------------
!
!  allocate space for decomposition
!
!----------------------------------------------------------------------

   allocate (newDistrb%blockLocation(nblocks_tot), &
             newDistrb%blockLocalID (nblocks_tot), stat=istat)

   allocate (newDistrb%blockCnt(nprocs))

!----------------------------------------------------------------------
!
!  distribute blocks across processors, one block per proc until used
!
!----------------------------------------------------------------------

   allocate(proc_tmp(nprocs))
   processor = 0
   globalID = 0
   proc_tmp = 0

   allocate(newDistrb%blockIndex(nprocs,max_blocks))
   newDistrb%blockIndex(:,:) = 0

   do j=1,nblocks_y
   do i=1,nblocks_x
      
      globalID = globalID + 1

      if (workPerBlock(globalID) /= 0) then
         processor = mod(processor,nprocs) + 1
         proc_tmp(processor) = proc_tmp(processor) + 1
         localID = proc_tmp(processor)
         if (localID > max_blocks) then
            call abort_ice('create_distrb_roundrobin: max_blocks too small')
            return
         endif
         newDistrb%blockLocation(globalID) = processor
         newDistrb%blockLocalID (globalID) = localID
         newDistrb%blockIndex(processor,localID) = globalID
      else  ! no work - eliminate block from distribution
         newDistrb%blockLocation(globalID) = 0
         newDistrb%blockLocalID (globalID) = 0
      endif

   end do
   end do

   newDistrb%numLocalBlocks = proc_tmp(my_task+1)
   newDistrb%blockCnt(:) = proc_tmp(:)
   deallocate(proc_tmp)

!   write(nu_diag,*) 'my_task,newDistrb%numLocalBlocks',&
!      my_task,newDistrb%numLocalBlocks

!----------------------------------------------------------------------
!
!  now store the local info
!
!----------------------------------------------------------------------

   globalID = 0

   if (newDistrb%numLocalBlocks > 0) then
      allocate (newDistrb%blockGlobalID(newDistrb%numLocalBlocks), &
                stat=istat)

      processor = my_task + 1
      do localID = 1,newDistrb%numLocalBlocks
         newDistrb%blockGlobalID (localID) = newDistrb%blockIndex(processor,&
                                             localID)
      enddo
   endif

!----------------------------------------------------------------------

 end function create_distrb_roundrobin
 
!***********************************************************************

 function create_distrb_sectrobin(nprocs, workPerBlock) result(newDistrb)

!  This function creates a distribution of blocks across processors
!  using a simple sectrobin algorithm. Mean for prescribed ice or
!  standalone CAM mode.

   integer (int_kind), intent(in) :: &
      nprocs            ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      workPerBlock        ! amount of work per block

   type (distrb) :: &
      newDistrb           ! resulting structure describing Cartesian
                          !  distribution of blocks

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      i, j,                  &! dummy loop indices
      istat,                 &! status flag for allocation
      mblocks,               &! estimate of max blocks per pe
      processor,             &! processor position in cartesian decomp
      globalID,              &! global block ID
      localID                 ! block location on this processor

   integer (int_kind), dimension(:), allocatable :: &
      proc_tmp           ! temp processor id

   logical (log_kind), dimension(:), allocatable :: &
      bfree              ! map of assigned blocks
   
   integer (int_kind) :: cnt, blktogether, i2
   integer (int_kind) :: totblocks, nchunks
   logical (log_kind) :: keepgoing

!----------------------------------------------------------------------
!
!  create communicator for this distribution
!
!----------------------------------------------------------------------

   call create_communicator(newDistrb%communicator, nprocs)

!----------------------------------------------------------------------
!
!  try to find best processor arrangement
!
!----------------------------------------------------------------------

   newDistrb%nprocs = nprocs

!----------------------------------------------------------------------
!
!  allocate space for decomposition
!
!----------------------------------------------------------------------

   allocate (newDistrb%blockLocation(nblocks_tot), &
             newDistrb%blockLocalID (nblocks_tot), stat=istat)

   allocate (newDistrb%blockCnt(nprocs))

!----------------------------------------------------------------------
!
!  distribute groups of blocks across processors, one per proc until used
!
!----------------------------------------------------------------------

   allocate(proc_tmp(nprocs))
   processor = 0
   globalID = 0
   proc_tmp = 0

   allocate(newDistrb%blockIndex(nprocs,max_blocks))
   newDistrb%blockIndex(:,:) = 0

   allocate(bfree(nblocks_x*nblocks_y))
   bfree=.true.

   totblocks = 0
   do j=1,nblocks_y
   do i=1,nblocks_x
      globalID = (j-1)*nblocks_x + i
      if (workPerBlock(globalID) /= 0) then
         totblocks=totblocks+1
      else  ! no work - eliminate block from distribution
         bfree(globalID) = .false.
         newDistrb%blockLocation(globalID) = 0
         newDistrb%blockLocalID (globalID) = 0
      endif
   enddo
   enddo

   mblocks = totblocks/nprocs
   if (mod(totblocks,nprocs) > 0) mblocks=mblocks+1

   blktogether = max(1,nint(float(totblocks)/float(6*nprocs)))

!   write(nu_diag,*) 'ice_distrb_sectrobin totblocks = ',totblocks,nblocks_y*nblocks_x
 
   !------------------------------
   ! southern group of blocks
   !   weave back and forth in i vs j
   !   go south to north, low - high pes
   !------------------------------

   processor=1
   cnt = 0
   keepgoing = .true.
   do j=1,nblocks_y
   do i=1,nblocks_x
      if (mod(j,2) == 0) then
         i2 = nblocks_x - i + 1
      else
         i2 = i
      endif
      globalID = (j-1)*nblocks_x + i2
      if (cnt >= blktogether) then
         processor = mod(processor,nprocs) + 1
         cnt = 0
         if (processor == 1) keepgoing = .false.
      endif
!      write(nu_diag,'(a,6i7,l2)') 'tcx ',i,j,globalID,cnt,blktogether,processor,keepgoing

      if (keepgoing) then
         if (bfree(globalID)) then
         if (workPerBlock(globalID) /= 0) then
            proc_tmp(processor) = proc_tmp(processor) + 1
            localID = proc_tmp(processor)
            if (localID > max_blocks) then
               call abort_ice('create_distrb_sectrobin: max_blocks too small')
               return
            endif
            newDistrb%blockLocation(globalID) = processor
            newDistrb%blockLocalID (globalID) = localID
            newDistrb%blockIndex(processor,localID) = globalID
            cnt = cnt + 1
            totblocks = totblocks-1
            bfree(globalID) = .false.

         else  ! no work - eliminate block from distribution
            bfree(globalID) = .false.
            newDistrb%blockLocation(globalID) = 0
            newDistrb%blockLocalID (globalID) = 0
         endif
         endif  ! bfree
      endif
   end do
   end do

!   write(nu_diag,*) 'ice_distrb_sectrobin totblocks left after southern = ',totblocks

   !------------------------------
   ! northern group of blocks
   !   weave back and forth in i vs j
   !   go north to south, high - low pes
   !------------------------------

   processor=nprocs
   cnt = 0
   keepgoing = .true.
   do j=nblocks_y,1,-1
   do i=1,nblocks_x
      if (mod(j,2) == 1) then
         i2 = nblocks_x - i + 1
      else
         i2 = i
      endif
      globalID = (j-1)*nblocks_x + i2
      if (cnt >= blktogether) then
         processor = mod(processor+nprocs-2,nprocs) + 1
         cnt = 0
         if (processor == nprocs) keepgoing = .false.
      endif

      if (keepgoing) then
         if (bfree(globalID)) then
         if (workPerBlock(globalID) /= 0) then
            proc_tmp(processor) = proc_tmp(processor) + 1
            localID = proc_tmp(processor)
            if (localID > max_blocks) then
               call abort_ice('create_distrb_sectrobin: max_blocks too small')
               return
            endif
            newDistrb%blockLocation(globalID) = processor
            newDistrb%blockLocalID (globalID) = localID
            newDistrb%blockIndex(processor,localID) = globalID
            cnt = cnt + 1
            totblocks = totblocks - 1
            bfree(globalID) = .false.

         else  ! no work - eliminate block from distribution
            bfree(globalID) = .false.
            newDistrb%blockLocation(globalID) = 0
            newDistrb%blockLocalID (globalID) = 0
         endif
         endif  ! bfree
      endif
   end do
   end do

!   write(nu_diag,*) 'ice_distrb_sectrobin totblocks left after northern = ',totblocks

   !------------------------------
   ! central group of blocks
   !   weave back and forth in i vs j
   !   go north to south, low - high / low - high pes
   !------------------------------

   nchunks = 2*nprocs
   blktogether = max(1,nint(float(totblocks)/float(nchunks)))
   processor=1
   cnt = 0
   do j=nblocks_y,1,-1
   do i=1,nblocks_x
      if (mod(j,2) == 1) then
         i2 = nblocks_x - i + 1
      else
         i2 = i
      endif
      globalID = (j-1)*nblocks_x + i2
      if (totblocks > 0) then
      do while (proc_tmp(processor) >= mblocks .or. cnt >= blktogether)
         nchunks = nchunks - 1
         if (nchunks == 0) then
            blktogether = 1
         else
            blktogether = max(1,nint(float(totblocks)/float(nchunks)))
         endif
         cnt = 0
         processor = mod(processor,nprocs) + 1
      enddo
      endif

!      write(nu_diag,*) 'ice_distrb_sectrobin central ',i,j,totblocks,cnt,nchunks,blktogether,processor

      if (bfree(globalID)) then
      if (workPerBlock(globalID) /= 0) then
         proc_tmp(processor) = proc_tmp(processor) + 1
         localID = proc_tmp(processor)
         if (localID > max_blocks) then
            call abort_ice('create_distrb_sectrobin: max_blocks too small')
            return
         endif
         newDistrb%blockLocation(globalID) = processor
         newDistrb%blockLocalID (globalID) = localID
         newDistrb%blockIndex(processor,localID) = globalID
         cnt = cnt + 1
         totblocks = totblocks-1
         bfree(globalID) = .false.

      else  ! no work - eliminate block from distribution
         bfree(globalID) = .false.
         newDistrb%blockLocation(globalID) = 0
         newDistrb%blockLocalID (globalID) = 0
      endif
      endif  ! bfree
   end do
   end do

   newDistrb%numLocalBlocks = proc_tmp(my_task+1)
   newDistrb%blockCnt(:) = proc_tmp(:)
   deallocate(proc_tmp)
   deallocate(bfree)

!----------------------------------------------------------------------
!
!  now store the local info
!
!----------------------------------------------------------------------

   globalID = 0

   if (newDistrb%numLocalBlocks > 0) then
      allocate (newDistrb%blockGlobalID(newDistrb%numLocalBlocks), &
                stat=istat)

      processor = my_task + 1
      do localID = 1,newDistrb%numLocalBlocks
         newDistrb%blockGlobalID (localID) = newDistrb%blockIndex(processor,&
                                             localID)
      enddo
   endif

!----------------------------------------------------------------------

 end function create_distrb_sectrobin
 
!***********************************************************************

 function create_distrb_sectcart(nprocs, workPerBlock) result(newDistrb)

!  This function creates a distribution of blocks across processors
!  using a simple sectcart algorithm. Mean for prescribed ice or
!  standalone CAM mode.

   integer (int_kind), intent(in) :: &
      nprocs            ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      workPerBlock        ! amount of work per block

   type (distrb) :: &
      newDistrb           ! resulting structure describing Cartesian
                          !  distribution of blocks

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      i, j, i2, j2,          &! dummy loop indices
      istat,                 &! status flag for allocation
      processor,             &! processor position in cartesian decomp
      globalID,              &! global block ID
      localID,               &! block location on this processor
      blktogether,           &! number of blocks together
      cnt                     ! counter

   integer (int_kind), dimension(:), allocatable :: &
      proc_tmp           ! temp processor id
   
   integer (int_kind) :: n

!----------------------------------------------------------------------
!
!  create communicator for this distribution
!
!----------------------------------------------------------------------

   call create_communicator(newDistrb%communicator, nprocs)

!----------------------------------------------------------------------
!
!  try to find best processor arrangement
!
!----------------------------------------------------------------------

   newDistrb%nprocs = nprocs

!----------------------------------------------------------------------
!
!  allocate space for decomposition
!
!----------------------------------------------------------------------

   allocate (newDistrb%blockLocation(nblocks_tot), &
             newDistrb%blockLocalID (nblocks_tot), stat=istat)

   allocate (newDistrb%blockCnt(nprocs))
!----------------------------------------------------------------------
!
!  distribute blocks linearly across processors in quadrants
!
!----------------------------------------------------------------------

   allocate(proc_tmp(nprocs))
   proc_tmp = 0

   allocate(newDistrb%blockIndex(nprocs,max_blocks))
   newDistrb%blockIndex(:,:) = 0

   blktogether = max(1,nint(float(nblocks_x*nblocks_y)/float(4*nprocs)))

   ! --- two phases, reset processor and cnt for each phase
   ! --- phase 1 is south to north, east to west on the left half of the domain
   ! --- phase 2 is north to south, east to west on the right half of the domain

   if (mod(nblocks_x,2) /= 0) then
      call abort_ice( &
         'create_distrb_sectcart: nblocks_x not divisible by 2')
      return
   endif

   do n=1,2
   processor = 1
   cnt = 0
   do j2=1,nblocks_y
   do i2=1,nblocks_x/2
      
      if (n == 1) then
         i = i2
         j = j2
      else
         i = nblocks_x/2 + i2
         j = nblocks_y - j2 + 1
      endif

      globalID = (j-1)*nblocks_x + i
      if (cnt >= blktogether) then
         processor = mod(processor,nprocs) + 1
         cnt = 0
      endif
      cnt = cnt + 1

      if (workPerBlock(globalID) /= 0) then
         proc_tmp(processor) = proc_tmp(processor) + 1
         localID = proc_tmp(processor)
         if (localID > max_blocks) then
            call abort_ice('create_distrb_sectcart: max_blocks too small')
            return
         endif
         newDistrb%blockLocation(globalID) = processor
         newDistrb%blockLocalID (globalID) = localID
         newDistrb%blockIndex(processor,localID) = globalID
      else  ! no work - eliminate block from distribution
         newDistrb%blockLocation(globalID) = 0
         newDistrb%blockLocalID (globalID) = 0
      endif

   end do
   end do
   end do

   newDistrb%numLocalBlocks = proc_tmp(my_task+1)
   newDistrb%blockCnt(:) = proc_tmp(:)
   deallocate(proc_tmp)

!   write(nu_diag,*) 'my_task,newDistrb%numLocalBlocks',&
!      my_task,newDistrb%numLocalBlocks

!----------------------------------------------------------------------
!
!  now store the local info
!
!----------------------------------------------------------------------

   globalID = 0

   if (newDistrb%numLocalBlocks > 0) then
      allocate (newDistrb%blockGlobalID(newDistrb%numLocalBlocks), &
                stat=istat)

      processor = my_task + 1
      do localID = 1,newDistrb%numLocalBlocks
         newDistrb%blockGlobalID (localID) = newDistrb%blockIndex(processor,&
                                             localID)
      enddo
   endif

!----------------------------------------------------------------------

 end function create_distrb_sectcart
 
!**********************************************************************

 function create_distrb_spacecurve(nprocs,work_per_block)

!  This function distributes blocks across processors in a
!  load-balanced manner using space-filling curves
!  added by J. Dennis 3/10/06

   use ice_spacecurve

   integer (int_kind), intent(in) :: &
      nprocs                ! number of processors in this distribution

   integer (int_kind), dimension(:), intent(in) :: &
      work_per_block        ! amount of work per block

   type (distrb) :: &
      create_distrb_spacecurve  ! resulting structure describing
                                ! load-balanced distribution of blocks

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      i,j,n              ,&! dummy loop indices
      pid                ,&! dummy for processor id
      localID              ! local block position on processor

   integer (int_kind), dimension(:),allocatable :: &
        idxT_i,idxT_j      ! Temporary indices for SFC

   integer (int_kind), dimension(:,:),allocatable :: &
        Mesh             ,&!   !arrays to hold Space-filling curve
        Mesh2            ,&!
        Mesh3              !

   integer (int_kind) :: &
        nblocksL,nblocks, &! Number of blocks local and total
        ii,extra,tmp1,    &! loop tempories used for
        s1,ig              ! partitioning curve

   logical, parameter :: Debug = .FALSE.

   type (factor_t) :: xdim,ydim

   integer (int_kind) :: it,jj,i2,j2
   integer (int_kind) :: curveSize,sb_x,sb_y,itmp,numfac
   integer (int_kind) :: subNum, sfcNum
   logical            :: foundx

   integer (int_kind), dimension(:), allocatable :: &
      proc_tmp             ! temp processor id for rake algrthm

   type (distrb) :: dist   ! temp hold distribution

   !------------------------------------------------------
   ! Space filling curves only work if:
   !
   !    nblocks_x = nblocks_y
   !       nblocks_x = 2^m 3^n 5^p where m,n,p are integers
   !------------------------------------------------------

   if((.not. IsFactorable(nblocks_y)) .or. (.not. IsFactorable(nblocks_x))) then
     create_distrb_spacecurve = create_distrb_cart(nprocs, work_per_block)
     return
   endif

   !-----------------------------------------------
   ! Factor the numbers of blocks in each dimension
   !-----------------------------------------------

   xdim = Factor(nblocks_x)
   ydim = Factor(nblocks_y)
   numfac = xdim%numfact

   !---------------------------------------------
   ! Match the common factors to create SFC curve
   !---------------------------------------------

   curveSize=1
   do it=1,numfac
      call MatchFactor(xdim,ydim,itmp,foundX)
      curveSize = itmp*curveSize
   enddo

   !--------------------------------------
   ! determine the size of the sub-blocks
   ! within the space-filling curve
   !--------------------------------------

   sb_x = ProdFactor(xdim)
   sb_y = ProdFactor(ydim)

   call create_communicator(dist%communicator, nprocs)

   dist%nprocs = nprocs

   !----------------------------------------------------------------------
   !
   !  allocate space for decomposition
   !
   !----------------------------------------------------------------------

   allocate (dist%blockLocation(nblocks_tot), &
             dist%blockLocalID (nblocks_tot))

   dist%blockLocation=0
   dist%blockLocalID =0

   !----------------------------------------------------------------------
   !  Create the array to hold the SFC and indices into it
   !----------------------------------------------------------------------

   allocate(Mesh(curveSize,curveSize))
   allocate(Mesh2(nblocks_x,nblocks_y))
   allocate(Mesh3(nblocks_x,nblocks_y))
   allocate(idxT_i(nblocks_tot),idxT_j(nblocks_tot))

   Mesh  = 0
   Mesh2 = 0
   Mesh3 = 0

   !----------------------------------------------------------------------
   !  Generate the space-filling curve
   !----------------------------------------------------------------------

   call GenSpaceCurve(Mesh)
   Mesh = Mesh + 1 ! make it 1-based indexing
   if(Debug) then
     if(my_task ==0) call PrintCurve(Mesh)
   endif

   !-----------------------------------------------
   ! Reindex the SFC to address internal sub-blocks
   !-----------------------------------------------

   do j=1,curveSize
   do i=1,curveSize
      sfcNum = (Mesh(i,j) - 1)*(sb_x*sb_y) + 1
      do jj=1,sb_y
      do ii=1,sb_x
         subNum = (jj-1)*sb_x + (ii-1)
         i2 = (i-1)*sb_x + ii
         j2 = (j-1)*sb_y + jj
         Mesh2(i2,j2) = sfcNum + subNum
      enddo
      enddo
   enddo
   enddo

   !------------------------------------------------
   ! create a linear array of i,j coordinates of SFC
   !------------------------------------------------

   idxT_i=0;idxT_j=0
   do j=1,nblocks_y
     do i=1,nblocks_x
        n = (j-1)*nblocks_x + i
        ig = Mesh2(i,j)
        if(work_per_block(n) /= 0) then
            idxT_i(ig)=i;idxT_j(ig)=j
        endif
     enddo
   enddo

   !-----------------------------
   ! Compress out the land blocks
   !-----------------------------

   ii=0
   do i=1,nblocks_tot
      if(IdxT_i(i) .gt. 0) then
         ii=ii+1
         Mesh3(idxT_i(i),idxT_j(i)) = ii
      endif
   enddo
   nblocks=ii
   if(Debug) then
     if(my_task==0) call PrintCurve(Mesh3)
   endif

   !----------------------------------------------------
   ! Compute the partitioning of the space-filling curve
   !----------------------------------------------------

   nblocksL = nblocks/nprocs
   ! every cpu gets nblocksL blocks, but the first 'extra' get nblocksL+1
   extra = mod(nblocks,nprocs)
   s1 = extra*(nblocksL+1)
   ! split curve into two curves:
   ! 1 ... s1  s2 ... nblocks
   !
   !  s1 = extra*(nblocksL+1)         (count be 0)
   !  s2 = s1+1
   !
   ! First region gets nblocksL+1 blocks per partition
   ! Second region gets nblocksL blocks per partition
   if(Debug) print *,'nprocs,extra,nblocks,nblocksL,s1: ', &
                nprocs,extra,nblocks,nblocksL,s1

   !-----------------------------------------------------------
   ! Use the SFC to partition the blocks across processors
   !-----------------------------------------------------------

   do j=1,nblocks_y
   do i=1,nblocks_x
      n = (j-1)*nblocks_x + i
      ii = Mesh3(i,j)
      if(ii>0) then
        if(ii<=s1) then
           ! ------------------------------------
           ! If on the first region of curve
           ! all processes get nblocksL+1 blocks
           ! ------------------------------------
           ii=ii-1
           tmp1 = ii/(nblocksL+1)
           dist%blockLocation(n) = tmp1+1
        else
           ! ------------------------------------
           ! If on the second region of curve
           ! all processes get nblocksL blocks
           ! ------------------------------------
           ii=ii-s1-1
           tmp1 = ii/nblocksL
           dist%blockLocation(n) = extra + tmp1 + 1
        endif
      endif
   enddo
   enddo

   !----------------------------------------------------------------------
   !  Reset the dist data structure
   !----------------------------------------------------------------------

   allocate(proc_tmp(nprocs))
   proc_tmp = 0

   do n=1,nblocks_tot
      pid = dist%blockLocation(n)
      !!!dist%blockLocation(n) = pid

      if(pid>0) then
        proc_tmp(pid) = proc_tmp(pid) + 1
        dist%blockLocalID(n) = proc_tmp(pid)
      else
        dist%blockLocalID(n) = 0
      endif
   enddo

   dist%numLocalBlocks = proc_tmp(my_task+1)

   if (dist%numLocalBlocks > 0) then
      allocate (dist%blockGlobalID(dist%numLocalBlocks))
      dist%blockGlobalID = 0
   endif
   localID = 0
   do n=1,nblocks_tot
      if (dist%blockLocation(n) == my_task+1) then
         localID = localID + 1
         dist%blockGlobalID(localID) = n
      endif
   enddo

   if(Debug) then
      if(my_task==0) print *,'dist%blockLocation:= ',dist%blockLocation
      print *,'IAM: ',my_task,' SpaceCurve: Number of blocks {total,local} :=', &
                nblocks_tot,nblocks,proc_tmp(my_task+1)
   endif
   !---------------------------------
   ! Deallocate temporary arrays
   !---------------------------------
   deallocate(proc_tmp)
   deallocate(Mesh,Mesh2,Mesh3)
   deallocate(idxT_i,idxT_j)

   create_distrb_spacecurve = dist  ! return the result

!----------------------------------------------------------------------

 end function create_distrb_spacecurve

!**********************************************************************

 subroutine ice_distributionRake (procWork, procID, blockWork, &
                                  priority, distribution)

!  This subroutine performs a rake algorithm to distribute the work
!  along a vector of processors.  In the rake algorithm, a work
!  threshold is first set.  Then, moving from left to right, work
!  above that threshold is raked to the next processor in line.
!  The process continues until the end of the vector is reached
!  and then the threshold is reduced by one for a second rake pass.
!  In this implementation, a priority for moving blocks is defined
!  such that the rake algorithm chooses the highest priority
!  block to be moved to the next processor.  This can be used
!  for example to always choose the eastern-most block or to
!  ensure a block does not stray too far from its neighbors.

   integer (int_kind), intent(in), dimension(:) :: &
      blockWork          ,&! amount of work per block
      procID               ! global processor number

   integer (int_kind), intent(inout), dimension(:) :: &
      procWork           ,&! amount of work per processor
      priority             ! priority for moving a given block

   type (distrb), intent(inout) :: &
      distribution         ! distribution to change

!----------------------------------------------------------------------
!
!  local variables
!
!----------------------------------------------------------------------

   integer (int_kind) :: &
      i, n,                  &! dummy loop indices
      np1,                   &! n+1 corrected for cyclical wrap
      iproc, inext,          &! processor ids for current and next 
      nprocs, numBlocks,   &! number of blocks, processors
      lastPriority,          &! priority for most recent block
      minPriority,           &! minimum priority
      lastLoc,               &! location for most recent block
      meanWork, maxWork,     &! mean,max work per processor
      diffWork, residual,    &! work differences and residual work
      numTransfers            ! counter for number of block transfers

!----------------------------------------------------------------------
!
!  initialization
!
!----------------------------------------------------------------------

   nprocs  = size(procWork)
   numBlocks = size(blockWork)

   !*** compute mean,max work per processor

   meanWork = sum(procWork)/nprocs + 1
   maxWork  = maxval(procWork)
   residual = mod(meanWork,nprocs)

   minPriority = 1000000
   do n=1,nprocs
      iproc = procID(n)
      do i=1,numBlocks
         if (distribution%blockLocation(i) == iproc) then
            minPriority = min(minPriority,priority(i))
         endif
      end do
   end do

!----------------------------------------------------------------------
!
!  do two sets of transfers
!
!----------------------------------------------------------------------

   transferLoop: do

!----------------------------------------------------------------------
!
!     do rake across the processors
!
!----------------------------------------------------------------------

      numTransfers = 0
      do n=1,nprocs
         if (n < nprocs) then
            np1   = n+1
         else
            np1   = 1
         endif
         iproc = procID(n)
         inext = procID(np1)

         if (procWork(n) > meanWork) then !*** pass work to next

            diffWork = procWork(n) - meanWork

            rake1: do while (diffWork > 1)

               !*** attempt to find a block with the required
               !*** amount of work and with the highest priority
               !*** for moving (eg boundary blocks first)

               lastPriority = 0
               lastLoc = 0

               do i=1,numBlocks
                  if (distribution%blockLocation(i) == iproc) then
                     if (priority(i) > lastPriority ) then
                        lastPriority = priority(i)
                        lastLoc = i
                     endif
                  endif
               end do
               if (lastLoc == 0) exit rake1 ! could not shift work

               numTransfers = numTransfers + 1
               distribution%blockLocation(lastLoc) = inext
               if (np1 == 1) priority(lastLoc) = minPriority
               diffWork = diffWork - blockWork(lastLoc)

               procWork(n  ) = procWork(n  )-blockWork(lastLoc)
               procWork(np1) = procWork(np1)+blockWork(lastLoc)
            end do rake1
         endif

      end do

!----------------------------------------------------------------------
!
!     increment meanWork by one and repeat
!
!----------------------------------------------------------------------

      meanWork = meanWork + 1
      if (numTransfers == 0 .or. meanWork > maxWork) exit transferLoop

   end do transferLoop

!----------------------------------------------------------------------

end subroutine ice_distributionRake

!***********************************************************************

end module ice_distribution

!|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||