make_local_matrix.cpp


//@HEADER
// ************************************************************************
// 
//               HPCCG: Simple Conjugate Gradient Benchmark Code
//                 Copyright (2006) Sandia Corporation
// 
// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
// license for use of this work by or on behalf of the U.S. Government.
// 
// This library is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as
// published by the Free Software Foundation; either version 2.1 of the
// License, or (at your option) any later version.
//  
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//  
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
// USA
// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
// 
// ************************************************************************
//@HEADER

#include <iostream>
using std::cout;
using std::cerr;
using std::endl;
#include <map>
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cassert>
#include "HPC_Sparse_Matrix.hpp"
#include "read_HPC_row.hpp"
#include "make_local_matrix.hpp"
#include "mytimer.hpp"
//#define DEBUG

#ifdef DEBUG
  static int debug = 1;
#else
  static int debug = 0;
#endif

#if USING_MPI || USING_CHARM

void identify_externals(HPC_Sparse_Matrix* A, std::map< int, int > &externals) {
  ///////////////////////////////////////////
  // Scan the indices and transform to local
  ///////////////////////////////////////////
  double t0;

  if (debug) t0 = mytimer();

  A->external_index = new int[max_external];
  A->num_external = 0;

  for (int i = 0; i < A->local_nrow; i++)
    {
      for (int j = 0; j < A->nnz_in_row[i]; j++)
	{
	  int cur_ind = A->ptr_to_inds_in_row[i][j];
	  if (A->start_row <= cur_ind && cur_ind <= A->stop_row)
	    {
	      A->ptr_to_inds_in_row[i][j] -= A->start_row;
	    }
	  else // Must find out if we have already set up this point
	    {
	      if (externals.find(cur_ind) == externals.end())
		{
		  externals[cur_ind] = A->num_external++;
		  if (A->num_external <= max_external)
		    {
		      A->external_index[A->num_external-1] = cur_ind;
		      // Mark index as external by negating it
		      A->ptr_to_inds_in_row[i][j] = - (A->ptr_to_inds_in_row[i][j] + 1);
		    }
		  else 
		    {
		      cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
			   <<endl;
		      abort();
		    }
		}
	      else
		{
		  // Mark index as external by adding 1 and negating it
		  A->ptr_to_inds_in_row[i][j] = - (A->ptr_to_inds_in_row[i][j] + 1);
		}
	    }
	}
    }

  if (debug) {
    t0 = mytimer() - t0;
    cout << "            Time in transform to local phase = " << t0 << endl;
  }

  A->local_ncol = A->local_nrow + A->num_external;
}

#endif

#if USING_MPI  // Compile this routine only if running in parallel

void make_local_matrix(HPC_Sparse_Matrix * A)
{
  int i, j, k;
  double t0;

  int debug_details = 0; // Set to 1 for voluminous output

  // Get MPI process info

  int size, rank; // Number of MPI processes, My process ID
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  
  // Extract Matrix pieces

  int start_row = A->start_row;
  int stop_row = A->stop_row;
  int total_nrow = A->total_nrow;
  long long total_nnz = A->total_nnz;
  int local_nrow = A->local_nrow;
  int local_nnz = A->local_nnz;
  int  * nnz_in_row = A->nnz_in_row;
  double ** ptr_to_vals_in_row = A->ptr_to_vals_in_row;
  int ** ptr_to_inds_in_row = A->ptr_to_inds_in_row;
  
  
  // We need to convert the index values for the rows on this processor
  // to a local index space. We need to:
  // - Determine if each index reaches to a local value or external value
  // - If local, subtract start_row from index value to get local index
  // - If external, find out if it is already accounted for.  
  //   - If so, then do nothing, 
  //   - otherwise 
  //     - add it to the list of external indices,  
  //     - find out which processor owns the value. 
  //     - Set up communication for sparse MV operation.

  std::map< int, int > externals;
  identify_externals(A, externals);

  ////////////////////////////////////////////////////////////////////////////
  // Go through list of externals to find out which processors must be accessed.
  ////////////////////////////////////////////////////////////////////////////

  if (debug) t0 = mytimer();

  int num_external = A->num_external;
  A->external_local_index = new int[max_external];

  int* external_index = A->external_index;
  int* external_local_index = A->external_local_index;
  
  int * tmp_buffer  = new int[size];  // Temp buffer space needed below

  // Build list of global index offset

  int * global_index_offsets = new int[size];

  // This call sends the start_row of each ith processor to the ith 
  // entry of global_index_offset on all processors.
  // Thus, each processor know the range of indices owned by all
  // other processors.
  // Note:  There might be a better algorithm for doing this, but this
  //        will work...

  MPI_Allgather(&start_row, 1, MPI_INT, global_index_offsets, 1, MPI_INT, MPI_COMM_WORLD);

  // Go through list of externals and find the processor that owns each
  int * external_processor = new int[num_external];
  int * new_external_processor = new int[num_external];
  for (i=0; i< num_external; i++)
    {
      int cur_ind = external_index[i]; 
     for (int j=size-1; j>=0; j--)
	if (global_index_offsets[j] <= cur_ind) 
	  {
	    external_processor[i] = j;
	    break;
	  }
    }
  if (debug) {
    t0 = mytimer() - t0;
    cout << "          Time in finding processors phase = " << t0 << endl;
  }


  ////////////////////////////////////////////////////////////////////////////
  // Sift through the external elements. For each newly encountered external
  // point assign it the next index in the sequence. Then look for other
  // external elements who are update by the same node and assign them the next
  // set of index numbers in the sequence (ie. elements updated by the same node
  // have consecutive indices).
  ////////////////////////////////////////////////////////////////////////////

  if (debug) t0 = mytimer();

  int count = local_nrow;
  for (i = 0; i < num_external; i++) external_local_index[i] = -1;

  for (i = 0; i < num_external; i++) {
    if (external_local_index[i] == -1) {
      external_local_index[i] = count++;

      for (j = i + 1; j < num_external; j++) {
        if (external_processor[j] == external_processor[i]) 
	  external_local_index[j] = count++;
      }
    }
  }

  if (debug) {
    t0 = mytimer() - t0;
    cout << "           Time in scanning external indices phase = " << t0 << endl;
  }
  if (debug) t0 = mytimer();


  for (i=0; i< local_nrow; i++)
    {
      for (j=0; j<nnz_in_row[i]; j++)
	{
	  if (ptr_to_inds_in_row[i][j]<0) // Change index values of externals
	    {
	      int cur_ind = - ptr_to_inds_in_row[i][j] - 1;
	      ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
	    }
	}
    }

  for (i = 0 ; i < num_external; i++)  new_external_processor[i] = 0;

  for (i = 0; i < num_external; i++)
      new_external_processor[external_local_index[i] - local_nrow] = 
	external_processor[i];

  if (debug) {
    t0 = mytimer() - t0;
    cout << "           Time in assigning external indices phase = " << t0 << endl;
  }

  if (debug_details)
    {
      for (i = 0; i < num_external; i++)
	{
	  cout << "Processor " << rank << " of " << size <<
	    ": external processor["<< i<< "] = " << external_processor[i]<< endl;
	  cout << "Processor " << rank << " of " << size <<
	    ": new external processor["<< i<< "] = " 
	       << new_external_processor[i]<< endl;
	}
    }

  ////////////////////////////////////////////////////////////////////////////
  ///
  // Count the number of neighbors from which we receive information to update
  // our external elements. Additionally, fill the array tmp_neighbors in the
  // following way:
  //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
  //                              processor i.
  //      tmp_neighbors[i] = x   ==>  (x-1)/size elements are updated from
  //                              processor i.
  ///
  ////////////////////////////////////////////////////////////////////////////

  t0 = mytimer();
  int * tmp_neighbors = new int[size];
  for (i = 0 ; i < size ; i++)  tmp_neighbors[i] = 0;

  int num_recv_neighbors = 0;
  int length             = 1;

  for (i = 0; i < num_external; i++) 
    {
      if (tmp_neighbors[new_external_processor[i]] == 0) 
	{
	  num_recv_neighbors++;
	  tmp_neighbors[new_external_processor[i]] = 1;
	}
      tmp_neighbors[new_external_processor[i]] += size;
    }

  /// sum over all processors all the tmp_neighbors arrays ///

  MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, 
		MPI_COMM_WORLD);

  /// decode the combined 'tmp_neighbors' (stored in tmp_buffer) 
  //  array from all the processors

  int num_send_neighbors = tmp_buffer[rank] % size;

  /// decode 'tmp_buffer[rank] to deduce total number of elements 
  //  we must send

  int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;

  //
  // Check to see if we have enough workspace allocated.  This could be 
  // dynamically modified, but let's keep it simple for now...
  //

  if (num_send_neighbors > max_num_messages)
    {
      cerr << "Must increase max_num_messages in HPC_Sparse_Matrix.hpp"
	   <<endl;
      cerr << "Must be at least " <<  num_send_neighbors <<endl;
      abort();
    }

  if (total_to_be_sent > max_external )
    {
      cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
	   <<endl;
      cerr << "Must be at least " << total_to_be_sent <<endl;
      abort();
    }
  delete [] tmp_neighbors;

  if (debug) {
    t0 = mytimer() - t0;
    cout << "           Time in finding neighbors phase = " << t0 << endl;
  }
  if (debug) cout << "Processor " << rank << " of " << size <<
	       ": Number of send neighbors = " << num_send_neighbors << endl;

  if (debug) cout << "Processor " << rank << " of " << size <<
	       ": Number of receive neighbors = " << num_recv_neighbors << endl;

  if (debug) cout << "Processor " << rank << " of " << size <<
	       ": Total number of elements to send = " << total_to_be_sent << endl;

  if (debug) MPI_Barrier(MPI_COMM_WORLD);

  /////////////////////////////////////////////////////////////////////////
  ///
  // Make a list of the neighbors that will send information to update our
  // external elements (in the order that we will receive this information).
  ///
  /////////////////////////////////////////////////////////////////////////

  int * recv_list = new int[max_external];
  j = 0;
  recv_list[j++] = new_external_processor[0];
  for (i = 1; i < num_external; i++) {
    if (new_external_processor[i - 1] != new_external_processor[i]) {
      recv_list[j++] = new_external_processor[i];
    }
  }

  //
  // Send a 0 length message to each of our recv neighbors
  //

  int * send_list = new int[num_send_neighbors];
  for (i = 0 ; i < num_send_neighbors; i++ ) send_list[i] = 0;

  //
  //  first post receives, these are immediate receives
  //  Do not wait for result to come, will do that at the
  //  wait call below.
  //
  int MPI_MY_TAG = 99;
  
  MPI_Request * request = new MPI_Request[max_num_messages];
  for (i = 0; i < num_send_neighbors; i++) 
    {
      MPI_Irecv(tmp_buffer+i, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MY_TAG, 
		MPI_COMM_WORLD, request+i);
    }

  // send messages 

  for (i = 0; i < num_recv_neighbors; i++) 
      MPI_Send(tmp_buffer+i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, 
	       MPI_COMM_WORLD);
  ///
   // Receive message from each send neighbor to construct 'send_list'.
   ///

  MPI_Status status;
  for (i = 0; i < num_send_neighbors; i++)
    {
      if ( MPI_Wait(request+i, &status) )
	{
	  cerr << "MPI_Wait error\n"<<endl;
	  exit(-1);
	}
      send_list[i] = status.MPI_SOURCE;
  }


  /////////////////////////////////////////////////////////////////////////
  ///
  //  Compare the two lists. In most cases they should be the same.  
  //  However, if they are not then add new entries to the recv list
  //  that are in the send list (but not already in the recv list).
  ///
  /////////////////////////////////////////////////////////////////////////

  for (j = 0; j < num_send_neighbors; j++) 
    {
      int found = 0;
      for (i = 0; i < num_recv_neighbors; i++) 
	{
	  if (recv_list[i] == send_list[j])
	    found = 1;
	}

      if (found == 0) {
	if (debug) cout << "Processor " << rank << " of " << size <<
		     ": recv_list[" << num_recv_neighbors <<"] = " 
			<< send_list[j] << endl;
	recv_list[num_recv_neighbors] = send_list[j];
	(num_recv_neighbors)++;

      }
    }

  delete [] send_list;
  num_send_neighbors = num_recv_neighbors;

  if (num_send_neighbors > max_num_messages)
    {
      cerr << "Must increase max_external in HPC_Sparse_Matrix.hpp"
	   <<endl;
      abort();
    }

  /////////////////////////////////////////////////////////////////////////
  /// Start filling HPC_Sparse_Matrix struct
  /////////////////////////////////////////////////////////////////////////

  A->total_to_be_sent = total_to_be_sent;
  int * elements_to_send = new int[total_to_be_sent];
  A->elements_to_send = elements_to_send;

  for (i = 0 ; i < total_to_be_sent; i++ ) elements_to_send[i] = 0;
  
  //
  // Create 'new_external' which explicitly put the external elements in the
  // order given by 'external_local_index'
  //

  int * new_external = new int[num_external];
  for (i = 0; i < num_external; i++) {
    new_external[external_local_index[i] - local_nrow] = external_index[i];
  }

  /////////////////////////////////////////////////////////////////////////
  //
  // Send each processor the global index list of the external elements in the
  // order that I will want to receive them when updating my external elements
  //
  /////////////////////////////////////////////////////////////////////////

  int * lengths = new int[num_recv_neighbors];

  MPI_MY_TAG++;

  // First post receives

  for (i = 0; i < num_recv_neighbors; i++) 
    {
      int partner    = recv_list[i];
      MPI_Irecv(lengths+i, 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD, 
		request+i);
    }

  int * neighbors = new int[max_num_neighbors];
  int * recv_length = new int[max_num_neighbors];		
  int * send_length = new int[max_num_neighbors];

  A->neighbors = neighbors;
  A->recv_length = recv_length;
  A->send_length = send_length;

  j = 0;
  for (i = 0; i < num_recv_neighbors; i++) 
    {
      int start  = j;
      int newlength = 0;

      // go through list of external elements until updating 
      // processor changes

      while ((j < num_external) && 
	     (new_external_processor[j] == recv_list[i])) 
	{
	  newlength++;
	  j++;
	  if (j == num_external) break;
	}

      recv_length[i] = newlength;
      neighbors[i]  = recv_list[i];
      
      length = j - start;
      MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
    }

  // Complete the receives of the number of externals

  for (i = 0; i < num_recv_neighbors; i++)
    {
      if ( MPI_Wait(request+i, &status) ) 
	{
	  cerr << "MPI_Wait error\n"<<endl;
	  exit(-1);
	}
      send_length[i] = lengths[i];
    }
  delete [] lengths;


  ///////////////////////////////////////////////////////////////////
  // Build "elements_to_send" list.  These are the x elements I own
  // that need to be sent to other processors.
  ///////////////////////////////////////////////////////////////////

  MPI_MY_TAG++;

  j = 0;
  for (i = 0; i < num_recv_neighbors; i++)
    {
      MPI_Irecv(elements_to_send+j, send_length[i], MPI_INT, neighbors[i], 
		MPI_MY_TAG, MPI_COMM_WORLD, request+i);
      j += send_length[i];
    }

  j = 0;
  for (i = 0; i < num_recv_neighbors; i++) 
    {
      int start  = j;
      int newlength = 0;
      
      // Go through list of external elements 
      // until updating processor changes.  This is redundant, but 
      // saves us from recording this information.

      while ((j < num_external) && 
	     (new_external_processor[j] == recv_list[i])) {
	
	newlength++;
	j++;
	if (j == num_external) break;
      }
      MPI_Send(new_external+start, j-start, MPI_INT, recv_list[i], 
	       MPI_MY_TAG, MPI_COMM_WORLD);
    }

  // receive from each neighbor the global index list of external elements

  for (i = 0; i < num_recv_neighbors; i++) 
    {
      if ( MPI_Wait(request+i, &status) ) 
	{
	  cerr << "MPI_Wait error\n"<<endl;
	  exit(-1);
	}
  }

  /// replace global indices by local indices ///

  for (i = 0; i < total_to_be_sent; i++) elements_to_send[i] -= start_row;


  ////////////////
  // Finish up !!
  ////////////////

  A->num_send_neighbors = num_send_neighbors;

  //Used in exchange_externals
  double *send_buffer = new double[total_to_be_sent];
  A->send_buffer = send_buffer;

  delete [] tmp_buffer;
  delete [] global_index_offsets;
  delete [] recv_list;
  delete [] external_processor;
  delete [] new_external_processor;
  delete [] request;

  return;
}
#endif // USING_MPI