DENDRO: /net/www/csela/dendro/releases/Dendro/src/oct/Coarsen.C Source File

00001 
00008 #include "TreeNode.h"
00009 #include "parUtils.h"
00010 #include "dendro.h"
00011 
00012 #ifdef __DEBUG__
00013 #ifndef __DEBUG_OCT__
00014 #define __DEBUG_OCT__
00015 #endif
00016 #endif
00017 
00018 namespace ot {
00019 
00020   int simpleCoarsen(const std::vector<TreeNode > &in, std::vector<TreeNode> &out, MPI_Comm comm) {
00021     PROF_SIMPLE_COARSE_BEGIN
00022 
00023       out.clear();
00024 
00025     std::vector<TreeNode> tmpIn = in;
00026 
00027     MPI_Comm   new_comm;
00028     par::splitComm2way(tmpIn.empty(), &new_comm, comm);
00029 
00030     if(!tmpIn.empty()) {
00031       for(int i = 0; i < tmpIn.size(); i++) {
00032         out.push_back(tmpIn[i].getParent());
00033       }//end for i
00034 
00035       std::vector<TreeNode> tmpOut;
00036       unsigned int dim = tmpIn[0].getDim();
00037       unsigned int maxDepth = tmpIn[0].getMaxDepth();
00038 
00039       //Although out is not empty on any processor in new_comm, we can't
00040       //guarantee that there will be sufficient number of elements in out after
00041       //removing duplicates across processors
00042       completeOctree(out, tmpOut,  dim, maxDepth, false, false, false, new_comm);
00043 
00044       out = tmpOut;
00045       tmpOut.clear();
00046     }
00047 
00048     PROF_SIMPLE_COARSE_END
00049   }//end function
00050 
00051   //New implementation. Written on April 21, 2008.
00052   //Assumption: in is sorted, linear, complete (not necessarily balanced)
00053   int coarsenOctree(const std::vector<TreeNode > &in, std::vector<TreeNode> &out) {
00054     PROF_COARSE_SEQ_BEGIN
00055       unsigned int inSz = static_cast<unsigned int>(in.size());
00056     out.clear();
00057 
00058     unsigned int dim; 
00059     if(inSz) {
00060       dim = in[0].getDim();
00061     }else {
00062       PROF_COARSE_SEQ_END
00063     }
00064 
00065     if (inSz < (1 << dim)) {
00066       out = in;
00067       std::cout<<"WARNING: Too Few Octants to Coarse."<<std::endl;      
00068       PROF_COARSE_SEQ_END
00069     }
00070 
00071     assert(in[0].getChildNumber() == 0);
00072 
00073     int prevFCidx = 0;
00074 
00075     while (prevFCidx < inSz) {
00076       int currFCidx = (prevFCidx + 1);
00077       //The Order of the operands is important here. Must Check for array out of
00078       //bounds first.
00079       while( (currFCidx < inSz) && (in[currFCidx].getChildNumber() != 0) ) {
00080         currFCidx++;
00081       }      
00082       if(currFCidx >= (prevFCidx + (1 << dim))) {
00083         //Coarsen the first 8 octants including prevFCidx and insert the rest
00084         //as they are. Only the first 8 octants will be coarsened
00085         out.push_back(in[prevFCidx].getParent());
00086         if( (prevFCidx + (1 << dim)) < currFCidx ) {
00087           out.insert(out.end(), 
00088               (in.begin() + (prevFCidx + (1 << dim))), (in.begin() + currFCidx));
00089         }
00090       } else {
00091         //Can't coarsen any octant in between prevFCidx and currFCidx
00092         if(prevFCidx < currFCidx) {
00093           out.insert(out.end(),
00094               (in.begin() + prevFCidx), (in.begin() + currFCidx));
00095         }
00096       }
00097       prevFCidx = currFCidx;
00098     }
00099 
00100     PROF_COARSE_SEQ_END
00101   }//end function
00102 
00103   //New Implementation. Written on April 21, 2008
00104   //Assumption: in is globally sorted, linear and complete (not necessarily
00105   //balanced).
00106   int coarsenOctree(const std::vector<TreeNode > &in, std::vector<TreeNode> &out,
00107       unsigned int dim, unsigned int maxDepth, MPI_Comm comm,
00108       bool skipPartition, MPI_Comm* newCommPtr, bool* iAmActive) {
00109     PROF_COARSE_BEGIN
00110 
00111       int size;
00112     MPI_Comm_size(comm, &size);
00113     out.clear();
00114 
00115     if(newCommPtr) {
00116       *newCommPtr = comm;
00117     }
00118 
00119     if(iAmActive) {
00120       *iAmActive = true;
00121     }
00122 
00124     //Check if it is a single processor...
00125     if (size == 1) {
00126 
00127 #ifndef __SILENT_MODE__
00128       std::cout<<"Coarsen Octree. inpSize: "<<in.size()<<" activeNpes: 1"<<std::endl; 
00129 #endif
00130 
00131       coarsenOctree(in,out);
00132       PROF_COARSE_END
00133     }
00134 
00135     std::vector<TreeNode> tmpIn = in;
00136     DendroIntL inSz = tmpIn.size();
00137 
00139     //Check if the input is too small...
00140     DendroIntL globSize;
00141     par::Mpi_Allreduce<DendroIntL>(&inSz, &globSize, 1, MPI_SUM, comm);
00142 
00143     int rank;
00144     MPI_Comm_rank(comm, &rank);
00145 
00146     //min grain size = 1000
00147     const DendroIntL THOUSAND = 1000;
00148     if (globSize < (THOUSAND*size)) {
00149       int splittingSize = (globSize/THOUSAND); 
00150       if(splittingSize == 0) {
00151         splittingSize = 1; 
00152       }
00153 
00154       unsigned int avgLoad = (globSize/splittingSize);
00155       int leftOvers = (globSize - (splittingSize*avgLoad));
00156 
00157       std::vector<ot::TreeNode> tmpIn2;
00158       if(rank >= splittingSize) {
00159         par::scatterValues<ot::TreeNode>(tmpIn, tmpIn2, 0, comm);
00160       }else if(rank < leftOvers) {
00161         par::scatterValues<ot::TreeNode>(tmpIn, tmpIn2, (avgLoad+1), comm);
00162       }else {
00163         par::scatterValues<ot::TreeNode>(tmpIn, tmpIn2, avgLoad, comm);
00164       }
00165       tmpIn.clear();
00166 
00167       MPI_Comm newComm;
00168       par::splitCommUsingSplittingRank(splittingSize, &newComm, comm);
00169 
00170 #ifndef __SILENT_MODE__
00171       if(!rank) {
00172         std::cout<<"Input to Coarsen is small ("<<globSize
00173           <<"). npes = "<<size<<" Splitting Comm. "<<std::endl;
00174       }
00175 #endif
00176 
00177       if(rank < splittingSize) {
00178         coarsenOctree(tmpIn2, out, dim, maxDepth, newComm, true, NULL, NULL);
00179       } else {
00180         if(iAmActive) {
00181           *iAmActive = false;
00182         }
00183       }
00184       tmpIn2.clear();
00185 
00186       if(newCommPtr) {
00187         *newCommPtr = newComm;
00188       }
00189 
00190       PROF_COARSE_END
00191     }//end the special case of too few elements.
00192 
00193 #ifndef __SILENT_MODE__
00194     if(!rank) {
00195       std::cout<<"Coarsen Octree. inpSize: "<<globSize <<" activeNpes: "<<size<<std::endl; 
00196     }
00197 #endif
00198 
00200     //Normal case all processors have enough number of elements for overlap.
00201     //This partition will guarantee that each proc has the min grain size 
00202     if(!skipPartition) {
00203       par::partitionW<ot::TreeNode>(tmpIn, NULL, comm);
00204     }
00205 
00206     inSz = tmpIn.size();
00207 
00208     int firstFCidx = 0;
00209 
00210     while( (firstFCidx < inSz) && (tmpIn[firstFCidx].getChildNumber() != 0) ) {
00211       firstFCidx++;
00212     }
00213 
00214     if(firstFCidx == inSz) {
00215       firstFCidx = -1;
00216     }
00217 
00218     int lastIdxToSend = -1;
00219 
00220     int lastFCidx = -1;
00221 
00222     if(firstFCidx >= 0) {
00223       if(rank < (size-1)) {
00224         lastFCidx = (inSz - 1);
00225         while( (lastFCidx >= 0) && (tmpIn[lastFCidx].getChildNumber() != 0) ) {
00226           lastFCidx--;
00227         }
00228 
00229         if(lastFCidx >= 0) {
00230           lastIdxToSend = (inSz - lastFCidx);
00231         }
00232       } else {
00233         lastFCidx = inSz;
00234       }//end if PN
00235     }
00236 
00237     MPI_Request sendFirstIdxRequest;
00238     MPI_Request sendLastIdxRequest;
00239     MPI_Request recvFirstIdxRequest;
00240     MPI_Request recvLastIdxRequest;
00241 
00242     int lastFCidxOfPrevP;
00243     int firstFCidxOfNextP;
00244 
00245     if(rank) {
00246       //Recv lastFCidx from the prev processor
00247       par::Mpi_Irecv<int>(&lastFCidxOfPrevP, 1, (rank-1),
00248           1, comm, &recvLastIdxRequest);
00249 
00250       //Send firstFCidx to the prev processor
00251       par::Mpi_Issend<int>(&firstFCidx, 1, (rank-1),
00252           2, comm, &sendFirstIdxRequest);
00253     }
00254 
00255     if(rank < (size-1)) {
00256       //Recv firstFCidx from the next processor
00257       par::Mpi_Irecv<int>(&firstFCidxOfNextP, 1, (rank+1),
00258           2, comm, &recvFirstIdxRequest);
00259 
00260       //Send lastFCidx to the next processor
00261       par::Mpi_Issend<int>(&lastIdxToSend, 1, (rank+1),
00262           1, comm, &sendLastIdxRequest);
00263     }
00264 
00265     //Process local elements, overlapping with communication
00266     int prevFCidx = firstFCidx;
00267 
00268     //If firstFCidx == lastFCidx, it will be processed after recieving the
00269     //messages from the previous and next processors
00270     //On the last processor, we want to process all elements after the firstFC.
00271     //So, on the last processor lastFCidx is set to the end of the list (inSz). 
00272     //On all other processors, we must wait for the firstFCofNextP before
00273     //processing elements after (including) our lastFC. Hence lastFC is not
00274     //processed here for other processors.
00275     while (prevFCidx < lastFCidx) {
00276       int currFCidx = (prevFCidx + 1);
00277       while( (currFCidx < lastFCidx) &&
00278           (tmpIn[currFCidx].getChildNumber() != 0) ) {
00279         currFCidx++;
00280       }      
00281       if(currFCidx >= (prevFCidx + (1 << dim))) {
00282         //Coarsen the first 8 octants including prevFCidx and insert the rest
00283         //as they are. Only the first 8 octants will be coarsened
00284         out.push_back(tmpIn[prevFCidx].getParent());
00285         if((prevFCidx + (1 << dim)) < currFCidx) {
00286           out.insert(out.end(), 
00287               (tmpIn.begin() + (prevFCidx + (1 << dim))),
00288               (tmpIn.begin() + currFCidx));
00289         }
00290       } else {
00291         //Can't coarsen any octant in between prevFCidx and currFCidx
00292         if(prevFCidx < currFCidx) {
00293           out.insert(out.end(),
00294               (tmpIn.begin() + prevFCidx), (tmpIn.begin() + currFCidx));
00295         }
00296       }
00297       prevFCidx = currFCidx;
00298     }//end main while loop
00299 
00300     if(rank) {
00301       MPI_Status statusWait;
00302       MPI_Wait(&recvLastIdxRequest, &statusWait);
00303 
00304       if ( (lastFCidxOfPrevP >= 0) && (firstFCidx >= 0) ) {
00305         if ( (lastFCidxOfPrevP + firstFCidx) >= (1 << dim) ) {
00306           int stIdx = ((1 << dim) - lastFCidxOfPrevP);
00307           if(stIdx < 0) {
00308             stIdx = 0;
00309           }
00310           //The elements 0 to stIdx are siblings of an octant on the prev
00311           //processor. The prev. processor will add the parent
00312           if(stIdx < firstFCidx) {
00313             out.insert(out.begin(),
00314                 (tmpIn.begin() + stIdx), (tmpIn.begin() + firstFCidx) );
00315           }
00316         } else {
00317           //No octant in the interval is coarsened 
00318           if(firstFCidx) {
00319             out.insert(out.begin(), tmpIn.begin(),
00320                 (tmpIn.begin() + firstFCidx) );
00321           }
00322         }
00323       } else {
00324         //Special case
00325         if (lastFCidxOfPrevP < 0) {
00326           //There is no FC on the previous processor
00327           //Since, every processor has atleast 8 elements, immaterial of
00328           //whether or not our processor has any FCs we can add all the
00329           //elements until our firstFCidx (not inclusive). If we don't have any
00330           //FCs then we can add all our elements, since they will neither be
00331           //combined with the previous processor nor the next processor
00332           if (firstFCidx >= 0) {
00333             if (firstFCidx) {
00334               out.insert(out.begin(), tmpIn.begin(),
00335                   (tmpIn.begin() + firstFCidx));
00336             }
00337           } else {
00338             //Since there are no FCs on this processor, the main loop would
00339             //have done nothing
00340             assert(out.empty());
00341             out = tmpIn;
00342           }
00343         } else {
00344           //Since the previous processor has a FC and we have none and since
00345           //every processor has atleast 8 elements (in particular this
00346           //processor owns atleast 8 elements), there are more than 8 elements
00347           //between the FC on the prev. processor and the next FC (there may be
00348           //none. It is ok. That is to be treated just as if the next FC idx is
00349           //past the end of octree itself)
00350           //the last FC on the prev. processor and 7 successive elements will
00351           //be coarsened and the remaining elements until the next FC will be
00352           //added as it is. Since, the next FC (if it exists) is not on our
00353           //processor, we must simply add all our elements, except those that
00354           //are the siblings of the last FC on the prev. proc.
00355           assert(out.empty());
00356           if (lastFCidxOfPrevP < (1 << dim) ) {
00357             int stIdx = ((1 << dim) - lastFCidxOfPrevP); 
00358             out.insert(out.begin(), (tmpIn.begin() + stIdx), tmpIn.end());
00359           } else {
00360             out = tmpIn;
00361           }
00362         }
00363       }//end if special case
00364     }//end if not P0
00365 
00366     if (rank < (size-1)) {
00367       MPI_Status statusWait;
00368       MPI_Wait(&recvFirstIdxRequest, &statusWait);
00369 
00370       if ( (lastIdxToSend >= 0) && (firstFCidxOfNextP >= 0) ) {
00371         if ( (lastIdxToSend + firstFCidxOfNextP) >= (1 << dim) ) {
00372           out.push_back(tmpIn[lastFCidx].getParent());
00373           if ( (lastFCidx + (1 << dim)) < inSz ) {
00374             out.insert(out.end(),
00375                 (tmpIn.begin() + (lastFCidx + (1 << dim))),
00376                 (tmpIn.begin() + inSz));
00377           }
00378         } else {
00379           out.insert(out.end(), (tmpIn.begin() + lastFCidx),
00380               (tmpIn.begin() + inSz));
00381         }
00382       } else {
00383         //Special case
00384         if (lastIdxToSend < 0) {
00385           //This processor does not have any FCs. This case will be handled
00386           //while checking our firstFC with the lastFC of the prev P.
00387           //Only P0 will not handle this case, since it will not have any prev P.
00388           //However, P0 is guaranteed to have atleast one FC (the 0,0,0 node)
00389           //Nothing to be done here.
00390         } else {
00391           //This processor has a FC, but the next processor does not have any
00392           //FCs. Since the next processor has atleast 8 elements. There are
00393           //more than 8 elements between the lastFC on this processor and the
00394           //next FC in the linear octree.
00395           out.push_back(tmpIn[lastFCidx].getParent());          
00396           if ( (lastFCidx + (1 << dim)) < inSz ) {
00397             out.insert(out.end(),
00398                 (tmpIn.begin() + (lastFCidx + (1 << dim))),
00399                 (tmpIn.begin() + inSz));
00400           }
00401         }
00402       }//end if special case
00403     }//end if not PN
00404 
00405     if (rank) {
00406       MPI_Status statusWait;
00407       MPI_Wait(&sendFirstIdxRequest, &statusWait);
00408     }
00409 
00410     if (rank < (size-1)) {
00411       MPI_Status statusWait;
00412       MPI_Wait(&sendLastIdxRequest, &statusWait);
00413     }
00414 
00415     PROF_COARSE_END
00416   }//end function
00417 
00418 }//end namespace
00419 
00420