Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

Balance.C

Go to the documentation of this file.
00001 
00009 #include "parUtils.h"
00010 #include "seqUtils.h"
00011 #include "TreeNode.h"
00012 #include "TreeNodePointer.h"
00013 #include "testUtils.h"
00014 #include "dendro.h"
00015 
00016 #ifdef __DEBUG__
00017 #ifndef __DEBUG_OCT__
00018 #define __DEBUG_OCT__
00019 #endif
00020 #endif
00021 
00022 #ifdef __DEBUG_OCT__
00023 #ifndef __MEASURE_BAL_COMM__
00024 #define __MEASURE_BAL_COMM__
00025 #endif
00026 #endif
00027 
00028 namespace ot {
00029 
00030   //Assumption: in is globally sorted, linear and complete.
00031   int balanceOctree (std::vector<TreeNode > &in, std::vector<TreeNode > &out,
00032       unsigned int dim, unsigned int maxDepth, bool incCorner,
00033       MPI_Comm comm, MPI_Comm* newCommPtr, bool* iAmActive) {
00034 #ifdef __PROF_WITH_BARRIER__
00035     MPI_Barrier(comm);
00036 #endif
00037     PROF_BAL_BEGIN
00038 
00039       int rank, size;
00040     MPI_Comm_size(comm,&size);
00041     out.clear();
00042 
00043     if(newCommPtr) {
00044       *newCommPtr = comm;
00045     }
00046 
00047     if(iAmActive) {
00048       *iAmActive = true;
00049     }
00050 
00052     //Check if it is a single processor...
00053     if (size == 1) {
00054 #ifndef __SILENT_MODE__
00055       std::cout<<"Balance Octree. inpSize: "<<in.size()<<" activeNpes: 1"<<std::endl; 
00056 #endif
00057 
00058       out = in;
00059       in.clear();
00060       comboRipple(out, incCorner);
00061       PROF_BAL_END
00062     }
00063 
00064     MPI_Comm_rank(comm,&rank);
00065 
00066     //Check if the input is too small...
00067     DendroIntL locSize = in.size();
00068     DendroIntL globSize;
00069     PROF_BAL_COMM_BEGIN
00070 
00071       par::Mpi_Allreduce<DendroIntL>(&locSize, &globSize, 1, MPI_SUM, comm);
00072 
00073     PROF_BAL_COMM_END
00074 
00075       //min grain size = 1000
00076       const DendroIntL THOUSAND = 1000;
00077     if (globSize < (THOUSAND*size)) {
00078       int splittingSize = (globSize/THOUSAND); 
00079       if(splittingSize == 0) {
00080         splittingSize = 1; 
00081       }
00082 
00083       unsigned int avgLoad = (globSize/splittingSize);
00084       int leftOvers = (globSize - (splittingSize*avgLoad));
00085 
00086       PROF_BAL_SCATTER_BEGIN
00087 
00088         std::vector<TreeNode> tmpIn;
00089       if(rank >= splittingSize) {
00090         par::scatterValues<ot::TreeNode>(in, tmpIn, 0, comm);
00091       }else if(rank < leftOvers) {
00092         par::scatterValues<ot::TreeNode>(in, tmpIn, (avgLoad+1), comm);
00093       }else {
00094         par::scatterValues<ot::TreeNode>(in, tmpIn, avgLoad, comm);
00095       }
00096 
00097       PROF_BAL_SCATTER_END
00098 
00099         in.clear();
00100 
00101       PROF_BAL_SPLIT_COMM_BEGIN
00102 
00103         MPI_Comm newComm;
00104       par::splitCommUsingSplittingRank(splittingSize, &newComm, comm);
00105 
00106       PROF_BAL_SPLIT_COMM_END
00107 
00108 #ifndef __SILENT_MODE__
00109         if(!rank) {
00110           std::cout<<"Input to Balance is small ("<<globSize
00111             <<"). npes = "<<size<<" Splitting Comm. "<<std::endl;
00112         }
00113 #endif
00114 
00115       if(rank < splittingSize) {
00116         balanceOctree (tmpIn, out, dim, maxDepth, incCorner, newComm, NULL, NULL);
00117       } else {
00118         if(iAmActive) {
00119           *iAmActive = false;
00120         }
00121       }
00122       tmpIn.clear();
00123 
00124       if(newCommPtr) {
00125         *newCommPtr = newComm;
00126       }
00127 
00128       PROF_BAL_END
00129     }//end if reduce procs
00130 
00131 #ifndef __SILENT_MODE__
00132     if(!rank) {
00133       std::cout<<"Balance Octree. inpSize: "<<globSize <<" activeNpes: "<<size<<std::endl; 
00134     }
00135 #endif
00136 
00138     //Partition in and create blocks (blocks must be globally sorted).
00139     //Prepare for blockPart.
00140 
00141     std::vector<ot::TreeNode> blocks;    
00142     std::vector<ot::TreeNode> minsAllBlocks;
00143 
00144 #ifdef __PROF_WITH_BARRIER__
00145     MPI_Barrier(comm);
00146 #endif
00147     PROF_BAL_BPART1_BEGIN
00148 
00149       blockPartStage1(in, blocks, dim, maxDepth, comm);            
00150 
00151     PROF_BAL_BPART1_END
00152 
00153 #ifdef __PROF_WITH_BARRIER__
00154       MPI_Barrier(comm);
00155 #endif
00156 
00157     PROF_BAL_BPART2_BEGIN
00158 
00159       blockPartStage2(in, blocks, minsAllBlocks, dim, maxDepth, comm);
00160 
00161     PROF_BAL_BPART2_END
00162 
00163       //blocks will be sorted.
00164 
00165       assert(!blocks.empty());
00166     TreeNode myFirstBlock = blocks[0];
00167     TreeNode myLastBlock = blocks[blocks.size()-1];
00168 
00169 #ifdef __DEBUG_OCT__
00170     assert(areComparable(myFirstBlock, in[0]));
00171     assert(myFirstBlock.isAncestor(in[0]) || myFirstBlock == in[0]);
00172 #endif
00173 
00174 #ifdef __USE_AGV_FOR_BAL__
00175     // 1. locally assign wgts to all blocks. The wgts are set to the rank of
00176     // the processor so that the owner of the block can be easily identified.
00177     //We don't need to reset the weights of blocks later since weights are not
00178     //used anywhere else. All comparisons are based only on the Morton
00179     //ordering.
00180     for(unsigned int i = 0; i < blocks.size(); i++) {
00181       blocks[i].setWeight(rank);
00182     }
00183 
00184     // 2. Communicate the blocks to all processors.
00185     int totalSize=0;
00186     int *blockSizes = new int [size];
00187     int *blockDisps = new int [size];
00188 
00189     int localBlockSize = blocks.size();
00190     PROF_BAL_COMM_BEGIN
00191 
00192       par::Mpi_Allgather<int>(&localBlockSize, blockSizes, 1, comm);
00193 
00194     PROF_BAL_COMM_END
00195 
00196       totalSize += blockSizes[0];
00197     blockDisps[0] = 0;
00198     for (unsigned int i = 1; i < size; i++) {
00199       totalSize += blockSizes[i];
00200       blockDisps[i] = blockDisps[i-1] + blockSizes[i-1];
00201     }//end for i
00202 
00203     // allocate for the blocks ...
00204     std::vector<TreeNode> allBlocks(totalSize);
00205 
00206     ot::TreeNode* blocksPtr = NULL;
00207     ot::TreeNode* allBlocksPtr = NULL;
00208     if(!blocks.empty()) {
00209       blocksPtr = &(*(blocks.begin()));
00210     }
00211     if(!allBlocks.empty()) {
00212       allBlocksPtr = &(*(allBlocks.begin()));
00213     }
00214     PROF_BAL_COMM_BEGIN
00215 
00216       par::Mpi_Allgatherv<ot::TreeNode>( blocksPtr, blockSizes[rank],
00217           allBlocksPtr, blockSizes, blockDisps, comm );
00218 
00219     PROF_BAL_COMM_END
00220 
00221 #ifndef __SILENT_MODE__
00222       if(!rank) {
00223         std::cout<<"# AllBlocks: "<<allBlocks.size()<<std::endl;
00224         for(int i = 0; i < size; i++) {
00225           std::cout<<"blockSizes["<<i<<"] = "<<blockSizes[i]<<std::endl;
00226         }
00227         for(int i = 0; i < allBlocks.size(); i++) {
00228           std::cout<<"allBlocks["<<i<<"].Wt() = "<<(allBlocks[i].getWeight())<<std::endl;
00229         }
00230       }
00231 #endif
00232 
00233 #endif
00234 
00236     //Local Balancing
00237 
00238     //Block Balance
00239     std::vector<ot::TreeNode> allBoundaryLeaves;
00240     std::vector<unsigned int> maxBlockBndVec;
00241 
00242     balanceBlocks(in, blocks, out, allBoundaryLeaves, incCorner, &maxBlockBndVec);
00243     in.clear();
00244 
00245 #ifdef __USE_AGV_FOR_BAL__
00246     std::vector<TreeNode> myNhBlocks;
00247     selectNeighboringBlocks(allBlocks, blocks, maxBlockBndVec, rank, myNhBlocks);
00248 
00249     allBlocks.clear();
00250 
00251 #ifdef __MEASURE_BAL_COMM__
00252     MPI_Barrier(comm);
00253     unsigned int localMyNhBlocksSize = myNhBlocks.size();
00254     unsigned int* globalMyNhBlocksSize = new unsigned int[size];
00255     par::Mpi_Gather<unsigned int>(&localMyNhBlocksSize, globalMyNhBlocksSize, 1, 0, comm);
00256     if(!rank) {
00257       for(int i = 0; i < size; i++) {
00258         std::cout<<"globalMyNhBlocksSize["<<i<<"] = "<<globalMyNhBlocksSize[i]<<std::endl;
00259       }//end for i
00260     }
00261     delete [] globalMyNhBlocksSize;
00262     MPI_Barrier(comm);
00263 #endif
00264 #endif
00265 
00266     blocks.clear();
00267     maxBlockBndVec.clear();
00268 
00269 #ifdef __USE_AGV_FOR_BAL__
00270     if(!myNhBlocks.empty()) {
00271       //Merge (result must remain sorted and linear) our local
00272       //boundary leaves and myNhBlocks. We need to perform the
00273       //intra-processor local balance on this
00274       //combined list. Note, this step is slightly different from 
00275       //what was described in the 2:1 balancing paper. In the algorithm, descibed
00276       //in the paper the blocks from other processors are not taken into account
00277       //for this step. By taking these blocks into account as well, we expect
00278       //to identify some very large octants early on and hence reduce the
00279       //insulation layer sizes before the actual 2-stage inter-processor step
00280       std::vector<ot::TreeNode> boundaryAndOtherBlocks;
00281 
00282       //Three Mutually Exclusive Cases:
00283       if(myNhBlocks[myNhBlocks.size()-1] < myFirstBlock) {
00284         //1) All myNhBlocks are < myFirstBlock
00285 
00286         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.end(),
00287             myNhBlocks.begin(), myNhBlocks.end());
00288 
00289         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.end(),
00290             allBoundaryLeaves.begin(), allBoundaryLeaves.end());
00291 
00292       } else if(myNhBlocks[0] > myLastBlock) {
00293         //2) All myNhBlocks are > myLastBlock
00294 
00295         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.end(),
00296             allBoundaryLeaves.begin(), allBoundaryLeaves.end());
00297 
00298         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.end(),
00299             myNhBlocks.begin(), myNhBlocks.end());
00300 
00301       } else {
00302         //3 some myNhBlocks are < myFirstBlock and some are > myLastBlock
00303 
00304         unsigned int myStIdxInNhBlocks = 0;
00305         unsigned int myEndIdxInNhBlocks = 0;
00306 
00307         for(unsigned int i = 0; i < myNhBlocks.size(); i++) {
00308           if(myNhBlocks[i] > myFirstBlock) {
00309             myStIdxInNhBlocks = i;
00310             break;
00311           }
00312         }
00313 
00314         //We don't need to use DLD here for comparison since there can not be
00315         //any overlap between our blocks/octants and myNhBlocks 
00316         for(unsigned int i = myNhBlocks.size(); i > 0; i--) {
00317           if(myNhBlocks[i-1] < myLastBlock) {
00318             myEndIdxInNhBlocks = i;
00319             break;
00320           }
00321         }
00322 
00323         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.begin(),
00324             myNhBlocks.begin(), (myNhBlocks.begin() + myStIdxInNhBlocks));
00325 
00326         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.end(),
00327             allBoundaryLeaves.begin(), allBoundaryLeaves.end());
00328 
00329         boundaryAndOtherBlocks.insert(boundaryAndOtherBlocks.end(),
00330             (myNhBlocks.begin() + myEndIdxInNhBlocks), myNhBlocks.end());
00331 
00332       }
00333 
00334       allBoundaryLeaves.clear();
00335 
00336       //Intra-processor Balance
00337       ripple(boundaryAndOtherBlocks, incCorner);
00338 
00339       //Extract the local boundary elements
00340       unsigned int myStartIdx = 0;
00341       unsigned int myEndIdx = 0;
00342 
00343       bool foundFirst = false;
00344       bool foundLast = false;
00345       for(unsigned int i = 0; i < boundaryAndOtherBlocks.size(); i++) {
00346         if(boundaryAndOtherBlocks[i] >= myFirstBlock) {
00347           myStartIdx = i;
00348           foundFirst = true;
00349           break;
00350         }
00351       }
00352 
00353       //We need DLD here since we need to account for descendants of
00354       //myLastBlock as well
00355       for(unsigned int i = boundaryAndOtherBlocks.size(); i > 0; i--) {
00356         if(boundaryAndOtherBlocks[i-1] <= myLastBlock.getDLD()) {
00357           myEndIdx = i;
00358           foundLast = true;
00359           break;
00360         }
00361       }
00362 
00363 #ifdef __DEBUG_OCT__
00364       //There must be at least one element belonging to our processor 
00365       assert(foundFirst && foundLast);
00366 #endif
00367 
00368       allBoundaryLeaves.insert(allBoundaryLeaves.begin(),
00369           boundaryAndOtherBlocks.begin() + myStartIdx, 
00370           boundaryAndOtherBlocks.begin() + myEndIdx);
00371 
00372       boundaryAndOtherBlocks.clear();
00373 
00374     } else {
00375       //Intra-processor Balance
00376       ripple(allBoundaryLeaves, incCorner);
00377     }//end if myNhBlocks is empty
00378     myNhBlocks.clear();
00379 #else
00380     //Intra-processor Balance
00381     ripple(allBoundaryLeaves, incCorner);
00382 #endif
00383 
00384     mergeComboBalAndPickBoundary(out, allBoundaryLeaves, myFirstBlock, myLastBlock);
00385 
00386 #ifdef __MEASURE_BAL_COMM__
00387     MPI_Barrier(comm);
00388     unsigned int localAllBndSize = allBoundaryLeaves.size();
00389     unsigned int* globalAllBndSize = new unsigned int[size];
00390     par::Mpi_Gather<unsigned int>(&localAllBndSize, globalAllBndSize, 1, 0, comm);
00391     if(!rank) {
00392       for(int i = 0; i < size; i++) {
00393         std::cout<<" globalAllBndSize["<<i<<"]= "<<globalAllBndSize[i]<<std::endl;
00394       }//end for i
00395     }
00396     delete [] globalAllBndSize;
00397     MPI_Barrier(comm);
00398 #endif
00399 
00400     //There is no need to sort, the lists are already sorted.
00401 
00403     //Preparation for inter-processor balance....
00404     //First Stage of Communication...
00405     std::vector<TreeNode> * sendNodes = new std::vector<TreeNode> [size];
00406     std::vector<unsigned int> * sentToPid = NULL;
00407 
00408     std::vector<ot::TreeNode> sendK;
00409     int *sendCnt = new int[size];
00410     int *recvCnt = new int[size];
00411     int *sendOffsets = new int[size];
00412 
00413     // 3. Loop through all local boundary nodes and determine which processors
00414     // need to be aware of those nodes. Create lists that shall be sent.
00415 
00416     for (int i = 0; i < size; i++) {
00417       sendCnt[i] = 0;
00418       sendNodes[i].clear();
00419     }
00420 
00421     if(!allBoundaryLeaves.empty()) {
00422       sentToPid = new std::vector<unsigned int> [allBoundaryLeaves.size()];
00423     } else {
00424       sentToPid = NULL;
00425     }
00426 
00427     //create sendNodes
00428     prepareBalComm1MessagesType2(allBoundaryLeaves, minsAllBlocks, rank, dim, maxDepth,
00429         sendNodes, sentToPid, sendCnt);
00430 
00431     // 4. Actual send/recv. to exchange nodes.
00432     //
00433     // 4a.
00434     // Now do an All2All to get numKeysRecv
00435     PROF_BAL_COMM_BEGIN
00436 
00437       par::Mpi_Alltoall<int>(sendCnt, recvCnt, 1, comm);
00438 
00439     PROF_BAL_COMM_END
00440 
00441 #ifdef __MEASURE_BAL_COMM__
00442       unsigned int numProcsSend1 = 0;
00443     unsigned int numProcsRecv1 = 0;
00444 #endif
00445 
00446     // 4b. Concatenate all nodes into one single Carray ...
00447     unsigned int totalSend = 0;
00448     unsigned int totalRecv = 0;
00449     for (unsigned int i = 0; i < size; i++) {
00450       totalSend += sendCnt[i];
00451       totalRecv += recvCnt[i];
00452 #ifdef __MEASURE_BAL_COMM__
00453       if(sendCnt[i]) {
00454         numProcsSend1++;
00455       }
00456       if(recvCnt[i]) {
00457         numProcsRecv1++;
00458       }
00459 #endif
00460     }//end for i
00461 
00462     // create the send and recv buffers ...
00463 
00464     int *recvOffsets1 = new int[size];
00465 
00466     // Now create sendK
00467     sendOffsets[0] = 0;
00468     recvOffsets1[0] = 0;
00469 
00470     // compute offsets ...
00471     for (int i = 1; i < size; i++) {
00472       sendOffsets[i] = sendOffsets[i-1] + sendCnt[i-1];
00473       recvOffsets1[i] = recvOffsets1[i-1] + recvCnt[i-1];
00474     }//end for i
00475 
00476     sendK.resize(totalSend);
00477     for (int i = 0; i < size; i++) {
00478 #ifdef __DEBUG_OCT__
00479       assert(seq::test::isUniqueAndSorted(sendNodes[i]));
00480 #endif
00481       for (unsigned int j = 0; j < sendCnt[i]; j++) {
00482         sendK[sendOffsets[i] + j] = sendNodes[i][j];
00483       }//end for j
00484     }//end for i
00485 
00486     for (unsigned int i = 0; i < size; i++) {
00487       sendNodes[i].clear();
00488     }
00489 
00490     // 4c. Perform SendRecv to send and receive all keys ...
00491     std::vector<ot::TreeNode> recvK1(totalRecv);
00492 
00493     ot::TreeNode* sendKptr = NULL;
00494     ot::TreeNode* recvK1ptr = NULL;
00495     if(!sendK.empty()) {
00496       sendKptr = &(*(sendK.begin()));
00497     }
00498     if(!recvK1.empty()) {
00499       recvK1ptr = &(*(recvK1.begin()));
00500     }
00501 
00502     PROF_BAL_COMM_BEGIN
00503 
00504       par::Mpi_Alltoallv_sparse<ot::TreeNode>(sendKptr, sendCnt, sendOffsets,        
00505           recvK1ptr, recvCnt, recvOffsets1, comm);
00506 
00507     PROF_BAL_COMM_END
00508 
00509       sendK.clear();
00510 
00511 #ifdef __MEASURE_BAL_COMM__
00512     MPI_Barrier(comm);
00513     unsigned int* allTotalSend1 = new unsigned int[size];
00514     unsigned int* allTotalRecv1 = new unsigned int[size];
00515     unsigned int* allNumProcsSend1 = new unsigned int[size];
00516     unsigned int* allNumProcsRecv1 = new unsigned int[size];
00517     par::Mpi_Gather<unsigned int>(&totalSend, allTotalSend1, 1, 0, comm);
00518     par::Mpi_Gather<unsigned int>(&totalRecv, allTotalRecv1, 1, 0, comm);
00519     par::Mpi_Gather<unsigned int>(&numProcsSend1, allNumProcsSend1, 1, 0, comm);
00520     par::Mpi_Gather<unsigned int>(&numProcsRecv1, allNumProcsRecv1, 1, 0, comm);
00521     if(!rank) {
00522       for(int i = 0; i < size; i++) {
00523         std::cout<<"allTotalSend1["<<i<<"] in bal: "<<allTotalSend1[i]<<std::endl; 
00524         std::cout<<"allTotalRecv1["<<i<<"] in bal: "<<allTotalRecv1[i]<<std::endl; 
00525         std::cout<<"allNumProcsSend1["<<i<<"] in bal: "<<allNumProcsSend1[i]<<std::endl; 
00526         std::cout<<"allNumProcsRecv1["<<i<<"] in bal: "<<allNumProcsRecv1[i]<<std::endl; 
00527       }
00528     }
00529     delete [] allTotalSend1;
00530     delete [] allTotalRecv1;
00531     delete [] allNumProcsSend1;
00532     delete [] allNumProcsRecv1;
00533     MPI_Barrier(comm);
00534 #endif
00535 
00537     //Second Stage of Communication...
00538     // 5. Loop through all local boundary nodes and determine which processors
00539     // need to be aware of those nodes. Create lists that shall be sent.
00540 
00541     std::vector<ot::TreeNode> wList;
00542     std::vector<std::vector<unsigned int> > wListRanks; 
00543 
00544     prepareWlistInBal(recvK1, recvCnt, size, myFirstBlock, myLastBlock, wList, wListRanks);
00545 
00546     for (int i = 0; i < size; i++) {
00547       sendCnt[i] = 0;
00548       sendNodes[i].clear();
00549     }
00550 
00551     prepareBalComm2Messages(allBoundaryLeaves, wList, wListRanks, 
00552         sendNodes, sentToPid, sendCnt);
00553 
00554     for (int i = 0; i < allBoundaryLeaves.size(); i++) {
00555       sentToPid[i].clear();
00556     }
00557 
00558     if(sentToPid) {
00559       delete [] sentToPid;
00560       sentToPid = NULL;
00561     }
00562 
00563     // 6. Actual send/recv. to exchange nodes.
00564     //
00565     // 6a.
00566     // Now do an All2All to get numKeysRecv
00567     PROF_BAL_COMM_BEGIN
00568 
00569       par::Mpi_Alltoall<int>(sendCnt, recvCnt, 1, comm);
00570 
00571     PROF_BAL_COMM_END
00572 
00573 #ifdef __MEASURE_BAL_COMM__
00574       unsigned int numProcsSend2 = 0;
00575     unsigned int numProcsRecv2 = 0;
00576 #endif
00577     // 6b. Concatenate all nodes into one single Carray ...
00578     totalSend = 0;
00579     totalRecv = 0;
00580     for (unsigned int i = 0; i < size; i++) {
00581       totalSend+= sendCnt[i];
00582       totalRecv+= recvCnt[i];
00583 #ifdef __MEASURE_BAL_COMM__
00584       if(sendCnt[i]) {
00585         numProcsSend2++;
00586       }
00587       if(recvCnt[i]) {
00588         numProcsRecv2++;
00589       }
00590 #endif
00591     }//end for i
00592 
00593     // create the send and recv buffers ...
00594 
00595     int *recvOffsets2 = new int[size];
00596     // Now create sendK
00597     sendOffsets[0] = 0;
00598     recvOffsets2[0] = 0;
00599 
00600     // compute offsets ...
00601     for (int i=1; i<size; i++) {
00602       sendOffsets[i] = sendOffsets[i-1] + sendCnt[i-1];
00603       recvOffsets2[i] = recvOffsets2[i-1] + recvCnt[i-1];
00604     }//end for i
00605 
00606     sendK.resize(totalSend);
00607     for (int i=0; i<size; i++) {
00608 #ifdef __DEBUG_OCT__
00609       assert(seq::test::isUniqueAndSorted(sendNodes[i]));
00610 #endif
00611       for (unsigned int j = 0; j < sendCnt[i]; j++) {
00612         sendK[sendOffsets[i] + j] = sendNodes[i][j];
00613       }//end for j
00614     }//end for i
00615 
00616     for (unsigned int i = 0; i < size; i++) {
00617       sendNodes[i].clear();
00618     }
00619 
00620     delete [] sendNodes;
00621     sendNodes = NULL;
00622 
00623     std::vector<ot::TreeNode> recvK2(totalRecv);
00624 
00625     sendKptr = NULL;
00626     ot::TreeNode* recvK2ptr = NULL;
00627     if(!sendK.empty()) {
00628       sendKptr = &(*(sendK.begin()));
00629     }
00630     if(!recvK2.empty()) {
00631       recvK2ptr = &(*(recvK2.begin()));
00632     }
00633 
00634     PROF_BAL_COMM_BEGIN
00635 
00636       par::Mpi_Alltoallv_sparse<ot::TreeNode>(sendKptr, sendCnt, sendOffsets,        
00637           recvK2ptr, recvCnt, recvOffsets2, comm);
00638 
00639     PROF_BAL_COMM_END
00640 
00641       sendK.clear();
00642 
00643     delete [] sendCnt;
00644     sendCnt = NULL;
00645 
00646     delete [] recvCnt;
00647     recvCnt = NULL;
00648 
00649     delete [] sendOffsets;
00650     sendOffsets = NULL;
00651 
00652 
00653 #ifdef __MEASURE_BAL_COMM__
00654     MPI_Barrier(comm);
00655     unsigned int* allTotalSend2 = new unsigned int[size];
00656     unsigned int* allTotalRecv2 = new unsigned int[size];
00657     unsigned int* allNumProcsSend2 = new unsigned int[size];
00658     unsigned int* allNumProcsRecv2 = new unsigned int[size];
00659     par::Mpi_Gather<unsigned int>(&totalSend, allTotalSend2, 1, 0, comm);
00660     par::Mpi_Gather<unsigned int>(&totalRecv, allTotalRecv2, 1, 0, comm);
00661     par::Mpi_Gather<unsigned int>(&numProcsSend2, allNumProcsSend2, 1, 0, comm);
00662     par::Mpi_Gather<unsigned int>(&numProcsRecv2, allNumProcsRecv2, 1, 0, comm);
00663     if(!rank) {
00664       for(int i = 0; i < size; i++) {
00665         std::cout<<"allTotalSend2["<<i<<"] in bal: "<<allTotalSend2[i]<<std::endl; 
00666         std::cout<<"allTotalRecv2["<<i<<"] in bal: "<<allTotalRecv2[i]<<std::endl; 
00667         std::cout<<"allNumProcsSend2["<<i<<"] in bal: "<<allNumProcsSend2[i]<<std::endl; 
00668         std::cout<<"allNumProcsRecv2["<<i<<"] in bal: "<<allNumProcsRecv2[i]<<std::endl; 
00669       }
00670     }
00671     delete [] allTotalSend2;
00672     delete [] allTotalRecv2;
00673     delete [] allNumProcsSend2;
00674     delete [] allNumProcsRecv2;
00675     MPI_Barrier(comm);
00676 #endif
00677 
00679 
00680     //7.a.Merge (In-place) recieved keys from both stages of communication....
00681     std::vector<ot::TreeNode> recvK;
00682 
00683     unsigned int myOff1 = recvOffsets1[rank];
00684     unsigned int myOff2 = recvOffsets2[rank];
00685     unsigned int myOff = myOff1 + myOff2;
00686 
00687     mergeRecvKeysInBal(recvK1, recvOffsets1, recvK2, recvOffsets2, size, recvK);
00688 
00689     delete [] recvOffsets1;
00690     recvOffsets1 = NULL;
00691 
00692     recvK1.clear();
00693 
00694     delete [] recvOffsets2;
00695     recvOffsets2 = NULL;
00696 
00697     recvK2.clear();
00698 
00699     //7.b.Merge (In-place) recieved octants with local octants....
00700     unsigned int n = static_cast<unsigned int>(allBoundaryLeaves.size());
00701     std::vector <TreeNode > nodes( recvK.size() + n );
00702 
00703     //Note this is an inplace insertion. This is done to preserve the sorted
00704     //order. Each recvK[i] is independently sorted. Also, since the intial
00705     //blocks were globally sorted and since the elements on each processor are
00706     //only decendants of these blocks, hence recvK[i][j] < recvK[i+][k] for all i,j
00707     //and k.
00708     for (int i = 0; i < myOff; i++) {
00709       nodes[i] = recvK[i];
00710     }
00711     for (int i = myOff; i < myOff + n; i++) {
00712       nodes[i] = allBoundaryLeaves[i - myOff];
00713     }
00714     for (int i = myOff + n; i < nodes.size(); i++) {
00715       nodes[i] = recvK[i - n];
00716     }
00717     recvK.clear();
00718     allBoundaryLeaves.clear();
00719 
00721 
00722     // 8. Now perform seq. balance on the new list of local nodes.
00723     ripple(nodes, incCorner);
00724 
00726     // 9. Discard all nodes that do not belong to the original domain. This can
00727     // easily be obtained from the mins of blocks. 
00728     unsigned int myStIdxInNodes = 0;
00729     unsigned int myEndIdxInNodes = 0;
00730     bool foundStart = false;
00731     bool foundEnd = false;
00732     for(unsigned int i = 0; i < nodes.size(); i++) {
00733       if(nodes[i] >= myFirstBlock) {
00734         myStIdxInNodes = i;
00735         foundStart = true;
00736         break;
00737       }
00738     }
00739 
00740     //We need DLD here since we need to account for descendants of
00741     //myLastBlock as well
00742     for(unsigned int i = nodes.size(); i > 0; i--) {
00743       if(nodes[i-1] <= myLastBlock.getDLD()) {
00744         myEndIdxInNodes = i;
00745         foundEnd = true;
00746         break;
00747       }
00748     }
00749 
00750 #ifdef __DEBUG_OCT__
00751     assert(foundStart && foundEnd);
00752 #endif
00753 
00754     allBoundaryLeaves.insert(allBoundaryLeaves.begin(),
00755         (nodes.begin() + myStIdxInNodes), (nodes.begin() + myEndIdxInNodes));
00756 
00757     nodes.clear();
00758 
00759     //10.Merge (In-place) results from all three stages....
00760     finalMergeInBal(out, allBoundaryLeaves);
00761 
00762     PROF_BAL_END
00763   }//end function
00764 
00765   //Assumption: in is sorted, linear and the morton space between the first and
00766   //last elements is complete.
00767   int comboRipple(std::vector<TreeNode> & in, bool incCorner, const unsigned int maxNum) {
00768     PROF_COMBO_RIPPLE_BEGIN
00769 
00770       if (in.size() < 2) {
00771         PROF_COMBO_RIPPLE_END
00772       }
00773 
00774     std::vector<ot::TreeNode> blocks;
00775     std::vector<ot::TreeNode> out;
00776     std::vector<ot::TreeNode> allBoundaryLeaves;
00777 
00778     TreeNode nca = getNCA(in[0],in[in.size()-1]);
00779     nca.addChildren(blocks);
00780 
00781     assert(maxNum > 0);
00782 
00783     if (in.size() > maxNum) {
00784       std::vector<TreeNode> *splitInp = NULL;
00785 
00786       if(!blocks.empty()) {
00787         splitInp = new std::vector<TreeNode> [blocks.size()];
00788       }
00789 
00790       unsigned int nextPt = 0;
00791       unsigned int nextNode = 0;
00792       //All elements of inp are inside some element in blocks.
00793       while (nextPt < in.size()) {
00794         //The first pt must be inside some block.
00795 #ifdef __DEBUG_OCT__
00796         assert(nextNode < blocks.size());
00797         assert(areComparable(blocks[nextNode], in[nextPt]));
00798 #endif
00799         if ((blocks[nextNode].isAncestor(in[nextPt])) ||
00800             (blocks[nextNode] == in[nextPt])) {
00801           splitInp[nextNode].push_back(in[nextPt]);
00802           nextPt++;
00803         } else {
00804           nextNode++;
00805           if (nextNode == blocks.size()) {
00806             assert(false);
00807           }
00808         }
00809       }//end while
00810       for (unsigned int i = 0; i < blocks.size(); i++) {
00811         comboRipple(splitInp[i], incCorner, maxNum);
00812       }//end for i
00813 
00814       unsigned int allBoundarySz = 0;
00815       unsigned int blockOutSize = 0;
00816 
00817       std::vector<TreeNode> *blockBoundaries = NULL;
00818 
00819       if(!blocks.empty()) {
00820         blockBoundaries = new std::vector<TreeNode>[blocks.size()];
00821       }
00822 
00823       unsigned int numEmptyBlocks=0;
00824       for (unsigned int bi = 0; bi < blocks.size(); bi++) {
00825         if (splitInp[bi].empty()) {
00826           continue; 
00827         }
00828         blockOutSize += splitInp[bi].size();
00829         if (splitInp[bi].size() > 1) {
00830           blocks[bi].pickInternalBoundaryCells(splitInp[bi], blockBoundaries[bi]);
00831         }
00832         if (blockBoundaries[bi].empty()) {
00833           //the case where splitInp[bi] = blocks[bi].
00834           blockBoundaries[bi].push_back(blocks[bi]);  
00835           numEmptyBlocks++;
00836         }
00837         allBoundarySz += blockBoundaries[bi].size();
00838       }//end for bi
00839 
00840       //Concatenate the lists into out and allBoundaryLeaves
00841       out.resize(blockOutSize);
00842       allBoundaryLeaves.resize(allBoundarySz);
00843       unsigned int nodeCtr = 0;
00844       unsigned int boundaryCtr = 0;
00845       for (unsigned int bi = 0; bi < blocks.size(); bi++) {
00846         for (unsigned int bj = 0; bj < splitInp[bi].size(); bj++) {
00847           out[nodeCtr] = splitInp[bi][bj];
00848           nodeCtr++;
00849         }
00850         for (unsigned int bj = 0; bj < blockBoundaries[bi].size(); bj++) {
00851           allBoundaryLeaves[boundaryCtr] = blockBoundaries[bi][bj];
00852           boundaryCtr++;
00853         }
00854       }//end for bi
00855       delete [] splitInp;
00856       splitInp = NULL;
00857 
00858       delete [] blockBoundaries;
00859       blockBoundaries = NULL;
00860     } else {
00861       balanceBlocks (in, blocks, out, allBoundaryLeaves, incCorner); 
00862     }
00863 
00864     in.clear();
00865 
00866     //Inter-Block Balance
00867     ripple(allBoundaryLeaves, incCorner);
00868 
00869     //Merge (in-place) results from the two stages....
00870     in.resize(out.size() + allBoundaryLeaves.size());
00871     unsigned int tmpLsz=0;
00872     unsigned int bndCnt=0;
00873     for (unsigned int i =0;i<out.size();i++) {
00874       if ( bndCnt < allBoundaryLeaves.size() ) {
00875         if ( out[i] == allBoundaryLeaves[bndCnt] ) {
00876           in[tmpLsz++] = out[i];
00877           bndCnt++;
00878         } else if (out[i] < allBoundaryLeaves[bndCnt] ) {
00879 #ifdef __DEBUG_OCT__
00880           assert(areComparable(out[i], allBoundaryLeaves[bndCnt]));
00881 #endif
00882           if (out[i].isAncestor(allBoundaryLeaves[bndCnt]) ) {
00883             while ( (bndCnt < allBoundaryLeaves.size()) && 
00884                 out[i].isAncestor(allBoundaryLeaves[bndCnt]) ) {
00885               in[tmpLsz++] = allBoundaryLeaves[bndCnt++];
00886 #ifdef __DEBUG_OCT__
00887               if(bndCnt < allBoundaryLeaves.size()) {
00888                 assert(areComparable(out[i], allBoundaryLeaves[bndCnt]));
00889               }
00890 #endif
00891             }
00892           } else {
00893             in[tmpLsz++] = out[i];
00894           }
00895         } else {
00896           // nodes[i] > allBdy .. so insert 
00897           in[tmpLsz++] = allBoundaryLeaves[bndCnt++];
00898         }
00899       } else {
00900         in[tmpLsz++] = out[i];
00901       }
00902     }//end for i
00903 
00904     in.resize(tmpLsz);
00905     out.clear();
00906 
00907     //There is no need to sort, the lists are already sorted.
00908     PROF_COMBO_RIPPLE_END
00909   }//end function
00910 
00911   //Assumption: All elements of inp are inside some unique element in blocks.
00912   int balanceBlocks (const std::vector<TreeNode> &inp,
00913       const std::vector<TreeNode> &blocks, std::vector<TreeNode> &nodes,
00914       std::vector<TreeNode> &allBoundaryLeaves, bool incCorner, 
00915       std::vector<unsigned int> *maxBlockBndVec ) {
00916     PROF_CON_BAL_BEGIN
00917 
00918       if (inp.empty()) {
00919         nodes.clear();
00920         allBoundaryLeaves.clear();
00921         if (maxBlockBndVec != NULL) {
00922           maxBlockBndVec->clear();
00923         }
00924         PROF_CON_BAL_END
00925       }
00926 
00927     unsigned int nextPt;
00928     unsigned int nextNode;
00929 
00930     assert(!blocks.empty());
00931     std::vector<TreeNode> *blockOut = new std::vector<TreeNode> [blocks.size()];
00932     std::vector<TreeNode> *splitInp = new std::vector<TreeNode> [blocks.size()];
00933 
00934     nextPt =0;
00935     nextNode=0;
00936     //All elements of inp are inside some element in blocks.
00937     while (nextPt<inp.size()) {
00938       //The first pt must be inside some block.
00939 #ifdef __DEBUG_OCT__
00940       assert(areComparable(blocks[nextNode], inp[nextPt]));
00941 #endif
00942       if ((blocks[nextNode].isAncestor(inp[nextPt])) ||
00943           (blocks[nextNode] == inp[nextPt])) {
00944         splitInp[nextNode].push_back(inp[nextPt]);
00945         nextPt++;
00946       } else {
00947         nextNode++;
00948         if (nextNode == blocks.size()) {
00949           assert(false);
00950         }
00951       }
00952     }//end while
00953 
00954     //Create Local Trees
00955     for (unsigned int bi=0;bi<blocks.size();bi++) {
00956       //This also sorts and makes the vector unique inside.
00957       blocks[bi].balanceSubtree(splitInp[bi],blockOut[bi],incCorner,true);
00958 
00959       splitInp[bi].clear();
00960       //This tackles the case where blocks[bi] has no decendants.
00961       if (blockOut[bi].empty()) {
00962         blockOut[bi].push_back(blocks[bi]);
00963       }
00964     }//end for bi
00965 
00966     delete [] splitInp; 
00967     splitInp = NULL;
00968 
00969     unsigned int allBoundarySz = 0;
00970     unsigned int blockOutSize = 0;
00971 
00972     std::vector<TreeNode> *blockBoundaries = 
00973       new std::vector<TreeNode>[blocks.size()];
00974 
00975     unsigned int numEmptyBlocks=0;
00976     for (unsigned int bi = 0; bi < blocks.size(); bi++) {
00977       blockOutSize += blockOut[bi].size();
00978       if (blockOut[bi].size() > 1) {
00979         blocks[bi].pickInternalBoundaryCells(blockOut[bi],blockBoundaries[bi]);
00980       }
00981       if (blockBoundaries[bi].empty()) {
00982         //the case where blockOut[bi] = blocks[bi].
00983         blockBoundaries[bi].push_back(blocks[bi]);  
00984         numEmptyBlocks++;
00985       }
00986       allBoundarySz += blockBoundaries[bi].size();
00987     }//end for bi
00988 
00989     //Concatenate the lists into nodes and allBoundaryLeaves
00990     nodes.resize(blockOutSize);
00991     allBoundaryLeaves.resize(allBoundarySz);
00992     unsigned int nodeCtr = 0;
00993     unsigned int boundaryCtr = 0;
00994 
00995     if (maxBlockBndVec != NULL) {
00996       maxBlockBndVec->resize(blocks.size());
00997       unsigned int maxDepth;
00998       if (!blocks.empty()) {
00999         maxDepth = blocks[0].getMaxDepth();  
01000       }
01001       for (unsigned int bi = 0; bi < blocks.size(); bi++) {
01002         (*maxBlockBndVec)[bi] = 0;
01003         for (unsigned int bj = 0; bj < blockOut[bi].size(); bj++) {
01004           nodes[nodeCtr] = blockOut[bi][bj];
01005           nodeCtr++;
01006         }//end for bj
01007         for (unsigned int bj = 0; bj < blockBoundaries[bi].size(); bj++) {
01008           unsigned int len = (1u<<(maxDepth - (blockBoundaries[bi][bj].getLevel())));
01009           if (len > ((*maxBlockBndVec)[bi])) {
01010             (*maxBlockBndVec)[bi] = len;
01011           }
01012           allBoundaryLeaves[boundaryCtr] = blockBoundaries[bi][bj];
01013           boundaryCtr++;
01014         }//end for bj
01015       }//end for bi
01016     } else {
01017       for (unsigned int bi = 0; bi < blocks.size(); bi++) {
01018         for (unsigned int bj = 0; bj < blockOut[bi].size(); bj++) {
01019           nodes[nodeCtr] = blockOut[bi][bj];
01020           nodeCtr++;
01021         }//end for bj
01022         for (unsigned int bj = 0; bj < blockBoundaries[bi].size(); bj++) {
01023           allBoundaryLeaves[boundaryCtr] = blockBoundaries[bi][bj];
01024           boundaryCtr++;
01025         }//end for bj
01026       }//end for bi
01027     }
01028 
01029     delete [] blockOut;
01030     blockOut = NULL;
01031 
01032     delete [] blockBoundaries;
01033     blockBoundaries = NULL;
01034 
01035     PROF_CON_BAL_END
01036   }//end function
01037 
01038   /*
01039   //New implementation of the ripple algorithm. Implemented on Dec 22, 2007.
01040   int ripple(std::vector<TreeNode> & nodes, bool incCorners) {
01041   PROF_RIPPLE_BAL_BEGIN 
01042 
01043   if (!nodes.size()) {
01044   PROF_RIPPLE_BAL_END 
01045   }
01046 
01047   unsigned int dim = nodes[0].getDim();
01048   unsigned int maxDepth = nodes[0].getMaxDepth();
01049   TreeNode root(dim,maxDepth);
01050 
01051   unsigned int maxLev = 1;
01052   for (unsigned int i=0;i<nodes.size();i++) {
01053   if (nodes[i].getLevel() > maxLev) {
01054   maxLev = nodes[i].getLevel();
01055   }
01056   }//end for 
01057 
01058   for (unsigned int lev = maxLev; lev > 2; lev--) {
01059   unsigned int mLev = maxDepth;
01060   for (unsigned int i=0;i<nodes.size();i++) {
01061   if (nodes[i].getLevel() < mLev) {
01062   mLev = nodes[i].getLevel();
01063   }
01064   }//end for 
01065 
01066   if (mLev >= (lev-1)) {
01067 //Difference between min and max levels is less than 2 .
01068 break;
01069 }
01070 
01071 std::vector<TreeNode> wList;      
01072 unsigned int wLen = 0;
01073 wList.resize(nodes.size());
01074 //Pick leaves at a particular level
01075 for (int i =0;i<nodes.size();i++) {
01076 if (nodes[i].getLevel() == lev) {
01077 wList[wLen++] = nodes[i];
01078 }
01079 }//end for i
01080 wList.resize(wLen);
01081 
01082 std::vector<std::vector<TreeNode> > tList(wLen);
01083 for (unsigned int i=0; i < wLen; i++) {
01084 tList[i] = wList[i].getSearchKeys(incCorners);        
01085 }//end for i
01086 wList.clear();
01087 
01088 std::vector<TreeNode> allKeys;
01089 for (unsigned int i=0; i < wLen; i++) {
01090 for(unsigned int j=0; j < tList[i].size(); j++) {
01091 if(tList[i][j] > root) {
01092 allKeys.push_back(tList[i][j]);
01093 }
01094 }
01095 }    
01096 tList.clear();
01097 
01098 par::makeVectorUnique<ot::TreeNode>(allKeys, false);
01099 unsigned int keyLen = allKeys.size();
01100 
01101 std::vector<std::vector<TreeNode> > seedList(nodes.size());
01102 unsigned int lastIdx = 0;
01103 for (int i = 0; i < keyLen; i++) {
01104 unsigned int idx;
01105 //assumes nodes is sorted and unique.          
01106 bool flag = par::maxLowerBound<TreeNode>(nodes, allKeys[i], idx, &lastIdx, NULL);
01107 if (flag) {
01108 lastIdx = idx;
01109 if (nodes[idx].isAncestor(allKeys[i])) {
01110   if (lev > (nodes[idx].getLevel() + 1 )) {
01111     seedList[idx].push_back(allKeys[i].getAncestor(lev-1));                
01112   }//end if correct result
01113 }
01114 } //end if flag
01115 }//end for i   
01116 allKeys.clear();
01117 
01118 //Include the new octants in the octree
01119 //while preserving linearity and sorted order
01120 
01121 //Seedlist may have duplicates. Seedlist 
01122 // will be sorted.
01123 
01124 std::vector<TreeNode> tmpList;
01125 std::vector<std::vector<TreeNode> >allInternalLeaves(nodes.size());
01126 unsigned int tmpSz = 0;
01127 for (unsigned int i=0;i<nodes.size();i++) {                
01128   par::makeVectorUnique<TreeNode>(seedList[i],true);
01129   if (!seedList[i].empty()) {
01130     nodes[i].completeSubtree(seedList[i], allInternalLeaves[i]);
01131   }
01132   tmpSz += allInternalLeaves[i].size();
01133   if (allInternalLeaves[i].empty()) {
01134     tmpSz++;
01135   }
01136   seedList[i].clear();
01137 }//end for i
01138 
01139 seedList.clear();
01140 tmpList.resize(tmpSz);
01141 tmpSz=0;
01142 for (unsigned int i=0;i<nodes.size();i++) {
01143   if (!allInternalLeaves[i].empty()) {
01144     for (int k=0;k<allInternalLeaves[i].size();k++) {
01145       tmpList[tmpSz++] = allInternalLeaves[i][k];
01146     }//end for k
01147     allInternalLeaves[i].clear();
01148   } else {
01149     tmpList[tmpSz++] = nodes[i];
01150   }
01151 }//end for i
01152 allInternalLeaves.clear();
01153 nodes = tmpList;
01154 tmpList.clear();
01155 }//end for lev
01156 
01157 PROF_RIPPLE_BAL_END 
01158 }//end function
01159 */
01160 
01161 //Original implementation of the ripple algorithm
01162 int ripple(std::vector<TreeNode> & nodes, bool incCorners) {
01163   PROF_RIPPLE_BAL_BEGIN 
01164 
01165     if (!nodes.size()) {
01166       PROF_RIPPLE_BAL_END 
01167     }
01168 
01169   unsigned int dim = nodes[0].getDim();
01170   unsigned int maxDepth = nodes[0].getMaxDepth();
01171   TreeNode root(dim,maxDepth);
01172 
01173   unsigned int maxLev = 1;
01174   for (unsigned int i=0;i<nodes.size();i++) {
01175     if (nodes[i].getLevel() > maxLev) {
01176       maxLev = nodes[i].getLevel();
01177     }
01178   }//end for 
01179 
01180   for (unsigned int lev = maxLev; lev > 2; lev--) {
01181     unsigned int mLev = maxDepth;
01182     for (unsigned int i=0;i<nodes.size();i++) {
01183       if (nodes[i].getLevel() < mLev) {
01184         mLev = nodes[i].getLevel();
01185       }
01186     }//end for 
01187 
01188     if (mLev >= (lev-1)) {
01189       //Difference between min and max levels is less than 2 .
01190       break;
01191     }
01192 
01193     std::vector<TreeNode> wList;
01194     std::vector<std::vector<TreeNode> > seedList(nodes.size());
01195     unsigned int wLen = 0;
01196     wList.resize(nodes.size());
01197 
01198     //Pick leaves at a particular level
01199     for (int i =0;i<nodes.size();i++) {
01200       if (nodes[i].getLevel() == lev) {
01201         wList[wLen++] = nodes[i];
01202       }
01203     }//end for i
01204     wList.resize(wLen);
01205 
01206     for (unsigned int i=0; i < wList.size(); i++) {
01207       std::vector<TreeNode> tList = wList[i].getSearchKeys(incCorners);
01208       for (int j=0; j<tList.size(); j++) {
01209         unsigned int idx;
01210         //assumes nodes is sorted and unique.          
01211         bool flag = seq::maxLowerBound<TreeNode>(nodes, tList[j], idx,NULL,NULL);
01212         if (flag) {
01213 #ifdef __DEBUG_OCT__
01214           assert(areComparable(nodes[idx], tList[j]));
01215 #endif
01216           if (nodes[idx].isAncestor(tList[j])) {
01217             if (wList[i].getLevel() > (nodes[idx].getLevel() + 1 )) {
01218               nodes[idx].addBalancingDescendants(wList[i],
01219                   seedList[idx], incCorners);
01220             }//end if failing balance
01221           }//end if valid result
01222         } //end if flag
01223       }//end for j
01224       tList.clear();
01225     }//end for i
01226 
01227     wList.clear();
01228     std::vector<TreeNode> tmpList;
01229     std::vector<std::vector<TreeNode> >allInternalLeaves(nodes.size());
01230     unsigned int tmpSz = 0;
01231     for (unsigned int i=0;i<nodes.size();i++) {
01232       seq::makeVectorUnique<TreeNode>(seedList[i],false);
01233       if (!seedList[i].empty()) {
01234         if (seedList[i][0] == root) {
01235           seedList[i].erase(seedList[i].begin());
01236         }
01237       }
01238       if (!seedList[i].empty()) {
01239         nodes[i].completeSubtree(seedList[i], allInternalLeaves[i]);
01240       }
01241       tmpSz += allInternalLeaves[i].size();
01242       if (allInternalLeaves[i].empty()) {
01243         tmpSz++;
01244       }
01245       seedList[i].clear();
01246     }//end for i
01247     seedList.clear();
01248     tmpList.resize(tmpSz);
01249     tmpSz=0;
01250     for (unsigned int i=0;i<nodes.size();i++) {
01251       if (!allInternalLeaves[i].empty()) {
01252         for (int k=0;k<allInternalLeaves[i].size();k++) {
01253           tmpList[tmpSz++] = allInternalLeaves[i][k];
01254         }//end for k
01255         allInternalLeaves[i].clear();
01256       } else {
01257         tmpList[tmpSz++] = nodes[i];
01258       }
01259     }//end for i
01260     allInternalLeaves.clear();
01261     nodes = tmpList;
01262     tmpList.clear();
01263   }//end for lev
01264 
01265   PROF_RIPPLE_BAL_END 
01266 }//end function
01267 
01268 
01269 int pointerBasedRipple(std::vector<ot::TreeNode> & nodes, bool incCorners) {
01270   PROF_PTR_RIPPLE_BAL_BEGIN
01271 
01272     if (!nodes.size()) {
01273       PROF_PTR_RIPPLE_BAL_END 
01274     }
01275 
01276   unsigned int dim = nodes[0].getDim();
01277   unsigned int maxDepth = nodes[0].getMaxDepth();
01278   TreeNode root(dim,maxDepth);
01279 
01280   unsigned int maxLev = 1;
01281   for (unsigned int i = 0; i < nodes.size(); i++) {
01282     if (nodes[i].getLevel() > maxLev) {
01283       maxLev = nodes[i].getLevel();
01284     }
01285   }//end for i
01286 
01287   TreeNodePointer ptrOctree;
01288 
01289   convertLinearToPointer(nodes, ptrOctree);
01290 
01291   for(unsigned int lev = maxLev; lev > 2; lev--) {
01292 
01293     //Traverse the tree, select octants at level = lev and process them
01294 
01295     std::vector<ot::TreeNode> wList;
01296 
01297     appendOctantsAtLevel(ptrOctree, wList, lev);
01298 
01299     for(unsigned int i = 0; i < wList.size(); i++) {
01300       std::vector<TreeNode> tList = wList[i].getSearchKeys(incCorners);
01301       for(int j = 0; j < tList.size(); j++) {
01302         TreeNodePointer* searchResult = NULL;
01303         findOctantOrFinestAncestor(ptrOctree, tList[j], searchResult);
01304 #ifdef __DEBUG_OCT__
01305         assert(searchResult);
01306         assert( ((searchResult->m_tnMe).isAncestor(tList[j]))
01307             || ((searchResult->m_tnMe) == tList[j]) );
01308         assert( (searchResult->m_tnpMyChildren) == NULL );
01309 #endif
01310         //Check balance constraint
01311         if( ((searchResult->m_tnMe).getLevel()) < (lev - 1) ) {
01312           addOctantToTreeNodePointer((*searchResult),
01313               (tList[j].getAncestor((lev - 1))));
01314         }
01315       }//end for j
01316       tList.clear();
01317     }//end for i
01318 
01319     wList.clear();
01320 
01321   }//end for lev
01322 
01323   std::vector<ot::TreeNode> linOct;
01324 
01325   convertPointerToLinear(linOct, ptrOctree);
01326 
01327   //Select only those octants from linOct, which are either present in nodes or
01328   //are decendants of octants in nodes.
01329   //assumes nodes is sorted, linear and unique.          
01330   std::vector<ot::TreeNode> finalOct;
01331   unsigned int lCtr = 0;
01332   for(unsigned int i = 0; i < nodes.size(); i++) {
01333     while( (lCtr < linOct.size()) && (linOct[lCtr] < nodes[i]) ) {
01334       lCtr++;
01335     }
01336     if(lCtr < linOct.size()) {
01337       if(linOct[lCtr] == nodes[i]) {
01338         finalOct.push_back(nodes[i]);
01339       } else {
01340 #ifdef __DEBUG_OCT__
01341         assert(nodes[i].isAncestor(linOct[lCtr]));
01342 #endif
01343         while( (lCtr < linOct.size()) &&
01344             (nodes[i].isAncestor(linOct[lCtr])) ) {
01345           finalOct.push_back(linOct[lCtr]);
01346           lCtr++;
01347         }
01348       }
01349     } else {
01350       //Can't exhaust linOct before exhausting nodes
01351       assert(false);
01352     }
01353   }
01354 
01355   nodes = finalOct;
01356 
01357   finalOct.clear();
01358   linOct.clear();
01359 
01360   deleteTreeNodePointer(ptrOctree);
01361 
01362   PROF_PTR_RIPPLE_BAL_END
01363 }//end function
01364 
01365 int parallelRippleType3(std::vector<TreeNode> & nodes,
01366     bool incCorners, bool checkBailOut, bool rePart,
01367     unsigned int dim, unsigned int maxDepth, MPI_Comm comm)
01368 {
01369   PROF_PAR_RIPPLE_TYPE3_BEGIN 
01370 
01371     TreeNode root(dim,maxDepth);
01372 
01373   unsigned int maxLev = 1;
01374   for(unsigned int i=0; i<nodes.size(); i++) {
01375     if (nodes[i].getLevel() > maxLev) {
01376       maxLev = nodes[i].getLevel();
01377     }
01378   }//end for 
01379 
01380   int rank,npes;
01381   MPI_Comm_rank(comm,&rank);
01382   MPI_Comm_size(comm,&npes);
01383 
01384 #ifdef __DEBUG_OCT__
01385   MPI_Barrier(comm);    
01386   if(!rank) {
01387     std::cout<<"Computed local maxLev."<<std::endl;
01388   }
01389   MPI_Barrier(comm);    
01390 #endif
01391 
01392   unsigned int globalMaxLev = maxLev;
01393   par::Mpi_Allreduce<unsigned int>(&maxLev, &globalMaxLev, 1, MPI_MAX, comm);
01394 
01395 #ifdef __DEBUG_OCT__
01396   MPI_Barrier(comm);    
01397   if(!rank) {
01398     std::cout<<"Global maxLev: "<<globalMaxLev<<std::endl;
01399   }
01400   MPI_Barrier(comm);    
01401 #endif
01402 
01403   for (unsigned int lev = globalMaxLev; lev > 2; lev--)
01404   {
01405 #ifdef __DEBUG_OCT__
01406     MPI_Barrier(comm);  
01407     if(!rank) {
01408       std::cout<<"Lev: "<<lev<<std::endl;
01409     }
01410     MPI_Barrier(comm);  
01411 #endif
01412     if(checkBailOut) {
01413       unsigned int minLev = maxDepth;
01414       for (unsigned int i=0; i<nodes.size(); i++) {
01415         if (nodes[i].getLevel() < minLev) {
01416           minLev = nodes[i].getLevel();
01417         }
01418       }//end for i
01419 
01420       unsigned int globalMinLev = minLev;
01421       par::Mpi_Allreduce<unsigned int>(&minLev, &globalMinLev, 1, MPI_MIN, comm);
01422 
01423       if (globalMinLev >= (lev-1)) {
01424         //Difference between min and 
01425         //max levels is less than 2 .
01426         break;
01427       }
01428     }//end if check to bail out
01429 
01430 #ifdef __DEBUG_OCT__
01431     MPI_Barrier(comm);  
01432     if(!rank) {
01433       std::cout<<"Passed Step 1."<<std::endl;
01434     }
01435     MPI_Barrier(comm);  
01436 #endif
01437 
01438     std::vector<TreeNode> wList;
01439     unsigned int wLen = 0;
01440     wList.resize(nodes.size());
01441     //Pick leaves at a particular level
01442     for (int i=0; i<nodes.size(); i++) {
01443       if (nodes[i].getLevel() == lev) {
01444         wList[wLen] = nodes[i];
01445         wLen++;
01446       }
01447     }//end for i
01448     wList.resize(wLen);
01449 
01450 #ifdef __DEBUG_OCT__
01451     MPI_Barrier(comm);  
01452     if(!rank) {
01453       std::cout<<"Passed Step 2."<<std::endl;
01454     }
01455     MPI_Barrier(comm);  
01456 #endif
01457 
01458     std::vector<std::vector<TreeNode> >tList(wLen);
01459     for (unsigned int i=0; i < wLen; i++) {
01460       tList[i] = wList[i].getSearchKeys(incCorners);        
01461     }//end for i
01462 
01463     wList.clear();
01464 
01465 #ifdef __DEBUG_OCT__
01466     MPI_Barrier(comm);  
01467     if(!rank) {
01468       std::cout<<"Passed Step 3."<<std::endl;
01469     }
01470     MPI_Barrier(comm);  
01471 #endif
01472 
01473     std::vector<TreeNode> allKeys;
01474     for (unsigned int i=0; i < wLen; i++) {
01475       for(unsigned int j=0; j < tList[i].size(); j++) {
01476         if(tList[i][j] > root) {
01477           allKeys.push_back(tList[i][j]);
01478         }
01479       }
01480     }    
01481     tList.clear();
01482 
01483 #ifdef __DEBUG_OCT__
01484     MPI_Barrier(comm);  
01485     if(!rank) {
01486       std::cout<<"Passed Step 4."<<std::endl;
01487     }
01488     std::cout<<rank<<": allKeys.size(): "
01489       <<allKeys.size()<<std::endl;
01490     MPI_Barrier(comm);  
01491 #endif
01492 
01493     //
01494     //Parallel searches and subsequent processing      
01495     //1. Compute the ranges controlled by each processor
01496     //2. Compare the keys with the ranges and 
01497     //send the keys to the appropriate processors.
01498     //3. Perform local searches using the keys
01499     //recieved. 
01500     //4. Compute the balancing descendants for the results
01501     //and create the 'seedList' vector. Note,
01502     //we process one level at a time. So all the keys
01503     //were generated by octants at the same level. The 
01504     //corresponding balancing descendants are simply the
01505     //ancestors of the keys at one level lower than this
01506     //level.
01507     //
01508 
01509     seq::makeVectorUnique<TreeNode>(allKeys,false);     
01510 
01511     unsigned int keyLen = allKeys.size();
01512 
01513 
01514     //First Get the mins from each processor.
01515 
01516     // allocate memory for the mins array
01517     std::vector<ot::TreeNode> mins (npes); 
01518 
01519     ot::TreeNode sendMin;
01520     if(!nodes.empty()) {
01521       sendMin = nodes[0]; //local min
01522     }else {
01523       sendMin = root;
01524     }
01525 
01526     par::Mpi_Allgather<ot::TreeNode>(&sendMin, &(*mins.begin()), 1, comm);    
01527 
01528 #ifdef __DEBUG_OCT__
01529     MPI_Barrier(comm);  
01530     if(!rank) {
01531       std::cout<<"Passed Step 5."<<std::endl;
01532     }
01533     MPI_Barrier(comm);  
01534 #endif
01535 
01536     //Now determine the processors which own these keys.
01537 
01538     unsigned int *partKeys = NULL;
01539 
01540     if(keyLen) {
01541       partKeys = new unsigned int[keyLen];    
01542     }
01543 
01544     for (unsigned int i=0; i<keyLen; i++) {
01545       unsigned int idx;
01546       //maxLB returns the last index in a 
01547       //sorted array such that a[ind] <= key 
01548       //and  a[index +1] > key
01549       bool found = seq::maxLowerBound<TreeNode >(mins,
01550           allKeys[i], idx, NULL, NULL);
01551       if (!found ) {
01552         //Can happen on incomplete domains
01553         partKeys[i] = rank;
01554       } else {
01555         partKeys[i] = idx;
01556       }
01557     }//end for i
01558 
01559     mins.clear();
01560 
01561 #ifdef __DEBUG_OCT__
01562     MPI_Barrier(comm);  
01563     if(!rank) {
01564       std::cout<<"Passed Step 6."<<std::endl;
01565     }
01566     MPI_Barrier(comm);  
01567 #endif
01568 
01569     int *numKeysSend = new int[npes];
01570     int *numKeysRecv = new int[npes];    
01571 
01572     for (int i=0; i<npes; i++) {
01573       numKeysSend[i] = 0;
01574     }
01575 
01576     // calculate the number of keys to send ...
01577     //and create the send buffer.
01578     std::vector<std::vector<unsigned int> >sendKtmp(npes);
01579 
01580     for (unsigned int i=0; i<keyLen; i++) {
01581       numKeysSend[partKeys[i]]++;
01582       sendKtmp[partKeys[i]].push_back(i);      
01583     }
01584 
01585     // Now do an All2All to get numKeysRecv
01586 
01587     par::Mpi_Alltoall<int>(numKeysSend, numKeysRecv, 1, comm);
01588 
01589     // Pre-processing for sending
01590 
01591     int *sendOffsets = new int[npes];
01592     sendOffsets[0] = 0;
01593     int *recvOffsets = new int[npes];
01594     recvOffsets[0] = 0;
01595 
01596     // compute offsets ...
01597 
01598     for (int i = 1; i < npes; i++) {
01599       sendOffsets[i] = sendOffsets[i-1] + 
01600         numKeysSend[i-1];
01601       recvOffsets[i] = recvOffsets[i-1] +
01602         numKeysRecv[i-1];
01603     }
01604 
01605     std::vector<ot::TreeNode> sendK(sendOffsets[npes-1] + numKeysSend[npes-1]);
01606 
01607     for(unsigned int i = 0; i < npes; i++) {
01608 #ifdef __DEBUG_OCT__
01609       assert(sendKtmp[i].size() == numKeysSend[i]);
01610 #endif
01611       for(unsigned int j = 0; j < numKeysSend[i]; j++) {
01612         sendK[sendOffsets[i] + j] = allKeys[sendKtmp[i][j]];
01613       }//end for j
01614     }//end for i
01615 
01616     if(partKeys) {
01617       delete [] partKeys;
01618       partKeys = NULL;
01619     }
01620 
01621     allKeys.clear();
01622     sendKtmp.clear();
01623 
01624     std::vector<ot::TreeNode> recvK(recvOffsets[npes-1] + numKeysRecv[npes-1]);
01625 
01626     ot::TreeNode* sendKptr = NULL;
01627     ot::TreeNode* recvKptr = NULL;
01628     if(!sendK.empty()) {
01629       sendKptr = &(*(sendK.begin()));
01630     }
01631     if(!recvK.empty()) {
01632       recvKptr = &(*(recvK.begin()));
01633     }
01634     par::Mpi_Alltoallv_sparse<ot::TreeNode>( sendKptr, numKeysSend, sendOffsets,        
01635         recvKptr, numKeysRecv, recvOffsets, comm);
01636 
01637 
01638     sendK.clear();
01639 
01640     delete [] sendOffsets;
01641     sendOffsets = NULL;
01642 
01643     delete [] recvOffsets;
01644     recvOffsets = NULL;
01645 
01646     delete [] numKeysSend;
01647     numKeysSend = NULL;
01648 
01649     delete [] numKeysRecv;
01650     numKeysRecv = NULL;
01651 
01652     seq::makeVectorUnique<TreeNode>(recvK,false);       
01653 
01654     keyLen = recvK.size();
01655 
01656 #ifdef __DEBUG_OCT__
01657     MPI_Barrier(comm);  
01658     if(!rank) {
01659       std::cout<<"Passed Step 7."<<std::endl;
01660     }
01661     MPI_Barrier(comm);  
01662 #endif
01663 
01664     //Local searches and creating seedList      
01665 
01666     std::vector<std::vector<TreeNode> > seedList(nodes.size());
01667     unsigned int lastIdx = 0;
01668     for(unsigned int i=0; i < keyLen; i++) {
01669       unsigned int idx;
01670       //assumes nodes is sorted and unique.          
01671       bool flag = seq::maxLowerBound<TreeNode>(nodes, recvK[i], idx, &lastIdx,NULL);
01672       if (flag) {
01673         lastIdx = idx;
01674 #ifdef __DEBUG_OCT__
01675         assert(areComparable(nodes[idx], recvK[i]));
01676 #endif
01677         if (nodes[idx].isAncestor(recvK[i])) {
01678           if (lev > (nodes[idx].getLevel() + 1)) {
01679             seedList[idx].push_back(recvK[i].getAncestor(lev-1));                
01680           }
01681         }//end if correct result
01682       }//end if flag        
01683     }//end for i
01684 
01685     recvK.clear();
01686 
01687 #ifdef __DEBUG_OCT__
01688     MPI_Barrier(comm);  
01689     if(!rank) {
01690       std::cout<<"Passed Step 8."<<std::endl;
01691     }
01692     MPI_Barrier(comm);  
01693 #endif
01694 
01695     //Include the new octants in the octree
01696     //while preserving linearity and sorted order
01697 
01698     //Seedlist may have duplicates. Seedlist will be sorted.
01699 
01700     std::vector<TreeNode> tmpList;
01701     std::vector<std::vector<TreeNode> >allInternalLeaves(nodes.size());
01702     unsigned int tmpSz = 0;
01703     for (unsigned int i=0;i<nodes.size();i++) { 
01704       seq::makeVectorUnique<TreeNode>(seedList[i],true);        
01705       if (!seedList[i].empty()) {
01706         nodes[i].completeSubtree(seedList[i], allInternalLeaves[i]);
01707       }
01708       tmpSz += allInternalLeaves[i].size();
01709       if (allInternalLeaves[i].empty()) {
01710         tmpSz++;
01711       }
01712       seedList[i].clear();
01713     }//end for i
01714 
01715 #ifdef __DEBUG_OCT__
01716     MPI_Barrier(comm);  
01717     if(!rank) {
01718       std::cout<<"Passed Step 9."<<std::endl;
01719     }
01720     MPI_Barrier(comm);  
01721 #endif
01722 
01723     seedList.clear();
01724     tmpList.resize(tmpSz);
01725     tmpSz=0;
01726     for (unsigned int i=0;i<nodes.size();i++) {
01727       if (!allInternalLeaves[i].empty()) {
01728         for (int k=0;k<allInternalLeaves[i].size();k++) {
01729           tmpList[tmpSz++] = allInternalLeaves[i][k];
01730         }//end for k
01731         allInternalLeaves[i].clear();
01732       } else {
01733         tmpList[tmpSz++] = nodes[i];
01734       }
01735     }//end for i
01736     allInternalLeaves.clear();
01737     nodes = tmpList;
01738     tmpList.clear();
01739 
01740 #ifdef __DEBUG_OCT__
01741     MPI_Barrier(comm);  
01742     if(!rank) {
01743       std::cout<<"Passed Step 10."<<std::endl;
01744     }
01745     MPI_Barrier(comm);  
01746 #endif
01747 
01748     if(rePart) {
01749       par::partitionW<ot::TreeNode>(nodes, NULL,comm);
01750     }
01751 
01752 #ifdef __DEBUG_OCT__
01753     MPI_Barrier(comm);  
01754     if(!rank) {
01755       std::cout<<"Passed Step 11."<<std::endl;
01756     }
01757     MPI_Barrier(comm);  
01758 #endif
01759 
01760   }//end for lev
01761 
01762   PROF_PAR_RIPPLE_TYPE3_END 
01763 }//end function
01764 
01765 
01766 int parallelRippleType2(std::vector<TreeNode> & nodes,
01767     bool incCorners, bool checkBailOut, bool rePart,
01768     unsigned int dim, unsigned int maxDepth, MPI_Comm comm)
01769 {
01770   PROF_PAR_RIPPLE_TYPE2_BEGIN 
01771 
01772     TreeNode root(dim,maxDepth);
01773 
01774   unsigned int maxLev = 1;
01775   for(unsigned int i=0; i<nodes.size(); i++) {
01776     if (nodes[i].getLevel() > maxLev) {
01777       maxLev = nodes[i].getLevel();
01778     }
01779   }//end for 
01780 
01781   int rank,npes;
01782   MPI_Comm_rank(comm,&rank);
01783   MPI_Comm_size(comm,&npes);
01784 
01785 #ifdef __DEBUG_OCT__
01786   MPI_Barrier(comm);    
01787   if(!rank) {
01788     std::cout<<"Computed local maxLev."<<std::endl;
01789   }
01790   MPI_Barrier(comm);    
01791 #endif
01792 
01793   unsigned int globalMaxLev = maxLev;
01794   par::Mpi_Allreduce<unsigned int>(&maxLev, &globalMaxLev, 1, MPI_MAX, comm);
01795 
01796 #ifdef __DEBUG_OCT__
01797   MPI_Barrier(comm);    
01798   if(!rank) {
01799     std::cout<<"Global maxLev: "<<globalMaxLev<<std::endl;
01800   }
01801   MPI_Barrier(comm);    
01802 #endif
01803 
01804   for (unsigned int lev = globalMaxLev; lev > 2; lev--)
01805   {
01806 #ifdef __DEBUG_OCT__
01807     MPI_Barrier(comm);  
01808     if(!rank) {
01809       std::cout<<"Lev: "<<lev<<std::endl;
01810     }
01811     MPI_Barrier(comm);  
01812 #endif
01813     if(checkBailOut) {
01814       unsigned int minLev = maxDepth;
01815       for (unsigned int i=0; i<nodes.size(); i++) {
01816         if (nodes[i].getLevel() < minLev) {
01817           minLev = nodes[i].getLevel();
01818         }
01819       }//end for i
01820 
01821       unsigned int globalMinLev = minLev;
01822       par::Mpi_Allreduce<unsigned int>(&minLev, &globalMinLev, 1, MPI_MIN, comm);
01823 
01824       if (globalMinLev >= (lev-1)) {
01825         //Difference between min and 
01826         //max levels is less than 2 .
01827         break;
01828       }
01829     }//end if check to bail out
01830 
01831 #ifdef __DEBUG_OCT__
01832     MPI_Barrier(comm);  
01833     if(!rank) {
01834       std::cout<<"Passed Step 1."<<std::endl;
01835     }
01836     MPI_Barrier(comm);  
01837 #endif
01838 
01839     std::vector<TreeNode> wList;
01840     unsigned int wLen = 0;
01841     wList.resize(nodes.size());
01842     //Pick leaves at a particular level
01843     for (int i=0; i<nodes.size(); i++) {
01844       if (nodes[i].getLevel() == lev) {
01845         wList[wLen] = nodes[i];
01846         wLen++;
01847       }
01848     }//end for i
01849     wList.resize(wLen);
01850 
01851 #ifdef __DEBUG_OCT__
01852     MPI_Barrier(comm);  
01853     if(!rank) {
01854       std::cout<<"Passed Step 2."<<std::endl;
01855     }
01856     MPI_Barrier(comm);  
01857 #endif
01858 
01859     std::vector<std::vector<TreeNode> >tList(wLen);
01860     for (unsigned int i=0; i < wLen; i++) {
01861       tList[i] = wList[i].getSearchKeys(incCorners);        
01862     }//end for i
01863 
01864     wList.clear();
01865 
01866 #ifdef __DEBUG_OCT__
01867     MPI_Barrier(comm);  
01868     if(!rank) {
01869       std::cout<<"Passed Step 3."<<std::endl;
01870     }
01871     MPI_Barrier(comm);  
01872 #endif
01873 
01874     std::vector<TreeNode> allKeys;
01875     for (unsigned int i=0; i < wLen; i++) {
01876       for(unsigned int j=0; j < tList[i].size(); j++) {
01877         if(tList[i][j] > root) {
01878           allKeys.push_back(tList[i][j]);
01879         }
01880       }
01881     }    
01882     tList.clear();
01883 
01884 #ifdef __DEBUG_OCT__
01885     MPI_Barrier(comm);  
01886     if(!rank) {
01887       std::cout<<"Passed Step 4."<<std::endl;
01888     }
01889     std::cout<<rank<<": allKeys.size(): "
01890       <<allKeys.size()<<std::endl;
01891     MPI_Barrier(comm);  
01892 #endif
01893 
01894     //
01895     //Parallel searches and subsequent processing      
01896     //1. Compute the ranges controlled by each processor
01897     //2. Compare the keys with the ranges and 
01898     //send the keys to the appropriate processors.
01899     //3. Perform local searches using the keys
01900     //recieved. 
01901     //4. Compute the balancing descendants for the results
01902     //and create the 'seedList' vector. Note,
01903     //we process one level at a time. So all the keys
01904     //were generated by octants at the same level. The 
01905     //corresponding balancing descendants are simply the
01906     //ancestors of the keys at one level lower than this
01907     //level.
01908     //
01909 
01910     unsigned int keyLen = allKeys.size();
01911 
01912 
01913     //First Get the mins from each processor.
01914 
01915     // allocate memory for the mins array
01916     std::vector<ot::TreeNode> mins (npes); 
01917 
01918     ot::TreeNode sendMin;
01919     if(!nodes.empty()) {
01920       sendMin = nodes[0]; //local min
01921     }else {
01922       sendMin = root;
01923     }
01924 
01925     par::Mpi_Allgather<ot::TreeNode>(&sendMin, &(*mins.begin()), 1, comm);    
01926 
01927 #ifdef __DEBUG_OCT__
01928     MPI_Barrier(comm);  
01929     if(!rank) {
01930       std::cout<<"Passed Step 5."<<std::endl;
01931     }
01932     MPI_Barrier(comm);  
01933 #endif
01934 
01935     //Now determine the processors which own these keys.
01936 
01937     unsigned int *partKeys = NULL;
01938 
01939     if(keyLen) {
01940       partKeys = new unsigned int[keyLen];    
01941     }
01942 
01943     for (unsigned int i=0; i<keyLen; i++) {
01944       unsigned int idx;
01945       //maxLB returns the last index in a 
01946       //sorted array such that a[ind] <= key 
01947       //and  a[index +1] > key
01948       bool found = seq::maxLowerBound<TreeNode >(mins,
01949           allKeys[i], idx, NULL, NULL);
01950       if (!found ) {
01951         //Can happen on incomplete domains
01952         partKeys[i] = rank;
01953       } else {
01954         partKeys[i] = idx;
01955       }
01956     }//end for i
01957 
01958     mins.clear();
01959 
01960 #ifdef __DEBUG_OCT__
01961     MPI_Barrier(comm);  
01962     if(!rank) {
01963       std::cout<<"Passed Step 6."<<std::endl;
01964     }
01965     MPI_Barrier(comm);  
01966 #endif
01967 
01968     int *numKeysSend = new int[npes];
01969     int *numKeysRecv = new int[npes];    
01970 
01971     for (int i=0; i<npes; i++) {
01972       numKeysSend[i] = 0;
01973     }
01974 
01975     // calculate the number of keys to send ...
01976     //and create the send buffer.
01977     std::vector<std::vector<unsigned int> >sendKtmp(npes);
01978 
01979     for (unsigned int i=0; i<keyLen; i++) {
01980       numKeysSend[partKeys[i]]++;
01981       sendKtmp[partKeys[i]].push_back(i);      
01982     }
01983 
01984     // Now do an All2All to get numKeysRecv
01985 
01986     par::Mpi_Alltoall<int>(numKeysSend, numKeysRecv, 1, comm);
01987 
01988     // Pre-processing for sending
01989 
01990     int *sendOffsets = new int[npes];
01991     sendOffsets[0] = 0;
01992     int *recvOffsets = new int[npes];
01993     recvOffsets[0] = 0;
01994 
01995     // compute offsets ...
01996 
01997     for (int i = 1; i < npes; i++) {
01998       sendOffsets[i] = sendOffsets[i-1] + 
01999         numKeysSend[i-1];
02000       recvOffsets[i] = recvOffsets[i-1] +
02001         numKeysRecv[i-1];
02002     }
02003 
02004     std::vector<ot::TreeNode> sendK(sendOffsets[npes-1] + numKeysSend[npes-1]);
02005 
02006     for(unsigned int i = 0; i < npes; i++) {
02007 #ifdef __DEBUG_OCT__
02008       assert(sendKtmp[i].size() == numKeysSend[i]);
02009 #endif
02010       for(unsigned int j = 0; j < numKeysSend[i]; j++) {
02011         sendK[sendOffsets[i] + j] = allKeys[sendKtmp[i][j]];
02012       }//end for j
02013     }//end for i
02014 
02015     if(partKeys) {
02016       delete [] partKeys;
02017       partKeys = NULL;
02018     }
02019 
02020     allKeys.clear();
02021     sendKtmp.clear();
02022 
02023     std::vector<ot::TreeNode> recvK(recvOffsets[npes-1] + numKeysRecv[npes-1]);
02024 
02025     ot::TreeNode* sendKptr = NULL;
02026     ot::TreeNode* recvKptr = NULL;
02027     if(!sendK.empty()) {
02028       sendKptr = &(*(sendK.begin()));
02029     }
02030     if(!recvK.empty()) {
02031       recvKptr = &(*(recvK.begin()));
02032     }
02033     par::Mpi_Alltoallv_sparse<ot::TreeNode>(sendKptr, numKeysSend, sendOffsets,        
02034         recvKptr, numKeysRecv,recvOffsets, comm);
02035 
02036     sendK.clear();
02037 
02038     delete [] sendOffsets;
02039     sendOffsets = NULL;
02040 
02041     delete [] recvOffsets;
02042     recvOffsets = NULL;
02043 
02044     delete [] numKeysSend;
02045     numKeysSend = NULL;
02046 
02047     delete [] numKeysRecv;
02048     numKeysRecv = NULL;
02049 
02050     keyLen = recvK.size();
02051 
02052 #ifdef __DEBUG_OCT__
02053     MPI_Barrier(comm);  
02054     if(!rank) {
02055       std::cout<<"Passed Step 7."<<std::endl;
02056     }
02057     MPI_Barrier(comm);  
02058 #endif
02059 
02060     //Local searches and creating seedList      
02061 
02062     std::vector<std::vector<TreeNode> > seedList(nodes.size());
02063     for(unsigned int i=0; i < keyLen; i++) {
02064       unsigned int idx;
02065       //assumes nodes is sorted and unique.          
02066       bool flag = seq::maxLowerBound<TreeNode>(nodes, recvK[i], idx,NULL,NULL);
02067       if (flag) {
02068 #ifdef __DEBUG_OCT__
02069         assert(areComparable(nodes[idx], recvK[i]));
02070 #endif
02071         if (nodes[idx].isAncestor(recvK[i])) {
02072           if (lev > (nodes[idx].getLevel() + 1)) {
02073             seedList[idx].push_back(recvK[i].getAncestor(lev-1));                
02074           }
02075         }//end if correct result
02076       }//end if flag        
02077     }//end for i
02078 
02079     recvK.clear();
02080 
02081 #ifdef __DEBUG_OCT__
02082     MPI_Barrier(comm);  
02083     if(!rank) {
02084       std::cout<<"Passed Step 8."<<std::endl;
02085     }
02086     MPI_Barrier(comm);  
02087 #endif
02088 
02089     //Include the new octants in the octree
02090     //while preserving linearity and sorted order
02091 
02092     //Seedlist may have duplicates. Seedlist 
02093     // may not be sorted.
02094 
02095     std::vector<TreeNode> tmpList;
02096     std::vector<std::vector<TreeNode> >allInternalLeaves(nodes.size());
02097     unsigned int tmpSz = 0;
02098     for (unsigned int i=0;i<nodes.size();i++) { 
02099       seq::makeVectorUnique<TreeNode>(seedList[i],false);       
02100       if (!seedList[i].empty()) {
02101         nodes[i].completeSubtree(seedList[i], allInternalLeaves[i]);
02102       }
02103       tmpSz += allInternalLeaves[i].size();
02104       if (allInternalLeaves[i].empty()) {
02105         tmpSz++;
02106       }
02107       seedList[i].clear();
02108     }//end for i
02109 
02110 #ifdef __DEBUG_OCT__
02111     MPI_Barrier(comm);  
02112     if(!rank) {
02113       std::cout<<"Passed Step 9."<<std::endl;
02114     }
02115     MPI_Barrier(comm);  
02116 #endif
02117 
02118     seedList.clear();
02119     tmpList.resize(tmpSz);
02120     tmpSz=0;
02121     for (unsigned int i=0;i<nodes.size();i++) {
02122       if (!allInternalLeaves[i].empty()) {
02123         for (int k=0;k<allInternalLeaves[i].size();k++) {
02124           tmpList[tmpSz++] = allInternalLeaves[i][k];
02125         }//end for k
02126         allInternalLeaves[i].clear();
02127       } else {
02128         tmpList[tmpSz++] = nodes[i];
02129       }
02130     }//end for i
02131     allInternalLeaves.clear();
02132     nodes = tmpList;
02133     tmpList.clear();
02134 
02135 #ifdef __DEBUG_OCT__
02136     MPI_Barrier(comm);  
02137     if(!rank) {
02138       std::cout<<"Passed Step 10."<<std::endl;
02139     }
02140     MPI_Barrier(comm);  
02141 #endif
02142 
02143     if(rePart) {
02144       par::partitionW<ot::TreeNode>(nodes, NULL,comm);
02145     }
02146 
02147 #ifdef __DEBUG_OCT__
02148     MPI_Barrier(comm);  
02149     if(!rank) {
02150       std::cout<<"Passed Step 11."<<std::endl;
02151     }
02152     MPI_Barrier(comm);  
02153 #endif
02154 
02155   }//end for lev
02156 
02157   PROF_PAR_RIPPLE_TYPE2_END 
02158 }//end function
02159 
02160 int parallelRippleType1(std::vector<TreeNode> & nodes,
02161     bool incCorners, bool checkBailOut, bool rePart,
02162     unsigned int dim, unsigned int maxDepth, MPI_Comm comm)
02163 {
02164   PROF_PAR_RIPPLE_TYPE1_BEGIN 
02165 
02166     TreeNode root(dim,maxDepth);
02167 
02168   unsigned int maxLev = 1;
02169   for(unsigned int i=0; i<nodes.size(); i++) {
02170     if (nodes[i].getLevel() > maxLev) {
02171       maxLev = nodes[i].getLevel();
02172     }
02173   }//end for 
02174 
02175   int rank,npes;
02176   MPI_Comm_rank(comm,&rank);
02177   MPI_Comm_size(comm,&npes);
02178 
02179 #ifdef __DEBUG_OCT__
02180   MPI_Barrier(comm);    
02181   if(!rank) {
02182     std::cout<<"Computed local maxLev."<<std::endl;
02183   }
02184   MPI_Barrier(comm);    
02185 #endif
02186 
02187   unsigned int globalMaxLev = maxLev;
02188   par::Mpi_Allreduce<unsigned int>(&maxLev,&globalMaxLev, 1, MPI_MAX, comm);
02189 
02190 #ifdef __DEBUG_OCT__
02191   MPI_Barrier(comm);    
02192   if(!rank) {
02193     std::cout<<"Global maxLev: "<<globalMaxLev<<std::endl;
02194   }
02195   MPI_Barrier(comm);    
02196 #endif
02197 
02198   for (unsigned int lev = globalMaxLev; lev > 2; lev--)
02199   {
02200 #ifdef __DEBUG_OCT__
02201     MPI_Barrier(comm);  
02202     if(!rank) {
02203       std::cout<<"Lev: "<<lev<<std::endl;
02204     }
02205     MPI_Barrier(comm);  
02206 #endif
02207     if(checkBailOut) {
02208       unsigned int minLev = maxDepth;
02209       for (unsigned int i=0; i<nodes.size(); i++) {
02210         if (nodes[i].getLevel() < minLev) {
02211           minLev = nodes[i].getLevel();
02212         }
02213       }//end for i
02214 
02215       unsigned int globalMinLev = minLev;
02216       par::Mpi_Allreduce<unsigned int>(&minLev, &globalMinLev, 1, MPI_MIN, comm);
02217 
02218       if (globalMinLev >= (lev-1)) {
02219         //Difference between min and 
02220         //max levels is less than 2 .
02221         break;
02222       }
02223     }//end if check to bail out
02224 
02225 #ifdef __DEBUG_OCT__
02226     MPI_Barrier(comm);  
02227     if(!rank) {
02228       std::cout<<"Passed Step 1."<<std::endl;
02229     }
02230     MPI_Barrier(comm);  
02231 #endif
02232 
02233     std::vector<TreeNode> wList;
02234     unsigned int wLen = 0;
02235     wList.resize(nodes.size());
02236     //Pick leaves at a particular level
02237     for (int i=0; i<nodes.size(); i++) {
02238       if (nodes[i].getLevel() == lev) {
02239         wList[wLen] = nodes[i];
02240         wLen++;
02241       }
02242     }//end for i
02243     wList.resize(wLen);
02244 
02245 #ifdef __DEBUG_OCT__
02246     MPI_Barrier(comm);  
02247     if(!rank) {
02248       std::cout<<"Passed Step 2."<<std::endl;
02249     }
02250     MPI_Barrier(comm);  
02251 #endif
02252 
02253     std::vector<std::vector<TreeNode> >tList(wLen);
02254     for (unsigned int i=0; i < wLen; i++) {
02255       tList[i] = wList[i].getSearchKeys(incCorners);        
02256     }//end for i
02257 
02258     wList.clear();
02259 
02260 #ifdef __DEBUG_OCT__
02261     MPI_Barrier(comm);  
02262     if(!rank) {
02263       std::cout<<"Passed Step 3."<<std::endl;
02264     }
02265     MPI_Barrier(comm);  
02266 #endif
02267 
02268     std::vector<TreeNode> allKeys;
02269     for (unsigned int i=0; i < wLen; i++) {
02270       for(unsigned int j=0; j < tList[i].size(); j++) {
02271         if(tList[i][j] > root) {
02272           allKeys.push_back(tList[i][j]);
02273         }
02274       }
02275     }    
02276     tList.clear();
02277 
02278 #ifdef __DEBUG_OCT__
02279     MPI_Barrier(comm);  
02280     if(!rank) {
02281       std::cout<<"Passed Step 4."<<std::endl;
02282     }
02283     std::cout<<rank<<": allKeys.size(): "
02284       <<allKeys.size()<<std::endl;
02285     MPI_Barrier(comm);  
02286 #endif
02287 
02288     //
02289     //Parallel searches and subsequent processing
02290     //1. Make the list of keys unique globally.
02291     //2. Compute the ranges controlled by each processor
02292     //3. Compare the keys with the ranges and 
02293     //send the keys to the appropriate processors.
02294     //4. Perform local searches using the keys
02295     //recieved. Use the fact that the keys are
02296     // sorted to do this efficiently.
02297     //5. Compute the balancing descendants for the results
02298     //and create the 'seedList' vector. Note,
02299     //we process one level at a time. So all the keys
02300     //were generated by octants at the same level. The 
02301     //corresponding balancing descendants are simply the
02302     //ancestors of the keys at one level lower than this
02303     //level.
02304     //
02305 
02306     par::removeDuplicates<ot::TreeNode>(allKeys,
02307         false,comm);
02308     unsigned int keyLen = allKeys.size();
02309 
02310 #ifdef __DEBUG_OCT__
02311     MPI_Barrier(comm);  
02312     if(!rank) {
02313       std::cout<<"Passed Step 5."<<std::endl;
02314     }
02315     MPI_Barrier(comm);  
02316 #endif
02317 
02318     //First Get the mins from each processor.
02319 
02320     // allocate memory for the mins array
02321     std::vector<ot::TreeNode> mins (npes); 
02322 
02323     ot::TreeNode sendMin;
02324     if(!nodes.empty()) {
02325       sendMin = nodes[0]; //local min
02326     }else {
02327       sendMin = root;
02328     }
02329 
02330     par::Mpi_Allgather<ot::TreeNode>(&sendMin, &(*mins.begin()), 1, comm);    
02331 
02332 #ifdef __DEBUG_OCT__
02333     MPI_Barrier(comm);  
02334     if(!rank) {
02335       std::cout<<"Passed Step 6."<<std::endl;
02336     }
02337     MPI_Barrier(comm);  
02338 #endif
02339 
02340     //Now determine the processors which own these keys.
02341 
02342     unsigned int *partKeys = NULL;
02343 
02344     if(keyLen) {
02345       partKeys = new unsigned int[keyLen];    
02346     }
02347 
02348     for (unsigned int i=0; i<keyLen; i++) {
02349       unsigned int idx;
02350       //maxLB returns the last index in a 
02351       //sorted array such that a[ind] <= key 
02352       //and  a[index +1] > key
02353       bool found = seq::maxLowerBound<TreeNode >(mins,
02354           allKeys[i], idx, NULL, NULL);
02355       if (!found ) {
02356         //Can happen on incomplete domains
02357         partKeys[i] = rank;
02358       } else {
02359         partKeys[i] = idx;
02360       }
02361     }//end for i
02362 
02363     mins.clear();
02364 
02365 #ifdef __DEBUG_OCT__
02366     MPI_Barrier(comm);  
02367     if(!rank) {
02368       std::cout<<"Passed Step 7."<<std::endl;
02369     }
02370     MPI_Barrier(comm);  
02371 #endif
02372 
02373     int *numKeysSend = new int[npes];
02374     int *numKeysRecv = new int[npes];    
02375 
02376     for (int i=0; i<npes; i++) {
02377       numKeysSend[i] = 0;
02378     }
02379 
02380     // calculate the number of keys to send ...
02381     for (unsigned int i=0; i<keyLen; i++) {
02382       numKeysSend[partKeys[i]]++;      
02383     }
02384 
02385     // Now do an All2All to get numKeysRecv
02386 
02387     par::Mpi_Alltoall<int>(numKeysSend, numKeysRecv, 1, comm);
02388 
02389     // Pre-processing for sending
02390 
02391     int *sendOffsets = new int[npes];
02392     sendOffsets[0] = 0;
02393     int *recvOffsets = new int[npes];
02394     recvOffsets[0] = 0;
02395 
02396 
02397     // compute offsets ...
02398 
02399     for (int i = 1; i < npes; i++) {
02400       sendOffsets[i] = sendOffsets[i-1] + 
02401         numKeysSend[i-1];
02402       recvOffsets[i] = recvOffsets[i-1] +
02403         numKeysRecv[i-1];
02404     }
02405 
02406     //Since, allKeys is sorted globally in this case
02407     // allKeys must be the same as sendK. So,
02408     //there is no need to create 
02409     //a separate send buffer.
02410 
02411     std::vector<ot::TreeNode> recvK(recvOffsets[npes-1] 
02412         + numKeysRecv[npes-1]);
02413 
02414     if(partKeys) {
02415       delete [] partKeys;
02416       partKeys = NULL;
02417     }
02418 
02419     ot::TreeNode* allKeysPtr = NULL;
02420     ot::TreeNode* recvKptr = NULL;
02421     if(!allKeys.empty()) {
02422       allKeysPtr = &(*(allKeys.begin()));
02423     }
02424     if(!recvK.empty()) {
02425       recvKptr = &(*(recvK.begin()));
02426     }
02427     par::Mpi_Alltoallv_sparse<ot::TreeNode>(allKeysPtr, numKeysSend, sendOffsets, 
02428         recvKptr, numKeysRecv, recvOffsets, comm);
02429 
02430     allKeys.clear();
02431 
02432     delete [] sendOffsets;
02433     sendOffsets = NULL;
02434 
02435     delete [] recvOffsets;
02436     recvOffsets = NULL;
02437 
02438     delete [] numKeysSend;
02439     numKeysSend = NULL;
02440 
02441     delete [] numKeysRecv;
02442     numKeysRecv = NULL;
02443 
02444     keyLen = recvK.size();
02445 
02446 #ifdef __DEBUG_OCT__
02447     MPI_Barrier(comm);  
02448     if(!rank) {
02449       std::cout<<"Passed Step 8."<<std::endl;
02450     }
02451     MPI_Barrier(comm);  
02452 #endif
02453 
02454     //Local searches and creating seedList
02455     //recvK will be sorted. 
02456     //since, recvK is sorted, we can pass the previous
02457     // result as a lower bound for subsequent 
02458     //searches instead of NULL for everything. This
02459     //should reduce the constant a bit.
02460 
02461     std::vector<std::vector<TreeNode> > seedList(nodes.size());
02462     unsigned int lastIdx = 0;   
02463     for(unsigned int i=0; i < keyLen; i++) {
02464       unsigned int idx;
02465       //assumes nodes is sorted and unique.          
02466       bool flag = seq::maxLowerBound<TreeNode>(nodes,
02467           recvK[i], idx,&lastIdx,NULL);
02468       if (flag) {
02469         lastIdx = idx;  
02470 #ifdef __DEBUG_OCT__
02471         assert(areComparable(nodes[idx], recvK[i]));
02472 #endif
02473         if (nodes[idx].isAncestor(recvK[i])) {
02474           if (lev > (nodes[idx].getLevel() + 1)) {
02475             seedList[idx].push_back(
02476                 recvK[i].getAncestor(lev-1));                
02477           }
02478         }//end if correct result
02479       }//end if flag        
02480     }//end for i
02481 
02482     recvK.clear();
02483 
02484 #ifdef __DEBUG_OCT__
02485     MPI_Barrier(comm);  
02486     if(!rank) {
02487       std::cout<<"Passed Step 9."<<std::endl;
02488     }
02489     MPI_Barrier(comm);  
02490 #endif
02491 
02492     //Include the new octants in the octree
02493     //while preserving linearity and sorted order
02494 
02495     //Seedlist may have duplicates. Seedlist 
02496     // will be sorted.
02497 
02498     std::vector<TreeNode> tmpList;
02499     std::vector<std::vector<TreeNode> >allInternalLeaves(nodes.size());
02500     unsigned int tmpSz = 0;
02501     for (unsigned int i=0;i<nodes.size();i++) { 
02502       seq::makeVectorUnique<TreeNode>(seedList[i],true);        
02503       if (!seedList[i].empty()) {
02504         nodes[i].completeSubtree(seedList[i], allInternalLeaves[i]);
02505       }
02506       tmpSz += allInternalLeaves[i].size();
02507       if (allInternalLeaves[i].empty()) {
02508         tmpSz++;
02509       }
02510       seedList[i].clear();
02511     }//end for i
02512 
02513 #ifdef __DEBUG_OCT__
02514     MPI_Barrier(comm);  
02515     if(!rank) {
02516       std::cout<<"Passed Step 10."<<std::endl;
02517     }
02518     MPI_Barrier(comm);  
02519 #endif
02520 
02521     seedList.clear();
02522     tmpList.resize(tmpSz);
02523     tmpSz=0;
02524     for (unsigned int i=0;i<nodes.size();i++) {
02525       if (!allInternalLeaves[i].empty()) {
02526         for (int k=0;k<allInternalLeaves[i].size();k++) {
02527           tmpList[tmpSz++] = allInternalLeaves[i][k];
02528         }//end for k
02529         allInternalLeaves[i].clear();
02530       } else {
02531         tmpList[tmpSz++] = nodes[i];
02532       }
02533     }//end for i
02534     allInternalLeaves.clear();
02535     nodes = tmpList;
02536     tmpList.clear();
02537 
02538 #ifdef __DEBUG_OCT__
02539     MPI_Barrier(comm);  
02540     if(!rank) {
02541       std::cout<<"Passed Step 11."<<std::endl;
02542     }
02543     MPI_Barrier(comm);  
02544 #endif
02545 
02546     if(rePart) {
02547       par::partitionW<ot::TreeNode>(nodes, NULL,comm);
02548     }
02549 
02550 #ifdef __DEBUG_OCT__
02551     MPI_Barrier(comm);  
02552     if(!rank) {
02553       std::cout<<"Passed Step 12."<<std::endl;
02554     }
02555     MPI_Barrier(comm);  
02556 #endif
02557 
02558   }//end for lev
02559 
02560   PROF_PAR_RIPPLE_TYPE1_END 
02561 }//end function
02562 
02563 //myNhBlocks is automatically sorted and unique at the end of the loop.
02564 int selectNeighboringBlocks(const std::vector<TreeNode>& allBlocks, 
02565     const std::vector<TreeNode>& blocks, const std::vector<unsigned int>& maxBlockBndVec,
02566     int myRank, std::vector<TreeNode>& myNhBlocks) {
02567   PROF_PICK_NH_BLOCKS_BEGIN
02568 
02569     for (int k = 0; k < allBlocks.size(); k++) {
02570       if (allBlocks[k].getWeight() == myRank) {
02571         //Only need to send to other processors.
02572         continue;
02573       }
02574 
02575       unsigned int othMinX = allBlocks[k].minX();
02576       unsigned int othMinY = allBlocks[k].minY();
02577       unsigned int othMinZ = allBlocks[k].minZ();
02578       unsigned int othMaxX = allBlocks[k].maxX();
02579       unsigned int othMaxY = allBlocks[k].maxY();
02580       unsigned int othMaxZ = allBlocks[k].maxZ();
02581 
02582       //Even if allBlocks[k] interescts the region of influence of any of the
02583       //local blocks, it is included in the neighbour list.
02584       for (int j = 0; j < blocks.size(); j++) {
02585         unsigned int myLen = maxBlockBndVec[j];
02586         unsigned int myMinX = blocks[j].minX();
02587         unsigned int myMinY = blocks[j].minY();
02588         unsigned int myMinZ = blocks[j].minZ();
02589         unsigned int myMaxX = blocks[j].maxX();
02590         unsigned int myMaxY = blocks[j].maxY();
02591         unsigned int myMaxZ = blocks[j].maxZ();
02592         unsigned int xlow = ( (myMinX >= myLen) ? (myMinX - myLen) : myMinX );
02593         unsigned int ylow = ( (myMinY >= myLen) ? (myMinY - myLen) : myMinY );
02594         unsigned int zlow = ( (myMinZ >= myLen) ? (myMinZ - myLen) : myMinZ );
02595         unsigned int xhigh = ( myMaxX + myLen );
02596         unsigned int yhigh = ( myMaxY + myLen );
02597         unsigned int zhigh = ( myMaxZ + myLen );
02598 
02599         if ( (othMinX < xhigh) && (othMinY < yhigh) && (othMinZ < zhigh)
02600             && (othMaxX > xlow) && (othMaxY > ylow) && (othMaxZ > zlow) ) {
02601           myNhBlocks.push_back(allBlocks[k]);
02602           break;
02603         }//end if to be sent
02604       }//end for j
02605     }//end for k   
02606 
02607   PROF_PICK_NH_BLOCKS_END
02608 }//end function
02609 
02610 int mergeComboBalAndPickBoundary(std::vector<ot::TreeNode>& out, 
02611     std::vector<ot::TreeNode>& allBoundaryLeaves, 
02612     const ot::TreeNode& firstBlock, const ot::TreeNode& lastBlock) {
02613   PROF_MERGE_COMBO_BAL_BEGIN
02614 
02615     //Merge (in-place) results from the two stages, i.e. blockBalance and
02616     //intra-processor rippleBalance..
02617     std::vector<TreeNode> tmpNodeList(out.size() + allBoundaryLeaves.size());
02618 
02619   unsigned int tmpLsz = 0;
02620   unsigned int bndCnt = 0;
02621   unsigned int bndSz = allBoundaryLeaves.size();
02622 
02623   for (unsigned int i = 0;i < out.size();i++) {
02624     if ( bndCnt < allBoundaryLeaves.size() ) {
02625       if ( out[i] == allBoundaryLeaves[bndCnt] ) {
02626         tmpNodeList[tmpLsz] = out[i];
02627         tmpLsz++;
02628         bndCnt++;
02629       } else if (out[i] < allBoundaryLeaves[bndCnt] ) {
02630 #ifdef __DEBUG_OCT__
02631         assert(areComparable(out[i], allBoundaryLeaves[bndCnt]));
02632 #endif
02633         if (out[i].isAncestor(allBoundaryLeaves[bndCnt]) ) {
02634           while ( (bndCnt < bndSz) && 
02635               out[i].isAncestor(allBoundaryLeaves[bndCnt]) ) {
02636             tmpNodeList[tmpLsz] = allBoundaryLeaves[bndCnt];
02637             tmpLsz++;
02638             bndCnt++;
02639 #ifdef __DEBUG_OCT__
02640             if(bndCnt < bndSz) {
02641               assert(areComparable(out[i], allBoundaryLeaves[bndCnt]));
02642             }
02643 #endif
02644           }
02645         } else {
02646           tmpNodeList[tmpLsz] = out[i];
02647           tmpLsz++;
02648         }
02649       } else {
02650         // nodes[i] > allBdy .. so insert
02651         tmpNodeList[tmpLsz] = allBoundaryLeaves[bndCnt];
02652         tmpLsz++;
02653         bndCnt++;
02654       }
02655     } else {
02656       tmpNodeList[tmpLsz] = out[i];
02657       tmpLsz++;
02658     }
02659   }//end for i
02660 
02661   tmpNodeList.resize(tmpLsz);
02662   out = tmpNodeList;
02663 
02664   tmpNodeList = allBoundaryLeaves;
02665   pickInterProcessorBoundaryNodes(tmpNodeList, allBoundaryLeaves,
02666       firstBlock, lastBlock);
02667   tmpNodeList.clear();
02668 
02669   PROF_MERGE_COMBO_BAL_END 
02670 }//end function
02671 
02672 int finalMergeInBal(std::vector<ot::TreeNode>& out, std::vector<ot::TreeNode>& allBoundaryLeaves) {
02673   PROF_FINAL_MERGE_IN_BAL_BEGIN
02674 
02675     std::vector<ot::TreeNode> tmpNodeList(out.size() + allBoundaryLeaves.size());
02676 
02677   unsigned int tmpLsz = 0;
02678   unsigned int bndSz = allBoundaryLeaves.size();
02679   unsigned int bndCnt = 0;
02680   for (unsigned int i = 0; i < out.size(); i++) {
02681     if ( bndCnt < bndSz ) {
02682       if ( out[i] == allBoundaryLeaves[bndCnt] ) {
02683         tmpNodeList[tmpLsz++] = out[i];
02684         bndCnt++;
02685       } else if (out[i] < allBoundaryLeaves[bndCnt] ) {
02686 #ifdef __DEBUG_OCT__
02687         assert(areComparable(out[i], allBoundaryLeaves[bndCnt]));
02688 #endif
02689         if (out[i].isAncestor(allBoundaryLeaves[bndCnt]) ) {
02690           while ( (bndCnt < bndSz ) && out[i].isAncestor(allBoundaryLeaves[bndCnt]) ) {
02691             tmpNodeList[tmpLsz++] = allBoundaryLeaves[bndCnt++];
02692 #ifdef __DEBUG_OCT__
02693             if(bndCnt < bndSz) {
02694               assert(areComparable(out[i], allBoundaryLeaves[bndCnt]));
02695             }
02696 #endif
02697           }
02698         } else {
02699           tmpNodeList[tmpLsz++] = out[i];
02700         }
02701       } else {
02702         // nodes[i] > allBdy .. so insert
02703         tmpNodeList[tmpLsz++] = allBoundaryLeaves[bndCnt++];
02704       }
02705     } else {
02706       tmpNodeList[tmpLsz++] = out[i];
02707     }
02708   }//end for i
02709 
02710   tmpNodeList.resize(tmpLsz);
02711   out = tmpNodeList;
02712   tmpNodeList.clear();
02713   allBoundaryLeaves.clear();
02714 
02715   PROF_FINAL_MERGE_IN_BAL_END
02716 }//end function
02717 
02718 int prepareBalComm1MessagesType1(const std::vector<ot::TreeNode>& allBoundaryLeaves, 
02719     const std::vector<ot::TreeNode>& myNhBlocks, int npes, unsigned int maxDepth, 
02720     std::vector<TreeNode>* sendNodes, std::vector<unsigned int>* sentToPid, int* sendCnt) {
02721   PROF_PREP_BAL_COMM1_MSSG_BEGIN 
02722 
02723     unsigned int allBndSz = allBoundaryLeaves.size();
02724   for (int j = 0; j < allBndSz; j++) {
02725     unsigned int myLen = (1u << (maxDepth - (allBoundaryLeaves[j].getLevel())));
02726     unsigned int myMinX = allBoundaryLeaves[j].minX();
02727     unsigned int myMinY = allBoundaryLeaves[j].minY();
02728     unsigned int myMinZ = allBoundaryLeaves[j].minZ();
02729     unsigned int xlow = ( (myMinX >= myLen) ? (myMinX - myLen) : myMinX );
02730     unsigned int ylow = ( (myMinY >= myLen) ? (myMinY - myLen) : myMinY );
02731     unsigned int zlow = ( (myMinZ >= myLen) ? (myMinZ - myLen) : myMinZ );
02732     unsigned int xhigh = ( myMinX + (2*myLen) );
02733     unsigned int yhigh = ( myMinY + (2*myLen) );
02734     unsigned int zhigh = ( myMinZ + (2*myLen) );
02735 
02736     unsigned int lastP = npes;
02737     //Each node could be sent to multiple processors. So must check with
02738     //different blocks. All the blocks from the same processor are
02739     //contiguous.
02740     for (int k = 0; k < myNhBlocks.size(); k++) {
02741       if (myNhBlocks[k].getWeight() == lastP) {
02742         //Already sent to this processor. It doesn't matter if there are
02743         //multiple blocks on the same processor in the region of influence of
02744         //the same node.
02745         continue;
02746       }//end if sent already.
02747 
02748       unsigned int othMinX = myNhBlocks[k].minX();
02749       unsigned int othMinY = myNhBlocks[k].minY();
02750       unsigned int othMinZ = myNhBlocks[k].minZ();
02751       unsigned int othMaxX = myNhBlocks[k].maxX();
02752       unsigned int othMaxY = myNhBlocks[k].maxY();
02753       unsigned int othMaxZ = myNhBlocks[k].maxZ();
02754 
02755       if ( (othMinX < xhigh) && (othMinY < yhigh) && (othMinZ < zhigh)
02756           && (othMaxX > xlow) && (othMaxY > ylow) && (othMaxZ > zlow) ) {
02757         sendNodes[myNhBlocks[k].getWeight()].push_back(allBoundaryLeaves[j]);
02758         sentToPid[j].push_back(myNhBlocks[k].getWeight());
02759         sendCnt[myNhBlocks[k].getWeight()]++;
02760         lastP = myNhBlocks[k].getWeight();
02761       }//end if to be sent
02762     }//end for k   
02763   }//end for j
02764 
02765   PROF_PREP_BAL_COMM1_MSSG_END 
02766 }//end function
02767 
02768 int prepareWlistInBal(const std::vector<ot::TreeNode>& recvK1, 
02769     const int* recvCnt, int npes, const ot::TreeNode& myFirstBlock,
02770     const ot::TreeNode& myLastBlock, std::vector<TreeNode>& wList,
02771     std::vector<std::vector<unsigned int> >& wListRanks) {
02772   PROF_PREP_BAL_WLIST_BEGIN
02773 
02774     std::vector<ot::TreeNode> recvKtemp = recvK1;
02775 
02776   int counter = 0;
02777   for(int i = 0; i < npes; i++) {
02778     for(int j = counter; j < (counter + recvCnt[i]); j++) {
02779       recvKtemp[j].setWeight(i);
02780     }//end for j
02781     counter += recvCnt[i];
02782   }//end for i
02783 
02784   std::vector<std::vector<ot::TreeNode> > wListTmp(recvKtemp.size());
02785   for (int i = 0; i < recvKtemp.size(); i++) {
02786     std::vector<TreeNode> myNh =  recvKtemp[i].getAllNeighbours();
02787     unsigned int myNhSz = myNh.size();
02788     for (int k = 0; k < myNhSz; k++) {
02789       //Those that overlap the local domain...
02790       //There are only 3 types of overlaps between octants A and B.
02791       //a) A is an ancestor of B
02792       //b) B is an ancestor of A
02793       //c) A and B are the same.
02794       //Any octant >= myFirstBlock and <= myLastBlock.DLD if and only if it
02795       //lies on my processor and it is either a decendant of one of my blocks or equal to
02796       //one of my blocks. 
02797       //Fact: If A is an ancestor of B, then A must also be the ancestor of
02798       //any octant > A and < B. So, if myNh[k] is not an ancestor of
02799       //myFirstBlock, then it can be an ancestor of some octant > myFirstBlock only
02800       //if myNh[k] >= myFirstBlock
02801 #ifdef __DEBUG_OCT__
02802       assert(areComparable(myNh[k], myFirstBlock));
02803 #endif
02804       if ( (myNh[k].isAncestor(myFirstBlock)) ||
02805           ( (myNh[k] >= myFirstBlock) && (myNh[k] <= myLastBlock.getDLD()) ) ) {
02806         myNh[k].setWeight(recvKtemp[i].getWeight());
02807         wListTmp[i].push_back(myNh[k]);
02808       }
02809     }//end for k
02810     myNh.clear();
02811   }//end for i
02812 
02813   recvKtemp.clear();
02814 
02815   for (int i = 0;i < wListTmp.size(); i++) {
02816     wList.insert(wList.end(), wListTmp[i].begin(), wListTmp[i].end());
02817     wListTmp[i].clear();
02818   }
02819   wListTmp.clear();
02820 
02821   std::sort(wList.begin(),wList.end());
02822 
02823   unsigned int wCtr=0;
02824   unsigned int wListSz = wList.size();
02825   if (wCtr < wListSz) {
02826     std::vector<unsigned int> tmpWr;
02827     tmpWr.push_back(wList[0].getWeight());
02828     wList[0].setWeight(1);
02829     wListRanks.push_back (tmpWr);
02830     tmpWr.clear();
02831     wCtr++;
02832   }
02833 
02834   unsigned int rowCtr = 0;
02835   while (wCtr < wListSz) {
02836     if (wList[wCtr] == wList[wCtr-1]) {
02837       wListRanks[rowCtr].push_back(wList[wCtr].getWeight());
02838       wList[wCtr].setWeight(1);
02839     } else {
02840       seq::makeVectorUnique<unsigned int>(wListRanks[rowCtr],false);
02841       std::vector<unsigned int> tmpWr;
02842       tmpWr.push_back(wList[wCtr].getWeight());
02843       wList[wCtr].setWeight(1);
02844       wListRanks.push_back(tmpWr);
02845       tmpWr.clear();
02846       rowCtr++;
02847     }
02848     wCtr++;
02849   }//end while
02850 
02851   seq::makeVectorUnique<TreeNode>(wList, true);
02852 
02853   PROF_PREP_BAL_WLIST_END
02854 }//end function
02855 
02856 /*
02857    Some Facts: allBoundaryLeaves is linear, sorted,unique, incomplete
02858    wList is not linear (overlaps are allowed), it is sorted, unique and incomplete.
02859    All elements in wList are in the domain controlled by this processor.
02860    wList and wListRanks are in sync always.
02861    Each element in allBoundaryLeaves can overlap with multiple elements in wList and vice-versa.
02862    Important Property: If A < B < C and if A and B do not overlap then A and C also do not overlap.
02863    This can be proved by contradiction, Suppose A and C overlap,
02864    then either A is an ancestor of C or C is an ancestor of A or both are equal.
02865    The third is automatically ruled out. The second is not possible since C > A.
02866    If A was an ancestor of C then A has to be an ancestor of B as well and since this 
02867    is not the case, A and C can't overlap.
02868    */
02869 int prepareBalComm2Messages(const std::vector<ot::TreeNode>& allBoundaryLeaves,
02870     const std::vector<ot::TreeNode>& wList,
02871     const std::vector<std::vector<unsigned int> >& wListRanks,
02872     std::vector<TreeNode>* sendNodes, std::vector<unsigned int>* sentToPid, int* sendCnt) {
02873   PROF_PREP_BAL_COMM2_MSSG_BEGIN 
02874 
02875     /*
02876        Why is sendNodes[i] sorted at the end of this loop?
02877        1. For each processor, the blocks (W) corresponding to that
02878        processor are visited in a sorted order.
02879 
02880        2. For any given processor say W1 is visited before W2 => W1 < W2.
02881 
02882        3. Either W1 is an ancestor of W2 or W1 and W2 do not overlap
02883 
02884        4. All bnd sent to W1 are sent in a sorted order.
02885 
02886        5. Let B1 be the list of elements that overlap with W1 and 
02887        let B2 be the ones that overlap with W2.
02888 
02889        6. There are only two possibilites: either B2 is a subset of B1 or
02890        B1 and B2 have no common elements and min(B2) > max(B1).
02891 
02892        7. IF W2 is a decendant of W1, then clearly any ancestor of W2 or
02893        decendant of W2 also overlaps W1, thus B2 is a subset of B1. Since, all 
02894        of these were sent already they will not be sent to the same processor 
02895        again and thus they will not disturb the order.
02896 
02897        8. If W2 > W1 and W2 and W1 do not overlap, then the only octants that overlap both
02898        W1 and W2 are the common ancestors.
02899 
02900        9. If there is a common ancestor of both in allBnd, it will be sent during the comparison
02901        with W1 and no other elements in allBnd will overlap either W1 or W2. In this case B1=B2.
02902 
02903        10. If there are no common ancestors then B1 and B2 have no common elements.
02904 
02905        11. Since W2 >W1 any octant that overlaps W2 but not W1 has to be > all decendants of W1.
02906        Hence, min(B2) > max(B1).
02907        */
02908 
02909     int lastStart = 0;
02910   unsigned int wListSz = wList.size();
02911   unsigned int allBndSz = allBoundaryLeaves.size();
02912   for (int ii = 0; ii < wListSz; ii++) {
02913     for (int j = lastStart; j < allBndSz; j++) {
02914       //Each node could be sent to multiple processors.
02915 #ifdef __DEBUG_OCT__
02916       assert(areComparable(wList[ii], allBoundaryLeaves[j]));
02917 #endif
02918       if ((wList[ii] == allBoundaryLeaves[j]) || (wList[ii].isAncestor(allBoundaryLeaves[j]))
02919           || (allBoundaryLeaves[j].isAncestor(wList[ii]))) {
02920         //Overlap...
02921         for (int jj = 0; jj < wListRanks[ii].size(); jj++) {
02922           bool sentAlready = false;
02923           for (int kk = 0; kk < sentToPid[j].size(); kk++) {
02924             //loopCtr++;
02925             if (sentToPid[j][kk] == wListRanks[ii][jj]) {
02926               sentAlready = true;
02927               break;
02928             }
02929           }//end for kk
02930           if (!sentAlready) {
02931             sendNodes[wListRanks[ii][jj]].push_back(allBoundaryLeaves[j]);
02932             sendCnt[wListRanks[ii][jj]]++;
02933             sentToPid[j].push_back(wListRanks[ii][jj]);
02934           }
02935         }//end for jj
02936       } else if (wList[ii] < allBoundaryLeaves[j]) {
02937         //No overlap w < allBnd
02938         //Since, W does not intersect this it will not intersect any element that follows.
02939         //This is the justification for early termination.
02940         break;
02941       } else {
02942         //No overlap allBnd < w
02943         //Since, allBnd does not intersect this w, it will not intersect any
02944         //element in w that follows. So subsequent w need not be compared
02945         //against this.
02946         lastStart++;
02947       }//end if-else overlaps
02948     }//end for j
02949   }//end for ii
02950 
02951   PROF_PREP_BAL_COMM2_MSSG_END 
02952 }//end function
02953 
02954 int mergeRecvKeysInBal(const std::vector<ot::TreeNode>& recvK1, const int* recvOffsets1,
02955     const std::vector<ot::TreeNode>& recvK2, const int* recvOffsets2, 
02956     int npes, std::vector<ot::TreeNode>& recvK) {
02957   PROF_MERGE_RECV_KEYS_BAL_BEGIN 
02958 
02959     recvK.resize(recvK1.size() + recvK2.size());
02960 
02961   //Note, you only recieve from other processors and not from yourself.
02962   //Merge recvK1 and recvK2 inplace....
02963 
02964   //Basic idea... All elements from processor i are less than those from i+1.
02965   //The elements in recvK1 and recvK2 are independently sorted.
02966   unsigned int recvKcnt = 0;
02967   for (int i = 0; i < npes; i++) {
02968     unsigned int nextFrom1 = recvOffsets1[i];
02969     unsigned int nextFrom2 = recvOffsets2[i];
02970     unsigned int end1 = ((i <(npes - 1)) ? recvOffsets1[i+1] : recvK1.size());
02971     unsigned int end2 = ((i <(npes - 1)) ? recvOffsets2[i+1] : recvK2.size());
02972     while ( (nextFrom1 < end1) || (nextFrom2 < end2) ) {
02973 
02974       if (nextFrom1 < end1 && nextFrom2 < end2) {
02975         while (recvK1[nextFrom1] <= recvK2[nextFrom2]) {
02976           recvK[recvKcnt++] = recvK1[nextFrom1];
02977           nextFrom1++;
02978           if (nextFrom1 >= end1) {
02979             break;
02980           }
02981         }
02982       }
02983 
02984       if (nextFrom1 < end1 && nextFrom2 < end2) {
02985         while (recvK1[nextFrom1] > recvK2[nextFrom2]) {
02986           recvK[recvKcnt++] = recvK2[nextFrom2];
02987           nextFrom2++;
02988           if (nextFrom2 >= end2) {
02989             break;
02990           }
02991         }
02992       }
02993 
02994       if (nextFrom2 >= end2) {
02995         while (nextFrom1 < end1) {
02996           recvK[recvKcnt++] = recvK1[nextFrom1];
02997           nextFrom1++;
02998         }
02999       }
03000 
03001       if (nextFrom1 >= end1) {
03002         while (nextFrom2 < end2) {
03003           recvK[recvKcnt++] = recvK2[nextFrom2];
03004           nextFrom2++;
03005         }
03006       }
03007 
03008     }
03009   }//end for i
03010 
03011   PROF_MERGE_RECV_KEYS_BAL_END 
03012 }//end function
03013 
03014 int prepareBalComm1MessagesType2(const std::vector<ot::TreeNode>& allBoundaryLeaves, 
03015     const std::vector<ot::TreeNode>& minsAllBlocks, int rank, unsigned int dim, 
03016     unsigned int maxDepth, std::vector<TreeNode>* sendNodes,
03017     std::vector<unsigned int>* sentToPid, int* sendCnt) {
03018   PROF_PREP_BAL_COMM1_MSSG_BEGIN 
03019 
03020     //Each octant must be sent to all processors which overlap its insulation
03021     //layer. So we generate all the neighbours (nh) of this octant at this level and
03022     //find all processors that lie in between maxLowerBound(DFD(nh(i))) and
03023     //maxLowerBound(DLD(nh(i))) for all i
03024 
03025     unsigned int allBndSz = allBoundaryLeaves.size();
03026   ot::TreeNode rootNode(dim, maxDepth);
03027   for (int j = 0; j < allBndSz; j++) {
03028     std::vector<ot::TreeNode> myNh = allBoundaryLeaves[j].getAllNeighbours();
03029 
03030     seq::makeVectorUnique<ot::TreeNode>(myNh, false);
03031     unsigned int stIdx = ( (myNh[0] == rootNode) ? 1 : 0 );
03032 
03033     std::vector<unsigned int> pIds;
03034     for(unsigned int i = stIdx; i < myNh.size(); i++) {
03035       unsigned int idx1;
03036       unsigned int idx2;
03037       seq::maxLowerBound<ot::TreeNode>(minsAllBlocks, myNh[i].getDFD(), idx1, NULL, NULL);
03038       seq::maxLowerBound<ot::TreeNode>(minsAllBlocks, myNh[i].getDLD(), idx2, NULL, NULL);
03039       for(int k = idx1; k <= idx2; k++) {
03040         pIds.push_back(k);
03041       }//end for k
03042     }//end for i
03043 
03044 #ifdef __DEBUG_OCT__
03045     //myNh is explicitly sorted. Moreover, no two elements of myNh overlap and since
03046     //myNh(i) < myNh(i+1) this implies that
03047     //myNh(i).getDFD() <= myNh(i).getDLD() < myNh(i+1).getDFD() <=
03048     //myNh(i+1).getDLD() 
03049     //If a < b then MLB(a) <= MLB(b). So uniqueness if not guaranteed.
03050     assert(seq::test::isSorted<unsigned int>(pIds));
03051 #endif
03052 
03053     seq::makeVectorUnique<unsigned int>(pIds, true);
03054 
03055     for(int i = 0; i < pIds.size(); i++) {
03056       if(pIds[i] != rank) {
03057         sendNodes[pIds[i]].push_back(allBoundaryLeaves[j]);
03058         sentToPid[j].push_back(pIds[i]);
03059         sendCnt[pIds[i]]++;
03060       }
03061     }//end for i
03062   }//end for j
03063 
03064   PROF_PREP_BAL_COMM1_MSSG_END 
03065 }//end function
03066 
03067 }//end namespace
03068 
03069 
03070 

Generated on Tue Mar 24 16:14:04 2009 for DENDRO by  doxygen 1.3.9.1