DENDRO: /net/www/csela/dendro/releases/Dendro/src/oct/BlockPart.C Source File

00001 
00009 #include "TreeNode.h"
00010 #include "parUtils.h"
00011 
00012 #ifdef __DEBUG__
00013 #ifndef __DEBUG_OCT__
00014 #define __DEBUG_OCT__
00015 #endif
00016 #endif
00017 
00018 #ifdef __DEBUG_OCT__
00019 #ifndef __MEASURE_BPART_COMM__
00020 #define __MEASURE_BPART_COMM__
00021 #endif
00022 #endif
00023 
00024 namespace ot {
00025 
00026   //Assumes that nodes are sorted.
00027   //No processor must call this with an empty input.
00028   int blockPartStage1_p2o(std::vector<TreeNode> &nodes, std::vector<TreeNode> &blocks,
00029       unsigned int dim, unsigned int maxDepth,  MPI_Comm comm) {
00030 #ifdef __PROF_WITH_BARRIER__
00031     MPI_Barrier(comm);
00032 #endif
00033     PROF_BLKPART1_BEGIN
00034       int npes, rank;
00035     MPI_Comm_rank(comm, &rank);
00036     MPI_Comm_size(comm, &npes);
00037 
00038     par::partitionW<ot::TreeNode>(nodes, NULL, comm);
00039 
00040     assert(nodes.size() > (1 << dim) ); 
00041 
00042     // 1. First compute the localCoarse octree 
00043     std::vector<TreeNode> localCoarse;
00044 
00045     //include both the min and max elements as well  
00046     //The output will be sorted, unique and linear
00047     appendCompleteRegion(nodes[0], nodes[nodes.size()-1], localCoarse, true, true);
00048 
00049     // 2. Get local Blocks. These will be input to completeOctree that will
00050     // produce globalCoarse.
00051     std::vector<TreeNode> localBlocks;
00052 
00053     //Select all the coarsest blocks and insert them into localBlocks
00054     unsigned int minLevel = maxDepth;
00055     for(int i = 0; i < localCoarse.size(); i++) {
00056       if(localCoarse[i].getLevel() < minLevel) {
00057         minLevel = localCoarse[i].getLevel();
00058       }
00059     }
00060 
00061     for(int i = 0; i < localCoarse.size(); i++) {
00062       if(localCoarse[i].getLevel() == minLevel) {
00063         localBlocks.push_back(localCoarse[i]);
00064       }
00065     }
00066 
00067     localCoarse.clear();
00068 
00069     for (unsigned int i = 0; i < localBlocks.size(); i++) {
00070       localBlocks[i].setWeight(1);
00071     }
00072 
00073     // 3. Call nodes2Oct on these cells to generate the 
00074     //    globalCoarse octree ...
00075     //localBlocks will be sorted and linear 
00076     //There is a pathological case which prevents us from asserting that
00077     //localBlocks will be globally unique. For example, if the last element in the input on processor i
00078     //is the same as the first element on processor i+1 and if they are both
00079     //selected in localBlocks.
00080 
00081     completeOctree(localBlocks, blocks, dim, maxDepth, false, true, false, comm);
00082 
00083     localBlocks.clear();
00084 
00085     PROF_BLKPART1_END
00086   } // end blockPart
00087 
00088   int blockPartStage2_p2o(std::vector<TreeNode> &nodes, std::vector<TreeNode> &globalCoarse,
00089       std::vector<ot::TreeNode>& minsAllBlocks, unsigned int dim, unsigned int maxDepth,
00090       MPI_Comm comm) {
00091 #ifdef __PROF_WITH_BARRIER__
00092     MPI_Barrier(comm);
00093 #endif
00094     PROF_BLKPART2_BEGIN
00095 
00096       ot::TreeNode rootNode (dim,maxDepth);
00097 
00098     int npes, rank;
00099     MPI_Comm_rank(comm,&rank);
00100     MPI_Comm_size(comm,&npes);
00101 
00102     int *sendCnt = new int[npes];
00103     int *recvCnt = new int[npes];
00104     int *sendOffsets = new int[npes];
00105     int *recvOffsets = new int[npes];
00106 
00107     //1. Now compute the wts of these cells ...
00108     //    a. Get the min and max nodes at each processor.
00109     std::vector<TreeNode> _mins_maxs(2*npes);
00110 
00111     // communicate ...
00112     ot::TreeNode sendMinMax[2];
00113 
00114     if (!nodes.empty()) {
00115       sendMinMax[0] =  nodes[0];
00116       sendMinMax[1] =  nodes[nodes.size()-1];
00117     } else {
00118       sendMinMax[0] = rootNode;
00119       sendMinMax[1] = rootNode;
00120     }
00121 
00122     par::Mpi_Allgather<ot::TreeNode>(sendMinMax, &(*_mins_maxs.begin()), 2, comm);
00123 
00124     std::vector<std::vector<TreeNode> > sendNodes(npes);
00125     std::vector<std::vector<unsigned int> > keymap(npes);
00126 
00127     for (int i=0; i<npes; i++) {
00128       sendCnt[i] = 0;
00129     }
00130 
00131     //    b. Now compute which cells go to which cells ...
00132     //       logic is that if the coarse cell is between the min and max at a
00133     //       processor or if it is an ancestor of min, then it is sent to that
00134     //       processor.
00135     //Naive Logic:
00136     for (unsigned int i=0; i<globalCoarse.size(); i++) {
00137       for (int p=0;p<npes;p++) {
00138 #ifdef __DEBUG_OCT__
00139         assert(areComparable(globalCoarse[i], _mins_maxs[2*p]));
00140 #endif
00141         if ( (globalCoarse[i].isAncestor(_mins_maxs[2*p])) ||
00142             ( (globalCoarse[i] >= _mins_maxs[2*p]) && (globalCoarse[i] <=_mins_maxs[(2*p)+1]) ) ) {
00143           sendNodes[p].push_back(globalCoarse[i]);
00144           // save keymap so that we can assign weights back to globalCoarse.
00145           keymap[p].push_back(i);    
00146           sendCnt[p]++;
00147         }//end if
00148       }//end for
00149     }//end for
00150 
00151     _mins_maxs.clear();
00152 
00153     //2. Send nodes to all cells to compute the wts ... locally ...
00154 
00155     //    a. Communicate how many you'll be sending and how many will be
00156     //       received.
00157 
00158     // Now do an All2All to get numKeysRecv
00159     par::Mpi_Alltoall<int>( sendCnt, recvCnt, 1, comm);
00160 
00161     //    b. Concatenate all nodes into one single Carray ...
00162     unsigned int totalSend = 0;
00163     unsigned int totalRecv = 0;
00164 #ifdef __MEASURE_BPART_COMM__
00165     unsigned int numProcsSendI = 0;
00166     unsigned int numProcsRecvI = 0;
00167 #endif
00168     for (unsigned int i = 0; i < npes; i++) {
00169       totalSend+= sendCnt[i];
00170       totalRecv+= recvCnt[i];
00171 #ifdef __MEASURE_BPART_COMM__
00172       if(sendCnt[i]) {
00173         numProcsSendI++;
00174       }
00175       if(recvCnt[i]) {
00176         numProcsRecvI++;
00177       }
00178 #endif
00179     }
00180 
00181     // create the send and recv buffers ...
00182     std::vector<ot::TreeNode> sendK (totalSend);
00183     std::vector<ot::TreeNode> recvK (totalRecv);
00184 
00185     // Now create sendK
00186     sendOffsets[0] = 0;
00187     recvOffsets[0] = 0;
00188 
00189     // compute offsets ...
00190     for (int i=1; i<npes; i++) {
00191       sendOffsets[i] = sendOffsets[i-1] + sendCnt[i-1];
00192       recvOffsets[i] = recvOffsets[i-1] + recvCnt[i-1];
00193     }
00194 
00195     for (int i=0; i<npes; i++) {
00196       for (unsigned int j=0; j<sendCnt[i]; j++) {
00197         sendK[sendOffsets[i] + j] = sendNodes[i][j];
00198       }//end for j
00199     }//end for i
00200 
00201     //3. send and receive all keys ...
00202 
00203     ot::TreeNode* sendKptr = NULL;
00204     ot::TreeNode* recvKptr = NULL;
00205     if(!sendK.empty()) {
00206       sendKptr = &(*(sendK.begin()));
00207     }
00208     if(!recvK.empty()) {
00209       recvKptr = &(*(recvK.begin()));
00210     }
00211 
00212     par::Mpi_Alltoallv_sparse<ot::TreeNode>(sendKptr, sendCnt, sendOffsets,      
00213         recvKptr, recvCnt, recvOffsets, comm);
00214 
00215     sendK.clear();
00216 
00217 #ifdef __MEASURE_BPART_COMM__
00218     MPI_Barrier(comm);
00219     unsigned int* allTotalSendI = new unsigned int[npes];
00220     unsigned int* allTotalRecvI = new unsigned int[npes];
00221     unsigned int* allNumProcsSendI = new unsigned int[npes];
00222     unsigned int* allNumProcsRecvI = new unsigned int[npes]; 
00223     par::Mpi_Gather<unsigned int>(&totalSend, allTotalSendI, 1, 0, comm);
00224     par::Mpi_Gather<unsigned int>(&totalRecv, allTotalRecvI, 1, 0, comm);
00225     par::Mpi_Gather<unsigned int>(&numProcsSendI, allNumProcsSendI, 1, 0, comm);
00226     par::Mpi_Gather<unsigned int>(&numProcsRecvI, allNumProcsRecvI, 1, 0, comm);
00227     if(!rank) {
00228       for(int i = 0; i < npes; i++) {
00229         std::cout<<" allTotalSendI["<<i<<"] in Bpart: "<<allTotalSendI[i]<<std::endl;
00230         std::cout<<" allTotalRecvI["<<i<<"] in Bpart: "<<allTotalRecvI[i]<<std::endl;
00231         std::cout<<" allNumProcsSendI["<<i<<"] in Bpart: "<<allNumProcsSendI[i]<<std::endl;
00232         std::cout<<" allNumProcsRecvI["<<i<<"] in Bpart: "<<allNumProcsRecvI[i]<<std::endl;
00233       }
00234     }
00235     delete [] allTotalSendI;
00236     delete [] allTotalRecvI;
00237     delete [] allNumProcsSendI;
00238     delete [] allNumProcsRecvI;
00239     MPI_Barrier(comm);
00240 #endif
00241 
00242     //4. Now compute the wts of the locally received nodes ...
00243     //    a. loop through nodes and update the wts of the local chunks ...
00244     unsigned int *wts = NULL;
00245     if(totalRecv) {
00246       wts = new unsigned int [totalRecv];
00247     }
00248     for (unsigned int i=0; i<totalRecv; i++) {
00249       wts[i] = 0;
00250     }
00251 
00252     //decendants and chunks are both sorted at this point.
00253     unsigned int nextPt = 0;
00254     unsigned int nextNode = 0;
00255     //Every element in nodes is inside some element in recvK.
00256     while (nextPt < nodes.size()) {
00257       //The first pt. lies in some block.
00258 #ifdef __DEBUG_OCT__
00259       assert(areComparable(recvK[nextNode], nodes[nextPt]));
00260 #endif
00261       if ((recvK[nextNode].isAncestor(nodes[nextPt])) ||
00262           (recvK[nextNode] == nodes[nextPt])) {
00263         wts[nextNode]++;
00264         nextPt++;
00265       } else {
00266         nextNode++;
00267         if (nextNode == totalRecv) {
00268           //If this fails then either recvK and nodes are not sorted or
00269           //Some pt in nodes is not in any of recvK
00270           assert(false);
00271         }
00272       }//end if-else
00273     }//end while
00274 
00275     recvK.clear();
00276 
00277     //5. Now communicate the wts back to the procs ...
00278     unsigned int *recvWts = NULL;
00279     if(totalSend) {
00280       recvWts = new unsigned int[totalSend];
00281     }
00282 
00283     par::Mpi_Alltoallv_sparse<unsigned int>( wts, recvCnt, recvOffsets, 
00284         recvWts, sendCnt, sendOffsets, comm);
00285 
00286     //6. Now map them back to the blocks ...
00287     for (int i=0;i<globalCoarse.size();i++) {
00288       globalCoarse[i].setWeight(0);
00289     }
00290 
00291     for (int i=0; i<npes; i++) {
00292       for (int j=0; j<sendCnt[i]; j++) {
00293         globalCoarse[keymap[i][j]].addWeight(recvWts[sendOffsets[i] + j]);
00294       }
00295     }
00296 
00297     for (unsigned int i=0; i<npes; i++) {
00298       keymap[i].clear();
00299       sendNodes[i].clear();
00300     }
00301     sendNodes.clear();
00302     keymap.clear();
00303 
00304     if(recvWts) {
00305       delete [] recvWts;
00306       recvWts = NULL;
00307     }
00308 
00309     if(wts) {
00310       delete [] wts;
00311       wts = NULL;
00312     }
00313 
00314     par::partitionW<ot::TreeNode>(globalCoarse,getNodeWeight,comm);
00315 
00316     //Reset weights
00317     for (unsigned int i=0;i<globalCoarse.size(); i++) {
00318       globalCoarse[i].setWeight(1);
00319     }
00320 
00321     // Now communicate the nodes ...
00322 
00323     //7. Determine locally which keys to send to which proc ...
00324     //Compute Dist on globalCoarse....
00325 
00326     std::vector<TreeNode> vtkDist(npes);
00327 
00328     ot::TreeNode *sendMin = NULL;
00329     if (!globalCoarse.empty()) {
00330       sendMin = (ot::TreeNode *)&(*(globalCoarse.begin()));
00331     } else {
00332       sendMin = &(rootNode);
00333     }
00334 
00335     par::Mpi_Allgather<ot::TreeNode>(sendMin, &(* vtkDist.begin()), 1, comm);
00336 
00337     minsAllBlocks.clear();
00338     for(int j = 0; j < npes; j++) {
00339       if(vtkDist[j] != rootNode) {
00340         minsAllBlocks.push_back(vtkDist[j]);
00341       }
00342     }//end for j
00343 
00344     for (unsigned int j = 1; j < npes ; j++) {
00345       if (vtkDist[j] == rootNode) {
00346         vtkDist[j] = vtkDist[j-1];
00347       }
00348     }//end for j
00349 
00350     // correct dist ...
00351     if (npes>1) {
00352       if (vtkDist[npes-1] == vtkDist[npes-2]) {
00353         vtkDist[npes-1] = rootNode;
00354       }//end if
00355 
00356       for (unsigned int i=npes-2; i>0; i--) {
00357         if (vtkDist[i] == vtkDist[i-1]) {
00358           vtkDist[i] = vtkDist[i+1];
00359         }//end if
00360       }//end for
00361     }//end if npes > 1
00362 
00363     unsigned int *part = NULL;
00364     if(!nodes.empty()) {
00365       part = new unsigned int[nodes.size()];
00366     }
00367 
00368     if (npes > 1) {
00369       unsigned int pCnt=0;
00370       for (unsigned int i=0; i< nodes.size(); i++) {
00371         if ( (nodes[i] >= vtkDist[pCnt]) && ( (pCnt == (npes-1)) 
00372               || ( nodes[i] < vtkDist[pCnt+1] ) || (vtkDist[pCnt+1] == rootNode) ) ) {
00373           part[i] = pCnt;
00374         } else {
00375           while ( (pCnt < (npes -1)) && (nodes[i] >= vtkDist[pCnt+1])
00376               && (vtkDist[pCnt+1] != rootNode)  ) {
00377             pCnt++;
00378           }//end while
00379           part[i] = pCnt;
00380         }//end if-else
00381       }//end for i
00382     }//end if np>1
00383 
00384     vtkDist.clear();
00385     //_________________________________________________________________________
00386     // Now the partitions should be contiguous since the two lists are globally
00387     // sorted ... and it's simply a shift between the two.
00388     //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00389 
00390     //compute the total number of nodes being sent to each proc ...
00391     for (int i=0; i<npes; i++) {
00392       sendCnt[i]=0;
00393       recvCnt[i]=0;
00394     }
00395 
00396     if (npes > 1) {
00397       for (unsigned int i=0; i<nodes.size(); i++) {
00398         sendCnt[part[i]]++;
00399       }//end for i
00400     } else {
00401       sendCnt[0] += (nodes.size());
00402     }//end if-else
00403 
00404     if(part) {
00405       delete [] part;
00406       part = NULL;
00407     }
00408 
00409     // communicate with other procs how many you shall be sending and get how
00410     // many to recieve from whom.
00411 
00412     par::Mpi_Alltoall<int>( sendCnt, recvCnt, 1, comm);
00413 
00414     totalRecv=0;
00415     for (unsigned int i=0; i<npes; i++) {
00416       totalRecv += recvCnt[i];
00417     }//end for i
00418 
00419     sendOffsets[0] = 0;
00420     recvOffsets[0] = 0;
00421 
00422     // compute offsets ...
00423     for (int i=1; i<npes; i++) {
00424       sendOffsets[i] = sendOffsets[i-1] + sendCnt[i-1];
00425       recvOffsets[i] = recvOffsets[i-1] + recvCnt[i-1];
00426     }//end for i
00427 
00428     // Allocate for new array ...
00429     std::vector<ot::TreeNode > newNodes(totalRecv);
00430 
00431     // perform All2Allv
00432     ot::TreeNode* nodesPtr = NULL;
00433     ot::TreeNode* newNodesPtr = NULL;
00434     if(!nodes.empty()) {
00435       nodesPtr = &(*(nodes.begin()));
00436     }
00437     if(!newNodes.empty()) {
00438       newNodesPtr = &(*(newNodes.begin()));
00439     }
00440 
00441 #ifdef __MEASURE_BPART_COMM__
00442     unsigned int totalSendSize = nodes.size();
00443     unsigned int totalRecvSize = newNodes.size();
00444     unsigned int numProcsSendF = 0;
00445     unsigned int numProcsRecvF = 0;
00446     for(int i = 0; i < npes; i++) {
00447       if(sendCnt[i]) {
00448         numProcsSendF++;
00449       }
00450       if(recvCnt[i]) {
00451         numProcsRecvF++;
00452       }
00453     }
00454 #endif
00455 
00456     par::Mpi_Alltoallv_sparse<ot::TreeNode>( nodesPtr, sendCnt, sendOffsets,      
00457         newNodesPtr, recvCnt, recvOffsets, comm);
00458 
00459     // reset the pointer ...
00460     nodes = newNodes;
00461     newNodes.clear();
00462 
00463     // clean up ...
00464     delete [] sendCnt;
00465     sendCnt = NULL;
00466 
00467     delete [] recvCnt;
00468     recvCnt = NULL;
00469 
00470     delete [] sendOffsets;
00471     sendOffsets = NULL;
00472 
00473     delete [] recvOffsets;
00474     recvOffsets = NULL;
00475 
00476 #ifdef __MEASURE_BPART_COMM__
00477     MPI_Barrier(comm);
00478     unsigned int* allTotalSendF = new unsigned int[npes];
00479     unsigned int* allTotalRecvF = new unsigned int[npes];
00480     unsigned int* allNumProcsSendF = new unsigned int[npes];
00481     unsigned int* allNumProcsRecvF = new unsigned int[npes]; 
00482     par::Mpi_Gather<unsigned int>(&totalSendSize, allTotalSendF, 1, 0, comm);
00483     par::Mpi_Gather<unsigned int>(&totalRecvSize, allTotalRecvF, 1, 0, comm);
00484     par::Mpi_Gather<unsigned int>(&numProcsSendF, allNumProcsSendF, 1, 0, comm);
00485     par::Mpi_Gather<unsigned int>(&numProcsRecvF, allNumProcsRecvF, 1, 0, comm);
00486     if(!rank) {
00487       for(int i = 0; i < npes; i++) {
00488         std::cout<<" allTotalSendF["<<i<<"] in Bpart: "<<allTotalSendF[i]<<std::endl;
00489         std::cout<<" allTotalRecvF["<<i<<"] in Bpart: "<<allTotalRecvF[i]<<std::endl;
00490         std::cout<<" allNumProcsSendF["<<i<<"] in Bpart: "<<allNumProcsSendF[i]<<std::endl;
00491         std::cout<<" allNumProcsRecvF["<<i<<"] in Bpart: "<<allNumProcsRecvF[i]<<std::endl;
00492       }
00493     }
00494     delete [] allTotalSendF;
00495     delete [] allTotalRecvF;
00496     delete [] allNumProcsSendF;
00497     delete [] allNumProcsRecvF;
00498     MPI_Barrier(comm);
00499 #endif
00500 
00501     PROF_BLKPART2_END
00502   } // end blockPart
00503 
00504 
00505   //Assumes that nodes are globally sorted, linear, complete, unique.
00506   //No processor must call this with an empty input.
00507   int blockPartStage1(std::vector<TreeNode> &nodes, std::vector<TreeNode> &blocks,
00508       unsigned int dim, unsigned int maxDepth,  MPI_Comm comm) {
00509 #ifdef __PROF_WITH_BARRIER__
00510     MPI_Barrier(comm);
00511 #endif
00512     PROF_BLKPART1_BEGIN
00513       int npes, rank;
00514     const double thFac = 0.5;
00515     MPI_Comm_rank(comm,&rank);
00516     MPI_Comm_size(comm,&npes);
00517 
00518     par::partitionW<ot::TreeNode>(nodes, NULL,comm);
00519 
00520     assert(nodes.size() > (1 << dim) ); 
00521 
00522 #ifdef __BLOCK_PART_EQUALS_MORTON_PART__
00523 
00524     blocks.clear();
00525 
00526     //include both the min and max elements as well  
00527     //The output will be sorted, unique and linear
00528     appendCompleteRegion(nodes[0], nodes[nodes.size()-1], blocks, true, true);
00529 
00530 #else
00531 
00532     // 1. First compute the localCoarse octree 
00533     std::vector<TreeNode> localCoarse;
00534     //include both the min and max elements as well  
00535     //The output will be sorted, unique and linear
00536     appendCompleteRegion(nodes[0], nodes[nodes.size()-1], localCoarse, true, true);
00537 
00538     // 2. Get local Blocks. These will be input to completeOctree that will
00539     // produce globalCoarse.
00540 
00541     std::vector<TreeNode> localBlocks;
00542 
00543     //Logic: Use bPartComparator to assign priorities and pick the ones
00544     //with high priority. The number to be picked is determined by a percentage
00545     //of the total local wt.
00546 
00547     for (unsigned int i = 0; i < localCoarse.size(); i++) {
00548       localCoarse[i].setWeight(0);
00549     }
00550 
00551     //decendants and chunks are both sorted at this point.
00552     unsigned int nextPt = 0;
00553     unsigned int nextNode = 0;
00554     //Every element in nodes is inside some element in localCoarse.
00555     while (nextPt < nodes.size()) {
00556       //The first pt. lies in some block.
00557 #ifdef __DEBUG_OCT__
00558       assert(areComparable(localCoarse[nextNode], nodes[nextPt]));
00559 #endif
00560       if ((localCoarse[nextNode].isAncestor(nodes[nextPt])) ||
00561           (localCoarse[nextNode] == nodes[nextPt])) {
00562         localCoarse[nextNode].addWeight(1);
00563         nextPt++;
00564       } else {
00565         nextNode++;
00566         if (nextNode == localCoarse.size()) {
00567           //If this fails then either the lists are not sorted
00568           //or there is some node which is not inside any block 
00569           assert(false);
00570         }
00571       }//end if-else
00572     }//end while
00573 
00574     sort(localCoarse.begin(), localCoarse.end(), ot::bPartComparator);
00575 
00576     long localWt = 0;
00577     unsigned int cnt = 0;
00578     while ( ( cnt < localCoarse.size() ) &&
00579         (localWt <= ( (long)( thFac*( (double)(nodes.size()) ) ) ) ) ) {
00580       localBlocks.push_back(localCoarse[cnt]);
00581       localWt += (long)(localCoarse[cnt].getWeight());
00582       cnt++;
00583     }//end while
00584 
00585     localCoarse.clear();
00586 
00587     for (unsigned int i = 0; i < localBlocks.size(); i++) {
00588       localBlocks[i].setWeight(1);
00589     }
00590 
00591     //Sorting is necessary here since bPartcomparator is different from <.
00592     sort(localBlocks.begin(), localBlocks.end());
00593 
00594     // 3. Call nodes2Oct on these cells to generate the 
00595     //    globalCoarse octree ...
00596     //localBlocks will be sorted, linear and unique at this point 
00597     //localBlocks will not be empty on any processor
00598 
00599     completeOctree(localBlocks, blocks, dim, maxDepth, true, true, true, comm);
00600 
00601     localBlocks.clear();
00602 
00603 #endif
00604 
00605     PROF_BLKPART1_END
00606   } // end blockPart
00607 
00608   int blockPartStage2(std::vector<TreeNode> &nodes, std::vector<TreeNode> &globalCoarse,
00609       std::vector<ot::TreeNode>& minsAllBlocks, unsigned int dim, unsigned int maxDepth,
00610       MPI_Comm comm) {
00611 #ifdef __PROF_WITH_BARRIER__
00612     MPI_Barrier(comm);
00613 #endif
00614     PROF_BLKPART2_BEGIN
00615 
00616       ot::TreeNode rootNode (dim,maxDepth);
00617 
00618     int npes, rank;
00619     MPI_Comm_rank(comm,&rank);
00620     MPI_Comm_size(comm,&npes);
00621 
00622 #ifdef __BLOCK_PART_EQUALS_MORTON_PART__
00623     //Just set minsAllBlocks. nodes and globalCoarse are already aligned.
00624     //We will not do weighted partitioning here.
00625 
00626     std::vector<TreeNode> vtkDist(npes);
00627 
00628     ot::TreeNode *sendMin = NULL;
00629     if (!globalCoarse.empty()) {
00630       sendMin = (ot::TreeNode *)&(*(globalCoarse.begin()));
00631     } else {
00632       sendMin = &(rootNode);
00633     }
00634 
00635     par::Mpi_Allgather<ot::TreeNode>(sendMin, &(* vtkDist.begin()), 1, comm);
00636 
00637     minsAllBlocks.clear();
00638     for(int j = 0; j < npes; j++) {
00639       if(vtkDist[j] != rootNode) {
00640         minsAllBlocks.push_back(vtkDist[j]);
00641       }
00642     }//end for j
00643 
00644 #else
00645 
00646     int *sendCnt = new int[npes];
00647     int *recvCnt = new int[npes];
00648     int *sendOffsets = new int[npes];
00649     int *recvOffsets = new int[npes];
00650 
00651     //1. Now compute the wts of these cells ...
00652     //    a. Get the min and max nodes at each processor.
00653     std::vector<TreeNode> _mins_maxs(2*npes);
00654 
00655     // communicate ...
00656     ot::TreeNode sendMinMax[2];
00657 
00658     if (!nodes.empty()) {
00659       sendMinMax[0] =  nodes[0];
00660       sendMinMax[1] =  nodes[nodes.size()-1];
00661     } else {
00662       sendMinMax[0] = rootNode;
00663       sendMinMax[1] = rootNode;
00664     }
00665 
00666     par::Mpi_Allgather<ot::TreeNode>(sendMinMax, &(*_mins_maxs.begin()), 2, comm);
00667 
00668     std::vector<std::vector<TreeNode> > sendNodes(npes);
00669     std::vector<std::vector<unsigned int> > keymap(npes);
00670 
00671     for (int i=0; i<npes; i++) {
00672       sendCnt[i] = 0;
00673     }
00674 
00675     //    b. Now compute which cells go to which cells ...
00676     //       logic is that if the coarse cell is between the min and max at a
00677     //       processor or if it is an ancestor of min, then it is sent to that
00678     //       processor.
00679     //Naive Logic:
00680     for (unsigned int i=0; i<globalCoarse.size(); i++) {
00681       for (int p=0;p<npes;p++) {
00682 #ifdef __DEBUG_OCT__
00683         assert(areComparable(globalCoarse[i], _mins_maxs[2*p]));
00684 #endif
00685         if ( (globalCoarse[i].isAncestor(_mins_maxs[2*p])) ||
00686             ( (globalCoarse[i] >= _mins_maxs[2*p]) && (globalCoarse[i] <=_mins_maxs[(2*p)+1]) ) ) {
00687           sendNodes[p].push_back(globalCoarse[i]);
00688           // save keymap so that we can assign weights back to globalCoarse.
00689           keymap[p].push_back(i);    
00690           sendCnt[p]++;
00691         }//end if
00692       }//end for
00693     }//end for
00694 
00695     _mins_maxs.clear();
00696 
00697     //2. Send nodes to all cells to compute the wts ... locally ...
00698 
00699     //    a. Communicate how many you'll be sending and how many will be
00700     //       received.
00701 
00702     // Now do an All2All to get numKeysRecv
00703     par::Mpi_Alltoall<int>( sendCnt, recvCnt, 1, comm);
00704 
00705     //    b. Concatenate all nodes into one single Carray ...
00706     unsigned int totalSend = 0;
00707     unsigned int totalRecv = 0;
00708 #ifdef __MEASURE_BPART_COMM__
00709     unsigned int numProcsSendI = 0;
00710     unsigned int numProcsRecvI = 0;
00711 #endif
00712     for (unsigned int i = 0; i < npes; i++) {
00713       totalSend+= sendCnt[i];
00714       totalRecv+= recvCnt[i];
00715 #ifdef __MEASURE_BPART_COMM__
00716       if(sendCnt[i]) {
00717         numProcsSendI++;
00718       }
00719       if(recvCnt[i]) {
00720         numProcsRecvI++;
00721       }
00722 #endif
00723     }
00724 
00725     // create the send and recv buffers ...
00726     std::vector<ot::TreeNode> sendK (totalSend);
00727     std::vector<ot::TreeNode> recvK (totalRecv);
00728 
00729     // Now create sendK
00730     sendOffsets[0] = 0;
00731     recvOffsets[0] = 0;
00732 
00733     // compute offsets ...
00734     for (int i=1; i<npes; i++) {
00735       sendOffsets[i] = sendOffsets[i-1] + sendCnt[i-1];
00736       recvOffsets[i] = recvOffsets[i-1] + recvCnt[i-1];
00737     }
00738 
00739     for (int i=0; i<npes; i++) {
00740       for (unsigned int j=0; j<sendCnt[i]; j++) {
00741         sendK[sendOffsets[i] + j] = sendNodes[i][j];
00742       }//end for j
00743     }//end for i
00744 
00745     //3. send and receive all keys ...
00746 
00747     ot::TreeNode* sendKptr = NULL;
00748     ot::TreeNode* recvKptr = NULL;
00749     if(!sendK.empty()) {
00750       sendKptr = &(*(sendK.begin()));
00751     }
00752     if(!recvK.empty()) {
00753       recvKptr = &(*(recvK.begin()));
00754     }
00755 
00756     par::Mpi_Alltoallv_sparse<ot::TreeNode>(sendKptr, sendCnt, sendOffsets,      
00757         recvKptr, recvCnt, recvOffsets, comm);
00758 
00759     sendK.clear();
00760 
00761 #ifdef __MEASURE_BPART_COMM__
00762     MPI_Barrier(comm);
00763     unsigned int* allTotalSendI = new unsigned int[npes];
00764     unsigned int* allTotalRecvI = new unsigned int[npes];
00765     unsigned int* allNumProcsSendI = new unsigned int[npes];
00766     unsigned int* allNumProcsRecvI = new unsigned int[npes]; 
00767     par::Mpi_Gather<unsigned int>(&totalSend, allTotalSendI, 1, 0, comm);
00768     par::Mpi_Gather<unsigned int>(&totalRecv, allTotalRecvI, 1, 0, comm);
00769     par::Mpi_Gather<unsigned int>(&numProcsSendI, allNumProcsSendI, 1, 0, comm);
00770     par::Mpi_Gather<unsigned int>(&numProcsRecvI, allNumProcsRecvI, 1, 0, comm);
00771     if(!rank) {
00772       for(int i = 0; i < npes; i++) {
00773         std::cout<<" allTotalSendI["<<i<<"] in Bpart: "<<allTotalSendI[i]<<std::endl;
00774         std::cout<<" allTotalRecvI["<<i<<"] in Bpart: "<<allTotalRecvI[i]<<std::endl;
00775         std::cout<<" allNumProcsSendI["<<i<<"] in Bpart: "<<allNumProcsSendI[i]<<std::endl;
00776         std::cout<<" allNumProcsRecvI["<<i<<"] in Bpart: "<<allNumProcsRecvI[i]<<std::endl;
00777       }
00778     }
00779     delete [] allTotalSendI;
00780     delete [] allTotalRecvI;
00781     delete [] allNumProcsSendI;
00782     delete [] allNumProcsRecvI;
00783     MPI_Barrier(comm);
00784 #endif
00785 
00786     //4. Now compute the wts of the locally received nodes ...
00787     //    a. loop through nodes and update the wts of the local chunks ...
00788     unsigned int *wts = NULL;
00789     if(totalRecv) {
00790       wts = new unsigned int [totalRecv];
00791     }
00792     for (unsigned int i=0; i<totalRecv; i++) {
00793       wts[i] = 0;
00794     }
00795 
00796     //decendants and chunks are both sorted at this point.
00797     unsigned int nextPt = 0;
00798     unsigned int nextNode = 0;
00799     //Every element in nodes is inside some element in recvK.
00800     while (nextPt < nodes.size()) {
00801       //The first pt. lies in some block.
00802 #ifdef __DEBUG_OCT__
00803       assert(areComparable(recvK[nextNode], nodes[nextPt]));
00804 #endif
00805       if ((recvK[nextNode].isAncestor(nodes[nextPt])) ||
00806           (recvK[nextNode] == nodes[nextPt])) {
00807         wts[nextNode]++;
00808         nextPt++;
00809       } else {
00810         nextNode++;
00811         if (nextNode == totalRecv) {
00812           //If this fails then either recvK and nodes are not sorted or
00813           //Some pt in nodes is not in any of recvK
00814           assert(false);
00815         }
00816       }//end if-else
00817     }//end while
00818 
00819     recvK.clear();
00820 
00821     //5. Now communicate the wts back to the procs ...
00822     unsigned int *recvWts = NULL;
00823     if(totalSend) {
00824       recvWts = new unsigned int[totalSend];
00825     }
00826 
00827     par::Mpi_Alltoallv_sparse<unsigned int>( wts, recvCnt, recvOffsets, 
00828         recvWts, sendCnt, sendOffsets, comm);
00829 
00830     //6. Now map them back to the blocks ...
00831     for (int i=0;i<globalCoarse.size();i++) {
00832       globalCoarse[i].setWeight(0);
00833     }
00834 
00835     for (int i=0; i<npes; i++) {
00836       for (int j=0; j<sendCnt[i]; j++) {
00837         globalCoarse[keymap[i][j]].addWeight(recvWts[sendOffsets[i] + j]);
00838       }
00839     }
00840 
00841     for (unsigned int i=0; i<npes; i++) {
00842       keymap[i].clear();
00843       sendNodes[i].clear();
00844     }
00845     sendNodes.clear();
00846     keymap.clear();
00847 
00848     if(recvWts) {
00849       delete [] recvWts;
00850       recvWts = NULL;
00851     }
00852 
00853     if(wts) {
00854       delete [] wts;
00855       wts = NULL;
00856     }
00857 
00858     par::partitionW<ot::TreeNode>(globalCoarse,getNodeWeight,comm);
00859 
00860     //Reset weights
00861     for (unsigned int i=0;i<globalCoarse.size(); i++) {
00862       globalCoarse[i].setWeight(1);
00863     }
00864 
00865     // Now communicate the nodes ...
00866 
00867     //7. Determine locally which keys to send to which proc ...
00868     //Compute Dist on globalCoarse....
00869 
00870     std::vector<TreeNode> vtkDist(npes);
00871 
00872     ot::TreeNode *sendMin = NULL;
00873     if (!globalCoarse.empty()) {
00874       sendMin = (ot::TreeNode *)&(*(globalCoarse.begin()));
00875     } else {
00876       sendMin = &(rootNode);
00877     }
00878 
00879     par::Mpi_Allgather<ot::TreeNode>(sendMin, &(* vtkDist.begin()), 1, comm);
00880 
00881     minsAllBlocks.clear();
00882     for(int j = 0; j < npes; j++) {
00883       if(vtkDist[j] != rootNode) {
00884         minsAllBlocks.push_back(vtkDist[j]);
00885       }
00886     }//end for j
00887 
00888     for (unsigned int j = 1; j < npes ; j++) {
00889       if (vtkDist[j] == rootNode) {
00890         vtkDist[j] = vtkDist[j-1];
00891       }
00892     }//end for j
00893 
00894     // correct dist ...
00895     if (npes>1) {
00896       if (vtkDist[npes-1] == vtkDist[npes-2]) {
00897         vtkDist[npes-1] = rootNode;
00898       }//end if
00899 
00900       for (unsigned int i=npes-2; i>0; i--) {
00901         if (vtkDist[i] == vtkDist[i-1]) {
00902           vtkDist[i] = vtkDist[i+1];
00903         }//end if
00904       }//end for
00905     }//end if npes > 1
00906 
00907     unsigned int *part = NULL;
00908     if(!nodes.empty()) {
00909       part = new unsigned int[nodes.size()];
00910     }
00911 
00912     if (npes > 1) {
00913       unsigned int pCnt=0;
00914       for (unsigned int i=0; i< nodes.size(); i++) {
00915         if ( (nodes[i] >= vtkDist[pCnt]) && ( (pCnt == (npes-1)) 
00916               || ( nodes[i] < vtkDist[pCnt+1] ) || (vtkDist[pCnt+1] == rootNode) ) ) {
00917           part[i] = pCnt;
00918         } else {
00919           while ( (pCnt < (npes -1)) && (nodes[i] >= vtkDist[pCnt+1])
00920               && (vtkDist[pCnt+1] != rootNode)  ) {
00921             pCnt++;
00922           }//end while
00923           part[i] = pCnt;
00924         }//end if-else
00925       }//end for i
00926     }//end if np>1
00927 
00928     vtkDist.clear();
00929     //_________________________________________________________________________
00930     // Now the partitions should be contiguous since the two lists are globally
00931     // sorted ... and it's simply a shift between the two.
00932     //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00933 
00934     //compute the total number of nodes being sent to each proc ...
00935     for (int i=0; i<npes; i++) {
00936       sendCnt[i]=0;
00937       recvCnt[i]=0;
00938     }
00939 
00940     if (npes > 1) {
00941       for (unsigned int i=0; i<nodes.size(); i++) {
00942         sendCnt[part[i]]++;
00943       }//end for i
00944     } else {
00945       sendCnt[0] += (nodes.size());
00946     }//end if-else
00947 
00948     if(part) {
00949       delete [] part;
00950       part = NULL;
00951     }
00952 
00953     // communicate with other procs how many you shall be sending and get how
00954     // many to recieve from whom.
00955 
00956     par::Mpi_Alltoall<int>( sendCnt, recvCnt, 1, comm);
00957 
00958     totalRecv=0;
00959     for (unsigned int i=0; i<npes; i++) {
00960       totalRecv += recvCnt[i];
00961     }//end for i
00962 
00963     sendOffsets[0] = 0;
00964     recvOffsets[0] = 0;
00965 
00966     // compute offsets ...
00967     for (int i=1; i<npes; i++) {
00968       sendOffsets[i] = sendOffsets[i-1] + sendCnt[i-1];
00969       recvOffsets[i] = recvOffsets[i-1] + recvCnt[i-1];
00970     }//end for i
00971 
00972     // Allocate for new array ...
00973     std::vector<ot::TreeNode > newNodes(totalRecv);
00974 
00975     // perform All2Allv
00976     ot::TreeNode* nodesPtr = NULL;
00977     ot::TreeNode* newNodesPtr = NULL;
00978     if(!nodes.empty()) {
00979       nodesPtr = &(*(nodes.begin()));
00980     }
00981     if(!newNodes.empty()) {
00982       newNodesPtr = &(*(newNodes.begin()));
00983     }
00984 
00985 #ifdef __MEASURE_BPART_COMM__
00986     unsigned int totalSendSize = nodes.size();
00987     unsigned int totalRecvSize = newNodes.size();
00988     unsigned int numProcsSendF = 0;
00989     unsigned int numProcsRecvF = 0;
00990     for(int i = 0; i < npes; i++) {
00991       if(sendCnt[i]) {
00992         numProcsSendF++;
00993       }
00994       if(recvCnt[i]) {
00995         numProcsRecvF++;
00996       }
00997     }
00998 #endif
00999 
01000     par::Mpi_Alltoallv_sparse<ot::TreeNode>( nodesPtr, sendCnt, sendOffsets,      
01001         newNodesPtr, recvCnt, recvOffsets, comm);
01002 
01003     // reset the pointer ...
01004     nodes = newNodes;
01005     newNodes.clear();
01006 
01007     // clean up ...
01008     delete [] sendCnt;
01009     sendCnt = NULL;
01010 
01011     delete [] recvCnt;
01012     recvCnt = NULL;
01013 
01014     delete [] sendOffsets;
01015     sendOffsets = NULL;
01016 
01017     delete [] recvOffsets;
01018     recvOffsets = NULL;
01019 
01020 #ifdef __MEASURE_BPART_COMM__
01021     MPI_Barrier(comm);
01022     unsigned int* allTotalSendF = new unsigned int[npes];
01023     unsigned int* allTotalRecvF = new unsigned int[npes];
01024     unsigned int* allNumProcsSendF = new unsigned int[npes];
01025     unsigned int* allNumProcsRecvF = new unsigned int[npes]; 
01026     par::Mpi_Gather<unsigned int>(&totalSendSize, allTotalSendF, 1, 0, comm);
01027     par::Mpi_Gather<unsigned int>(&totalRecvSize, allTotalRecvF, 1, 0, comm);
01028     par::Mpi_Gather<unsigned int>(&numProcsSendF, allNumProcsSendF, 1, 0, comm);
01029     par::Mpi_Gather<unsigned int>(&numProcsRecvF, allNumProcsRecvF, 1, 0, comm);
01030     if(!rank) {
01031       for(int i = 0; i < npes; i++) {
01032         std::cout<<" allTotalSendF["<<i<<"] in Bpart: "<<allTotalSendF[i]<<std::endl;
01033         std::cout<<" allTotalRecvF["<<i<<"] in Bpart: "<<allTotalRecvF[i]<<std::endl;
01034         std::cout<<" allNumProcsSendF["<<i<<"] in Bpart: "<<allNumProcsSendF[i]<<std::endl;
01035         std::cout<<" allNumProcsRecvF["<<i<<"] in Bpart: "<<allNumProcsRecvF[i]<<std::endl;
01036       }
01037     }
01038     delete [] allTotalSendF;
01039     delete [] allTotalRecvF;
01040     delete [] allNumProcsSendF;
01041     delete [] allNumProcsRecvF;
01042     MPI_Barrier(comm);
01043 #endif
01044 
01045 #endif
01046 
01047     PROF_BLKPART2_END
01048   } // end blockPart
01049 
01050 }//end namespace
01051 
01052