#include "mpi.h"
#include <vector>
#include "dendro.h"
#include "parUtils.txx"
Go to the source code of this file.
Namespaces | |
namespace | par |
Defines | |
#define | KEEP_HIGH 100 |
#define | KEEP_LOW 101 |
#define | PROF_A2AV_WAIT_BEGIN |
#define | PROF_A2AV_WAIT_END |
#define | PROF_PAR_ALL2ALL_BEGIN |
#define | PROF_PAR_ALL2ALL_END return 1; |
#define | PROF_PAR_ALL2ALLV_DENSE_BEGIN |
#define | PROF_PAR_ALL2ALLV_DENSE_END return 1; |
#define | PROF_PAR_ALL2ALLV_SPARSE_BEGIN |
#define | PROF_PAR_ALL2ALLV_SPARSE_END return 1; |
#define | PROF_PAR_ALLGATHER_BEGIN |
#define | PROF_PAR_ALLGATHER_END return 1; |
#define | PROF_PAR_ALLGATHERV_BEGIN |
#define | PROF_PAR_ALLGATHERV_END return 1; |
#define | PROF_PAR_ALLREDUCE_BEGIN |
#define | PROF_PAR_ALLREDUCE_END return 1; |
#define | PROF_PAR_BCAST_BEGIN |
#define | PROF_PAR_BCAST_END return 1; |
#define | PROF_PAR_CONCAT_BEGIN |
#define | PROF_PAR_CONCAT_END return 1; |
#define | PROF_PAR_GATHER_BEGIN |
#define | PROF_PAR_GATHER_END return 1; |
#define | PROF_PAR_REDUCE_BEGIN |
#define | PROF_PAR_REDUCE_END return 1; |
#define | PROF_PAR_SCAN_BEGIN |
#define | PROF_PAR_SCAN_END return 1; |
#define | PROF_PAR_SCATTER_BEGIN |
#define | PROF_PAR_SCATTER_END return 1; |
#define | PROF_PAR_SENDRECV_BEGIN |
#define | PROF_PAR_SENDRECV_END return 1; |
#define | PROF_PARTW_BEGIN |
#define | PROF_PARTW_END return 1; |
#define | PROF_REMDUP_BEGIN |
#define | PROF_REMDUP_END return 1; |
#define | PROF_SEARCH_BEGIN |
#define | PROF_SEARCH_END return 1; |
#define | PROF_SORT_BEGIN |
#define | PROF_SORT_END return 1; |
#define | PROF_SPLIT_COMM_2WAY_BEGIN |
#define | PROF_SPLIT_COMM_2WAY_END return 1; |
#define | PROF_SPLIT_COMM_BEGIN |
#define | PROF_SPLIT_COMM_END return 1; |
Functions | |
template<typename T> | |
void | bitonicSort (std::vector< T > &in, MPI_Comm comm) |
An implementation of parallel bitonic sort that does not expect the number of processors to be a power of 2. In fact, the number of processors can even be odd. Moreover, we do not even expect the length of the vector (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call this with a different number of elements. However, we do expect that 'in' atleast has 1 element on each processor. This recursively calls the function bitonicSort_binary, followed by a special parallel merge. | |
template<typename T> | |
void | bitonicSort_binary (std::vector< T > &in, MPI_Comm comm) |
An implementation of parallel bitonic sort that expects the number of processors to be a power of 2. However, unlike most implementations, we do not expect the length of the vector (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call this with a different number of elements. However, we do expect that 'in' atleast has 1 element on each processor. | |
template<typename T> | |
int | concatenate (std::vector< T > &listA, std::vector< T > &listB, MPI_Comm comm) |
A parallel concatenation function. listB is appended (globally) to listA and the result is stored in listA. An useful application of this function is when listA and listB are individually sorted (globally) and the smallest element in listB is greater than the largest element in listA and we want to create a merged list that is sorted. | |
template<typename T> | |
unsigned int | defaultWeight (const T *a) |
template<typename T> | |
int | maxLowerBound (const std::vector< T > &keys, const std::vector< T > &searchList, std::vector< T > &results, MPI_Comm comm) |
A parallel search function. | |
template<typename T> | |
void | MergeLists (std::vector< T > &listA, std::vector< T > &listB, int KEEP_WHAT) |
Merges lists A, and B, retaining either the low or the High in list A. | |
template<typename T> | |
void | MergeSplit (std::vector< T > &local_list, int which_keys, int partner, MPI_Comm comm) |
The main operation in the parallel bitonic sort algorithm. This implements the compare-split operation. | |
template<typename T> | |
int | Mpi_Allgather (T *sendbuf, T *recvbuf, int count, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Allgatherv (T *sendbuf, int sendcount, T *recvbuf, int *recvcounts, int *displs, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Allreduce (T *sendbuf, T *recvbuf, int count, MPI_Op op, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Alltoall (T *sendbuf, T *recvbuf, int count, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Alltoallv_dense (T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Alltoallv_sparse (T *sendbuf, int *sendcnts, int *sdispls, T *recvbuf, int *recvcnts, int *rdispls, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Bcast (T *buffer, int count, int root, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Gather (T *sendBuffer, T *recvBuffer, int count, int root, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Irecv (T *buf, int count, int source, int tag, MPI_Comm comm, MPI_Request *request) |
template<typename T> | |
int | Mpi_Issend (T *buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request *request) |
template<typename T> | |
int | Mpi_Recv (T *buf, int count, int source, int tag, MPI_Comm comm, MPI_Status *status) |
template<typename T> | |
int | Mpi_Reduce (T *sendbuf, T *recvbuf, int count, MPI_Op op, int root, MPI_Comm comm) |
template<typename T> | |
int | Mpi_Scan (T *sendbuf, T *recvbuf, int count, MPI_Op op, MPI_Comm comm) |
template<typename T, typename S> | |
int | Mpi_Sendrecv (T *sendBuf, int sendCount, int dest, int sendTag, S *recvBuf, int recvCount, int source, int recvTag, MPI_Comm comm, MPI_Status *status) |
template<typename T> | |
void | Par_bitonic_merge_incr (std::vector< T > &local_list, int proc_set_size, MPI_Comm comm) |
template<typename T> | |
void | Par_bitonic_sort_decr (std::vector< T > &local_list, int proc_set_size, MPI_Comm comm) |
template<typename T> | |
void | Par_bitonic_sort_incr (std::vector< T > &local_list, int proc_set_size, MPI_Comm comm) |
template<typename T> | |
int | partitionW (std::vector< T > &vec, unsigned int(*getWeight)(const T *), MPI_Comm comm) |
A parallel weighted partitioning function. In our implementation, we do not pose any restriction on the input or the number of processors. This function can be used with an odd number of processors as well. Some processors can pass an empty vector as input. The relative ordering of the elements is preserved. | |
template<typename T> | |
int | removeDuplicates (std::vector< T > &nodes, bool isSorted, MPI_Comm comm) |
Removes duplicates in parallel. If the input is not sorted, sample sort will be called within the function to sort the vector and then duplicates will be removed. | |
template<typename T> | |
int | sampleSort (std::vector< T > &in, std::vector< T > &out, MPI_Comm comm) |
A parallel sample sort implementation. In our implementation, we do not pose any restriction on the input or the number of processors. This function can be used with an odd number of processors as well. Some processors can pass an empty vector as input. If the total number of elements in the vector (globally) is fewer than 10*p^2, where p is the number of processors, then we will use bitonic sort instead of sample sort to sort the vector. We use a paralle bitonic sort to sort the samples in the sample sort algorithm. Hence, the complexity of the algorithm is O(n/p log n/p) + O(p log p). Here, n is the global length of the vector and p is the number of processors. | |
template<typename T> | |
int | scatterValues (std::vector< T > &in, std::vector< T > &out, DendroIntL outSz, MPI_Comm comm) |
Re-distributes a STL vector, preserving the relative ordering of the elements. |
Hari Sundar, hsundar@gmail.com
Shravan Veerapaneni, shravan@seas.upenn.edu
Santi Swaroop Adavani, santis@gmail.com
Definition in file parUtils.h.
|
Definition at line 14 of file parUtils.h. |
|
Definition at line 15 of file parUtils.h. |
|
Definition at line 192 of file parUtils.h. Referenced by par::Mpi_Alltoallv_sparse(). |
|
Definition at line 213 of file parUtils.h. Referenced by par::Mpi_Alltoallv_sparse(). |
|
Definition at line 203 of file parUtils.h. Referenced by par::Mpi_Alltoall(). |
|
Definition at line 224 of file parUtils.h. Referenced by par::Mpi_Alltoall(). |
|
Definition at line 207 of file parUtils.h. Referenced by par::Mpi_Alltoallv_dense(). |
|
Definition at line 228 of file parUtils.h. Referenced by par::Mpi_Alltoallv_dense(). |
|
Definition at line 206 of file parUtils.h. Referenced by par::Mpi_Alltoallv_sparse(). |
|
Definition at line 227 of file parUtils.h. Referenced by par::Mpi_Alltoallv_sparse(). |
|
Definition at line 205 of file parUtils.h. Referenced by par::Mpi_Allgather(). |
|
Definition at line 226 of file parUtils.h. Referenced by par::Mpi_Allgather(). |
|
Definition at line 204 of file parUtils.h. Referenced by par::Mpi_Allgatherv(). |
|
Definition at line 225 of file parUtils.h. Referenced by par::Mpi_Allgatherv(). |
|
Definition at line 202 of file parUtils.h. Referenced by par::Mpi_Allreduce(). |
|
Definition at line 223 of file parUtils.h. Referenced by par::Mpi_Allreduce(). |
|
Definition at line 198 of file parUtils.h. Referenced by par::Mpi_Bcast(). |
|
Definition at line 219 of file parUtils.h. Referenced by par::Mpi_Bcast(). |
|
Definition at line 208 of file parUtils.h. Referenced by par::concatenate(). |
|
Definition at line 229 of file parUtils.h. Referenced by par::concatenate(). |
|
Definition at line 199 of file parUtils.h. Referenced by par::Mpi_Gather(). |
|
Definition at line 220 of file parUtils.h. Referenced by par::Mpi_Gather(). |
|
Definition at line 201 of file parUtils.h. Referenced by par::Mpi_Reduce(). |
|
Definition at line 222 of file parUtils.h. Referenced by par::Mpi_Reduce(). |
|
Definition at line 200 of file parUtils.h. Referenced by par::Mpi_Scan(). |
|
Definition at line 221 of file parUtils.h. Referenced by par::Mpi_Scan(). |
|
Definition at line 196 of file parUtils.h. Referenced by par::scatterValues(). |
|
Definition at line 217 of file parUtils.h. Referenced by par::scatterValues(). |
|
Definition at line 197 of file parUtils.h. Referenced by par::Mpi_Sendrecv(). |
|
Definition at line 218 of file parUtils.h. Referenced by par::Mpi_Sendrecv(). |
|
Definition at line 211 of file parUtils.h. Referenced by par::partitionW(). |
|
Definition at line 232 of file parUtils.h. Referenced by par::partitionW(). |
|
Definition at line 210 of file parUtils.h. Referenced by par::removeDuplicates(). |
|
Definition at line 231 of file parUtils.h. Referenced by par::removeDuplicates(). |
|
Definition at line 195 of file parUtils.h. Referenced by par::maxLowerBound(). |
|
Definition at line 216 of file parUtils.h. Referenced by par::maxLowerBound(). |
|
Definition at line 209 of file parUtils.h. Referenced by par::sampleSort(). |
|
Definition at line 230 of file parUtils.h. Referenced by par::sampleSort(). |
|
Definition at line 193 of file parUtils.h. Referenced by par::splitComm2way(). |
|
Definition at line 214 of file parUtils.h. Referenced by par::splitComm2way(). |
|
Definition at line 194 of file parUtils.h. Referenced by par::splitCommUsingSplittingRank(). |
|
Definition at line 215 of file parUtils.h. Referenced by par::splitCommUsingSplittingRank(). |
|
An implementation of parallel bitonic sort that does not expect the number of processors to be a power of 2. In fact, the number of processors can even be odd. Moreover, we do not even expect the length of the vector (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call this with a different number of elements. However, we do expect that 'in' atleast has 1 element on each processor. This recursively calls the function bitonicSort_binary, followed by a special parallel merge.
Definition at line 1623 of file parUtils.txx. References par::Par_bitonic_merge_incr(), sort(), par::splitCommBinary(), and par::splitCommBinaryNoFlip(). |
|
An implementation of parallel bitonic sort that expects the number of processors to be a power of 2. However, unlike most implementations, we do not expect the length of the vector (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call this with a different number of elements. However, we do expect that 'in' atleast has 1 element on each processor.
Definition at line 1593 of file parUtils.txx. |
|
A parallel concatenation function. listB is appended (globally) to listA and the result is stored in listA. An useful application of this function is when listA and listB are individually sorted (globally) and the smallest element in listB is greater than the largest element in listA and we want to create a merged list that is sorted.
Definition at line 547 of file parUtils.txx. References DendroIntL, PROF_PAR_CONCAT_BEGIN, and PROF_PAR_CONCAT_END. |
|
Definition at line 393 of file parUtils.txx. |
|
A parallel search function.
Definition at line 764 of file parUtils.txx. References PROF_SEARCH_BEGIN, and PROF_SEARCH_END. |
|
Merges lists A, and B, retaining either the low or the High in list A.
The reason for this is that, on the Keep_Low side, max(min(A), min(B)) > min(A) > max(A-) and similarly on the Keep_high side, min(max(A), max(B)) < max(A) < min(A+) which guarantees that the merged lists remain bitonic. Definition at line 1673 of file parUtils.txx. |
|
The main operation in the parallel bitonic sort algorithm. This implements the compare-split operation.
Definition at line 1476 of file parUtils.txx. |
|
Definition at line 196 of file parUtils.txx. References PROF_PAR_ALLGATHER_BEGIN, and PROF_PAR_ALLGATHER_END. |
|
Definition at line 147 of file parUtils.txx. References PROF_PAR_ALLGATHERV_BEGIN, and PROF_PAR_ALLGATHERV_END. |
|
Definition at line 85 of file parUtils.txx. References PROF_PAR_ALLREDUCE_BEGIN, and PROF_PAR_ALLREDUCE_END. |
|
Definition at line 97 of file parUtils.txx. References PROF_PAR_ALL2ALL_BEGIN, and PROF_PAR_ALL2ALL_END. Referenced by ot::flagNodesType2(). |
|
Definition at line 318 of file parUtils.txx. References PROF_PAR_ALL2ALLV_DENSE_BEGIN, and PROF_PAR_ALL2ALLV_DENSE_END. |
|
Definition at line 226 of file parUtils.txx. References PROF_A2AV_WAIT_BEGIN, PROF_A2AV_WAIT_END, PROF_PAR_ALL2ALLV_SPARSE_BEGIN, and PROF_PAR_ALL2ALLV_SPARSE_END. |
|
Definition at line 123 of file parUtils.txx. References PROF_PAR_BCAST_BEGIN, and PROF_PAR_BCAST_END. |
|
Definition at line 110 of file parUtils.txx. References PROF_PAR_GATHER_BEGIN, and PROF_PAR_GATHER_END. |
|
Definition at line 50 of file parUtils.txx. |
|
Definition at line 28 of file parUtils.txx. |
|
Definition at line 39 of file parUtils.txx. |
|
Definition at line 135 of file parUtils.txx. References PROF_PAR_REDUCE_BEGIN, and PROF_PAR_REDUCE_END. |
|
Definition at line 73 of file parUtils.txx. References PROF_PAR_SCAN_BEGIN, and PROF_PAR_SCAN_END. |
|
Definition at line 61 of file parUtils.txx. References PROF_PAR_SENDRECV_BEGIN, and PROF_PAR_SENDRECV_END. |
|
Definition at line 1571 of file parUtils.txx. References binOp::getPrevHighestPowerOfTwo(). Referenced by par::bitonicSort(). |
|
Definition at line 1539 of file parUtils.txx. |
|
Definition at line 1507 of file parUtils.txx. |
|
A parallel weighted partitioning function. In our implementation, we do not pose any restriction on the input or the number of processors. This function can be used with an odd number of processors as well. Some processors can pass an empty vector as input. The relative ordering of the elements is preserved.
Definition at line 930 of file parUtils.txx. References DendroIntL, PROF_PARTW_BEGIN, and PROF_PARTW_END. |
|
Removes duplicates in parallel. If the input is not sorted, sample sort will be called within the function to sort the vector and then duplicates will be removed.
Definition at line 1174 of file parUtils.txx. References PROF_REMDUP_BEGIN, PROF_REMDUP_END, and par::splitComm2way(). |
|
A parallel sample sort implementation. In our implementation, we do not pose any restriction on the input or the number of processors. This function can be used with an odd number of processors as well. Some processors can pass an empty vector as input. If the total number of elements in the vector (globally) is fewer than 10*p^2, where p is the number of processors, then we will use bitonic sort instead of sample sort to sort the vector. We use a paralle bitonic sort to sort the samples in the sample sort algorithm. Hence, the complexity of the algorithm is O(n/p log n/p) + O(p log p). Here, n is the global length of the vector and p is the number of processors.
Definition at line 1282 of file parUtils.txx. References DendroIntL, PROF_SORT_BEGIN, PROF_SORT_END, sort(), and par::splitCommUsingSplittingRank(). |
|
Re-distributes a STL vector, preserving the relative ordering of the elements.
Definition at line 398 of file parUtils.txx. References DendroIntL, PROF_PAR_SCATTER_BEGIN, and PROF_PAR_SCATTER_END. Referenced by ot::prolongMatVecType2(), and ot::restrictMatVecType2(). |