Parallel STL

C++17: параллельная версия стандартных
алгоритмов
Евгений Крутько
e.s.krutko@gmail.com
Национальный исследовательский центр "Курчатовский институт"
25.02.2017

Эволюция многопоточной работы в C++
Нам нужно выполнить работу асинхронно
1 int main(){
2 //Here we should use some features to call doWork in parallel
3 doWork(int a);
4 doSomeOtherWork(double t);
5 //Ensure doWork completed
6 return 0;
7 }

Давным-давно...
... у нас не было даже стандарта C++11.
1 //Using POSIX API
2 void* makeDoWork(void *arg) {
3 int *a=reinterpret_cast<int*>(arg);
4 doWork(*a);
5 }
6 int main(){
7 pthread_t thread;
8 int value = 42;
9 pthread_create(
10 &thread,
11 nullptr,
12 makeDoWork,
13 &value);
15 pthread_join(thread, nullptr);
16 return 0;
17 }

Давным-давно...
... у нас не было даже стандарта C++11.
1 //Using Win API
2 DWORD WINAPI makeDoWork(void *arg) {
3 int *a=reinterpret_cast<int*>(arg);
4 doWork(*a);
5 }
6 int main(){
7 HANDLE thread;
8 int value = 42;
9 thread = CreateThread(
10 NULL,
11 0,
12 makeDoWork,
13 &value,
14 0,
15 NULL
16 );
18 WaitForSingleObject(thread, INFINITE);
19 return 0;
20 }

Кросс-платформенный подход
И другие

Начиная со стандарта C++11
std::thread
std::async / std::future
1 int main(int argc, char**argv) {
2 std::thread thread;
3 int value;
4 //Do doWork() in new thread
5 thread = std::thread(
6 doWork,
7 std::ref(value));
8 //Do someything else in this thread
9 doSomeOtherWork();
10 //Whait for doWork() finishes
11 thread.join();
12 return 0;
13 }

Начиная со стандарта C++11
std::thread
std::async / std::future
1 int main(int argc, char**argv) {
2 int value;
3 //Do doWork() in new thread
4 auto future = std::async(
5 std::launch::async;
6 doWork,
7 std::ref(value));
8 //Do someything else in this thread
9 doSomeOtherWork();
10 //Whait for doWork() finishes
11 future.get();
12 return 0;
13 }

Нам и этого мало, хочется еще проще
Возможно когда-нибудь :)
1 auto auto(auto auto) { auto; }

Нам и этого мало, хочется еще проще
Возможно когда-нибудь :)
1 auto auto(auto auto) { auto; }
Уже /почти/ в стандарте
1 //Something from <algorythm>
2 std::some_standard_algorythm_with_stl_containers(
3 std::begin(container),
4 std::end(container)
5 );
6 //The same but with specification of execution policy
8 ExecutionPolicy policy,
11 );

Алгоритмы стандартной библиотеки, доступные в
параллельном режиме
Объявлены в заголовочном файле <algorithm>
Пока в стадии TS в заголовочном файле
<experimental/algorithm>
adjacent_difference adjacent_find all_of any_of
copy copy_if copy_n count
count_if equal exclusive_scan fill
fill_n find find_end find_first_of
find_if find_if_not for_each for_each_n
generate generate_n includes inclusive_scan
inner_product inplace_merge is_heap is_heap_until
is_partitioned is_sorted is_sorted_until lexicographical_compare
max_element merge min_element minmax_element
mismatch move none_of nth_element
partial_sort partial_sort_copy partition partition_copy
reduce remove remove_copy remove_copy_if
remove_if replace replace_copy replace_copy_if
replace_if reverse reverse_copy rotate
rotate_copy search search_n set_difference
set_intersection set_symmetric_difference set_union sort
stable_partition stable_sort swap_ranges transform
transform_exclusive_scan transform_inclusive_scan transform_reduce uninitialized_copy
uninitialized_copy_n uninitialized_fill uninitialized_fill_n unique
unique_copy

Политики выполнения
1 //Available from <execution_policy>
2 //While in TS stage from <experimental/execution_policy>
3
4 //Plain old sequenced execution
5 constexpr sequential_execution_policy seq{ };
6 //Parallel execution
7 constexpr parallel_execution_policy par{ };
8 //Parallel with SIMD instructions
9 constexpr parallel_vector_execution_policy par_vec{ };

Политики выполнения
1 //Available from <execution_policy>
2 //While in TS stage from <experimental/execution_policy>
3
4 //Plain old sequenced execution
5 constexpr sequential_execution_policy seq{ };
6 //Parallel execution
7 constexpr parallel_execution_policy par{ };
8 //Parallel with SIMD instructions
9 constexpr parallel_vector_execution_policy par_vec{ };
Стрелять себе по ногам теперь можно еще одним изящным
способом
1 int a[] = {0,1};
2 std::vector<int> v;
3 std::for_each(std::par,
4 std::begin(a),
5 std::end(a),
6 [&](int i) {
7 v.push_back(i*2+1); // Error: data race
8 });

Дайте две!
Как же попробовать? Из документа
https://isocpp.org/files/papers/P0024R2.html
Microsoft MS ParallelSTL page
HPX HPX github
Codeplay Sycl github
HSA HSA for math science page
Thibaut Lutz github
NVIDIA github
http://github.com/eskrut/ParallelSTL.git

Тестовая задача #1
Обычная реализация
1 auto vec = makeShuffledVector();
2 double baseDuration = 0;
3
4 auto vecToSort = vec;
5 {
6 Stopwatch sw("plain sort");
7 std::sort(std::begin(vecToSort), std::end(vecToSort));
8 baseDuration = sw.duration();
9 }
10 if (! std::is_sorted(std::begin(vecToSort), std::end(vecToSort)))
11 throw std::runtime_error("Failed with plain sort");

С последовательной политикой
1 //This includes should be from ${T_LUTZ_ROOT}/include
2 #include <numeric>
3 #include <experimental/algorithm>
4
5 /* --- */
6
7 vecToSort = vec;
8 {
9 Stopwatch sw("seq sort", baseDuration);
10 sort(std::experimental::parallel::seq,
11 std::begin(vecToSort), std::end(vecToSort));
12 }

С параллельной политикой
1 //This includes should be from ${T_LUTZ_ROOT}/include
2 #include <numeric>
3 #include <experimental/algorithm>
4
5 /* --- */
6
7 vecToSort = vec;
8 {
9 Stopwatch sw("par sort", baseDuration);
10 sort(std::experimental::parallel::par,
12 }

С последовательной политикой
1 #include "hpx/hpx_init.hpp"
2 #include "hpx/hpx.hpp"
3
4 #include "hpx/parallel/numeric.hpp"
5 #include "hpx/parallel/algorithm.hpp"
6
7 /* --- */
8
9 vecToSort = vec;
10 {
11 Stopwatch sw("seq sort", baseDuration);
12 hpx::parallel::sort(hpx::parallel::seq,
14 }
16 throw std::runtime_error("Failed with seq sort");

С параллельной политикой
3
6
7 /* --- */
8
9 vecToSort = vec;
10 {
11 Stopwatch sw("par sort", baseDuration);
12 hpx::parallel::sort(hpx::parallel::par,
14 }
16 throw std::runtime_error("Failed with par sort");

С параллельной и векторизованной политикой
3
6
7 /* --- */
8
9 vecToSort = vec;
10 {
11 Stopwatch sw("par_vec sort", baseDuration);
12 hpx::parallel::sort(hpx::parallel::par_vec,
14 }
16 throw std::runtime_error("Failed with par_vec sort");

Обычная реализация
1 const size_t numParts = std::thread::hardware_concurrency()*2 > 0 ?
2 std::thread::hardware_concurrency() * 2 : 8;
3 std::list<std::vector<size_t>> listOfVecs;
4 listOfVecs.resize(numParts);
5 std::generate(listOfVecs.begin(),
6 listOfVecs.end(),
7 [&](){ return makeShuffledVector(memoryToAlloc/numParts);}
8 );
9 double baseDuration = 0;
10 auto list = listOfVecs;
11 {
12 Stopwatch sw("plain for, plain sort");
13 for(auto &vecToSort : list)
15 baseDuration = sw.duration();
16 }
17 for(const auto &vecToSort : list)
19 throw std::runtime_error("Failed with plain for, plain sort");

Последовательный for, параллельный sort
4
5 /* --- */
6
9 std::begin(vecToSort),
10 std::end(vecToSort));

Параллельный for, последовательный sort
1 for_each(std::experimental::parallel::par,
2 std::begin(list), std::end(list),
3 [](std::vector<size_t> &vecToSort){
5 });
6
7 /* --- */
8
9 hpx::parallel::for_each(hpx::parallel::par,
13 });

Параллельный for, параллельный sort
[параллельный в квадрате, квадратно параллельный]
6 });
7
8 /* --- */
9
10 hpx::parallel::for_each(hpx::parallel::par,
15 });

Результаты
[2 физических, 4 h-threading ядра]
Тест #1
Реализация Политика Ускорение
t-lutz
seq 1
par 2.37
HPX
seq 0.99
par 2.61
par_vec 2.64
Тест #2
Реализация Политика for Политика sort Ускорение
t-lutz
seq par 2.26
par seq 2.78
par par 2.48
HPX
seq par 2.52
par seq 2.75
par par 2.87

Race!
Догнать и перегнать!
3
4 /* --- */
5

Race!
1 auto work=[](decltype(list.end()) begin,decltype(list.end()) end){
2 for(auto it = begin; it != end; ++it)
3 std::sort(it->begin(), it->end());
4 };
5 size_t hc = std::thread::hardware_concurrency();
6 if(hc == 0) hc = 8;
7 auto numThreads = std::min(list.size(), hc);
8 auto chunkPerThread = list.size() / numThreads;
9 auto threadBegin = list.begin();
10 auto threadEnd = threadBegin;
11 std::advance(threadEnd, chunkPerThread);
12 std::list<std::future<void>> futures;
13 for(size_t thId = 0; thId < numThreads - 1; ++thId){
14 futures.push_back(std::async(std::launch::async,
15 work, threadBegin, threadEnd));
16 threadBegin = threadEnd;
17 std::advance(threadEnd, chunkPerThread);
18 }
19 work(threadBegin, list.end());
20 for(auto &f : futures) f.get();

И победителем стал...
Тест #2
Реализация Политика Ускорение Количество строк
Parallel STL par/par ∼2.5 2
mine async/seq ∼2.5 20

Динамические политики
1 std::experimental::parallel::execution_policy outerPolicy =
2 std::experimental::parallel::seq;
3 std::experimental::parallel::execution_policy innerPolicy =
4 std::experimental::parallel::par;
5
6 /* Decide at runtime which policy should be */
7 outerPolicy = std::experimental::parallel::par;
8 innerPolicy = std::experimental::parallel::par_vec;
9
10 for_each(outerPolicy,
12 [&innerPolicy](std::vector<size_t> &vecToSort){
13 sort(innerPolicy,
14 std::begin(vecToSort),
15 std::end(vecToSort));
16 });

Если что-то пошло не так
1 struct Processor
2 {
3 void doWork(size_t id) {
4 throw std::runtime_error("`o´o_ from " + std::to_string(id));
5 }
6 };
7
8 std::vector<Processor> processors;
9 processors.resize(25);
10
11 try {
12 std::cout << "Plain for" << std::endl;
13 std::for_each(processors.begin(), processors.end(),
14 [&](Processor &p){
15 p.doWork(&p - processors.data());
16 });
17 }
18 catch(std::exception &e) {
19 std::cout << e.what() << std::endl;
20 }

Ловим исключения
1 try {
2 std::cout << "Seq for" << std::endl;
3 hpx::parallel::for_each(hpx::parallel::seq,
4 processors.begin(), processors.end(),
5 [&](Processor &p){
6 p.doWork(&p - processors.data());
7 });
8 }
9 catch(hpx::parallel::exception_list &list) {
10 std::cout << "Exception list what: " << list.what() << std::endl;
11 std::cout << "Total " << list.size() << " exceptions:" << std::
endl;
12 for(auto &e : list) {
13 try{ boost::rethrow_exception(e); }
14 catch(std::exception &e){
15 std::cout << "twhat: " << e.what() << std::endl;
16 }
17 }
18 }
19 catch(std::exception &e){
20 std::cout << e.what() << std::endl;
21 }

Ловим исключения
Plain for
ò_ó from 0
Seq for
Exception list what: HPX(unknown_error)
Total 1 exceptions:
what: HPX(unknown_error)
Par for
Exception list what: HPX(unknown_error)
Total 13 exceptions:
what: ò_ó from 0
what: ò_ó from 2
what: ò_ó from 4
what: ò_ó from 6
what: ò_ó from 8
what: ò_ó from 10

На десерт
Parallelism Extension for STL еще не реализовано в
компиляторах.
1 #include <algorithm>
2
6 );
Но если используется gcc, то...

На десерт
Parallelism Extension for STL еще не реализовано в
компиляторах.
1 #include <algorithm>
2
6 );
Но если используется gcc, то...
1 //OpenMP should be enabled
2
3 #include <parallel/algorithm>
4
5 std::__parallel::some_standard_algorythm_with_stl_containers(
8 );

Те же самые примеры
1 std::__parallel::sort(std::begin(vecToSort), std::end(vecToSort));
2
3 std::__parallel::for_each(std::begin(list), std::end(list),
5 std::__parallel::sort(std::begin(vecToSort), std::end(
vecToSort));
6 });
7
8 #pragma omp parallel
9 {
10 #pragma omp single
11 {
12 for(auto it = list.begin(); it != list.end(); ++it)
13 #pragma omp task
14 sort(it->begin(), it->end());
15 }
16 }

Результаты
Тест #2
Реализация Политика Ускорение Количество строк
Parallel STL par/par ∼2.5 2
mine (std::async) async/seq ∼2.5 20
Parallel gcc par/par ∼2.5 2
mine (openmp) task/seq ∼2.5 9

Parallel STL

More Related Content

Parallel STL