在运行时选择线程数量:std::thread:hardware_currency(),这个函数返回一个对于给定程序执行时能够真正并发运行的线程数量的指示,如果该信息不可用则函数会返回0。例如,在多核系统上它可能是CPU核心的数量。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
template<typename Iterator, typename T>
struct accumulate_block
{
void operator()(Iterator first, Iterator last, T&result)
{
result = std::accumulate(first, last, result);
}
};

template<typename Iterator, typename T>
T parallel_accumulate(Iterator first, Iterator last, T init)
{
unsigned long const length = std::distance(first, last);

if (!length) // 输入范围为空,无需计算
return init;

unsigned long const min_per_thread = 25;
unsigned long const max_thread = (length + min_per_thread - 1) / min_per_thread; // 设置线程最大数量,避免5个数创建2个线程
unsigned long const hardware_threads = std::thread:hardware_concurrency(); // 硬件线程数量
unsigned long const num_threads = std::min(hardware_threads != 0 ? hardware_threads : 2, max_threads); // 适合的线程数量
unsigned long const block_size = length / num_threads; // 每个线程处理多少数

std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads - 1); // 注意-1,因为自己也算一个线程

Iterator block_start = first;
for (unsigned long i = 0; i < num_threads - 1; ++i)
{
Iterator block_end = block_start;
std::advance(block_end, block_size); // 迭代到当前块的末尾
threads[i] = std::thread(accumulate_block<Iterator, T>(), block_start, block_end, std::ref(results[i])); // 注意这个std::ref,缺了的话将会引用thread构造函数中的副本,导致实际对象并没有被更改
block_start = block_end;
}
accumulate_block<Iterator, T>()(start, last, results[num_threads - 1]); // 当前线程结束最后的块,即可能没有被整除的地方

std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join)); // 等待所有线程
return std::accumulate(results.begin(), results.end(); init); // 累加并返回
}