我正在try 解决并行计算中的一个家庭作业(job)示例.以下是给定的代码段:
for (int i=0; i < n-1; i++) {
x[i] = (y[i] + x[i+1]) / 7;
}
我们必须分析代码段的依赖关系,并try 对给定的代码段进行并行化,并使用适当的样本量对其进行基准测试.我的第一次try 只是复制x
并删除此处的反依赖:
double *x_old = malloc(sizeof(*x_old) * n); // the plain "copy" of x
memcpy(x_old, &x[1], (n - 1) * sizeof(*x)); // to prevent i + 1 in loop we start x[1] and copy n - 1 elements
#pragma omp parallel for
for (int i = 0; i < n-1; i++) {
x[i] = (y[i] + x_old[i]) / 7;
}
这段代码有效,我判断了数组的串行和并行校验和,它们都是相同的.不计算memcpy()
英寸的加速比在1.6 - 2和100_000_00 - 400_000_000 array elements之间.但如果我进一步增大数组大小(在470_000_000 elements附近),则加速比会显著降低,在500_000_000 array elements处,加速比为0.375.
我在3种不同的设备上对其进行了基准测试,结果相同(加速方面):
联想Ideapad 5
台式电脑
在我们的群集 node PC上
为什么加速比会下降,为什么下降得这么快?为什么会在特定大小的数组上发生这种情况?
我在这里提供了一个程序的最小示例:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "omp.h"
void init(double *arr, int size);
double sum(double *arr, int size);
int main(int argc, char **argv){
if(argc != 3){
printf("Wrong use of program! (1) Size of array, (2) num threads\n");
return EXIT_FAILURE;
}
int n = atoi(argv[1]);
int threads = atoi(argv[2]);
omp_set_num_threads(threads);
double *x = malloc(sizeof(*x) * n);
double *y = malloc(sizeof(*y) * n);
init(x, n);
init(y, n);
double start = omp_get_wtime();
for (int i = 0; i < n-1; i++) {
x[i] = (y[i] + x[i+1]) / 7;
}
double end = omp_get_wtime();
double elapsed_s = end - start;
printf("single elapsed: %fs\n", elapsed_s);
double checksum_s = sum(x, n);
printf("single checksum: %f\n", checksum_s);
init(x, n); //reinit
init(y, n); //reinit
double *x_old = malloc(sizeof(*x_old) * n); // the plain "copy" of x
memcpy(x_old, &x[1], (n - 1) * sizeof(*x)); // to prevent i + 1 in loop we start x[1] and copy n - 1 elements
start = omp_get_wtime();
#pragma omp parallel for
for (int i = 0; i < n-1; i++) {
x[i] = (y[i] + x_old[i]) / 7;
}
end = omp_get_wtime();
double elapsed_p = end - start;
printf("omp threads %d Thread(s) elapsed: %fs\n", omp_get_max_threads(), elapsed_p);
double checksum_p = sum(x, n);
printf("omp threads %d Thread(s) single checksum: %f\n", omp_get_max_threads(), checksum_p);
printf("%s\n", fabs((checksum_p - checksum_s)) < __DBL_EPSILON__ ? "OK" : "FAILED");
printf("Speedup: %.3f\n", elapsed_s / elapsed_p);
}
void init(double *arr, int size){
for(int i = 0; i < size; ++i){
arr[i] = 1.0;
}
}
double sum(double *arr, int size){
double sum = 0.0;
for(int i = 0; i < size; ++i){
sum += arr[i];
}
return sum;
}
这里是一个示例基准: