我是OpenMP新手,我必须完成这个项目,在这个项目中,我需要使用雅可比迭代法求解2D矩阵,以使用OpenMP解决导热性问题.
基本上,它是一个在侧面有四个壁的板,有固定的温度,我需要计算出中间在中间.
代码已经交给了我们,我要做的是三件简单的事情:
- 对串行代码计时
- 并行串行代码并比较
- 如果可能,进一步优化并行代码
我已经运行了串行代码,并将代码并行化以进行比较.
我将try 对两者进行编译器优化,但我希望并行代码更快.
有趣的是,我添加的线程越多,速度就越慢.
我理解对于一个小的问题规模来说,这将是一个更大的开销,但我认为这是一个足够大的问题规模?
这是在任何重大优化之前,我是否做了一些明显错误的事情,使其成为这样?
代码如下:
序列号:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
int i, j, iter;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
/**
* @var t, tnew, diff, diffmax,
* t is the old temprature array,tnew is the new array
*/
double t[m+2][n+2], tnew[m+1][n+1], diff, difmax;
/**
* Timer variables
* @var start, end
*/
double start, end;
printf("%d %d %lf\n",m,n, tol);
start = omp_get_wtime();
// initialise temperature array
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
// fix boundary conditions
for (i=1; i <= m; i++) {
t[i][0] = 33.0;
t[i][n+1] = 42.0;
}
for (j=1; j <= n; j++) {
t[0][j] = 20.0;
t[m+1][j] = 25.0;
}
// main loop
iter = 0;
difmax = 1000000.0;
while (difmax > tol) {
iter++;
// update temperature for next iteration
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j] + t[i+1][j] + t[i][j-1] + t[i][j+1]) / 4.0;
}
}
// work out maximum difference between old and new temperatures
difmax = 0.0;
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > difmax) {
difmax = diff;
}
// copy new to old temperatures
t[i][j] = tnew[i][j];
}
}
}
end = omp_get_wtime();
// print results,
//Loop tempratures commented out to save performance
printf("iter = %d difmax = %9.11lf\n", iter, difmax);
printf("Time in seconds: %lf \n", end - start);
// for (i=0; i <= m+1; i++) {
// printf("\n");
// for (j=0; j <= n+1; j++) {
// printf("%3.5lf ", t[i][j]);
// }
// }
// printf("\n");
}
以下是并行代码:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
/**
* @brief Integer variables
* @var i external loop (y column array) counter,
* @var j internal loop (x row array counter) counter,
* @var iter number of iterations,
* @var numthreads number of threads
*/
int i, j, iter, numThreads;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
numThreads = atoi(argv[4]);
/**
* @brief Double variables
* @var t, tnew -> The variable that holds the temprature, the t is the old value and the tnew is the new value,
* @var diff Measures the difference,
* @var diffmax
* t is the temprature array, I guess it holds the matrix?
*
*/
double t[m+2][n+2], tnew[m+1][n+1], diff, diffmax, privDiffmax;
/**
* Timer variables
* @var start, end
*/
double start, end;
/**
* @brief Print the problem size & the tolerance
* This print statement can be there as it is not part of the parallel region
* We also print the number of threads when printing the problem size & tolerance
*/
//printf("%d %d %lf %d\n",m,n, tol, numThreads);
omp_set_num_threads(numThreads);
/**
* @brief Initialise the timer
*
*/
start = omp_get_wtime();
/**
* @brief Creating the parallel region:
* Here both loop counters are private:
*/
#pragma omp parallel private(i, j)
{
/**
* @brief initialise temperature array
* This can be in a parallel region by itself
*/
#pragma omp for collapse(2) schedule(static)
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
// fix boundary conditions
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
t[i][0] = 33.0;
t[i][n+1] = 42.0;
}
#pragma omp for schedule(static)
for (j=1; j <= n; j++) {
t[0][j] = 20.0;
t[m+1][j] = 25.0;
}
}
// main loop
iter = 0;
diffmax = 1000000.0;
while (diffmax > tol) {
iter = iter + 1;
/**
* @brief update temperature for next iteration
* Here we have created a parallel for directive, this is the second parallel region
*/
#pragma omp parallel for private(i, j) collapse(2) schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j] + t[i+1][j] + t[i][j-1] + t[i][j+1]) / 4.0;
}
}
// work out maximum difference between old and new temperatures
diffmax = 0.0;
/**
* @brief Third parallel region that compares the difference
*/
#pragma omp parallel private(i, j, privDiffmax, diff)
{
privDiffmax = 0.0;
#pragma omp for collapse(2) schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > privDiffmax) {
privDiffmax = diff;
}
// copy new to old temperatures
t[i][j] = tnew[i][j];
}
}
#pragma omp critical
if (privDiffmax > diffmax)
{
diffmax = privDiffmax;
}
}
}
//Add timer for the end
end = omp_get_wtime();
// print results,
//Loop tempratures commented out to save performance
printf("iter = %d diffmax = %9.11lf\n", iter, diffmax);
printf("Time in seconds: %lf \n", end - start);
// for (i=0; i <= m+1; i++) {
// printf("\n");
// for (j=0; j <= n+1; j++) {
// printf("%3.5lf ", t[i][j]);
// }
// }
// printf("\n");
}
以下是一些串行代码的基准测试:
我已经运行了代码并进行了测试,我已经注释掉了打印语句,因为除了测试之外,我不需要看到它.
我有一个8核的苹果Mac M1
我是OpenMP新手,忍不住觉得自己错过了什么.