32#include "fasp_functs.h"
64 INT count = 0, added, countrow;
69 INT mybegin, myend, myid, nthreads;
72 nthreads = fasp_get_num_threads();
77 printf(
"### ERROR: Matrix sizes do not match!\n");
82 if (A == NULL && B == NULL) {
90 if (A->
nnz == 0 && B->
nnz == 0) {
99 if (A->
nnz == 0 || A == NULL) {
101 memcpy(C->
IA, B->
IA, (B->
row + 1) *
sizeof(
INT));
106#pragma omp parallel private(myid, mybegin, myend, i)
108 myid = omp_get_thread_num();
110 for (i = mybegin; i < myend; ++i) C->
val[i] = B->
val[i] * beta;
114 for (i = 0; i < A->
nnz; ++i) C->
val[i] = B->
val[i] * beta;
122 if (B->
nnz == 0 || B == NULL) {
124 memcpy(C->
IA, A->
IA, (A->
row + 1) *
sizeof(
INT));
129 INT mybegin, myend, myid;
130#pragma omp parallel private(myid, mybegin, myend, i)
132 myid = omp_get_thread_num();
134 for (i = mybegin; i < myend; ++i) C->
val[i] = A->
val[i] * alpha;
138 for (i = 0; i < A->
nnz; ++i) C->
val[i] = A->
val[i] * alpha;
156 memset(C->
IA, 0,
sizeof(
INT) * (C->
row + 1));
157 memset(C->
JA, -1,
sizeof(
INT) * (A->
nnz + B->
nnz));
159 for (i = 0; i < A->
row; ++i) {
161 for (j = A->
IA[i]; j < A->IA[i + 1]; ++j) {
162 C->
val[count] = alpha * A->
val[j];
163 C->
JA[count] = A->
JA[j];
169 for (k = B->
IA[i]; k < B->IA[i + 1]; ++k) {
172 for (l = C->
IA[i]; l < C->IA[i] + countrow + 1; l++) {
173 if (B->
JA[k] == C->
JA[l]) {
174 C->
val[l] = C->
val[l] + beta * B->
val[k];
181 C->
val[count] = beta * B->
val[k];
182 C->
JA[count] = B->
JA[k];
189 C->
IA[i + 1] += C->
IA[i];
244 const INT * ia = A->
IA, *ja = A->
JA;
247 INT i, k, begin_row, end_row, nnz_row;
255 nthreads = fasp_get_num_threads();
260 INT myid, mybegin, myend;
263#pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, \
266 for (myid = 0; myid < nthreads; myid++) {
268 for (i = mybegin; i < myend; ++i) {
272 nnz_row = end_row - begin_row;
276 temp += aj[k] * x[ja[k]];
278 temp += aj[k] * x[ja[k]];
280 temp += aj[k] * x[ja[k]];
284 temp += aj[k] * x[ja[k]];
286 temp += aj[k] * x[ja[k]];
288 temp += aj[k] * x[ja[k]];
290 temp += aj[k] * x[ja[k]];
294 temp += aj[k] * x[ja[k]];
296 temp += aj[k] * x[ja[k]];
298 temp += aj[k] * x[ja[k]];
300 temp += aj[k] * x[ja[k]];
302 temp += aj[k] * x[ja[k]];
306 temp += aj[k] * x[ja[k]];
308 temp += aj[k] * x[ja[k]];
310 temp += aj[k] * x[ja[k]];
312 temp += aj[k] * x[ja[k]];
314 temp += aj[k] * x[ja[k]];
316 temp += aj[k] * x[ja[k]];
320 temp += aj[k] * x[ja[k]];
322 temp += aj[k] * x[ja[k]];
324 temp += aj[k] * x[ja[k]];
326 temp += aj[k] * x[ja[k]];
328 temp += aj[k] * x[ja[k]];
330 temp += aj[k] * x[ja[k]];
332 temp += aj[k] * x[ja[k]];
335 for (k = begin_row; k < end_row; ++k) {
336 temp += aj[k] * x[ja[k]];
346 for (i = 0; i < m; ++i) {
350 nnz_row = end_row - begin_row;
354 temp += aj[k] * x[ja[k]];
356 temp += aj[k] * x[ja[k]];
358 temp += aj[k] * x[ja[k]];
362 temp += aj[k] * x[ja[k]];
364 temp += aj[k] * x[ja[k]];
366 temp += aj[k] * x[ja[k]];
368 temp += aj[k] * x[ja[k]];
372 temp += aj[k] * x[ja[k]];
374 temp += aj[k] * x[ja[k]];
376 temp += aj[k] * x[ja[k]];
378 temp += aj[k] * x[ja[k]];
380 temp += aj[k] * x[ja[k]];
384 temp += aj[k] * x[ja[k]];
386 temp += aj[k] * x[ja[k]];
388 temp += aj[k] * x[ja[k]];
390 temp += aj[k] * x[ja[k]];
392 temp += aj[k] * x[ja[k]];
394 temp += aj[k] * x[ja[k]];
398 temp += aj[k] * x[ja[k]];
400 temp += aj[k] * x[ja[k]];
402 temp += aj[k] * x[ja[k]];
404 temp += aj[k] * x[ja[k]];
406 temp += aj[k] * x[ja[k]];
408 temp += aj[k] * x[ja[k]];
410 temp += aj[k] * x[ja[k]];
413 for (k = begin_row; k < end_row; ++k) {
414 temp += aj[k] * x[ja[k]];
440 const INT * ia = A->
IA, *ja = A->
JA;
441 INT i, k, begin_row, end_row;
446 INT myid, mybegin, myend;
447 INT nthreads = fasp_get_num_threads();
452#pragma omp parallel for private(myid, i, mybegin, myend, temp, begin_row, end_row, k)
453 for (myid = 0; myid < nthreads; myid++) {
455 for (i = mybegin; i < myend; i++) {
459 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
465 for (i = 0; i < m; ++i) {
469 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
496 const INT * ia = A->
IA, *ja = A->
JA;
498 INT i, k, begin_row, end_row;
505 nthreads = fasp_get_num_threads();
510 INT myid, mybegin, myend;
512#pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, k)
514 for (myid = 0; myid < nthreads; myid++) {
516 for (i = mybegin; i < myend; ++i) {
520 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
525 for (i = 0; i < m; ++i) {
529 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
535 else if (alpha == -1.0) {
537 INT myid, mybegin, myend;
540#pragma omp parallel for private(myid, mybegin, myend, temp, i, begin_row, end_row, k)
543 for (myid = 0; myid < nthreads; myid++) {
545 for (i = mybegin; i < myend; ++i) {
549 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
554 for (i = 0; i < m; ++i) {
558 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
566 INT myid, mybegin, myend;
568#pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, k)
570 for (myid = 0; myid < nthreads; myid++) {
572 for (i = mybegin; i < myend; ++i) {
576 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
577 y[i] += temp * alpha;
581 for (i = 0; i < m; ++i) {
585 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
586 y[i] += temp * alpha;
612 const INT * ia = A->
IA, *ja = A->
JA;
614 INT i, k, begin_row, end_row;
622 nthreads = fasp_get_num_threads();
628 INT myid, mybegin, myend;
630#pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, k)
632 for (myid = 0; myid < nthreads; myid++) {
634 for (i = mybegin; i < myend; ++i) {
638 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
643 for (i = 0; i < m; ++i) {
647 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
653 else if (alpha == -1.0) {
655 INT myid, mybegin, myend;
657#pragma omp parallel for private(myid, mybegin, myend, temp, i, begin_row, end_row, k)
659 for (myid = 0; myid < nthreads; myid++) {
661 for (i = mybegin; i < myend; ++i) {
665 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
670 for (i = 0; i < m; ++i) {
674 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
682 INT myid, mybegin, myend;
684#pragma omp parallel for private(myid, mybegin, myend, i, temp, begin_row, end_row, k)
686 for (myid = 0; myid < nthreads; myid++) {
688 for (i = mybegin; i < myend; ++i) {
692 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
693 y[i] += temp * alpha;
697 for (i = 0; i < m; ++i) {
701 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
702 y[i] += temp * alpha;
728 const INT *ia = A->
IA, *ja = A->
JA;
730 INT i, k, begin_row, end_row;
736 INT myid, mybegin, myend;
737 INT nthreads = fasp_get_num_threads();
738#pragma omp parallel for private(myid, i, mybegin, myend, begin_row, end_row, temp, k)
739 for (myid = 0; myid < nthreads; myid++) {
741 for (i = mybegin; i < myend; ++i) {
745 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
751 for (i = 0; i < m; ++i) {
755 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
761 }
else if (alpha == -1.0) {
764 INT myid, mybegin, myend;
765 INT nthreads = fasp_get_num_threads();
766#pragma omp parallel for private(myid, i, mybegin, myend, begin_row, end_row, temp, k)
767 for (myid = 0; myid < nthreads; myid++) {
769 for (i = mybegin; i < myend; ++i) {
773 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
779 for (i = 0; i < m; ++i) {
783 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
794 INT myid, mybegin, myend;
795 INT nthreads = fasp_get_num_threads();
796#pragma omp parallel for private(myid, i, mybegin, myend, begin_row, end_row, temp, k)
797 for (myid = 0; myid < nthreads; myid++) {
799 for (i = mybegin; i < myend; ++i) {
803 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
804 y[i] += temp * alpha;
809 for (i = 0; i < m; ++i) {
813 for (k = begin_row; k < end_row; ++k) temp += x[ja[k]];
814 y[i] += temp * alpha;
836 register REAL value = 0.0;
838 const INT * ia = A->
IA, *ja = A->
JA;
840 INT i, k, begin_row, end_row;
853#pragma omp parallel for reduction(+ : value) private(i, temp, begin_row, end_row, k)
855 for (i = 0; i < m; ++i) {
859 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
860 value += y[i] * temp;
863 for (i = 0; i < m; ++i) {
867 for (k = begin_row; k < end_row; ++k) temp += aj[k] * x[ja[k]];
868 value += y[i] * temp;
890 INT i, j, k, l, count;
900 for (i = 0; i < B->
col; ++i) JD[i] = -1;
903 for (i = 0; i < C->
row; ++i) {
906 for (k = A->
IA[i]; k < A->IA[i + 1]; ++k) {
907 for (j = B->
IA[A->
JA[k]]; j < B->IA[A->
JA[k] + 1]; ++j) {
908 for (l = 0; l < count; l++) {
909 if (JD[l] == B->
JA[j])
break;
913 JD[count] = B->
JA[j];
918 C->
IA[i + 1] = count;
919 for (j = 0; j < count; ++j) {
924 for (i = 0; i < C->
row; ++i) C->
IA[i + 1] += C->
IA[i];
931 for (i = 0; i < C->
row; ++i) {
934 for (k = A->
IA[i]; k < A->IA[i + 1]; ++k) {
935 for (j = B->
IA[A->
JA[k]]; j < B->IA[A->
JA[k] + 1]; ++j) {
936 for (l = 0; l < countJD; l++) {
937 if (JD[l] == B->
JA[j])
break;
941 C->
JA[count] = B->
JA[j];
942 JD[countJD] = B->
JA[j];
959 for (i = 0; i < C->
row; ++i) {
960 for (j = C->
IA[i]; j < C->IA[i + 1]; ++j) {
962 for (k = A->
IA[i]; k < A->IA[i + 1]; ++k) {
963 for (l = B->
IA[A->
JA[k]]; l < B->IA[A->
JA[k] + 1]; l++) {
964 if (B->
JA[l] == C->
JA[j]) {
997 const INT n_coarse = R->
row;
998 const INT* R_i = R->
IA;
999 const INT* R_j = R->
JA;
1002 const INT n_fine = A->
row;
1003 const INT* A_i = A->
IA;
1004 const INT* A_j = A->
JA;
1007 const INT* P_i = P->
IA;
1008 const INT* P_j = P->
JA;
1014 REAL* RAP_data = NULL;
1017 INT* P_marker = NULL;
1018 INT* A_marker = NULL;
1021 INT* Ps_marker = NULL;
1022 INT* As_marker = NULL;
1024 INT ic, i1, i2, i3, jj1, jj2, jj3;
1025 INT jj_counter, jj_row_begining;
1026 REAL r_entry, r_a_product, r_a_p_product;
1031 INT myid, mybegin, myend, Ctemp;
1032 nthreads = fasp_get_num_threads();
1035 INT coarse_mul_nthreads = n_coarse * nthreads;
1036 INT fine_mul_nthreads = n_fine * nthreads;
1037 INT coarse_add_nthreads = n_coarse + nthreads;
1038 INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
1039 INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
1042 As_marker = Ps_marker + coarse_mul_nthreads;
1052 INT* RAP_temp = As_marker + fine_mul_nthreads;
1053 INT* part_end = RAP_temp + coarse_add_nthreads;
1056#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
1057 jj_counter, ic, jj_row_begining, jj1, i1, jj2, i2, \
1059 for (myid = 0; myid < nthreads; myid++) {
1061 P_marker = Ps_marker + myid * n_coarse;
1062 A_marker = As_marker + myid * n_fine;
1064 for (ic = mybegin; ic < myend; ic++) {
1065 P_marker[ic] = jj_counter;
1066 jj_row_begining = jj_counter;
1069 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1071 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1073 if (A_marker[i2] != ic) {
1075 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1077 if (P_marker[i3] < jj_row_begining) {
1078 P_marker[i3] = jj_counter;
1086 RAP_temp[ic + myid] = jj_row_begining;
1088 RAP_temp[myend + myid] = jj_counter;
1090 part_end[myid] = myend + myid + 1;
1093 jj_counter = part_end[0];
1095 for (i1 = 1; i1 < nthreads; i1++) {
1096 Ctemp += RAP_temp[part_end[i1 - 1] - 1];
1097 for (jj1 = part_end[i1 - 1] + 1; jj1 < part_end[i1]; jj1++) {
1098 RAP_i[jj_counter] = RAP_temp[jj1] + Ctemp;
1102 RAP_size = RAP_i[n_coarse];
1108 for (ic = 0; ic < n_coarse; ic++) {
1109 Ps_marker[ic] = jj_counter;
1110 jj_row_begining = jj_counter;
1113 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1116 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1118 if (As_marker[i2] != ic) {
1120 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1122 if (Ps_marker[i3] < jj_row_begining) {
1123 Ps_marker[i3] = jj_counter;
1131 RAP_i[ic] = jj_row_begining;
1134 RAP_i[n_coarse] = jj_counter;
1135 RAP_size = jj_counter;
1147#pragma omp parallel for private(myid, mybegin, myend, P_marker, A_marker, jj_counter, \
1148 ic, jj_row_begining, jj1, r_entry, i1, jj2, \
1149 r_a_product, i2, jj3, r_a_p_product, i3)
1150 for (myid = 0; myid < nthreads; myid++) {
1152 P_marker = Ps_marker + myid * n_coarse;
1153 A_marker = As_marker + myid * n_fine;
1154 jj_counter = RAP_i[mybegin];
1155 for (ic = mybegin; ic < myend; ic++) {
1156 P_marker[ic] = jj_counter;
1157 jj_row_begining = jj_counter;
1158 RAP_j[jj_counter] = ic;
1159 RAP_data[jj_counter] = 0.0;
1161 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1162 r_entry = R_data[jj1];
1165 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1166 r_a_product = r_entry * A_data[jj2];
1169 if (A_marker[i2] != ic) {
1171 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1172 r_a_p_product = r_a_product * P_data[jj3];
1175 if (P_marker[i3] < jj_row_begining) {
1176 P_marker[i3] = jj_counter;
1177 RAP_data[jj_counter] = r_a_p_product;
1178 RAP_j[jj_counter] = i3;
1181 RAP_data[P_marker[i3]] += r_a_p_product;
1185 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1187 r_a_p_product = r_a_product * P_data[jj3];
1188 RAP_data[P_marker[i3]] += r_a_p_product;
1198 for (ic = 0; ic < n_coarse; ic++) {
1199 Ps_marker[ic] = jj_counter;
1200 jj_row_begining = jj_counter;
1201 RAP_j[jj_counter] = ic;
1202 RAP_data[jj_counter] = 0.0;
1205 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1206 r_entry = R_data[jj1];
1209 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1210 r_a_product = r_entry * A_data[jj2];
1213 if (As_marker[i2] != ic) {
1215 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1216 r_a_p_product = r_a_product * P_data[jj3];
1219 if (Ps_marker[i3] < jj_row_begining) {
1220 Ps_marker[i3] = jj_counter;
1221 RAP_data[jj_counter] = r_a_p_product;
1222 RAP_j[jj_counter] = i3;
1225 RAP_data[Ps_marker[i3]] += r_a_p_product;
1229 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1231 r_a_p_product = r_a_product * P_data[jj3];
1232 RAP_data[Ps_marker[i3]] += r_a_p_product;
1242 RAP->
row = n_coarse;
1243 RAP->
col = n_coarse;
1244 RAP->
nnz = RAP_size;
1247 RAP->
val = RAP_data;
1272 const INT n_coarse = R->
row;
1273 const INT* R_i = R->
IA;
1274 const INT* R_j = R->
JA;
1276 const INT n_fine = A->
row;
1277 const INT* A_i = A->
IA;
1278 const INT* A_j = A->
JA;
1281 const INT* P_i = P->
IA;
1282 const INT* P_j = P->
JA;
1287 REAL* RAP_data = NULL;
1290 INT* P_marker = NULL;
1291 INT* A_marker = NULL;
1294 INT* Ps_marker = NULL;
1295 INT* As_marker = NULL;
1297 INT ic, i1, i2, i3, jj1, jj2, jj3;
1298 INT jj_counter, jj_row_begining;
1303 INT myid, mybegin, myend, Ctemp;
1304 nthreads = fasp_get_num_threads();
1307 INT coarse_mul_nthreads = n_coarse * nthreads;
1308 INT fine_mul_nthreads = n_fine * nthreads;
1309 INT coarse_add_nthreads = n_coarse + nthreads;
1310 INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
1311 INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
1314 As_marker = Ps_marker + coarse_mul_nthreads;
1324 INT* RAP_temp = As_marker + fine_mul_nthreads;
1325 INT* part_end = RAP_temp + coarse_add_nthreads;
1328#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
1329 jj_counter, ic, jj_row_begining, jj1, i1, jj2, i2, \
1331 for (myid = 0; myid < nthreads; myid++) {
1333 P_marker = Ps_marker + myid * n_coarse;
1334 A_marker = As_marker + myid * n_fine;
1336 for (ic = mybegin; ic < myend; ic++) {
1337 P_marker[ic] = jj_counter;
1338 jj_row_begining = jj_counter;
1341 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1343 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1345 if (A_marker[i2] != ic) {
1347 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1349 if (P_marker[i3] < jj_row_begining) {
1350 P_marker[i3] = jj_counter;
1358 RAP_temp[ic + myid] = jj_row_begining;
1360 RAP_temp[myend + myid] = jj_counter;
1362 part_end[myid] = myend + myid + 1;
1365 jj_counter = part_end[0];
1367 for (i1 = 1; i1 < nthreads; i1++) {
1368 Ctemp += RAP_temp[part_end[i1 - 1] - 1];
1369 for (jj1 = part_end[i1 - 1] + 1; jj1 < part_end[i1]; jj1++) {
1370 RAP_i[jj_counter] = RAP_temp[jj1] + Ctemp;
1374 RAP_size = RAP_i[n_coarse];
1380 for (ic = 0; ic < n_coarse; ic++) {
1381 Ps_marker[ic] = jj_counter;
1382 jj_row_begining = jj_counter;
1385 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1388 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1390 if (As_marker[i2] != ic) {
1392 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1394 if (Ps_marker[i3] < jj_row_begining) {
1395 Ps_marker[i3] = jj_counter;
1403 RAP_i[ic] = jj_row_begining;
1406 RAP_i[n_coarse] = jj_counter;
1407 RAP_size = jj_counter;
1419#pragma omp parallel for private(myid, mybegin, myend, P_marker, A_marker, jj_counter, \
1420 ic, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
1421 for (myid = 0; myid < nthreads; myid++) {
1423 P_marker = Ps_marker + myid * n_coarse;
1424 A_marker = As_marker + myid * n_fine;
1425 jj_counter = RAP_i[mybegin];
1426 for (ic = mybegin; ic < myend; ic++) {
1427 P_marker[ic] = jj_counter;
1428 jj_row_begining = jj_counter;
1429 RAP_j[jj_counter] = ic;
1430 RAP_data[jj_counter] = 0.0;
1432 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1435 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1438 if (A_marker[i2] != ic) {
1440 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1443 if (P_marker[i3] < jj_row_begining) {
1444 P_marker[i3] = jj_counter;
1445 RAP_data[jj_counter] = A_data[jj2];
1446 RAP_j[jj_counter] = i3;
1449 RAP_data[P_marker[i3]] += A_data[jj2];
1453 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1455 RAP_data[P_marker[i3]] += A_data[jj2];
1465 for (ic = 0; ic < n_coarse; ic++) {
1466 Ps_marker[ic] = jj_counter;
1467 jj_row_begining = jj_counter;
1468 RAP_j[jj_counter] = ic;
1469 RAP_data[jj_counter] = 0.0;
1472 for (jj1 = R_i[ic]; jj1 < R_i[ic + 1]; jj1++) {
1474 for (jj2 = A_i[i1]; jj2 < A_i[i1 + 1]; jj2++) {
1476 if (As_marker[i2] != ic) {
1478 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1480 if (Ps_marker[i3] < jj_row_begining) {
1481 Ps_marker[i3] = jj_counter;
1482 RAP_data[jj_counter] = A_data[jj2];
1483 RAP_j[jj_counter] = i3;
1486 RAP_data[Ps_marker[i3]] += A_data[jj2];
1490 for (jj3 = P_i[i2]; jj3 < P_i[i2 + 1]; jj3++) {
1492 RAP_data[Ps_marker[i3]] += A_data[jj2];
1502 RAP->
row = n_coarse;
1503 RAP->
col = n_coarse;
1504 RAP->
nnz = RAP_size;
1507 RAP->
val = RAP_data;
1534 const INT * ir = R->
IA, *ia = A->
IA, *ip = P->
IA;
1535 const INT * jr = R->
JA, *ja = A->
JA, *jp = P->
JA;
1545 INT i, i1, j, jj, k, length;
1546 INT begin_row, end_row, begin_rowA, end_rowA, begin_rowR, end_rowR;
1547 INT istart, iistart, count;
1564 for (i = 0; i < row; ++i) {
1573 for (jj = begin_rowR; jj < end_rowR; ++jj) {
1577 end_rowA = ia[j + 1];
1578 for (k = begin_rowA; k < end_rowA; ++k) {
1579 if (index[ja[k]] == -2) {
1580 index[ja[k]] = istart;
1593 for (j = 0; j < count; ++j) {
1595 istart = index[istart];
1600 end_row = ip[jj + 1];
1601 for (k = begin_row; k < end_row; ++k) {
1603 if (iindex[jp[k]] == -2) {
1604 iindex[jp[k]] = iistart;
1612 iac[i1] = iac[i] + length;
1622 for (j = begin_row; j < end_row; ++j) {
1626 iistart = iindex[iistart];
1628 iindex[jac[j]] = -2;
1640 for (i = 0; i < row; ++i) {
1646 for (j = begin_row; j < end_row; ++j) {
1647 BTindex[jac[j]] = j;
1657 for (jj = begin_rowR; jj < end_rowR; ++jj) {
1662 end_rowA = ia[j + 1];
1663 for (k = begin_rowA; k < end_rowA; ++k) {
1664 if (index[ja[k]] == -2) {
1665 index[ja[k]] = istart;
1669 temp[ja[k]] += aj[k];
1675 for (j = 0; j < length; ++j) {
1677 istart = index[istart];
1682 end_row = ip[jj + 1];
1683 for (k = begin_row; k < end_row; ++k) {
1685 acj[BTindex[jp[k]]] += temp[jj];
1742#pragma omp parallel for if (n > OPENMP_HOLDS)
1744 for (i = 0; i <= n; ++i) {
1750#pragma omp parallel for if (nnzA > OPENMP_HOLDS)
1752 for (i = 0; i < nnzA; ++i) {
1757#pragma omp parallel for if (nc > OPENMP_HOLDS)
1759 for (i = 0; i <= nc; ++i) {
1764#pragma omp parallel for if (nnzP > OPENMP_HOLDS)
1766 for (i = 0; i < nnzP; ++i) {
1774 Pt->
JA, Pt->
val, n, nc, &maxrpout, P->
IA, P->
JA);
1785#pragma omp parallel for if (Ac->row > OPENMP_HOLDS)
1787 for (i = 0; i <= Ac->
row; ++i) Ac->
IA[i]--;
1792 for (i = 0; i < Ac->
nnz; ++i) Ac->
JA[i]--;
1797 for (i = 0; i <= n; ++i) A->
IA[i]--;
1802 for (i = 0; i < nnzA; ++i) A->
JA[i]--;
1807 for (i = 0; i <= n; ++i) P->
IA[i]--;
1812 for (i = 0; i < nnzP; ++i) P->
JA[i]--;
1817 for (i = 0; i <= nc; ++i) Pt->
IA[i]--;
1822 for (i = 0; i < nnzP; ++i) Pt->
JA[i]--;
1847 INT n1, nc1, nnzp, maxrp;
1848 INT * ip = NULL, *jp = NULL;
1860 ip = (
INT*)calloc(n1,
sizeof(
INT));
1861 jp = (
INT*)calloc(nnzp,
sizeof(
INT));
1880 ac.
IA = (
INT*)calloc(nc1,
sizeof(
INT));
1887 fasp_sparse_rapms_(ir, jr, ia, ja, ip, jp, &n, &nc, ac.
IA, ac.
JA, &maxrp);
1888 ac.
nnz = ac.
IA[nc] - 1;
1894 fasp_sparse_rapms_(ir, jr, ia, ja, ip, jp, &n, &nc, ac.
IA, ac.
JA, &maxrp);
1901 fasp_sparse_rapcmp_(ir, jr, r, ia, ja, a, ipt, jpt, pt, &n, &nc, ac.
IA, ac.
JA,
1937 nthreads = fasp_get_num_threads();
1943 INT * ir = R->
IA, *ia = A->
IA, *ip = P->
IA;
1944 INT * jr = R->
JA, *ja = A->
JA, *jp = P->
JA;
1946 INT istart, iistart;
1947 INT end_row, end_rowA, end_rowR;
1948 INT i, j, jj, k, length, myid, mybegin, myend;
1949 INT jj_counter, ic, jj_row_begining, jj1, i1, jj2, i2, jj3, i3;
1952 INT* BTindex = NULL;
1954 INT FiveMyid, min_A, min_P, A_pos, P_pos, FiveIc;
1955 INT minus_one_length_A = icor_ysk[5 * nthreads];
1956 INT minus_one_length_P = icor_ysk[5 * nthreads + 1];
1957 INT minus_one_length = minus_one_length_A + minus_one_length_P;
1965 INT* indexs = iindexs + minus_one_length_P;
1966 INT* BTindexs = indexs + minus_one_length_A;
1980 INT* iac_temp = part_end + nthreads;
1987#pragma omp parallel for private(myid, FiveMyid, mybegin, myend, min_A, min_P, index, \
1988 iindex, A_pos, P_pos, ic, FiveIc, jj_counter, \
1989 jj_row_begining, end_rowR, jj1, i1, end_rowA, jj2, \
1990 i2, end_row, jj3, i3)
1992 for (myid = 0; myid < nthreads; myid++) {
1993 FiveMyid = myid * 5;
1994 mybegin = icor_ysk[FiveMyid];
1995 if (myid == nthreads - 1) {
1998 myend = icor_ysk[FiveMyid + 5];
2000 min_A = icor_ysk[FiveMyid + 2];
2001 min_P = icor_ysk[FiveMyid + 4];
2004 for (ic = myid - 1; ic >= 0; ic--) {
2006 A_pos += icor_ysk[FiveIc + 1];
2007 P_pos += icor_ysk[FiveIc + 3];
2009 iindex_array[myid] = iindex = iindexs + P_pos - min_P;
2010 index_array[myid] = index = indexs + A_pos - min_A;
2012 for (ic = mybegin; ic < myend; ic++) {
2013 iindex[ic] = jj_counter;
2014 jj_row_begining = jj_counter;
2016 end_rowR = ir[ic + 1];
2017 for (jj1 = ir[ic]; jj1 < end_rowR; jj1++) {
2019 end_rowA = ia[i1 + 1];
2020 for (jj2 = ia[i1]; jj2 < end_rowA; jj2++) {
2022 if (index[i2] != ic) {
2024 end_row = ip[i2 + 1];
2025 for (jj3 = ip[i2]; jj3 < end_row; jj3++) {
2027 if (iindex[i3] < jj_row_begining) {
2028 iindex[i3] = jj_counter;
2035 iac_temp[ic + myid] = jj_row_begining;
2037 iac_temp[myend + myid] = jj_counter;
2038 part_end[myid] = myend + myid + 1;
2041 jj_counter = part_end[0];
2043 for (i1 = 1; i1 < nthreads; i1++) {
2044 Ctemp += iac_temp[part_end[i1 - 1] - 1];
2045 for (jj1 = part_end[i1 - 1] + 1; jj1 < part_end[i1]; jj1++) {
2046 iac[jj_counter] = iac_temp[jj1] + Ctemp;
2056#pragma omp parallel for private(myid, index, iindex, FiveMyid, mybegin, myend, i, \
2057 istart, length, i1, end_rowR, jj, j, end_rowA, k, \
2060 for (myid = 0; myid < nthreads; myid++) {
2061 iindex = iindex_array[myid];
2062 index = index_array[myid];
2063 FiveMyid = myid * 5;
2064 mybegin = icor_ysk[FiveMyid];
2065 if (myid == nthreads - 1) {
2068 myend = icor_ysk[FiveMyid + 5];
2070 for (i = mybegin; i < myend; ++i) {
2076 for (jj = ir[i]; jj < end_rowR; ++jj) {
2079 end_rowA = ia[j + 1];
2080 for (k = ia[j]; k < end_rowA; ++k) {
2081 if (index[ja[k]] == -2) {
2082 index[ja[k]] = istart;
2094 for (j = 0; j < length; ++j) {
2096 istart = index[istart];
2099 end_row = ip[jj + 1];
2100 for (k = ip[jj]; k < end_row; ++k) {
2102 if (iindex[jp[k]] == -2) {
2103 iindex[jp[k]] = iistart;
2111 for (j = iac[i]; j < end_row; ++j) {
2115 iistart = iindex[iistart];
2117 iindex[jac[j]] = -2;
2132#pragma omp parallel for private( \
2133 myid, index, FiveMyid, mybegin, myend, min_A, min_P, A_pos, P_pos, ic, FiveIc, \
2134 BTindex, temp, i, i1, end_row, j, istart, length, end_rowR, jj, end_rowA, k)
2136 for (myid = 0; myid < nthreads; myid++) {
2137 index = index_array[myid];
2138 FiveMyid = myid * 5;
2139 mybegin = icor_ysk[FiveMyid];
2140 if (myid == nthreads - 1) {
2143 myend = icor_ysk[FiveMyid + 5];
2145 min_A = icor_ysk[FiveMyid + 2];
2146 min_P = icor_ysk[FiveMyid + 4];
2149 for (ic = myid - 1; ic >= 0; ic--) {
2151 A_pos += icor_ysk[FiveIc + 1];
2152 P_pos += icor_ysk[FiveIc + 3];
2154 BTindex = BTindexs + P_pos - min_P;
2155 temp = temps + A_pos - min_A;
2156 for (i = mybegin; i < myend; ++i) {
2160 for (j = iac[i]; j < end_row; ++j) {
2161 BTindex[jac[j]] = j;
2168 for (jj = ir[i]; jj < end_rowR; ++jj) {
2171 end_rowA = ia[j + 1];
2172 for (k = ia[j]; k < end_rowA; ++k) {
2173 if (index[ja[k]] == -2) {
2174 index[ja[k]] = istart;
2178 temp[ja[k]] += rj[jj] * aj[k];
2183 for (j = 0; j < length; ++j) {
2185 istart = index[istart];
2188 end_row = ip[jj + 1];
2189 for (k = ip[jj]; k < end_row; ++k) {
2191 acj[BTindex[jp[k]]] += temp[jj] * pj[k];
2212 iindex_array = NULL;
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
void fasp_iarray_cp(const INT n, const INT *x, INT *y)
Copy an array to the other y=x.
void fasp_mem_free(void *mem)
Free up previous allocated memory body and set pointer to NULL.
void * fasp_mem_realloc(void *oldmem, const LONGLONG tsize)
Reallocate, initiate, and check memory.
void * fasp_mem_calloc(const unsigned int size, const unsigned int type)
Allocate, initiate, and check memory.
void fasp_get_start_end(const INT procid, const INT nprocs, const INT n, INT *start, INT *end)
Assign Load to each thread.
void fasp_blas_darray_ax(const INT n, const REAL a, REAL *x)
x = a*x
void fasp_dcsr_alloc(const INT m, const INT n, const INT nnz, dCSRmat *A)
Allocate CSR sparse matrix memory space.
void fasp_sparse_rapms_(INT *ir, INT *jr, INT *ia, INT *ja, INT *ip, INT *jp, INT *nin, INT *ncin, INT *iac, INT *jac, INT *maxrout)
Calculates the nonzero structure of R*A*P, if jac is not null. If jac is null only finds num of nonze...
void fasp_sparse_iit_(INT *ia, INT *ja, INT *na, INT *ma, INT *iat, INT *jat)
Transpose a boolean matrix (only given by ia, ja)
void fasp_sparse_rapcmp_(INT *ir, INT *jr, REAL *r, INT *ia, INT *ja, REAL *a, INT *ipt, INT *jpt, REAL *pt, INT *nin, INT *ncin, INT *iac, INT *jac, REAL *ac, INT *idummy)
Calculates R*A*P after the nonzero structure of the result is known. iac,jac,ac have to be allocated ...
void fasp_blas_dcsr_mxv_agg(const dCSRmat *A, const REAL *x, REAL *y)
Matrix-vector multiplication y = A*x (nonzeros of A = 1)
void fasp_blas_ldcsr_aAxpy(const REAL alpha, const dCSRmat *A, const LONGREAL *x, REAL *y)
Matrix-vector multiplication y = alpha*A*x + y.
void fasp_blas_dcsr_axm(dCSRmat *A, const REAL alpha)
Multiply a sparse matrix A in CSR format by a scalar alpha.
void fasp_blas_dcsr_mxv(const dCSRmat *A, const REAL *x, REAL *y)
Matrix-vector multiplication y = A*x.
dCSRmat fasp_blas_dcsr_rap2(INT *ir, INT *jr, REAL *r, INT *ia, INT *ja, REAL *a, INT *ipt, INT *jpt, REAL *pt, INT n, INT nc, INT *maxrpout, INT *ipin, INT *jpin)
Compute R*A*P.
void fasp_blas_dcsr_aAxpy_agg(const REAL alpha, const dCSRmat *A, const REAL *x, REAL *y)
Matrix-vector multiplication y = alpha*A*x + y (nonzeros of A = 1)
unsigned long total_alloc_mem
void fasp_blas_dcsr_mxm(const dCSRmat *A, const dCSRmat *B, dCSRmat *C)
Sparse matrix multiplication C=A*B.
void fasp_blas_dcsr_rap_agg1(const dCSRmat *R, const dCSRmat *A, const dCSRmat *P, dCSRmat *B)
Triple sparse matrix multiplication B=R*A*P (nonzeros of R, P = 1)
void fasp_blas_dcsr_rap_agg(const dCSRmat *R, const dCSRmat *A, const dCSRmat *P, dCSRmat *RAP)
Triple sparse matrix multiplication B=R*A*P (nonzeros of R, P = 1)
REAL fasp_blas_dcsr_vmv(const dCSRmat *A, const REAL *x, const REAL *y)
vector-Matrix-vector multiplication alpha = y'*A*x
SHORT fasp_blas_dcsr_add(const dCSRmat *A, const REAL alpha, const dCSRmat *B, const REAL beta, dCSRmat *C)
compute C = alpha*A + beta*B in CSR format
unsigned long total_alloc_count
void fasp_blas_dcsr_rap4(dCSRmat *R, dCSRmat *A, dCSRmat *P, dCSRmat *B, INT *icor_ysk)
Triple sparse matrix multiplication B=R*A*P.
void fasp_blas_dcsr_rap(const dCSRmat *R, const dCSRmat *A, const dCSRmat *P, dCSRmat *RAP)
Triple sparse matrix multiplication B=R*A*P.
void fasp_blas_dcsr_aAxpy(const REAL alpha, const dCSRmat *A, const REAL *x, REAL *y)
Matrix-vector multiplication y = alpha*A*x + y.
void fasp_blas_dcsr_ptap(const dCSRmat *Pt, const dCSRmat *A, const dCSRmat *P, dCSRmat *Ac)
Triple sparse matrix multiplication B=P'*A*P.
Main header file for the FASP project.
#define SHORT
FASP integer and floating point numbers.
#define FASP_SUCCESS
Definition of return status and error messages.
#define TRUE
Definition of logic type.
Sparse matrix of REAL type in CSR format.
INT col
column of matrix A, n
REAL * val
nonzero entries of A
INT row
row number of matrix A, m
INT * IA
integer array of row pointers, the size is m+1
INT nnz
number of nonzero entries
INT * JA
integer array of column indexes, the size is nnz