22#include "fasp_functs.h"
49 const INT storage_manner)
67 if ( nb > 0 && NNZ > 0) {
103 const INT storage_manner,
205 INT i,j,k,p,inb,jnb,nb2;
228 for ( j=0; j<nnz; ++j ) {
230 if (i<m-1) AT->
IA[i+2]++;
233 for ( i=2; i<=m; ++i ) AT->
IA[i]+=AT->
IA[i-1];
237 for ( i=0; i<n; ++i ) {
238 INT ibegin=A->
IA[i], iend1=A->
IA[i+1];
239 for ( p=ibegin; p<iend1; p++ ) {
243 for ( inb=0; inb<nb; inb++ )
244 for ( jnb=0; jnb<nb; jnb++ )
245 AT->
val[nb2*k + inb*nb + jnb] = A->
val[nb2*p + jnb*nb + inb];
252 for ( i=0; i<n; ++i ) {
253 INT ibegin=A->
IA[i], iend1=A->
IA[i+1];
254 for ( p=ibegin; p<iend1; p++ ) {
299 const INT nb = A->
nb;
303 INT myid, mybegin, stride_i, myend, nthreads;
306 nthreads = fasp_get_num_threads();
319 stride_i = n/nthreads;
320#pragma omp parallel private(myid, mybegin, myend, i) num_threads(nthreads)
322 myid = omp_get_thread_num();
323 mybegin = myid*stride_i;
324 if ( myid < nthreads-1 ) myend = mybegin+stride_i;
326 for ( i = mybegin; i < myend; ++i ) {
333 for ( i=0; i<n; ++i ) col_flag[Js[i]]=i+1;
338 for ( i=0; i<m; ++i ) {
339 for ( k=A->
IA[Is[i]]; k<A->IA[Is[i]+1]; ++k ) {
341 if (col_flag[j]>0) nnz++;
353 for ( i=0; i<m; ++i) {
354 for ( k=A->
IA[Is[i]]; k<A->IA[Is[i]+1]; ++k ) {
356 if ( col_flag[j] > 0 ) {
357 B->
JA[nnz]=col_flag[j]-1;
358 memcpy(B->
val+nnz*nb2, A->
val+k*nb2, nb2*
sizeof(
REAL));
388 const INT num_rowsA = A -> ROW;
389 const INT num_colsA = A -> COL;
390 const INT nb = A->
nb;
391 const INT nb2 = nb*nb;
393 const INT *A_i = A -> IA;
395 REAL *A_data = A -> val;
397 INT i, j, tempi, row_size;
401 INT myid, mybegin, myend, ibegin, iend;
402 INT nthreads = fasp_get_num_threads();
411#pragma omp parallel for private (myid,mybegin,myend,i,j,tempi,ibegin,iend)
412 for (myid = 0; myid < nthreads; myid++) {
414 for (i = mybegin; i < myend; i++) {
415 ibegin = A_i[i+1]; iend = A_i[i];
416 for (j = ibegin; j < iend; j ++) {
421 A_j[ibegin] = A_j[j];
424 memcpy(tempd+myid*nb2, A_data+ibegin*nb2, (nb2)*
sizeof(
REAL));
425 memcpy(A_data+ibegin*nb2, A_data+j*nb2, (nb2)*
sizeof(
REAL));
426 memcpy(A_data+j*nb2, tempd+myid*nb2, (nb2)*
sizeof(
REAL));
443 for (i = 0; i < num_rowsA; i ++) {
444 row_size = A_i[i+1] - A_i[i];
445 for (j = 0; j < row_size; j ++) {
453 memcpy(tempd, A_data, (nb2)*
sizeof(
REAL));
454 memcpy(A_data, A_data+j*nb2, (nb2)*
sizeof(
REAL));
455 memcpy(A_data+j*nb2, tempd, (nb2)*
sizeof(
REAL));
460 if (j == row_size-1)
return -2;
463 A_data += row_size*nb2;
470 if (status < 0)
return status;
490 const INT nb = A->
nb;
491 const INT nb2 = nb*nb;
492 const INT size = ROW*nb2;
493 const INT *IA = A->
IA;
494 const INT *JA = A->
JA;
503 INT myid, mybegin, myend;
508 nthreads = fasp_get_num_threads();
519#pragma omp parallel for private(myid, i, mybegin, myend, k)
521 for (myid = 0; myid < nthreads; myid++) {
523 for (i = mybegin; i < myend; ++i) {
524 for (k = IA[i]; k < IA[i+1]; ++k) {
526 memcpy(diaginv.
val+i*nb2, val+k*nb2, nb2*
sizeof(
REAL));
532 for (i = 0; i < ROW; ++i) {
533 for (k = IA[i]; k < IA[i+1]; ++k) {
535 memcpy(diaginv.
val+i*nb2, val+k*nb2, nb2*
sizeof(
REAL));
542#pragma omp parallel for private(myid, i, mybegin, myend)
544 for (myid = 0; myid < nthreads; myid++) {
547 for (i = mybegin; i < myend; ++i) {
552 for (i = mybegin; i < myend; ++i) {
554 diaginv.
val[i] = 1.0 / diaginv.
val[i];
561 for (i = 0; i < ROW; ++i) {
566 for (i = 0; i < ROW; ++i) {
568 diaginv.
val[i] = 1.0 / diaginv.
val[i];
597 const INT nb = A->
nb;
598 const INT nb2 = nb*nb;
599 const INT size = ROW*nb2;
600 const INT *IA = A->
IA;
601 const INT *JA = A->
JA;
614 INT myid, mybegin, myend;
619 if ( IAb ) memcpy(IAb, IA, (ROW+1)*
sizeof(
INT));
622 if ( JAb ) memcpy(JAb, JA, NNZ*
sizeof(
INT));
628 nthreads = fasp_get_num_threads();
635#pragma omp parallel for private(myid, i, mybegin, myend, k)
637 for (myid = 0; myid < nthreads; myid++) {
639 for (i = mybegin; i < myend; ++i) {
640 for (k = IA[i]; k < IA[i+1]; ++k) {
642 memcpy(diaginv+i*nb2, val+k*nb2, nb2*
sizeof(
REAL));
648 for (i = 0; i < ROW; ++i) {
649 for (k = IA[i]; k < IA[i+1]; ++k) {
651 memcpy(diaginv+i*nb2, val+k*nb2, nb2*
sizeof(
REAL));
659#pragma omp parallel for private(myid, i, mybegin, myend)
661 for (myid = 0; myid < nthreads; myid++) {
664 for (i = mybegin; i < myend; ++i) {
669 for (i = mybegin; i < myend; ++i) {
671 diaginv[i] = 1.0 / diaginv[i];
678 for (i = 0; i < ROW; ++i) {
683 for (i = 0; i < ROW; ++i) {
685 diaginv[i] = 1.0 / diaginv[i];
693#pragma omp parallel for private(myid, mybegin, myend, i, k, m, j, l)
695 for (myid = 0; myid < nthreads; myid++) {
697 for (i = mybegin; i < myend; ++i) {
698 for (k = IA[i]; k < IA[i+1]; ++k) {
703 memset(valb+m, 0X0, nb2*
sizeof(
REAL));
704 for (l = 0; l < nb; l ++) valb[m+l*nb+l] = 1.0;
714 for (i = 0; i < ROW; ++i) {
715 for (k = IA[i]; k < IA[i+1]; ++k) {
720 memset(valb+m, 0X0, nb2*
sizeof(
REAL));
721 for (l = 0; l < nb; l ++) valb[m+l*nb+l] = 1.0;
758 const INT nb = A->
nb, nbp1 = nb+1;
759 const INT nb2 = nb*nb;
770 INT i,k,m,l,ibegin,iend;
774 INT myid, mybegin, myend;
779 nthreads = fasp_get_num_threads();
789 if (IAb) memcpy(IAb, IA, (ROW+1)*
sizeof(
INT));
792 if (JAb) memcpy(JAb, JA, NNZ*
sizeof(
INT));
798#pragma omp parallel for private (myid, i, mybegin, myend, ibegin, iend, k, m, l)
800 for (myid = 0; myid < nthreads; myid++) {
802 for (i = mybegin; i < myend; ++i) {
803 ibegin = IA[i]; iend = IA[i+1];
804 for (k = ibegin; k < iend; ++k) {
811 memset(valb+m, 0X0, nb2*
sizeof(
REAL));
812 for (l = 0; l < nb; l ++) valb[m+l*nbp1] = 1.0;
820 for (i = 0; i < ROW; ++i) {
821 ibegin = IA[i]; iend = IA[i+1];
822 for (k = ibegin; k < iend; ++k) {
829 memset(valb+m, 0X0, nb2*
sizeof(
REAL));
830 for (l = 0; l < nb; l ++) valb[m+l*nbp1] = 1.0;
863 INT ROW_plus_one = ROW+1;
881 INT myid, mybegin, myend, stride_i, nthreads = 1;
884 nthreads = fasp_get_num_threads();
904 stride_i = ROW/nthreads;
905#pragma omp parallel private(myid, mybegin, myend,i,k,m,j) num_threads(nthreads)
907 myid = omp_get_thread_num();
908 mybegin = myid*stride_i;
909 if (myid < nthreads-1) myend = mybegin+stride_i;
911 for (i=mybegin; i < myend; ++i) {
916 memcpy(diaginv+i*4, val+m, 4*
sizeof(
REAL));
922 for (k = IA[i]+1; k < IA[i+1]; ++k)
934 for (i = 0; i < ROW; ++i) {
938 memcpy(diaginv+i*4, val+m, 4*
sizeof(
REAL));
944 for (k = IA[i]+1; k < IA[i+1]; ++k) {
957 stride_i = ROW/nthreads;
958#pragma omp parallel private(myid, mybegin, myend,i,k,m,j) num_threads(nthreads)
960 myid = omp_get_thread_num();
961 mybegin = myid*stride_i;
962 if (myid < nthreads-1) myend = mybegin+stride_i;
964 for (i=mybegin; i < myend; ++i) {
966 for (k = IA[i]; k < IA[i+1]; ++k) {
969 memcpy(diaginv+i*9, val+m, 9*
sizeof(
REAL));
976 for (k = IA[i]; k < IA[i+1]; ++k) {
987 for (i = 0; i < ROW; ++i) {
989 for (k = IA[i]; k < IA[i+1]; ++k) {
992 memcpy(diaginv+i*9, val+m, 9*
sizeof(
REAL));
997 printf(
"### DEBUG: row, col = %d\n", i);
1003 for (k = IA[i]; k < IA[i+1]; ++k) {
1017 stride_i = ROW/nthreads;
1018#pragma omp parallel private(myid, mybegin, myend,i,k,m,j) num_threads(nthreads)
1020 myid = omp_get_thread_num();
1021 mybegin = myid*stride_i;
1022 if (myid < nthreads-1) myend = mybegin+stride_i;
1024 for (i=mybegin; i < myend; ++i) {
1026 for (k = IA[i]; k < IA[i+1]; ++k) {
1029 memcpy(diaginv+i*25, val+m, 25*
sizeof(
REAL));
1038 for (k = IA[i]; k < IA[i+1]; ++k) {
1050 for (i = 0; i < ROW; ++i) {
1052 for (k = IA[i]; k < IA[i+1]; ++k) {
1056 memcpy(diaginv+i*25, val+m, 25*
sizeof(
REAL));
1066 REAL aa[25], bb[25];
1067 for (k = 0; k < 25; k++) bb[k] = diaginv[i * 25 + k];
1068 for (k = 0; k < 25; k++) aa[k] = diaginv[i * 25 + k];
1070 printf(
"### DEBUG: Check inverse matrix...\n");
1071 printf(
"##----------------------------------------------\n");
1072 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1073 bb[0]* aa[0] + bb[1] * aa[5] + bb[2] * aa[10] + bb[3] * aa[15] + bb[4] * aa[20],
1074 bb[0]* aa[1] + bb[1] * aa[6] + bb[2] * aa[11] + bb[3] * aa[16] + bb[4] * aa[21],
1075 bb[0]* aa[2] + bb[1] * aa[7] + bb[2] * aa[12] + bb[3] * aa[17] + bb[4] * aa[22],
1076 bb[0]* aa[3] + bb[1] * aa[8] + bb[2] * aa[13] + bb[3] * aa[18] + bb[4] * aa[23],
1077 bb[0]* aa[4] + bb[1] * aa[9] + bb[3] * aa[14] + bb[3] * aa[19] + bb[4] * aa[24]);
1078 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1079 bb[5]* aa[0] + bb[6] * aa[5] + bb[7] * aa[10] + bb[8] * aa[15] + bb[9] * aa[20],
1080 bb[5]* aa[1] + bb[6] * aa[6] + bb[7] * aa[11] + bb[8] * aa[16] + bb[9] * aa[21],
1081 bb[5]* aa[2] + bb[6] * aa[7] + bb[7] * aa[12] + bb[8] * aa[17] + bb[9] * aa[22],
1082 bb[5]* aa[3] + bb[6] * aa[8] + bb[7] * aa[13] + bb[8] * aa[18] + bb[9] * aa[23],
1083 bb[5]* aa[4] + bb[6] * aa[9] + bb[7] * aa[14] + bb[8] * aa[19] + bb[9] * aa[24]);
1084 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1085 bb[10]* aa[0] + bb[11] * aa[5] + bb[12] * aa[10] + bb[13] * aa[15] + bb[14] * aa[20],
1086 bb[10]* aa[1] + bb[11] * aa[6] + bb[12] * aa[11] + bb[13] * aa[16] + bb[14] * aa[21],
1087 bb[10]* aa[2] + bb[11] * aa[7] + bb[12] * aa[12] + bb[13] * aa[17] + bb[14] * aa[22],
1088 bb[10]* aa[3] + bb[11] * aa[8] + bb[12] * aa[13] + bb[13] * aa[18] + bb[14] * aa[23],
1089 bb[10]* aa[4] + bb[11] * aa[9] + bb[12] * aa[14] + bb[13] * aa[19] + bb[14] * aa[24]);
1090 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1091 bb[15]* aa[0] + bb[16] * aa[5] + bb[17] * aa[10] + bb[18] * aa[15] + bb[19] * aa[20],
1092 bb[15]* aa[1] + bb[16] * aa[6] + bb[17] * aa[11] + bb[18] * aa[16] + bb[19] * aa[21],
1093 bb[15]* aa[2] + bb[16] * aa[7] + bb[17] * aa[12] + bb[18] * aa[17] + bb[19] * aa[22],
1094 bb[15]* aa[3] + bb[16] * aa[8] + bb[17] * aa[13] + bb[18] * aa[18] + bb[19] * aa[23],
1095 bb[15]* aa[4] + bb[16] * aa[9] + bb[17] * aa[14] + bb[18] * aa[19] + bb[19] * aa[24]);
1096 printf(
"## %12.5e %12.5e %12.5e %12.5e %12.5e\n",
1097 bb[20]* aa[0] + bb[21] * aa[5] + bb[22] * aa[10] + bb[23] * aa[15] + bb[24] * aa[20],
1098 bb[20]* aa[1] + bb[21] * aa[6] + bb[22] * aa[11] + bb[23] * aa[16] + bb[24] * aa[21],
1099 bb[20]* aa[2] + bb[21] * aa[7] + bb[22] * aa[12] + bb[23] * aa[17] + bb[24] * aa[22],
1100 bb[20]* aa[3] + bb[21] * aa[8] + bb[22] * aa[13] + bb[23] * aa[18] + bb[24] * aa[23],
1101 bb[20]* aa[4] + bb[21] * aa[9] + bb[22] * aa[14] + bb[23] * aa[19] + bb[24] * aa[24]);
1102 printf(
"##----------------------------------------------\n");
1106 for (k = IA[i]; k < IA[i+1]; ++k) {
1120 stride_i = ROW/nthreads;
1121#pragma omp parallel private(myid, mybegin, myend,i,k,m,j) num_threads(nthreads)
1123 myid = omp_get_thread_num();
1124 mybegin = myid*stride_i;
1125 if (myid < nthreads-1) myend = mybegin+stride_i;
1127 for (i=mybegin; i < myend; ++i) {
1129 for (k = IA[i]; k < IA[i+1]; ++k) {
1132 memcpy(diaginv+i*49, val+m, 49*
sizeof(
REAL));
1141 for (k = IA[i]; k < IA[i+1]; ++k) {
1152 for (i = 0; i < ROW; ++i) {
1154 for (k = IA[i]; k < IA[i+1]; ++k) {
1157 memcpy(diaginv+i*49, val+m, 49*
sizeof(
REAL));
1167 for (k = IA[i]; k < IA[i+1]; ++k) {
1181 stride_i = ROW/nthreads;
1182#pragma omp parallel private(myid, mybegin, myend,i,k,m,j) num_threads(nthreads)
1184 myid = omp_get_thread_num();
1185 mybegin = myid*stride_i;
1186 if (myid < nthreads-1) myend = mybegin+stride_i;
1188 for (i=mybegin; i < myend; ++i) {
1190 for (k = IA[i]; k < IA[i+1]; ++k) {
1193 memcpy(diaginv+i*nb2, val+m, nb2*
sizeof(
REAL));
1202 for (k = IA[i]; k < IA[i+1]; ++k) {
1212 for (i = 0; i < ROW; ++i) {
1214 for (k = IA[i]; k < IA[i+1]; ++k) {
1217 memcpy(diaginv+i*nb2, val+m, nb2*
sizeof(
REAL));
1227 for (k = IA[i]; k < IA[i+1]; ++k) {
1267 const INT nb = A->
nb;
1268 const INT nb2 = nb*nb;
1269 const INT *IA = A->
IA;
1270 const INT *JA = A->
JA;
1283 INT myid, mybegin, myend;
1288 nthreads = fasp_get_num_threads();
1299 if (IAb) memcpy(IAb, IA, (ROW+1)*
sizeof(
INT));
1302 if (JAb) memcpy(JAb, JA, NNZ*
sizeof(
INT));
1310#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1312 for (myid = 0; myid < nthreads; myid++) {
1314 for (i = mybegin; i < myend; ++i) {
1315 ibegin = IA[i]; iend = IA[i+1];
1318 memcpy(diaginv+i*4, val+m, 4*
sizeof(
REAL));
1325 for (k = ibegin+1; k < iend; ++k) {
1333 for (i = 0; i < ROW; ++i) {
1334 ibegin = IA[i]; iend = IA[i+1];
1337 memcpy(diaginv+i*4, val+m, 4*
sizeof(
REAL));
1344 for (k = ibegin+1; k < iend; ++k) {
1356#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1358 for (myid = 0; myid < nthreads; myid++) {
1360 for (i = mybegin; i < myend; ++i) {
1361 ibegin = IA[i]; iend = IA[i+1];
1364 memcpy(diaginv+i*9, val+m, 9*
sizeof(
REAL));
1369 for (k = ibegin+1; k < iend; ++k) {
1377 for (i = 0; i < ROW; ++i) {
1378 ibegin = IA[i]; iend = IA[i+1];
1381 memcpy(diaginv+i*9, val+m, 9*
sizeof(
REAL));
1388 for (k = ibegin+1; k < iend; ++k) {
1400#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1402 for (myid = 0; myid < nthreads; myid++) {
1404 for (i = mybegin; i < myend; ++i) {
1406 ibegin = IA[i]; iend = IA[i+1];
1408 memcpy(diaginv+i*25, val+m, 25*
sizeof(
REAL));
1415 for (k = ibegin+1; k < iend; ++k) {
1423 for (i = 0; i < ROW; ++i) {
1425 ibegin = IA[i]; iend = IA[i+1];
1427 memcpy(diaginv+i*25, val+m, 25*
sizeof(
REAL));
1434 for (k = ibegin+1; k < iend; ++k) {
1445#pragma omp parallel for private(myid, i, mybegin, myend, ibegin, iend, m, k)
1447 for (myid = 0; myid < nthreads; myid++) {
1449 for (i = mybegin; i < myend; ++i) {
1451 ibegin = IA[i]; iend = IA[i+1];
1453 memcpy(diaginv+i*49, val+m, 49*
sizeof(
REAL));
1460 for (k = ibegin+1; k < iend; ++k) {
1468 for (i = 0; i < ROW; ++i) {
1470 ibegin = IA[i]; iend = IA[i+1];
1472 memcpy(diaginv+i*49, val+m, 49*
sizeof(
REAL));
1479 for (k = ibegin+1; k < iend; ++k) {
1491#pragma omp parallel for private(myid, mybegin, myend, i, ibegin, iend, m, k)
1493 for (myid = 0; myid < nthreads; myid++) {
1495 for (i = mybegin; i < myend; ++i) {
1497 ibegin = IA[i]; iend = IA[i+1];
1499 memcpy(diaginv+i*nb2, val+m, nb2*
sizeof(
REAL));
1506 for (k = ibegin+1; k < iend; ++k) {
1514 for (i = 0; i < ROW; ++i) {
1516 ibegin = IA[i]; iend = IA[i+1];
1518 memcpy(diaginv+i*nb2, val+m, nb2*
sizeof(
REAL));
1525 for (k = ibegin+1; k < iend; ++k) {
1566#pragma omp parallel for private(i,k) if(n>OPENMP_HOLDS)
1568 for (i = 0; i < n; ++i) {
1569 for (k = A->
IA[i]; k < A->IA[i+1]; ++k) {
1570 if (A->
JA[k] == i) {
1571 memcpy(diag+i*nb2, A->
val+k*nb2, nb2*
sizeof(
REAL));
1600 const INT ROW_plus_one = ROW+1;
1603 const INT nb = A->
nb;
1605 const INT *IA = A->
IA;
1606 const INT *JA = A->
JA;
1634 for (i=0; i<ROW; i++){
1636 for (j=IA[i]; j<IA[i+1]; j++){
1640 temp[0] = val[j*nb2];
1641 temp[1] = val[j*nb2+1];
1642 temp[2] = val[j*nb2+2];
1643 temp[3] = val[j*nb2+3];
1648 DL[i*nb2+2] = -temp[2]/temp[0];
1655 DU[i*nb2+1] = -temp[1]/temp[0];
1671 for (i=0; i<ROW; i++){
1673 for (j=IA[i]; j<IA[i+1]; j++){
1677 temp[0] = val[j*nb2];
1678 temp[1] = val[j*nb2+1];
1679 temp[2] = val[j*nb2+2];
1680 temp[3] = val[j*nb2+3];
1681 temp[4] = val[j*nb2+4];
1682 temp[5] = val[j*nb2+5];
1683 temp[6] = val[j*nb2+6];
1684 temp[7] = val[j*nb2+7];
1685 temp[8] = val[j*nb2+8];
1688 REAL s22 = temp[4] - ((temp[1]*temp[3])/temp[0]);
1689 REAL s23 = temp[5] - ((temp[2]*temp[3])/temp[0]);
1690 REAL s32 = temp[7] - ((temp[1]*temp[6])/temp[0]);
1696 DL[i*nb2+3] = -temp[3]/temp[0];
1699 DL[i*nb2+6] = -temp[6]/temp[0] + (temp[3]/temp[0])*(s32/s22);
1700 DL[i*nb2+7] = -s32/s22;
1705 DU[i*nb2+1] = -temp[1]/temp[0];
1706 DU[i*nb2+2] = -temp[2]/temp[0] + (temp[1]/temp[0])*(s23/s22);
1709 DU[i*nb2+5] = -s23/s22;
1725 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1735 for (i=0; i<ROW; i++){
1737 for (j=IA[i]; j<IA[i+1]; j++){
1750 valb[j*nb2+1] = 0.0;
1751 valb[j*nb2+2] = 0.0;
1764 for (i=0; i<ROW; i++){
1766 for (j=IA[i]; j<IA[i+1]; j++){
1779 valb[j*nb2+1] = 0.0;
1780 valb[j*nb2+2] = 0.0;
1781 valb[j*nb2+3] = 0.0;
1782 valb[j*nb2+5] = 0.0;
1783 valb[j*nb2+6] = 0.0;
1784 valb[j*nb2+7] = 0.0;
1797 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1829 INT ROW_plus_one = ROW+1;
1844 REAL sqt3, sqt4, sqt8;
1861 for (i=0; i<ROW; i++){
1862 for (j=IA[i]; j<IA[i+1]; j++){
1864 REAL temp0 = val[j*nb2];
1865 REAL temp1 = val[j*nb2+1];
1866 REAL temp2 = val[j*nb2+2];
1867 REAL temp3 = val[j*nb2+3];
1871 sqt3 = sqrt(
ABS(temp3));
1876 DL[i*nb2+2] = -temp2/temp0/sqt3;
1877 DL[i*nb2+3] = 1.0/sqt3;
1881 DU[i*nb2+1] = -temp1/temp0/sqt3;
1883 DU[i*nb2+3] = 1.0/sqt3;
1893 for (i=0; i<ROW; i++){
1894 for (j=IA[i]; j<IA[i+1]; j++){
1896 REAL temp0 = val[j*nb2];
1897 REAL temp1 = val[j*nb2+1];
1898 REAL temp2 = val[j*nb2+2];
1899 REAL temp3 = val[j*nb2+3];
1900 REAL temp4 = val[j*nb2+4];
1901 REAL temp5 = val[j*nb2+5];
1902 REAL temp6 = val[j*nb2+6];
1903 REAL temp7 = val[j*nb2+7];
1904 REAL temp8 = val[j*nb2+8];
1909 sqt4 = sqrt(
ABS(temp4));
1910 sqt8 = sqrt(
ABS(temp8));
1913 REAL s22 = temp4 - ((temp1*temp3)/temp0);
1914 REAL s23 = temp5 - ((temp2*temp3)/temp0);
1915 REAL s32 = temp7 - ((temp1*temp6)/temp0);
1921 DL[i*nb2+3] = -temp3/temp0/sqt4;
1922 DL[i*nb2+4] = 1.0/sqt4;
1924 DL[i*nb2+6] = (-temp6/temp0 + (temp3/temp0)*(s32/s22))/sqt8;
1925 DL[i*nb2+7] = -s32/s22/sqt8;
1926 DL[i*nb2+8] = 1.0/sqt8;
1930 DU[i*nb2+1] = -temp1/temp0/sqt4;
1931 DU[i*nb2+2] = (-temp2/temp0 + (temp1/temp0)*(s23/s22))/sqt8;
1933 DU[i*nb2+4] = 1.0/sqt4;
1934 DU[i*nb2+5] = -s23/s22/sqt8;
1937 DU[i*nb2+8] = 1.0/sqt8;
1948 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
1957 for (i=0; i<ROW; i++){
1958 for (j=IA[i]; j<IA[i+1]; j++){
1966 valb[j*nb2+1] = 0.0;
1967 valb[j*nb2+2] = 0.0;
1976 for (i=0; i<ROW; i++){
1977 for (j=IA[i]; j<IA[i+1]; j++){
1985 valb[j*nb2+1] = 0.0;
1986 valb[j*nb2+2] = 0.0;
1987 valb[j*nb2+3] = 0.0;
1988 valb[j*nb2+5] = 0.0;
1989 valb[j*nb2+6] = 0.0;
1990 valb[j*nb2+7] = 0.0;
1999 printf(
"### ERROR: Only works for nb = 2, 3! [%s]\n", __FUNCTION__);
2027 const INT *ia= A->
IA, *ja = A->
JA;
2029 const INT nb = A->
nb, nb2 = nb*nb;
2033 INT i,j,k,jaj,i1,i2,start,jj;
2038 nthreads = fasp_get_num_threads();
2048 INT myid, mybegin, myend;
2050#pragma omp parallel for private(myid, mybegin, myend, i)
2052 for (myid=0; myid<nthreads; ++myid) {
2054 for (i=mybegin; i<myend; ++i) Pt[P[i]] = i;
2058 for (i=0; i<n; ++i) Pt[P[i]] = i;
2063 for (i=0; i<n; ++i) {
2065 Aperm.
IA[i+1] = Aperm.
IA[i]+(ia[k+1]-ia[k]);
2070 INT myid, mybegin, myend;
2072#pragma omp parallel for private(myid, mybegin, myend, i, i1, i2, k, start, j, jaj, jj)
2074 for (myid=0; myid<nthreads; ++myid) {
2076 for (i=mybegin; i<myend; ++i) {
2077 i1 = Aperm.
IA[i]; i2 = Aperm.
IA[i+1]-1;
2080 for (j=i1; j<=i2; ++j) {
2082 Aperm.
JA[j] = ja[jaj];
2083 for (jj=0; jj<nb2;++jj)
2084 Aperm.
val[j*nb2+jj] = Aval[jaj*nb2+jj];
2090 for (i=0; i<n; ++i) {
2091 i1 = Aperm.
IA[i]; i2 = Aperm.
IA[i+1]-1;
2094 for (j=i1; j<=i2; ++j) {
2096 Aperm.
JA[j] = ja[jaj];
2097 for (jj=0; jj<nb2;++jj)
2098 Aperm.
val[j*nb2+jj] = Aval[jaj*nb2+jj];
2105 INT myid, mybegin, myend;
2107#pragma omp parallel for private(myid, mybegin, myend, k, j)
2109 for (myid=0; myid<nthreads; ++myid) {
2111 for (k=mybegin; k<myend; ++k) {
2113 Aperm.
JA[k] = Pt[j];
2118 for (k=0; k<nnz; ++k) {
2120 Aperm.
JA[k] = Pt[j];
2144 const INT num_rowsA = A -> ROW;
2145 const INT nb = A->
nb;
2146 const INT nb2 = nb*nb;
2149 REAL *A_data = A -> val;
2151 INT i, ii, j, jj, ibegin, iend, iend1;
2155 INT myid, mybegin, myend;
2156 INT nthreads = fasp_get_num_threads();
2161#pragma omp parallel for private (myid,mybegin,myend,i,ii,j,jj,ibegin,iend,iend1)
2162 for (myid = 0; myid < nthreads; myid++) {
2164 for (i = mybegin; i < myend; i++) {
2165 ibegin = A_i[i]; iend = A_i[i+1]; iend1 = iend-1;
2166 for (j = ibegin; j < iend1; j ++) {
2168 for (jj = j+1; jj < iend; jj ++) {
2169 if (A_j[j] == A_j[jj]) {
2171 for ( ii=0; ii <nb2; ii++ )
2172 A_data[j*nb2 +ii] += A_data[ jj*nb2+ii];
2184 for (i = 0; i < num_rowsA; i ++) {
2185 ibegin = A_i[i]; iend = A_i[i+1]; iend1 = iend-1;
2186 for (j = ibegin; j < iend1; j ++) {
2188 for (jj = j+1; jj < iend; jj ++) {
2189 if (A_j[j] == A_j[jj]) {
2191 for ( ii=0; ii <nb2; ii++ )
2192 A_data[j*nb2 +ii] += A_data[ jj*nb2+ii];
2193 printf(
"### WARNING: Same col indices at %d, col %d (%d %d)!\n",
2208 memcpy(tempA_i, A_i, (num_rowsA+1 )*
sizeof(
INT));
2209 jj = 0; A_i[0] = jj;
2210 for (i = 0; i < num_rowsA; i ++) {
2211 ibegin = tempA_i[i]; iend = tempA_i[i+1];
2212 for (j = ibegin; j < iend; j ++) {
2214 memcpy(A_data +jj*nb2, A_data+j*nb2, (nb2)*
sizeof(
REAL));
2224 printf(
"### WARNING: %d col indices have been merged!\n", count);
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
void fasp_iarray_cp(const INT n, const INT *x, INT *y)
Copy an array to the other y=x.
void fasp_mem_free(void *mem)
Free up previous allocated memory body and set pointer to NULL.
void * fasp_mem_calloc(const unsigned int size, const unsigned int type)
Allocate, initiate, and check memory.
void fasp_get_start_end(const INT procid, const INT nprocs, const INT n, INT *start, INT *end)
Assign Load to each thread.
void fasp_smat_inv_nc3(REAL *a)
Compute the inverse matrix of a 3*3 full matrix A (in place)
void fasp_smat_identity_nc7(REAL *a)
Set a 7*7 full matrix to be a identity.
void fasp_smat_identity_nc3(REAL *a)
Set a 3*3 full matrix to be a identity.
SHORT fasp_smat_invp_nc(REAL *a, const INT n)
Compute the inverse of a matrix using Gauss Elimination with Pivoting.
void fasp_smat_inv_nc7(REAL *a)
Compute the inverse matrix of a 7*7 matrix a.
void fasp_smat_inv_nc2(REAL *a)
Compute the inverse matrix of a 2*2 full matrix A (in place)
SHORT fasp_smat_inv(REAL *a, const INT n)
Compute the inverse matrix of a small full matrix a.
void fasp_smat_identity_nc2(REAL *a)
Set a 2*2 full matrix to be a identity.
void fasp_smat_inv_nc5(REAL *a)
Compute the inverse matrix of a 5*5 full matrix A (in place)
void fasp_smat_identity_nc5(REAL *a)
Set a 5*5 full matrix to be a identity.
void fasp_smat_identity(REAL *a, const INT n, const INT n2)
Set a n*n full matrix to be a identity.
void fasp_blas_smat_mul(const REAL *a, const REAL *b, REAL *c, const INT n)
Compute the matrix product of two small full matrices a and b, stored in c.
void fasp_blas_smat_mul_nc2(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 2* matrices a and b, stored in c.
void fasp_blas_smat_mul_nc3(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 3*3 matrices a and b, stored in c.
void fasp_blas_smat_mul_nc7(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 7*7 matrices a and b, stored in c.
void fasp_blas_smat_mul_nc5(const REAL *a, const REAL *b, REAL *c)
Compute the matrix product of two 5*5 matrices a and b, stored in c.
dBSRmat fasp_dbsr_create(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner)
Create BSR sparse matrix data memory space.
dBSRmat fasp_dbsr_diaginv(const dBSRmat *A)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
INT fasp_dbsr_trans(const dBSRmat *A, dBSRmat *AT)
Find A^T from given dBSRmat matrix A.
SHORT fasp_dbsr_getblk(const dBSRmat *A, const INT *Is, const INT *Js, const INT m, const INT n, dBSRmat *B)
Get a sub BSR matrix of A with specified rows and columns.
void fasp_dbsr_getdiag(INT n, const dBSRmat *A, REAL *diag)
Abstract the diagonal blocks of a BSR matrix.
dBSRmat fasp_dbsr_diagLU(const dBSRmat *A, REAL *DL, REAL *DU)
Compute B := DL*A*DU. We decompose each diagonal block of A into LDU form and DL = diag(L^{-1}) and D...
void fasp_dbsr_cp(const dBSRmat *A, dBSRmat *B)
copy a dCSRmat to a new one B=A
dBSRmat fasp_dbsr_diagLU2(dBSRmat *A, REAL *DL, REAL *DU)
Compute B := DL*A*DU. We decompose each diagonal block of A into LDU form and DL = diag(L^{-1}) and D...
dBSRmat fasp_dbsr_perm(const dBSRmat *A, const INT *P)
Apply permutation of A, i.e. Aperm=PAP' by the orders given in P.
void fasp_dbsr_alloc(const INT ROW, const INT COL, const INT NNZ, const INT nb, const INT storage_manner, dBSRmat *A)
Allocate memory space for BSR format sparse matrix.
void fasp_dbsr_free(dBSRmat *A)
Free memory space for BSR format sparse matrix.
dBSRmat fasp_dbsr_diaginv2(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
SHORT fasp_dbsr_diagpref(dBSRmat *A)
Reorder the column and data arrays of a square BSR matrix, so that the first entry in each row is the...
dBSRmat fasp_dbsr_diaginv4(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
INT fasp_dbsr_merge_col(dBSRmat *A)
Check and merge some same col index in one row.
dBSRmat fasp_dbsr_diaginv3(const dBSRmat *A, REAL *diaginv)
Compute B := D^{-1}*A, where 'D' is the block diagonal part of A.
dvector fasp_dbsr_getdiaginv(const dBSRmat *A)
Get D^{-1} of matrix A.
Main header file for the FASP project.
#define SHORT
FASP integer and floating point numbers.
#define FASP_SUCCESS
Definition of return status and error messages.
#define TRUE
Definition of logic type.
Block sparse row storage matrix of REAL type.
INT COL
number of cols of sub-blocks in matrix A, N
INT NNZ
number of nonzero sub-blocks in matrix A, NNZ
INT nb
dimension of each sub-block
INT * IA
integer array of row pointers, the size is ROW+1
INT ROW
number of rows of sub-blocks in matrix A, M
INT storage_manner
storage manner for each sub-block
Vector with n entries of REAL type.
REAL * val
actual vector entries