Fast Auxiliary Space Preconditioning 2.7.7 Aug/28/2022
BlaSpmvBSR.c
Go to the documentation of this file.
1
14#include <math.h>
15
16#ifdef _OPENMP
17#include <omp.h>
18#endif
19
20#include "fasp.h"
21#include "fasp_functs.h"
22
23/*---------------------------------*/
24/*-- Public Functions --*/
25/*---------------------------------*/
26
39 const REAL alpha)
40{
41 const INT nnz = A->NNZ;
42 const INT nb = A->nb;
43
44 // A direct calculation can be written as:
45 fasp_blas_darray_ax(nnz*nb*nb, alpha, A->val);
46}
47
67void fasp_blas_dbsr_aAxpby (const REAL alpha,
68 dBSRmat *A,
69 REAL *x,
70 const REAL beta,
71 REAL *y )
72{
73 /* members of A */
74 INT ROW = A->ROW;
75 INT nb = A->nb;
76 INT *IA = A->IA;
77 INT *JA = A->JA;
78 REAL *val = A->val;
79
80 /* local variables */
81 INT size = ROW*nb;
82 INT jump = nb*nb;
83 INT i,j,k,iend;
84 REAL temp;
85 REAL *pA = NULL;
86 REAL *px0 = NULL;
87 REAL *py0 = NULL;
88 REAL *py = NULL;
89
90 SHORT nthreads = 1, use_openmp = FALSE;
91
92#ifdef _OPENMP
93 if ( ROW > OPENMP_HOLDS ) {
94 use_openmp = TRUE;
95 nthreads = fasp_get_num_threads();
96 }
97#endif
98
99 //----------------------------------------------
100 // Treat (alpha == 0.0) computation
101 //----------------------------------------------
102
103 if (alpha == 0.0) {
104 fasp_blas_darray_ax(size, beta, y);
105 return;
106 }
107
108 //-------------------------------------------------
109 // y = (beta/alpha)*y
110 //-------------------------------------------------
111
112 temp = beta / alpha;
113 if (temp != 1.0) {
114 if (temp == 0.0) {
115 memset(y, 0X0, size*sizeof(REAL));
116 }
117 else {
118 //for (i = size; i--; ) y[i] *= temp; // modified by Xiaozhe, 03/11/2011
119 fasp_blas_darray_ax(size, temp, y);
120 }
121 }
122
123 //-----------------------------------------------------------------
124 // y += A*x (Core Computation)
125 // each non-zero block elements are stored in row-major order
126 //-----------------------------------------------------------------
127
128 switch (nb)
129 {
130 case 2:
131 {
132 if (use_openmp) {
133 INT myid, mybegin, myend;
134#ifdef _OPENMP
135#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend)
136#endif
137 for (myid =0; myid < nthreads; myid++) {
138 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
139 for (i=mybegin; i < myend; ++i) {
140 py0 = &y[i*2];
141 iend = IA[i+1];
142 for (k = IA[i]; k < iend; ++k) {
143 j = JA[k];
144 pA = val+k*4; // &val[k*jump];
145 px0 = x+j*2; // &x[j*nb];
146 py = py0;
147 fasp_blas_smat_ypAx_nc2( pA, px0, py );
148 }
149 }
150 }
151 }
152 else {
153 for (i = 0; i < ROW; ++i) {
154 py0 = &y[i*2];
155 iend = IA[i+1];
156 for (k = IA[i]; k < iend; ++k) {
157 j = JA[k];
158 pA = val+k*4; // &val[k*jump];
159 px0 = x+j*2; // &x[j*nb];
160 py = py0;
161 fasp_blas_smat_ypAx_nc2( pA, px0, py );
162 }
163 }
164 }
165 }
166 break;
167
168 case 3:
169 {
170 if (use_openmp) {
171 INT myid, mybegin, myend;
172#ifdef _OPENMP
173#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend )
174#endif
175 for (myid =0; myid < nthreads; myid++) {
176 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
177 for (i=mybegin; i < myend; ++i) {
178 py0 = &y[i*3];
179 iend = IA[i+1];
180 for (k = IA[i]; k < iend; ++k) {
181 j = JA[k];
182 pA = val+k*9; // &val[k*jump];
183 px0 = x+j*3; // &x[j*nb];
184 py = py0;
185 fasp_blas_smat_ypAx_nc3( pA, px0, py );
186 }
187 }
188 }
189 }
190 else {
191 for (i = 0; i < ROW; ++i) {
192 py0 = &y[i*3];
193 iend = IA[i+1];
194 for (k = IA[i]; k < iend; ++k) {
195 j = JA[k];
196 pA = val+k*9; // &val[k*jump];
197 px0 = x+j*3; // &x[j*nb];
198 py = py0;
199 fasp_blas_smat_ypAx_nc3( pA, px0, py );
200 }
201 }
202 }
203 }
204 break;
205
206 case 5:
207 {
208 if (use_openmp) {
209 INT myid, mybegin, myend;
210#ifdef _OPENMP
211#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend )
212#endif
213 for (myid =0; myid < nthreads; myid++) {
214 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
215 for (i=mybegin; i < myend; ++i) {
216 py0 = &y[i*5];
217 iend = IA[i+1];
218 for (k = IA[i]; k < iend; ++k) {
219 j = JA[k];
220 pA = val+k*25; // &val[k*jump];
221 px0 = x+j*5; // &x[j*nb];
222 py = py0;
223 fasp_blas_smat_ypAx_nc5( pA, px0, py );
224 }
225 }
226 }
227 }
228 else {
229 for (i = 0; i < ROW; ++i) {
230 py0 = &y[i*5];
231 iend = IA[i+1];
232 for (k = IA[i]; k < iend; ++k) {
233 j = JA[k];
234 pA = val+k*25; // &val[k*jump];
235 px0 = x+j*5; // &x[j*nb];
236 py = py0;
237 fasp_blas_smat_ypAx_nc5( pA, px0, py );
238 }
239 }
240 }
241 }
242 break;
243
244 case 7:
245 {
246 if (use_openmp) {
247 INT myid, mybegin, myend;
248#ifdef _OPENMP
249#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend )
250#endif
251 for (myid =0; myid < nthreads; myid++) {
252 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
253 for (i=mybegin; i < myend; ++i) {
254 py0 = &y[i*7];
255 iend = IA[i+1];
256 for (k = IA[i]; k < iend; ++k) {
257 j = JA[k];
258 pA = val+k*49; // &val[k*jump];
259 px0 = x+j*7; // &x[j*nb]��
260 py = py0;
261 fasp_blas_smat_ypAx_nc7( pA, px0, py );
262 }
263 }
264 }
265 }
266 else {
267 for (i = 0; i < ROW; ++i) {
268 py0 = &y[i*7];
269 iend = IA[i+1];
270 for (k = IA[i]; k < iend; ++k) {
271 j = JA[k];
272 pA = val+k*49; // &val[k*jump];
273 px0 = x+j*7; // &x[j*nb];
274 py = py0;
275 fasp_blas_smat_ypAx_nc7( pA, px0, py );
276 }
277 }
278 }
279 }
280 break;
281
282 default:
283 {
284 if (use_openmp) {
285 INT myid, mybegin, myend;
286#ifdef _OPENMP
287#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend)
288#endif
289 for (myid =0; myid < nthreads; myid++) {
290 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
291 for (i=mybegin; i < myend; ++i) {
292 py0 = &y[i*nb];
293 iend = IA[i+1];
294 for (k = IA[i]; k < iend; ++k) {
295 j = JA[k];
296 pA = val+k*jump; // &val[k*jump];
297 px0 = x+j*nb; // &x[j*nb];
298 py = py0;
299 fasp_blas_smat_ypAx( pA, px0, py, nb );
300 }
301 }
302 }
303 }
304 else {
305 for (i = 0; i < ROW; ++i) {
306 py0 = &y[i*nb];
307 iend = IA[i+1];
308 for (k = IA[i]; k < iend; ++k) {
309 j = JA[k];
310 pA = val+k*jump; // &val[k*jump];
311 px0 = x+j*nb; // &x[j*nb];
312 py = py0;
313 fasp_blas_smat_ypAx( pA, px0, py, nb );
314 }
315 }
316 }
317 }
318 break;
319 }
320
321 //------------------------------------------
322 // y = alpha*y
323 //------------------------------------------
324
325 if (alpha != 1.0) {
326 fasp_blas_darray_ax(size, alpha, y);
327 }
328}
329
348void fasp_blas_dbsr_aAxpy (const REAL alpha,
349 const dBSRmat *A,
350 const REAL *x,
351 REAL *y)
352{
353 /* members of A */
354 const INT ROW = A->ROW;
355 const INT nb = A->nb;
356 const INT *IA = A->IA;
357 const INT *JA = A->JA;
358 const REAL *val = A->val;
359
360 /* local variables */
361 const REAL *pA = NULL;
362 const REAL *px0 = NULL;
363 REAL *py0 = NULL;
364 REAL *py = NULL;
365
366 REAL temp = 0.0;
367 INT size = ROW*nb;
368 INT jump = nb*nb;
369 INT i, j, k, iend;
370
371 SHORT nthreads = 1, use_openmp = FALSE;
372
373#ifdef _OPENMP
374 if ( ROW > OPENMP_HOLDS ) {
375 use_openmp = TRUE;
376 nthreads = fasp_get_num_threads();
377 }
378#endif
379
380 //----------------------------------------------
381 // Treat (alpha == 0.0) computation
382 //----------------------------------------------
383
384 if (alpha == 0.0){
385 return; // Nothing to compute
386 }
387
388 //-------------------------------------------------
389 // y = (1.0/alpha)*y
390 //-------------------------------------------------
391
392 if (alpha != 1.0){
393 temp = 1.0 / alpha;
394 fasp_blas_darray_ax(size, temp, y);
395 }
396
397 //-----------------------------------------------------------------
398 // y += A*x (Core Computation)
399 // each non-zero block elements are stored in row-major order
400 //-----------------------------------------------------------------
401
402 switch (nb)
403 {
404 case 2:
405 {
406 if (use_openmp) {
407 INT myid, mybegin, myend;
408#ifdef _OPENMP
409#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend)
410#endif
411 for (myid =0; myid < nthreads; myid++) {
412 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
413 for (i=mybegin; i < myend; ++i) {
414 py0 = &y[i*2];
415 iend = IA[i+1];
416 for (k = IA[i]; k < iend; ++k) {
417 j = JA[k];
418 pA = val+k*4; // &val[k*jump];
419 px0 = x+j*2; // &x[j*nb];
420 py = py0;
421 fasp_blas_smat_ypAx_nc2( pA, px0, py );
422 }
423 }
424 }
425 }
426 else {
427 for (i = 0; i < ROW; ++i) {
428 py0 = &y[i*2];
429 iend = IA[i+1];
430 for (k = IA[i]; k < iend; ++k) {
431 j = JA[k];
432 pA = val+k*4; // &val[k*jump];
433 px0 = x+j*2; // &x[j*nb];
434 py = py0;
435 fasp_blas_smat_ypAx_nc2( pA, px0, py );
436 }
437 }
438 }
439 }
440 break;
441
442 case 3:
443 {
444 if (use_openmp) {
445 INT myid, mybegin, myend;
446#ifdef _OPENMP
447#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend )
448#endif
449 for (myid =0; myid < nthreads; myid++) {
450 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
451 for (i=mybegin; i < myend; ++i) {
452 py0 = &y[i*3];
453 iend = IA[i+1];
454 for (k = IA[i]; k < iend; ++k) {
455 j = JA[k];
456 pA = val+k*9; // &val[k*jump];
457 px0 = x+j*3; // &x[j*nb];
458 py = py0;
459 fasp_blas_smat_ypAx_nc3( pA, px0, py );
460 }
461 }
462 }
463 }
464 else {
465 for (i = 0; i < ROW; ++i){
466 py0 = &y[i*3];
467 iend = IA[i+1];
468 for (k = IA[i]; k < iend; ++k) {
469 j = JA[k];
470 pA = val+k*9; // &val[k*jump];
471 px0 = x+j*3; // &x[j*nb];
472 py = py0;
473 fasp_blas_smat_ypAx_nc3( pA, px0, py );
474 }
475 }
476 }
477 }
478 break;
479
480 case 5:
481 {
482 if (use_openmp) {
483 INT myid, mybegin, myend;
484#ifdef _OPENMP
485#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend )
486#endif
487 for (myid =0; myid < nthreads; myid++) {
488 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
489 for (i=mybegin; i < myend; ++i) {
490 py0 = &y[i*5];
491 iend = IA[i+1];
492 for (k = IA[i]; k < iend; ++k) {
493 j = JA[k];
494 pA = val+k*25; // &val[k*jump];
495 px0 = x+j*5; // &x[j*nb];
496 py = py0;
497 fasp_blas_smat_ypAx_nc5( pA, px0, py );
498 }
499 }
500 }
501 }
502 else {
503 for (i = 0; i < ROW; ++i){
504 py0 = &y[i*5];
505 iend = IA[i+1];
506 for (k = IA[i]; k < iend; ++k) {
507 j = JA[k];
508 pA = val+k*25; // &val[k*jump];
509 px0 = x+j*5; // &x[j*nb];
510 py = py0;
511 fasp_blas_smat_ypAx_nc5( pA, px0, py );
512 }
513 }
514 }
515 }
516 break;
517
518 case 7:
519 {
520 if (use_openmp) {
521 INT myid, mybegin, myend;
522#ifdef _OPENMP
523#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend )
524#endif
525 for (myid =0; myid < nthreads; myid++) {
526 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
527 for (i=mybegin; i < myend; ++i) {
528 py0 = &y[i*7];
529 iend = IA[i+1];
530 for (k = IA[i]; k < iend; ++k) {
531 j = JA[k];
532 pA = val+k*49; // &val[k*jump];
533 px0 = x+j*7; // &x[j*nb];
534 py = py0;
535 fasp_blas_smat_ypAx_nc7( pA, px0, py );
536 }
537 }
538 }
539 }
540 else {
541 for (i = 0; i < ROW; ++i) {
542 py0 = &y[i*7];
543 iend = IA[i+1];
544 for (k = IA[i]; k < iend; ++k) {
545 j = JA[k];
546 pA = val+k*49; // &val[k*jump];
547 px0 = x+j*7; // &x[j*nb];
548 py = py0;
549 fasp_blas_smat_ypAx_nc7( pA, px0, py );
550 }
551
552 }
553 }
554 }
555 break;
556
557 default:
558 {
559 if (use_openmp) {
560 INT myid, mybegin, myend;
561#ifdef _OPENMP
562#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, pA, px0, py,iend)
563#endif
564 for (myid =0; myid < nthreads; myid++) {
565 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
566 for (i=mybegin; i < myend; ++i) {
567 py0 = &y[i*nb];
568 iend = IA[i+1];
569 for (k = IA[i]; k < iend; ++k) {
570 j = JA[k];
571 pA = val+k*jump; // &val[k*jump];
572 px0 = x+j*nb; // &x[j*nb];
573 py = py0;
574 fasp_blas_smat_ypAx( pA, px0, py, nb );
575 }
576
577 }
578 }
579 }
580 else {
581 for (i = 0; i < ROW; ++i) {
582 py0 = &y[i*nb];
583 iend = IA[i+1];
584 for (k = IA[i]; k < iend; ++k) {
585 j = JA[k];
586 pA = val+k*jump; // &val[k*jump];
587 px0 = x+j*nb; // &x[j*nb];
588 py = py0;
589 fasp_blas_smat_ypAx( pA, px0, py, nb );
590 }
591
592 }
593 }
594 }
595 break;
596 }
597
598 //------------------------------------------
599 // y = alpha*y
600 //------------------------------------------
601
602 if (alpha != 1.0){
603 fasp_blas_darray_ax(size, alpha, y);
604 }
605 return;
606}
607
625 const dBSRmat *A,
626 const REAL *x,
627 REAL *y)
628{
629 /* members of A */
630 const INT ROW = A->ROW;
631 const INT nb = A->nb;
632 const INT *IA = A->IA;
633 const INT *JA = A->JA;
634
635 /* local variables */
636 const REAL *px0 = NULL;
637 REAL *py0 = NULL, *py = NULL;
638 SHORT nthreads = 1, use_openmp = FALSE;
639
640 INT size = ROW*nb;
641 INT i, j, k, iend;
642 REAL temp = 0.0;
643
644#ifdef _OPENMP
645 if ( ROW > OPENMP_HOLDS ) {
646 use_openmp = TRUE;
647 nthreads = fasp_get_num_threads();
648 }
649#endif
650
651 //----------------------------------------------
652 // Treat (alpha == 0.0) computation
653 //----------------------------------------------
654
655 if (alpha == 0.0){
656 return; // Nothing to compute
657 }
658
659 //-------------------------------------------------
660 // y = (1.0/alpha)*y
661 //-------------------------------------------------
662
663 if (alpha != 1.0){
664 temp = 1.0 / alpha;
665 fasp_blas_darray_ax(size, temp, y);
666 }
667
668 //-----------------------------------------------------------------
669 // y += A*x (Core Computation)
670 // each non-zero block elements are stored in row-major order
671 //-----------------------------------------------------------------
672
673 switch (nb)
674 {
675 case 2:
676 {
677 if (use_openmp) {
678 INT myid, mybegin, myend;
679#ifdef _OPENMP
680#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py,iend)
681#endif
682 for (myid =0; myid < nthreads; myid++) {
683 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
684 for (i=mybegin; i < myend; ++i) {
685 py0 = &y[i*2];
686 iend = IA[i+1];
687 for (k = IA[i]; k < iend; ++k) {
688 j = JA[k];
689 px0 = x+j*2; // &x[j*nb];
690 py = py0;
691 py[0] += px0[0];
692 py[1] += px0[1];
693 }
694 }
695 }
696 }
697 else {
698 for (i = 0; i < ROW; ++i) {
699 py0 = &y[i*2];
700 iend = IA[i+1];
701 for (k = IA[i]; k < iend; ++k) {
702 j = JA[k];
703 px0 = x+j*2; // &x[j*nb];
704 py = py0;
705 py[0] += px0[0];
706 py[1] += px0[1];
707 }
708 }
709 }
710 }
711 break;
712
713 case 3:
714 {
715 if (use_openmp) {
716 INT myid, mybegin, myend;
717#ifdef _OPENMP
718#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py,iend )
719#endif
720 for (myid =0; myid < nthreads; myid++) {
721 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
722 for (i=mybegin; i < myend; ++i) {
723 py0 = &y[i*3];
724 iend = IA[i+1];
725 for (k = IA[i]; k < iend; ++k) {
726 j = JA[k];
727 px0 = x+j*3; // &x[j*nb];
728 py = py0;
729 py[0] += px0[0];
730 py[1] += px0[1];
731 py[2] += px0[2];
732 }
733 }
734 }
735 }
736 else {
737 for (i = 0; i < ROW; ++i){
738 py0 = &y[i*3];
739 iend = IA[i+1];
740 for (k = IA[i]; k < iend; ++k) {
741 j = JA[k];
742 px0 = x+j*3; // &x[j*nb];
743 py = py0;
744 py[0] += px0[0];
745 py[1] += px0[1];
746 py[2] += px0[2];
747 }
748 }
749 }
750 }
751 break;
752
753 case 5:
754 {
755 if (use_openmp) {
756 INT myid, mybegin, myend;
757#ifdef _OPENMP
758#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py,iend )
759#endif
760 for (myid =0; myid < nthreads; myid++) {
761 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
762 for (i=mybegin; i < myend; ++i) {
763 py0 = &y[i*5];
764 iend = IA[i+1];
765 for (k = IA[i]; k < iend; ++k) {
766 j = JA[k];
767 px0 = x+j*5; // &x[j*nb];
768 py = py0;
769 py[0] += px0[0];
770 py[1] += px0[1];
771 py[2] += px0[2];
772 py[3] += px0[3];
773 py[4] += px0[4];
774 }
775 }
776 }
777 }
778 else {
779 for (i = 0; i < ROW; ++i){
780 py0 = &y[i*5];
781 iend = IA[i+1];
782 for (k = IA[i]; k < iend; ++k) {
783 j = JA[k];
784 px0 = x+j*5; // &x[j*nb];
785 py = py0;
786 py[0] += px0[0];
787 py[1] += px0[1];
788 py[2] += px0[2];
789 py[3] += px0[3];
790 py[4] += px0[4];
791 }
792 }
793 }
794 }
795 break;
796
797 case 7:
798 {
799 if (use_openmp) {
800 INT myid, mybegin, myend;
801#ifdef _OPENMP
802#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py,iend )
803#endif
804 for (myid =0; myid < nthreads; myid++) {
805 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
806 for (i=mybegin; i < myend; ++i) {
807 py0 = &y[i*7];
808 iend = IA[i+1];
809 for (k = IA[i]; k < iend; ++k) {
810 j = JA[k];
811 px0 = x+j*7; // &x[j*nb];
812 py = py0;
813 py[0] += px0[0];
814 py[1] += px0[1];
815 py[2] += px0[2];
816 py[3] += px0[3];
817 py[4] += px0[4];
818 py[5] += px0[5];
819 py[6] += px0[6];
820 }
821 }
822 }
823 }
824 else {
825 for (i = 0; i < ROW; ++i) {
826 py0 = &y[i*7];
827 iend = IA[i+1];
828 for (k = IA[i]; k < iend; ++k) {
829 j = JA[k];
830 px0 = x+j*7; // &x[j*nb];
831 py = py0;
832 py[0] += px0[0];
833 py[1] += px0[1];
834 py[2] += px0[2];
835 py[3] += px0[3];
836 py[4] += px0[4];
837 py[5] += px0[5];
838 py[6] += px0[6];
839 }
840
841 }
842 }
843 }
844 break;
845
846 default:
847 {
848 if (use_openmp) {
849 INT myid, mybegin, myend;
850#ifdef _OPENMP
851#pragma omp parallel for private(myid, mybegin, myend, i, py0, k, j, px0, py,iend)
852#endif
853 for (myid =0; myid < nthreads; myid++) {
854 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
855 for (i=mybegin; i < myend; ++i) {
856 py0 = &y[i*nb];
857 iend = IA[i+1];
858 for (k = IA[i]; k < iend; ++k) {
859 j = JA[k];
860 px0 = x+j*nb; // &x[j*nb];
861 py = py0;
862 fasp_blas_darray_axpy(nb, 1.0, px0, py);
863 }
864
865 }
866 }
867 }
868 else {
869 for (i = 0; i < ROW; ++i) {
870 py0 = &y[i*nb];
871 iend = IA[i+1];
872 for (k = IA[i]; k < iend; ++k) {
873 j = JA[k];
874 px0 = x+j*nb; // &x[j*nb];
875 py = py0;
876 fasp_blas_darray_axpy(nb, 1.0, px0, py);
877 }
878
879 }
880 }
881 }
882 break;
883 }
884
885 //------------------------------------------
886 // y = alpha*y
887 //------------------------------------------
888
889 if ( alpha != 1.0 ) fasp_blas_darray_ax(size, alpha, y);
890
891 return;
892}
893
911 const REAL *x,
912 REAL *y)
913{
914 /* members of A */
915 const INT ROW = A->ROW;
916 const INT nb = A->nb;
917 const INT *IA = A->IA;
918 const INT *JA = A->JA;
919 const REAL *val = A->val;
920
921 /* local variables */
922 INT size = ROW*nb;
923 INT jump = nb*nb;
924 INT i,j,k, num_nnz_row;
925
926 const REAL *pA = NULL;
927 const REAL *px0 = NULL;
928 REAL *py0 = NULL;
929 REAL *py = NULL;
930
931 SHORT use_openmp = FALSE;
932
933#ifdef _OPENMP
934 INT myid, mybegin, myend, nthreads;
935 if ( ROW > OPENMP_HOLDS ) {
936 use_openmp = TRUE;
937 nthreads = fasp_get_num_threads();
938 }
939#endif
940
941 //-----------------------------------------------------------------
942 // zero out 'y'
943 //-----------------------------------------------------------------
944 fasp_darray_set(size, y, 0.0);
945
946 //-----------------------------------------------------------------
947 // y = A*x (Core Computation)
948 // each non-zero block elements are stored in row-major order
949 //-----------------------------------------------------------------
950
951 switch (nb)
952 {
953 case 3:
954 {
955 if (use_openmp) {
956#ifdef _OPENMP
957#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
958 {
959 myid = omp_get_thread_num();
960 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
961 for (i=mybegin; i < myend; ++i)
962 {
963 py0 = &y[i*3];
964 num_nnz_row = IA[i+1] - IA[i];
965 switch(num_nnz_row)
966 {
967 case 3:
968 k = IA[i];
969 j = JA[k];
970 pA = val+k*9;
971 px0 = x+j*3;
972 py = py0;
973 fasp_blas_smat_ypAx_nc3( pA, px0, py );
974
975 k ++;
976 j = JA[k];
977 pA = val+k*9;
978 px0 = x+j*3;
979 py = py0;
980 fasp_blas_smat_ypAx_nc3( pA, px0, py );
981
982 k ++;
983 j = JA[k];
984 pA = val+k*9;
985 px0 = x+j*3;
986 py = py0;
987 fasp_blas_smat_ypAx_nc3( pA, px0, py );
988
989 break;
990
991 case 4:
992 k = IA[i];
993 j = JA[k];
994 pA = val+k*9;
995 px0 = x+j*3;
996 py = py0;
997 fasp_blas_smat_ypAx_nc3( pA, px0, py );
998
999 k ++;
1000 j = JA[k];
1001 pA = val+k*9;
1002 px0 = x+j*3;
1003 py = py0;
1004 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1005
1006 k ++;
1007 j = JA[k];
1008 pA = val+k*9;
1009 px0 = x+j*3;
1010 py = py0;
1011 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1012
1013 k ++;
1014 j = JA[k];
1015 pA = val+k*9;
1016 px0 = x+j*3;
1017 py = py0;
1018 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1019
1020 break;
1021
1022 case 5:
1023 k = IA[i];
1024 j = JA[k];
1025 pA = val+k*9;
1026 px0 = x+j*3;
1027 py = py0;
1028 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1029
1030 k ++;
1031 j = JA[k];
1032 pA = val+k*9;
1033 px0 = x+j*3;
1034 py = py0;
1035 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1036
1037 k ++;
1038 j = JA[k];
1039 pA = val+k*9;
1040 px0 = x+j*3;
1041 py = py0;
1042 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1043
1044 k ++;
1045 j = JA[k];
1046 pA = val+k*9;
1047 px0 = x+j*3;
1048 py = py0;
1049 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1050
1051 k ++;
1052 j = JA[k];
1053 pA = val+k*9;
1054 px0 = x+j*3;
1055 py = py0;
1056 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1057
1058 break;
1059
1060 case 6:
1061 k = IA[i];
1062 j = JA[k];
1063 pA = val+k*9;
1064 px0 = x+j*3;
1065 py = py0;
1066 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1067
1068 k ++;
1069 j = JA[k];
1070 pA = val+k*9;
1071 px0 = x+j*3;
1072 py = py0;
1073 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1074
1075 k ++;
1076 j = JA[k];
1077 pA = val+k*9;
1078 px0 = x+j*3;
1079 py = py0;
1080 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1081
1082 k ++;
1083 j = JA[k];
1084 pA = val+k*9;
1085 px0 = x+j*3;
1086 py = py0;
1087 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1088
1089 k ++;
1090 j = JA[k];
1091 pA = val+k*9;
1092 px0 = x+j*3;
1093 py = py0;
1094 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1095
1096 k ++;
1097 j = JA[k];
1098 pA = val+k*9;
1099 px0 = x+j*3;
1100 py = py0;
1101 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1102
1103 break;
1104
1105 case 7:
1106 k = IA[i];
1107 j = JA[k];
1108 pA = val+k*9;
1109 px0 = x+j*3;
1110 py = py0;
1111 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1112
1113 k ++;
1114 j = JA[k];
1115 pA = val+k*9;
1116 px0 = x+j*3;
1117 py = py0;
1118 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1119
1120 k ++;
1121 j = JA[k];
1122 pA = val+k*9;
1123 px0 = x+j*3;
1124 py = py0;
1125 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1126
1127 k ++;
1128 j = JA[k];
1129 pA = val+k*9;
1130 px0 = x+j*3;
1131 py = py0;
1132 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1133
1134 k ++;
1135 j = JA[k];
1136 pA = val+k*9;
1137 px0 = x+j*3;
1138 py = py0;
1139 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1140
1141 k ++;
1142 j = JA[k];
1143 pA = val+k*9;
1144 px0 = x+j*3;
1145 py = py0;
1146 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1147
1148 k ++;
1149 j = JA[k];
1150 pA = val+k*9;
1151 px0 = x+j*3;
1152 py = py0;
1153 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1154
1155 break;
1156
1157 default:
1158 for (k = IA[i]; k < IA[i+1]; ++k)
1159 {
1160 j = JA[k];
1161 pA = val+k*9;
1162 px0 = x+j*3;
1163 py = py0;
1164 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1165 }
1166 break;
1167 }
1168 }
1169 }
1170#endif
1171 }
1172 else {
1173 for (i = 0; i < ROW; ++i)
1174 {
1175 py0 = &y[i*3];
1176 num_nnz_row = IA[i+1] - IA[i];
1177 switch(num_nnz_row)
1178 {
1179 case 3:
1180 k = IA[i];
1181 j = JA[k];
1182 pA = val+k*9; // &val[k*jump];
1183 px0 = x+j*3; // &x[j*nb];
1184 py = py0;
1185 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1186
1187 k ++;
1188 j = JA[k];
1189 pA = val+k*9; // &val[k*jump];
1190 px0 = x+j*3; // &x[j*nb];
1191 py = py0;
1192 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1193
1194 k ++;
1195 j = JA[k];
1196 pA = val+k*9; // &val[k*jump];
1197 px0 = x+j*3; // &x[j*nb];
1198 py = py0;
1199 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1200
1201 break;
1202
1203 case 4:
1204 k = IA[i];
1205 j = JA[k];
1206 pA = val+k*9; // &val[k*jump];
1207 px0 = x+j*3; // &x[j*nb];
1208 py = py0;
1209 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1210
1211 k ++;
1212 j = JA[k];
1213 pA = val+k*9; // &val[k*jump];
1214 px0 = x+j*3; // &x[j*nb];
1215 py = py0;
1216 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1217
1218 k ++;
1219 j = JA[k];
1220 pA = val+k*9; // &val[k*jump];
1221 px0 = x+j*3; // &x[j*nb];
1222 py = py0;
1223 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1224
1225 k ++;
1226 j = JA[k];
1227 pA = val+k*9; // &val[k*jump];
1228 px0 = x+j*3; // &x[j*nb];
1229 py = py0;
1230 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1231
1232 break;
1233
1234 case 5:
1235 k = IA[i];
1236 j = JA[k];
1237 pA = val+k*9; // &val[k*jump];
1238 px0 = x+j*3; // &x[j*nb];
1239 py = py0;
1240 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1241
1242 k ++;
1243 j = JA[k];
1244 pA = val+k*9; // &val[k*jump];
1245 px0 = x+j*3; // &x[j*nb];
1246 py = py0;
1247 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1248
1249 k ++;
1250 j = JA[k];
1251 pA = val+k*9; // &val[k*jump];
1252 px0 = x+j*3; // &x[j*nb];
1253 py = py0;
1254 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1255
1256 k ++;
1257 j = JA[k];
1258 pA = val+k*9; // &val[k*jump];
1259 px0 = x+j*3; // &x[j*nb];
1260 py = py0;
1261 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1262
1263 k ++;
1264 j = JA[k];
1265 pA = val+k*9; // &val[k*jump];
1266 px0 = x+j*3; // &x[j*nb];
1267 py = py0;
1268 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1269
1270 break;
1271
1272 case 6:
1273 k = IA[i];
1274 j = JA[k];
1275 pA = val+k*9; // &val[k*jump];
1276 px0 = x+j*3; // &x[j*nb];
1277 py = py0;
1278 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1279
1280 k ++;
1281 j = JA[k];
1282 pA = val+k*9; // &val[k*jump];
1283 px0 = x+j*3; // &x[j*nb];
1284 py = py0;
1285 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1286
1287 k ++;
1288 j = JA[k];
1289 pA = val+k*9; // &val[k*jump];
1290 px0 = x+j*3; // &x[j*nb];
1291 py = py0;
1292 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1293
1294 k ++;
1295 j = JA[k];
1296 pA = val+k*9; // &val[k*jump];
1297 px0 = x+j*3; // &x[j*nb];
1298 py = py0;
1299 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1300
1301 k ++;
1302 j = JA[k];
1303 pA = val+k*9; // &val[k*jump];
1304 px0 = x+j*3; // &x[j*nb];
1305 py = py0;
1306 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1307
1308 k ++;
1309 j = JA[k];
1310 pA = val+k*9; // &val[k*jump];
1311 px0 = x+j*3; // &x[j*nb];
1312 py = py0;
1313 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1314
1315 break;
1316
1317 case 7:
1318 k = IA[i];
1319 j = JA[k];
1320 pA = val+k*9; // &val[k*jump];
1321 px0 = x+j*3; // &x[j*nb];
1322 py = py0;
1323 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1324
1325 k ++;
1326 j = JA[k];
1327 pA = val+k*9; // &val[k*jump];
1328 px0 = x+j*3; // &x[j*nb];
1329 py = py0;
1330 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1331
1332 k ++;
1333 j = JA[k];
1334 pA = val+k*9; // &val[k*jump];
1335 px0 = x+j*3; // &x[j*nb];
1336 py = py0;
1337 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1338
1339 k ++;
1340 j = JA[k];
1341 pA = val+k*9; // &val[k*jump];
1342 px0 = x+j*3; // &x[j*nb];
1343 py = py0;
1344 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1345
1346 k ++;
1347 j = JA[k];
1348 pA = val+k*9; // &val[k*jump];
1349 px0 = x+j*3; // &x[j*nb];
1350 py = py0;
1351 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1352
1353 k ++;
1354 j = JA[k];
1355 pA = val+k*9; // &val[k*jump];
1356 px0 = x+j*3; // &x[j*nb];
1357 py = py0;
1358 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1359
1360 k ++;
1361 j = JA[k];
1362 pA = val+k*9; // &val[k*jump];
1363 px0 = x+j*3; // &x[j*nb];
1364 py = py0;
1365 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1366
1367 break;
1368
1369 default:
1370 for (k = IA[i]; k < IA[i+1]; ++k)
1371 {
1372 j = JA[k];
1373 pA = val+k*9; // &val[k*jump];
1374 px0 = x+j*3; // &x[j*nb];
1375 py = py0;
1376 fasp_blas_smat_ypAx_nc3( pA, px0, py );
1377 }
1378 break;
1379 }
1380 }
1381 }
1382 }
1383 break;
1384
1385 case 5:
1386 {
1387 if (use_openmp) {
1388#ifdef _OPENMP
1389#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
1390 {
1391 myid = omp_get_thread_num();
1392 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1393 for (i=mybegin; i < myend; ++i)
1394 {
1395 py0 = &y[i*5];
1396 num_nnz_row = IA[i+1] - IA[i];
1397 switch(num_nnz_row)
1398 {
1399 case 3:
1400 k = IA[i];
1401 j = JA[k];
1402 pA = val+k*25; // &val[k*jump];
1403 px0 = x+j*5; // &x[j*nb];
1404 py = py0;
1405 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1406
1407 k ++;
1408 j = JA[k];
1409 pA = val+k*25; // &val[k*jump];
1410 px0 = x+j*5; // &x[j*nb];
1411 py = py0;
1412 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1413
1414 k ++;
1415 j = JA[k];
1416 pA = val+k*25; // &val[k*jump];
1417 px0 = x+j*5; // &x[j*nb];
1418 py = py0;
1419 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1420
1421 break;
1422
1423 case 4:
1424 k = IA[i];
1425 j = JA[k];
1426 pA = val+k*25; // &val[k*jump];
1427 px0 = x+j*5; // &x[j*nb];
1428 py = py0;
1429 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1430
1431 k ++;
1432 j = JA[k];
1433 pA = val+k*25; // &val[k*jump];
1434 px0 = x+j*5; // &x[j*nb];
1435 py = py0;
1436 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1437
1438 k ++;
1439 j = JA[k];
1440 pA = val+k*25; // &val[k*jump];
1441 px0 = x+j*5; // &x[j*nb];
1442 py = py0;
1443 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1444
1445 k ++;
1446 j = JA[k];
1447 pA = val+k*25; // &val[k*jump];
1448 px0 = x+j*5; // &x[j*nb];
1449 py = py0;
1450 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1451
1452 break;
1453
1454 case 5:
1455 k = IA[i];
1456 j = JA[k];
1457 pA = val+k*25; // &val[k*jump];
1458 px0 = x+j*5; // &x[j*nb];
1459 py = py0;
1460 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1461
1462 k ++;
1463 j = JA[k];
1464 pA = val+k*25; // &val[k*jump];
1465 px0 = x+j*5; // &x[j*nb];
1466 py = py0;
1467 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1468
1469 k ++;
1470 j = JA[k];
1471 pA = val+k*25; // &val[k*jump];
1472 px0 = x+j*5; // &x[j*nb];
1473 py = py0;
1474 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1475
1476 k ++;
1477 j = JA[k];
1478 pA = val+k*25; // &val[k*jump];
1479 px0 = x+j*5; // &x[j*nb];
1480 py = py0;
1481 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1482
1483 k ++;
1484 j = JA[k];
1485 pA = val+k*25; // &val[k*jump];
1486 px0 = x+j*5; // &x[j*nb];
1487 py = py0;
1488 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1489
1490 break;
1491
1492 case 6:
1493 k = IA[i];
1494 j = JA[k];
1495 pA = val+k*25; // &val[k*jump];
1496 px0 = x+j*5; // &x[j*nb];
1497 py = py0;
1498 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1499
1500 k ++;
1501 j = JA[k];
1502 pA = val+k*25; // &val[k*jump];
1503 px0 = x+j*5; // &x[j*nb];
1504 py = py0;
1505 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1506
1507 k ++;
1508 j = JA[k];
1509 pA = val+k*25; // &val[k*jump];
1510 px0 = x+j*5; // &x[j*nb];
1511 py = py0;
1512 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1513
1514 k ++;
1515 j = JA[k];
1516 pA = val+k*25; // &val[k*jump];
1517 px0 = x+j*5; // &x[j*nb];
1518 py = py0;
1519 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1520
1521 k ++;
1522 j = JA[k];
1523 pA = val+k*25; // &val[k*jump];
1524 px0 = x+j*5; // &x[j*nb];
1525 py = py0;
1526 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1527
1528 k ++;
1529 j = JA[k];
1530 pA = val+k*25; // &val[k*jump];
1531 px0 = x+j*5; // &x[j*nb];
1532 py = py0;
1533 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1534
1535 break;
1536
1537 case 7:
1538 k = IA[i];
1539 j = JA[k];
1540 pA = val+k*25; // &val[k*jump];
1541 px0 = x+j*5; // &x[j*nb];
1542 py = py0;
1543 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1544
1545 k ++;
1546 j = JA[k];
1547 pA = val+k*25; // &val[k*jump];
1548 px0 = x+j*5; // &x[j*nb];
1549 py = py0;
1550 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1551
1552 k ++;
1553 j = JA[k];
1554 pA = val+k*25; // &val[k*jump];
1555 px0 = x+j*5; // &x[j*nb];
1556 py = py0;
1557 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1558
1559 k ++;
1560 j = JA[k];
1561 pA = val+k*25; // &val[k*jump];
1562 px0 = x+j*5; // &x[j*nb];
1563 py = py0;
1564 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1565
1566 k ++;
1567 j = JA[k];
1568 pA = val+k*25; // &val[k*jump];
1569 px0 = x+j*5; // &x[j*nb];
1570 py = py0;
1571 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1572
1573 k ++;
1574 j = JA[k];
1575 pA = val+k*25; // &val[k*jump];
1576 px0 = x+j*5; // &x[j*nb];
1577 py = py0;
1578 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1579
1580 k ++;
1581 j = JA[k];
1582 pA = val+k*25; // &val[k*jump];
1583 px0 = x+j*5; // &x[j*nb];
1584 py = py0;
1585 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1586
1587 break;
1588
1589 default:
1590 for (k = IA[i]; k < IA[i+1]; ++k)
1591 {
1592 j = JA[k];
1593 pA = val+k*25; // &val[k*jump];
1594 px0 = x+j*5; // &x[j*nb];
1595 py = py0;
1596 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1597 }
1598 break;
1599 }
1600 }
1601 }
1602#endif
1603 }
1604 else {
1605 for (i = 0; i < ROW; ++i)
1606 {
1607 py0 = &y[i*5];
1608 num_nnz_row = IA[i+1] - IA[i];
1609 switch(num_nnz_row)
1610 {
1611 case 3:
1612 k = IA[i];
1613 j = JA[k];
1614 pA = val+k*25; // &val[k*jump];
1615 px0 = x+j*5; // &x[j*nb];
1616 py = py0;
1617 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1618
1619 k ++;
1620 j = JA[k];
1621 pA = val+k*25; // &val[k*jump];
1622 px0 = x+j*5; // &x[j*nb];
1623 py = py0;
1624 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1625
1626 k ++;
1627 j = JA[k];
1628 pA = val+k*25; // &val[k*jump];
1629 px0 = x+j*5; // &x[j*nb];
1630 py = py0;
1631 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1632
1633 break;
1634
1635 case 4:
1636 k = IA[i];
1637 j = JA[k];
1638 pA = val+k*25; // &val[k*jump];
1639 px0 = x+j*5; // &x[j*nb];
1640 py = py0;
1641 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1642
1643 k ++;
1644 j = JA[k];
1645 pA = val+k*25; // &val[k*jump];
1646 px0 = x+j*5; // &x[j*nb];
1647 py = py0;
1648 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1649
1650 k ++;
1651 j = JA[k];
1652 pA = val+k*25; // &val[k*jump];
1653 px0 = x+j*5; // &x[j*nb];
1654 py = py0;
1655 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1656
1657 k ++;
1658 j = JA[k];
1659 pA = val+k*25; // &val[k*jump];
1660 px0 = x+j*5; // &x[j*nb];
1661 py = py0;
1662 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1663
1664 break;
1665
1666 case 5:
1667 k = IA[i];
1668 j = JA[k];
1669 pA = val+k*25; // &val[k*jump];
1670 px0 = x+j*5; // &x[j*nb];
1671 py = py0;
1672 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1673
1674 k ++;
1675 j = JA[k];
1676 pA = val+k*25; // &val[k*jump];
1677 px0 = x+j*5; // &x[j*nb];
1678 py = py0;
1679 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1680
1681 k ++;
1682 j = JA[k];
1683 pA = val+k*25; // &val[k*jump];
1684 px0 = x+j*5; // &x[j*nb];
1685 py = py0;
1686 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1687
1688 k ++;
1689 j = JA[k];
1690 pA = val+k*25; // &val[k*jump];
1691 px0 = x+j*5; // &x[j*nb];
1692 py = py0;
1693 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1694
1695 k ++;
1696 j = JA[k];
1697 pA = val+k*25; // &val[k*jump];
1698 px0 = x+j*5; // &x[j*nb];
1699 py = py0;
1700 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1701
1702 break;
1703
1704 case 6:
1705 k = IA[i];
1706 j = JA[k];
1707 pA = val+k*25; // &val[k*jump];
1708 px0 = x+j*5; // &x[j*nb];
1709 py = py0;
1710 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1711
1712 k ++;
1713 j = JA[k];
1714 pA = val+k*25; // &val[k*jump];
1715 px0 = x+j*5; // &x[j*nb];
1716 py = py0;
1717 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1718
1719 k ++;
1720 j = JA[k];
1721 pA = val+k*25; // &val[k*jump];
1722 px0 = x+j*5; // &x[j*nb];
1723 py = py0;
1724 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1725
1726 k ++;
1727 j = JA[k];
1728 pA = val+k*25; // &val[k*jump];
1729 px0 = x+j*5; // &x[j*nb];
1730 py = py0;
1731 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1732
1733 k ++;
1734 j = JA[k];
1735 pA = val+k*25; // &val[k*jump];
1736 px0 = x+j*5; // &x[j*nb];
1737 py = py0;
1738 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1739
1740 k ++;
1741 j = JA[k];
1742 pA = val+k*25; // &val[k*jump];
1743 px0 = x+j*5; // &x[j*nb];
1744 py = py0;
1745 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1746
1747 break;
1748
1749 case 7:
1750 k = IA[i];
1751 j = JA[k];
1752 pA = val+k*25; // &val[k*jump];
1753 px0 = x+j*5; // &x[j*nb];
1754 py = py0;
1755 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1756
1757 k ++;
1758 j = JA[k];
1759 pA = val+k*25; // &val[k*jump];
1760 px0 = x+j*5; // &x[j*nb];
1761 py = py0;
1762 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1763
1764 k ++;
1765 j = JA[k];
1766 pA = val+k*25; // &val[k*jump];
1767 px0 = x+j*5; // &x[j*nb];
1768 py = py0;
1769 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1770
1771 k ++;
1772 j = JA[k];
1773 pA = val+k*25; // &val[k*jump];
1774 px0 = x+j*5; // &x[j*nb];
1775 py = py0;
1776 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1777
1778 k ++;
1779 j = JA[k];
1780 pA = val+k*25; // &val[k*jump];
1781 px0 = x+j*5; // &x[j*nb];
1782 py = py0;
1783 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1784
1785 k ++;
1786 j = JA[k];
1787 pA = val+k*25; // &val[k*jump];
1788 px0 = x+j*5; // &x[j*nb];
1789 py = py0;
1790 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1791
1792 k ++;
1793 j = JA[k];
1794 pA = val+k*25; // &val[k*jump];
1795 px0 = x+j*5; // &x[j*nb];
1796 py = py0;
1797 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1798
1799 break;
1800
1801 default:
1802 for (k = IA[i]; k < IA[i+1]; ++k)
1803 {
1804 j = JA[k];
1805 pA = val+k*25; // &val[k*jump];
1806 px0 = x+j*5; // &x[j*nb];
1807 py = py0;
1808 fasp_blas_smat_ypAx_nc5( pA, px0, py );
1809 }
1810 break;
1811 }
1812 }
1813 }
1814 }
1815 break;
1816
1817 case 7:
1818 {
1819 if (use_openmp) {
1820#ifdef _OPENMP
1821#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
1822 {
1823 myid = omp_get_thread_num();
1824 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
1825 for (i=mybegin; i < myend; ++i)
1826 {
1827 py0 = &y[i*7];
1828 num_nnz_row = IA[i+1] - IA[i];
1829 switch(num_nnz_row)
1830 {
1831 case 3:
1832 k = IA[i];
1833 j = JA[k];
1834 pA = val+k*49; // &val[k*jump];
1835 px0 = x+j*7; // &x[j*nb];
1836 py = py0;
1837 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1838
1839 k ++;
1840 j = JA[k];
1841 pA = val+k*49; // &val[k*jump];
1842 px0 = x+j*7; // &x[j*nb];
1843 py = py0;
1844 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1845
1846 k ++;
1847 j = JA[k];
1848 pA = val+k*49; // &val[k*jump];
1849 px0 = x+j*7; // &x[j*nb];
1850 py = py0;
1851 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1852
1853 break;
1854
1855 case 4:
1856 k = IA[i];
1857 j = JA[k];
1858 pA = val+k*49; // &val[k*jump];
1859 px0 = x+j*7; // &x[j*nb];
1860 py = py0;
1861 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1862
1863 k ++;
1864 j = JA[k];
1865 pA = val+k*49; // &val[k*jump];
1866 px0 = x+j*7; // &x[j*nb];
1867 py = py0;
1868 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1869
1870 k ++;
1871 j = JA[k];
1872 pA = val+k*49; // &val[k*jump];
1873 px0 = x+j*7; // &x[j*nb];
1874 py = py0;
1875 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1876
1877 k ++;
1878 j = JA[k];
1879 pA = val+k*49; // &val[k*jump];
1880 px0 = x+j*7; // &x[j*nb];
1881 py = py0;
1882 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1883
1884 break;
1885
1886 case 5:
1887 k = IA[i];
1888 j = JA[k];
1889 pA = val+k*49; // &val[k*jump];
1890 px0 = x+j*7; // &x[j*nb];
1891 py = py0;
1892 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1893
1894 k ++;
1895 j = JA[k];
1896 pA = val+k*49; // &val[k*jump];
1897 px0 = x+j*7; // &x[j*nb];
1898 py = py0;
1899 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1900
1901 k ++;
1902 j = JA[k];
1903 pA = val+k*49; // &val[k*jump];
1904 px0 = x+j*7; // &x[j*nb];
1905 py = py0;
1906 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1907
1908 k ++;
1909 j = JA[k];
1910 pA = val+k*49; // &val[k*jump];
1911 px0 = x+j*7; // &x[j*nb];
1912 py = py0;
1913 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1914
1915 k ++;
1916 j = JA[k];
1917 pA = val+k*49; // &val[k*jump];
1918 px0 = x+j*7; // &x[j*nb];
1919 py = py0;
1920 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1921
1922 break;
1923
1924 case 6:
1925 k = IA[i];
1926 j = JA[k];
1927 pA = val+k*49; // &val[k*jump];
1928 px0 = x+j*7; // &x[j*nb];
1929 py = py0;
1930 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1931
1932 k ++;
1933 j = JA[k];
1934 pA = val+k*49; // &val[k*jump];
1935 px0 = x+j*7; // &x[j*nb];
1936 py = py0;
1937 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1938
1939 k ++;
1940 j = JA[k];
1941 pA = val+k*49; // &val[k*jump];
1942 px0 = x+j*7; // &x[j*nb];
1943 py = py0;
1944 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1945
1946 k ++;
1947 j = JA[k];
1948 pA = val+k*49; // &val[k*jump];
1949 px0 = x+j*7; // &x[j*nb];
1950 py = py0;
1951 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1952
1953 k ++;
1954 j = JA[k];
1955 pA = val+k*49; // &val[k*jump];
1956 px0 = x+j*7; // &x[j*nb];
1957 py = py0;
1958 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1959
1960 k ++;
1961 j = JA[k];
1962 pA = val+k*49; // &val[k*jump];
1963 px0 = x+j*7; // &x[j*nb];
1964 py = py0;
1965 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1966
1967 break;
1968
1969 case 7:
1970 k = IA[i];
1971 j = JA[k];
1972 pA = val+k*49; // &val[k*jump];
1973 px0 = x+j*7; // &x[j*nb];
1974 py = py0;
1975 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1976
1977 k ++;
1978 j = JA[k];
1979 pA = val+k*49; // &val[k*jump];
1980 px0 = x+j*7; // &x[j*nb];
1981 py = py0;
1982 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1983
1984 k ++;
1985 j = JA[k];
1986 pA = val+k*49; // &val[k*jump];
1987 px0 = x+j*7; // &x[j*nb];
1988 py = py0;
1989 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1990
1991 k ++;
1992 j = JA[k];
1993 pA = val+k*49; // &val[k*jump];
1994 px0 = x+j*7; // &x[j*nb];
1995 py = py0;
1996 fasp_blas_smat_ypAx_nc7( pA, px0, py );
1997
1998 k ++;
1999 j = JA[k];
2000 pA = val+k*49; // &val[k*jump];
2001 px0 = x+j*7; // &x[j*nb];
2002 py = py0;
2003 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2004
2005 k ++;
2006 j = JA[k];
2007 pA = val+k*49; // &val[k*jump];
2008 px0 = x+j*7; // &x[j*nb];
2009 py = py0;
2010 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2011
2012 k ++;
2013 j = JA[k];
2014 pA = val+k*49; // &val[k*jump];
2015 px0 = x+j*7; // &x[j*nb];
2016 py = py0;
2017 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2018
2019 break;
2020
2021 default:
2022 for (k = IA[i]; k < IA[i+1]; ++k)
2023 {
2024 j = JA[k];
2025 pA = val+k*49; // &val[k*jump];
2026 px0 = x+j*7; // &x[j*nb];
2027 py = py0;
2028 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2029 }
2030 break;
2031 }
2032 }
2033 }
2034#endif
2035 }
2036 else {
2037 for (i = 0; i < ROW; ++i)
2038 {
2039 py0 = &y[i*7];
2040 num_nnz_row = IA[i+1] - IA[i];
2041 switch(num_nnz_row)
2042 {
2043 case 3:
2044 k = IA[i];
2045 j = JA[k];
2046 pA = val+k*49; // &val[k*jump];
2047 px0 = x+j*7; // &x[j*nb];
2048 py = py0;
2049 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2050
2051 k ++;
2052 j = JA[k];
2053 pA = val+k*49; // &val[k*jump];
2054 px0 = x+j*7; // &x[j*nb];
2055 py = py0;
2056 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2057
2058 k ++;
2059 j = JA[k];
2060 pA = val+k*49; // &val[k*jump];
2061 px0 = x+j*7; // &x[j*nb];
2062 py = py0;
2063 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2064
2065 break;
2066
2067 case 4:
2068 k = IA[i];
2069 j = JA[k];
2070 pA = val+k*49; // &val[k*jump];
2071 px0 = x+j*7; // &x[j*nb];
2072 py = py0;
2073 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2074
2075 k ++;
2076 j = JA[k];
2077 pA = val+k*49; // &val[k*jump];
2078 px0 = x+j*7; // &x[j*nb];
2079 py = py0;
2080 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2081
2082 k ++;
2083 j = JA[k];
2084 pA = val+k*49; // &val[k*jump];
2085 px0 = x+j*7; // &x[j*nb];
2086 py = py0;
2087 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2088
2089 k ++;
2090 j = JA[k];
2091 pA = val+k*49; // &val[k*jump];
2092 px0 = x+j*7; // &x[j*nb];
2093 py = py0;
2094 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2095
2096 break;
2097
2098 case 5:
2099 k = IA[i];
2100 j = JA[k];
2101 pA = val+k*49; // &val[k*jump];
2102 px0 = x+j*7; // &x[j*nb];
2103 py = py0;
2104 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2105
2106 k ++;
2107 j = JA[k];
2108 pA = val+k*49; // &val[k*jump];
2109 px0 = x+j*7; // &x[j*nb];
2110 py = py0;
2111 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2112
2113 k ++;
2114 j = JA[k];
2115 pA = val+k*49; // &val[k*jump];
2116 px0 = x+j*7; // &x[j*nb];
2117 py = py0;
2118 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2119
2120 k ++;
2121 j = JA[k];
2122 pA = val+k*49; // &val[k*jump];
2123 px0 = x+j*7; // &x[j*nb];
2124 py = py0;
2125 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2126
2127 k ++;
2128 j = JA[k];
2129 pA = val+k*49; // &val[k*jump];
2130 px0 = x+j*7; // &x[j*nb];
2131 py = py0;
2132 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2133
2134 break;
2135
2136 case 6:
2137 k = IA[i];
2138 j = JA[k];
2139 pA = val+k*49; // &val[k*jump];
2140 px0 = x+j*7; // &x[j*nb];
2141 py = py0;
2142 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2143
2144 k ++;
2145 j = JA[k];
2146 pA = val+k*49; // &val[k*jump];
2147 px0 = x+j*7; // &x[j*nb];
2148 py = py0;
2149 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2150
2151 k ++;
2152 j = JA[k];
2153 pA = val+k*49; // &val[k*jump];
2154 px0 = x+j*7; // &x[j*nb];
2155 py = py0;
2156 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2157
2158 k ++;
2159 j = JA[k];
2160 pA = val+k*49; // &val[k*jump];
2161 px0 = x+j*7; // &x[j*nb];
2162 py = py0;
2163 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2164
2165 k ++;
2166 j = JA[k];
2167 pA = val+k*49; // &val[k*jump];
2168 px0 = x+j*7; // &x[j*nb];
2169 py = py0;
2170 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2171
2172 k ++;
2173 j = JA[k];
2174 pA = val+k*49; // &val[k*jump];
2175 px0 = x+j*7; // &x[j*nb];
2176 py = py0;
2177 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2178
2179 break;
2180
2181 case 7:
2182 k = IA[i];
2183 j = JA[k];
2184 pA = val+k*49; // &val[k*jump];
2185 px0 = x+j*7; // &x[j*nb];
2186 py = py0;
2187 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2188
2189 k ++;
2190 j = JA[k];
2191 pA = val+k*49; // &val[k*jump];
2192 px0 = x+j*7; // &x[j*nb];
2193 py = py0;
2194 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2195
2196 k ++;
2197 j = JA[k];
2198 pA = val+k*49; // &val[k*jump];
2199 px0 = x+j*7; // &x[j*nb];
2200 py = py0;
2201 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2202
2203 k ++;
2204 j = JA[k];
2205 pA = val+k*49; // &val[k*jump];
2206 px0 = x+j*7; // &x[j*nb];
2207 py = py0;
2208 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2209
2210 k ++;
2211 j = JA[k];
2212 pA = val+k*49; // &val[k*jump];
2213 px0 = x+j*7; // &x[j*nb];
2214 py = py0;
2215 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2216
2217 k ++;
2218 j = JA[k];
2219 pA = val+k*49; // &val[k*jump];
2220 px0 = x+j*7; // &x[j*nb];
2221 py = py0;
2222 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2223
2224 k ++;
2225 j = JA[k];
2226 pA = val+k*49; // &val[k*jump];
2227 px0 = x+j*7; // &x[j*nb];
2228 py = py0;
2229 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2230
2231 break;
2232
2233 default:
2234 for (k = IA[i]; k < IA[i+1]; ++k)
2235 {
2236 j = JA[k];
2237 pA = val+k*49; // &val[k*jump];
2238 px0 = x+j*7; // &x[j*nb];
2239 py = py0;
2240 fasp_blas_smat_ypAx_nc7( pA, px0, py );
2241 }
2242 break;
2243 }
2244 }
2245 }
2246 }
2247 break;
2248
2249 default:
2250 {
2251 if (use_openmp) {
2252#ifdef _OPENMP
2253#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
2254 {
2255 myid = omp_get_thread_num();
2256 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
2257 for (i=mybegin; i < myend; ++i)
2258 {
2259 py0 = &y[i*nb];
2260 num_nnz_row = IA[i+1] - IA[i];
2261 switch(num_nnz_row)
2262 {
2263 case 3:
2264 k = IA[i];
2265 j = JA[k];
2266 pA = val+k*jump; // &val[k*jump];
2267 px0 = x+j*nb; // &x[j*nb];
2268 py = py0;
2269 fasp_blas_smat_ypAx( pA, px0, py, nb );
2270
2271 k ++;
2272 j = JA[k];
2273 pA = val+k*jump; // &val[k*jump];
2274 px0 = x+j*nb; // &x[j*nb];
2275 py = py0;
2276 fasp_blas_smat_ypAx( pA, px0, py, nb );
2277
2278 k ++;
2279 j = JA[k];
2280 pA = val+k*jump; // &val[k*jump];
2281 px0 = x+j*nb; // &x[j*nb];
2282 py = py0;
2283 fasp_blas_smat_ypAx( pA, px0, py, nb );
2284
2285 break;
2286
2287 case 4:
2288 k = IA[i];
2289 j = JA[k];
2290 pA = val+k*jump; // &val[k*jump];
2291 px0 = x+j*nb; // &x[j*nb];
2292 py = py0;
2293 fasp_blas_smat_ypAx( pA, px0, py, nb );
2294
2295 k ++;
2296 j = JA[k];
2297 pA = val+k*jump; // &val[k*jump];
2298 px0 = x+j*nb; // &x[j*nb];
2299 py = py0;
2300 fasp_blas_smat_ypAx( pA, px0, py, nb );
2301
2302 k ++;
2303 j = JA[k];
2304 pA = val+k*jump; // &val[k*jump];
2305 px0 = x+j*nb; // &x[j*nb];
2306 py = py0;
2307 fasp_blas_smat_ypAx( pA, px0, py, nb );
2308
2309 k ++;
2310 j = JA[k];
2311 pA = val+k*jump; // &val[k*jump];
2312 px0 = x+j*nb; // &x[j*nb];
2313 py = py0;
2314 fasp_blas_smat_ypAx( pA, px0, py, nb );
2315
2316 break;
2317
2318 case 5:
2319 k = IA[i];
2320 j = JA[k];
2321 pA = val+k*jump; // &val[k*jump];
2322 px0 = x+j*nb; // &x[j*nb];
2323 py = py0;
2324 fasp_blas_smat_ypAx( pA, px0, py, nb );
2325
2326 k ++;
2327 j = JA[k];
2328 pA = val+k*jump; // &val[k*jump];
2329 px0 = x+j*nb; // &x[j*nb];
2330 py = py0;
2331 fasp_blas_smat_ypAx( pA, px0, py, nb );
2332
2333 k ++;
2334 j = JA[k];
2335 pA = val+k*jump; // &val[k*jump];
2336 px0 = x+j*nb; // &x[j*nb];
2337 py = py0;
2338 fasp_blas_smat_ypAx( pA, px0, py, nb );
2339
2340 k ++;
2341 j = JA[k];
2342 pA = val+k*jump; // &val[k*jump];
2343 px0 = x+j*nb; // &x[j*nb];
2344 py = py0;
2345 fasp_blas_smat_ypAx( pA, px0, py, nb );
2346
2347 k ++;
2348 j = JA[k];
2349 pA = val+k*jump; // &val[k*jump];
2350 px0 = x+j*nb; // &x[j*nb];
2351 py = py0;
2352 fasp_blas_smat_ypAx( pA, px0, py, nb );
2353
2354 break;
2355
2356 case 6:
2357 k = IA[i];
2358 j = JA[k];
2359 pA = val+k*jump; // &val[k*jump];
2360 px0 = x+j*nb; // &x[j*nb];
2361 py = py0;
2362 fasp_blas_smat_ypAx( pA, px0, py, nb );
2363
2364 k ++;
2365 j = JA[k];
2366 pA = val+k*jump; // &val[k*jump];
2367 px0 = x+j*nb; // &x[j*nb];
2368 py = py0;
2369 fasp_blas_smat_ypAx( pA, px0, py, nb );
2370
2371 k ++;
2372 j = JA[k];
2373 pA = val+k*jump; // &val[k*jump];
2374 px0 = x+j*nb; // &x[j*nb];
2375 py = py0;
2376 fasp_blas_smat_ypAx( pA, px0, py, nb );
2377
2378 k ++;
2379 j = JA[k];
2380 pA = val+k*jump; // &val[k*jump];
2381 px0 = x+j*nb; // &x[j*nb];
2382 py = py0;
2383 fasp_blas_smat_ypAx( pA, px0, py, nb );
2384
2385 k ++;
2386 j = JA[k];
2387 pA = val+k*jump; // &val[k*jump];
2388 px0 = x+j*nb; // &x[j*nb];
2389 py = py0;
2390 fasp_blas_smat_ypAx( pA, px0, py, nb );
2391
2392 k ++;
2393 j = JA[k];
2394 pA = val+k*jump; // &val[k*jump];
2395 px0 = x+j*nb; // &x[j*nb];
2396 py = py0;
2397 fasp_blas_smat_ypAx( pA, px0, py, nb );
2398
2399 break;
2400
2401 case 7:
2402 k = IA[i];
2403 j = JA[k];
2404 pA = val+k*jump; // &val[k*jump];
2405 px0 = x+j*nb; // &x[j*nb];
2406 py = py0;
2407 fasp_blas_smat_ypAx( pA, px0, py, nb );
2408
2409 k ++;
2410 j = JA[k];
2411 pA = val+k*jump; // &val[k*jump];
2412 px0 = x+j*nb; // &x[j*nb];
2413 py = py0;
2414 fasp_blas_smat_ypAx( pA, px0, py, nb );
2415
2416 k ++;
2417 j = JA[k];
2418 pA = val+k*jump; // &val[k*jump];
2419 px0 = x+j*nb; // &x[j*nb];
2420 py = py0;
2421 fasp_blas_smat_ypAx( pA, px0, py, nb );
2422
2423 k ++;
2424 j = JA[k];
2425 pA = val+k*jump; // &val[k*jump];
2426 px0 = x+j*nb; // &x[j*nb];
2427 py = py0;
2428 fasp_blas_smat_ypAx( pA, px0, py, nb );
2429
2430 k ++;
2431 j = JA[k];
2432 pA = val+k*jump; // &val[k*jump];
2433 px0 = x+j*nb; // &x[j*nb];
2434 py = py0;
2435 fasp_blas_smat_ypAx( pA, px0, py, nb );
2436
2437 k ++;
2438 j = JA[k];
2439 pA = val+k*jump; // &val[k*jump];
2440 px0 = x+j*nb; // &x[j*nb];
2441 py = py0;
2442 fasp_blas_smat_ypAx( pA, px0, py, nb );
2443
2444 k ++;
2445 j = JA[k];
2446 pA = val+k*jump; // &val[k*jump];
2447 px0 = x+j*nb; // &x[j*nb];
2448 py = py0;
2449 fasp_blas_smat_ypAx( pA, px0, py, nb );
2450
2451 break;
2452
2453 default:
2454 for (k = IA[i]; k < IA[i+1]; ++k)
2455 {
2456 j = JA[k];
2457 pA = val+k*jump; // &val[k*jump];
2458 px0 = x+j*nb; // &x[j*nb];
2459 py = py0;
2460 fasp_blas_smat_ypAx( pA, px0, py, nb );
2461 }
2462 break;
2463 }
2464 }
2465 }
2466#endif
2467 }
2468 else {
2469 for (i = 0; i < ROW; ++i)
2470 {
2471 py0 = &y[i*nb];
2472 num_nnz_row = IA[i+1] - IA[i];
2473 switch(num_nnz_row)
2474 {
2475 case 3:
2476 k = IA[i];
2477 j = JA[k];
2478 pA = val+k*jump; // &val[k*jump];
2479 px0 = x+j*nb; // &x[j*nb];
2480 py = py0;
2481 fasp_blas_smat_ypAx( pA, px0, py, nb );
2482
2483 k ++;
2484 j = JA[k];
2485 pA = val+k*jump; // &val[k*jump];
2486 px0 = x+j*nb; // &x[j*nb];
2487 py = py0;
2488 fasp_blas_smat_ypAx( pA, px0, py, nb );
2489
2490 k ++;
2491 j = JA[k];
2492 pA = val+k*jump; // &val[k*jump];
2493 px0 = x+j*nb; // &x[j*nb];
2494 py = py0;
2495 fasp_blas_smat_ypAx( pA, px0, py, nb );
2496
2497 break;
2498
2499 case 4:
2500 k = IA[i];
2501 j = JA[k];
2502 pA = val+k*jump; // &val[k*jump];
2503 px0 = x+j*nb; // &x[j*nb];
2504 py = py0;
2505 fasp_blas_smat_ypAx( pA, px0, py, nb );
2506
2507 k ++;
2508 j = JA[k];
2509 pA = val+k*jump; // &val[k*jump];
2510 px0 = x+j*nb; // &x[j*nb];
2511 py = py0;
2512 fasp_blas_smat_ypAx( pA, px0, py, nb );
2513
2514 k ++;
2515 j = JA[k];
2516 pA = val+k*jump; // &val[k*jump];
2517 px0 = x+j*nb; // &x[j*nb];
2518 py = py0;
2519 fasp_blas_smat_ypAx( pA, px0, py, nb );
2520
2521 k ++;
2522 j = JA[k];
2523 pA = val+k*jump; // &val[k*jump];
2524 px0 = x+j*nb; // &x[j*nb];
2525 py = py0;
2526 fasp_blas_smat_ypAx( pA, px0, py, nb );
2527
2528 break;
2529
2530 case 5:
2531 k = IA[i];
2532 j = JA[k];
2533 pA = val+k*jump; // &val[k*jump];
2534 px0 = x+j*nb; // &x[j*nb];
2535 py = py0;
2536 fasp_blas_smat_ypAx( pA, px0, py, nb );
2537
2538 k ++;
2539 j = JA[k];
2540 pA = val+k*jump; // &val[k*jump];
2541 px0 = x+j*nb; // &x[j*nb];
2542 py = py0;
2543 fasp_blas_smat_ypAx( pA, px0, py, nb );
2544
2545 k ++;
2546 j = JA[k];
2547 pA = val+k*jump; // &val[k*jump];
2548 px0 = x+j*nb; // &x[j*nb];
2549 py = py0;
2550 fasp_blas_smat_ypAx( pA, px0, py, nb );
2551
2552 k ++;
2553 j = JA[k];
2554 pA = val+k*jump; // &val[k*jump];
2555 px0 = x+j*nb; // &x[j*nb];
2556 py = py0;
2557 fasp_blas_smat_ypAx( pA, px0, py, nb );
2558
2559 k ++;
2560 j = JA[k];
2561 pA = val+k*jump; // &val[k*jump];
2562 px0 = x+j*nb; // &x[j*nb];
2563 py = py0;
2564 fasp_blas_smat_ypAx( pA, px0, py, nb );
2565
2566 break;
2567
2568 case 6:
2569 k = IA[i];
2570 j = JA[k];
2571 pA = val+k*jump; // &val[k*jump];
2572 px0 = x+j*nb; // &x[j*nb];
2573 py = py0;
2574 fasp_blas_smat_ypAx( pA, px0, py, nb );
2575
2576 k ++;
2577 j = JA[k];
2578 pA = val+k*jump; // &val[k*jump];
2579 px0 = x+j*nb; // &x[j*nb];
2580 py = py0;
2581 fasp_blas_smat_ypAx( pA, px0, py, nb );
2582
2583 k ++;
2584 j = JA[k];
2585 pA = val+k*jump; // &val[k*jump];
2586 px0 = x+j*nb; // &x[j*nb];
2587 py = py0;
2588 fasp_blas_smat_ypAx( pA, px0, py, nb );
2589
2590 k ++;
2591 j = JA[k];
2592 pA = val+k*jump; // &val[k*jump];
2593 px0 = x+j*nb; // &x[j*nb];
2594 py = py0;
2595 fasp_blas_smat_ypAx( pA, px0, py, nb );
2596
2597 k ++;
2598 j = JA[k];
2599 pA = val+k*jump; // &val[k*jump];
2600 px0 = x+j*nb; // &x[j*nb];
2601 py = py0;
2602 fasp_blas_smat_ypAx( pA, px0, py, nb );
2603
2604 k ++;
2605 j = JA[k];
2606 pA = val+k*jump; // &val[k*jump];
2607 px0 = x+j*nb; // &x[j*nb];
2608 py = py0;
2609 fasp_blas_smat_ypAx( pA, px0, py, nb );
2610
2611 break;
2612
2613 case 7:
2614 k = IA[i];
2615 j = JA[k];
2616 pA = val+k*jump; // &val[k*jump];
2617 px0 = x+j*nb; // &x[j*nb];
2618 py = py0;
2619 fasp_blas_smat_ypAx( pA, px0, py, nb );
2620
2621 k ++;
2622 j = JA[k];
2623 pA = val+k*jump; // &val[k*jump];
2624 px0 = x+j*nb; // &x[j*nb];
2625 py = py0;
2626 fasp_blas_smat_ypAx( pA, px0, py, nb );
2627
2628 k ++;
2629 j = JA[k];
2630 pA = val+k*jump; // &val[k*jump];
2631 px0 = x+j*nb; // &x[j*nb];
2632 py = py0;
2633 fasp_blas_smat_ypAx( pA, px0, py, nb );
2634
2635 k ++;
2636 j = JA[k];
2637 pA = val+k*jump; // &val[k*jump];
2638 px0 = x+j*nb; // &x[j*nb];
2639 py = py0;
2640 fasp_blas_smat_ypAx( pA, px0, py, nb );
2641
2642 k ++;
2643 j = JA[k];
2644 pA = val+k*jump; // &val[k*jump];
2645 px0 = x+j*nb; // &x[j*nb];
2646 py = py0;
2647 fasp_blas_smat_ypAx( pA, px0, py, nb );
2648
2649 k ++;
2650 j = JA[k];
2651 pA = val+k*jump; // &val[k*jump];
2652 px0 = x+j*nb; // &x[j*nb];
2653 py = py0;
2654 fasp_blas_smat_ypAx( pA, px0, py, nb );
2655
2656 k ++;
2657 j = JA[k];
2658 pA = val+k*jump; // &val[k*jump];
2659 px0 = x+j*nb; // &x[j*nb];
2660 py = py0;
2661 fasp_blas_smat_ypAx( pA, px0, py, nb );
2662
2663 break;
2664
2665 default:
2666 for (k = IA[i]; k < IA[i+1]; ++k)
2667 {
2668 j = JA[k];
2669 pA = val+k*jump; // &val[k*jump];
2670 px0 = x+j*nb; // &x[j*nb];
2671 py = py0;
2672 fasp_blas_smat_ypAx( pA, px0, py, nb );
2673 }
2674 break;
2675 }
2676 }
2677 }
2678 }
2679 break;
2680 }
2681}
2682
2698 const REAL *x,
2699 REAL *y)
2700{
2701 /* members of A */
2702 const INT ROW = A->ROW;
2703 const INT nb = A->nb;
2704 const INT size = ROW*nb;
2705 const INT *IA = A->IA;
2706 const INT *JA = A->JA;
2707
2708 /* local variables */
2709 const REAL *px0 = NULL;
2710 REAL *py0 = NULL, *py = NULL;
2711 INT i,j,k, num_nnz_row;
2712 SHORT use_openmp = FALSE;
2713
2714#ifdef _OPENMP
2715 const REAL *val = A->val;
2716 const REAL *pA;
2717 INT myid, mybegin, myend, nthreads;
2718 if ( ROW > OPENMP_HOLDS ) {
2719 use_openmp = TRUE;
2720 nthreads = fasp_get_num_threads();
2721 }
2722#endif
2723
2724 //-----------------------------------------------------------------
2725 // zero out 'y'
2726 //-----------------------------------------------------------------
2727 fasp_darray_set(size, y, 0.0);
2728
2729 //-----------------------------------------------------------------
2730 // y = A*x (Core Computation)
2731 // each non-zero block elements are stored in row-major order
2732 //-----------------------------------------------------------------
2733
2734 switch (nb)
2735 {
2736 case 3:
2737 {
2738 if (use_openmp) {
2739#ifdef _OPENMP
2740#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
2741 {
2742 myid = omp_get_thread_num();
2743 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
2744 for (i=mybegin; i < myend; ++i) {
2745 py0 = &y[i*3];
2746 num_nnz_row = IA[i+1] - IA[i];
2747 switch(num_nnz_row) {
2748 case 3:
2749 k = IA[i];
2750 j = JA[k];
2751 pA = val+k*9;
2752 px0 = x+j*3;
2753 py = py0;
2754 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2755
2756 k ++;
2757 j = JA[k];
2758 pA = val+k*9;
2759 px0 = x+j*3;
2760 py = py0;
2761 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2762
2763 k ++;
2764 j = JA[k];
2765 pA = val+k*9;
2766 px0 = x+j*3;
2767 py = py0;
2768 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2769
2770 break;
2771
2772 case 4:
2773 k = IA[i];
2774 j = JA[k];
2775 pA = val+k*9;
2776 px0 = x+j*3;
2777 py = py0;
2778 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2779
2780 k ++;
2781 j = JA[k];
2782 pA = val+k*9;
2783 px0 = x+j*3;
2784 py = py0;
2785 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2786
2787 k ++;
2788 j = JA[k];
2789 pA = val+k*9;
2790 px0 = x+j*3;
2791 py = py0;
2792 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2793
2794 k ++;
2795 j = JA[k];
2796 pA = val+k*9;
2797 px0 = x+j*3;
2798 py = py0;
2799 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2800
2801 break;
2802
2803 case 5:
2804 k = IA[i];
2805 j = JA[k];
2806 pA = val+k*9;
2807 px0 = x+j*3;
2808 py = py0;
2809 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2810
2811 k ++;
2812 j = JA[k];
2813 pA = val+k*9;
2814 px0 = x+j*3;
2815 py = py0;
2816 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2817
2818 k ++;
2819 j = JA[k];
2820 pA = val+k*9;
2821 px0 = x+j*3;
2822 py = py0;
2823 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2824
2825 k ++;
2826 j = JA[k];
2827 pA = val+k*9;
2828 px0 = x+j*3;
2829 py = py0;
2830 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2831
2832 k ++;
2833 j = JA[k];
2834 pA = val+k*9;
2835 px0 = x+j*3;
2836 py = py0;
2837 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2838
2839 break;
2840
2841 case 6:
2842 k = IA[i];
2843 j = JA[k];
2844 pA = val+k*9;
2845 px0 = x+j*3;
2846 py = py0;
2847 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2848
2849 k ++;
2850 j = JA[k];
2851 pA = val+k*9;
2852 px0 = x+j*3;
2853 py = py0;
2854 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2855
2856 k ++;
2857 j = JA[k];
2858 pA = val+k*9;
2859 px0 = x+j*3;
2860 py = py0;
2861 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2862
2863 k ++;
2864 j = JA[k];
2865 pA = val+k*9;
2866 px0 = x+j*3;
2867 py = py0;
2868 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2869
2870 k ++;
2871 j = JA[k];
2872 pA = val+k*9;
2873 px0 = x+j*3;
2874 py = py0;
2875 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2876
2877 k ++;
2878 j = JA[k];
2879 pA = val+k*9;
2880 px0 = x+j*3;
2881 py = py0;
2882 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2883
2884 break;
2885
2886 case 7:
2887 k = IA[i];
2888 j = JA[k];
2889 pA = val+k*9;
2890 px0 = x+j*3;
2891 py = py0;
2892 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2893
2894 k ++;
2895 j = JA[k];
2896 pA = val+k*9;
2897 px0 = x+j*3;
2898 py = py0;
2899 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2900
2901 k ++;
2902 j = JA[k];
2903 pA = val+k*9;
2904 px0 = x+j*3;
2905 py = py0;
2906 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2907
2908 k ++;
2909 j = JA[k];
2910 pA = val+k*9;
2911 px0 = x+j*3;
2912 py = py0;
2913 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2914
2915 k ++;
2916 j = JA[k];
2917 pA = val+k*9;
2918 px0 = x+j*3;
2919 py = py0;
2920 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2921
2922 k ++;
2923 j = JA[k];
2924 pA = val+k*9;
2925 px0 = x+j*3;
2926 py = py0;
2927 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2928
2929 k ++;
2930 j = JA[k];
2931 pA = val+k*9;
2932 px0 = x+j*3;
2933 py = py0;
2934 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2935
2936 break;
2937
2938 default:
2939 for (k = IA[i]; k < IA[i+1]; ++k)
2940 {
2941 j = JA[k];
2942 pA = val+k*9;
2943 px0 = x+j*3;
2944 py = py0;
2945 fasp_blas_smat_ypAx_nc3( pA, px0, py );
2946 }
2947 break;
2948 }
2949 }
2950 }
2951#endif
2952 }
2953 else {
2954 for (i = 0; i < ROW; ++i) {
2955 py0 = &y[i*3];
2956 num_nnz_row = IA[i+1] - IA[i];
2957 switch(num_nnz_row) {
2958 case 3:
2959 k = IA[i];
2960 j = JA[k];
2961 px0 = x+j*3; // &x[j*nb];
2962 py = py0;
2963 py[0] += px0[0];
2964 py[1] += px0[1];
2965 py[2] += px0[2];
2966
2967 k ++;
2968 j = JA[k];
2969 px0 = x+j*3; // &x[j*nb];
2970 py = py0;
2971 py[0] += px0[0];
2972 py[1] += px0[1];
2973 py[2] += px0[2];
2974
2975 k ++;
2976 j = JA[k];
2977 px0 = x+j*3; // &x[j*nb];
2978 py = py0;
2979 py[0] += px0[0];
2980 py[1] += px0[1];
2981 py[2] += px0[2];
2982
2983 break;
2984
2985 case 4:
2986 k = IA[i];
2987 j = JA[k];
2988 px0 = x+j*3; // &x[j*nb];
2989 py = py0;
2990 py[0] += px0[0];
2991 py[1] += px0[1];
2992 py[2] += px0[2];
2993
2994 k ++;
2995 j = JA[k];
2996 px0 = x+j*3; // &x[j*nb];
2997 py = py0;
2998 py[0] += px0[0];
2999 py[1] += px0[1];
3000 py[2] += px0[2];
3001
3002 k ++;
3003 j = JA[k];
3004 px0 = x+j*3; // &x[j*nb];
3005 py = py0;
3006 py[0] += px0[0];
3007 py[1] += px0[1];
3008 py[2] += px0[2];
3009
3010 k ++;
3011 j = JA[k];
3012 px0 = x+j*3; // &x[j*nb];
3013 py = py0;
3014 py[0] += px0[0];
3015 py[1] += px0[1];
3016 py[2] += px0[2];
3017
3018 break;
3019
3020 case 5:
3021 k = IA[i];
3022 j = JA[k];
3023 px0 = x+j*3; // &x[j*nb];
3024 py = py0;
3025 py[0] += px0[0];
3026 py[1] += px0[1];
3027 py[2] += px0[2];
3028
3029 k ++;
3030 j = JA[k];
3031 px0 = x+j*3; // &x[j*nb];
3032 py = py0;
3033 py[0] += px0[0];
3034 py[1] += px0[1];
3035 py[2] += px0[2];
3036
3037 k ++;
3038 j = JA[k];
3039 px0 = x+j*3; // &x[j*nb];
3040 py = py0;
3041 py[0] += px0[0];
3042 py[1] += px0[1];
3043 py[2] += px0[2];
3044
3045 k ++;
3046 j = JA[k];
3047 px0 = x+j*3; // &x[j*nb];
3048 py = py0;
3049 py[0] += px0[0];
3050 py[1] += px0[1];
3051 py[2] += px0[2];
3052
3053 k ++;
3054 j = JA[k];
3055 px0 = x+j*3; // &x[j*nb];
3056 py = py0;
3057 py[0] += px0[0];
3058 py[1] += px0[1];
3059 py[2] += px0[2];
3060
3061 break;
3062
3063 case 6:
3064 k = IA[i];
3065 j = JA[k];
3066 px0 = x+j*3; // &x[j*nb];
3067 py = py0;
3068 py[0] += px0[0];
3069 py[1] += px0[1];
3070 py[2] += px0[2];
3071
3072 k ++;
3073 j = JA[k];
3074 px0 = x+j*3; // &x[j*nb];
3075 py = py0;
3076 py[0] += px0[0];
3077 py[1] += px0[1];
3078 py[2] += px0[2];
3079
3080 k ++;
3081 j = JA[k];
3082 px0 = x+j*3; // &x[j*nb];
3083 py = py0;
3084 py[0] += px0[0];
3085 py[1] += px0[1];
3086 py[2] += px0[2];
3087
3088 k ++;
3089 j = JA[k];
3090 px0 = x+j*3; // &x[j*nb];
3091 py = py0;
3092 py[0] += px0[0];
3093 py[1] += px0[1];
3094 py[2] += px0[2];
3095
3096 k ++;
3097 j = JA[k];
3098 px0 = x+j*3; // &x[j*nb];
3099 py = py0;
3100 py[0] += px0[0];
3101 py[1] += px0[1];
3102 py[2] += px0[2];
3103
3104 k ++;
3105 j = JA[k];
3106 px0 = x+j*3; // &x[j*nb];
3107 py = py0;
3108 py[0] += px0[0];
3109 py[1] += px0[1];
3110 py[2] += px0[2];
3111
3112 break;
3113
3114 case 7:
3115 k = IA[i];
3116 j = JA[k];
3117 px0 = x+j*3; // &x[j*nb];
3118 py = py0;
3119 py[0] += px0[0];
3120 py[1] += px0[1];
3121 py[2] += px0[2];
3122
3123 k ++;
3124 j = JA[k];
3125 px0 = x+j*3; // &x[j*nb];
3126 py = py0;
3127 py[0] += px0[0];
3128 py[1] += px0[1];
3129 py[2] += px0[2];
3130
3131 k ++;
3132 j = JA[k];
3133 px0 = x+j*3; // &x[j*nb];
3134 py = py0;
3135 py[0] += px0[0];
3136 py[1] += px0[1];
3137 py[2] += px0[2];
3138
3139 k ++;
3140 j = JA[k];
3141 px0 = x+j*3; // &x[j*nb];
3142 py = py0;
3143 py[0] += px0[0];
3144 py[1] += px0[1];
3145 py[2] += px0[2];
3146
3147 k ++;
3148 j = JA[k];
3149 px0 = x+j*3; // &x[j*nb];
3150 py = py0;
3151 py[0] += px0[0];
3152 py[1] += px0[1];
3153 py[2] += px0[2];
3154
3155 k ++;
3156 j = JA[k];
3157 px0 = x+j*3; // &x[j*nb];
3158 py = py0;
3159 py[0] += px0[0];
3160 py[1] += px0[1];
3161 py[2] += px0[2];
3162
3163 k ++;
3164 j = JA[k];
3165 px0 = x+j*3; // &x[j*nb];
3166 py = py0;
3167 py[0] += px0[0];
3168 py[1] += px0[1];
3169 py[2] += px0[2];
3170
3171 break;
3172
3173 default:
3174 for (k = IA[i]; k < IA[i+1]; ++k) {
3175 j = JA[k];
3176 px0 = x+j*3; // &x[j*nb];
3177 py = py0;
3178 py[0] += px0[0];
3179 py[1] += px0[1];
3180 py[2] += px0[2];
3181 }
3182 break;
3183 }
3184 }
3185 }
3186 }
3187 break;
3188
3189 case 5:
3190 {
3191 if (use_openmp) {
3192#ifdef _OPENMP
3193#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
3194 {
3195 myid = omp_get_thread_num();
3196 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
3197 for (i=mybegin; i < myend; ++i) {
3198 py0 = &y[i*5];
3199 num_nnz_row = IA[i+1] - IA[i];
3200 switch(num_nnz_row) {
3201
3202 case 3:
3203 k = IA[i];
3204 j = JA[k];
3205 pA = val+k*25; // &val[k*jump];
3206 px0 = x+j*5; // &x[j*nb];
3207 py = py0;
3208 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3209
3210 k ++;
3211 j = JA[k];
3212 pA = val+k*25; // &val[k*jump];
3213 px0 = x+j*5; // &x[j*nb];
3214 py = py0;
3215 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3216
3217 k ++;
3218 j = JA[k];
3219 pA = val+k*25; // &val[k*jump];
3220 px0 = x+j*5; // &x[j*nb];
3221 py = py0;
3222 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3223
3224 break;
3225
3226 case 4:
3227 k = IA[i];
3228 j = JA[k];
3229 pA = val+k*25; // &val[k*jump];
3230 px0 = x+j*5; // &x[j*nb];
3231 py = py0;
3232 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3233
3234 k ++;
3235 j = JA[k];
3236 pA = val+k*25; // &val[k*jump];
3237 px0 = x+j*5; // &x[j*nb];
3238 py = py0;
3239 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3240
3241 k ++;
3242 j = JA[k];
3243 pA = val+k*25; // &val[k*jump];
3244 px0 = x+j*5; // &x[j*nb];
3245 py = py0;
3246 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3247
3248 k ++;
3249 j = JA[k];
3250 pA = val+k*25; // &val[k*jump];
3251 px0 = x+j*5; // &x[j*nb];
3252 py = py0;
3253 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3254
3255 break;
3256
3257 case 5:
3258 k = IA[i];
3259 j = JA[k];
3260 pA = val+k*25; // &val[k*jump];
3261 px0 = x+j*5; // &x[j*nb];
3262 py = py0;
3263 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3264
3265 k ++;
3266 j = JA[k];
3267 pA = val+k*25; // &val[k*jump];
3268 px0 = x+j*5; // &x[j*nb];
3269 py = py0;
3270 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3271
3272 k ++;
3273 j = JA[k];
3274 pA = val+k*25; // &val[k*jump];
3275 px0 = x+j*5; // &x[j*nb];
3276 py = py0;
3277 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3278
3279 k ++;
3280 j = JA[k];
3281 pA = val+k*25; // &val[k*jump];
3282 px0 = x+j*5; // &x[j*nb];
3283 py = py0;
3284 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3285
3286 k ++;
3287 j = JA[k];
3288 pA = val+k*25; // &val[k*jump];
3289 px0 = x+j*5; // &x[j*nb];
3290 py = py0;
3291 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3292
3293 break;
3294
3295 case 6:
3296 k = IA[i];
3297 j = JA[k];
3298 pA = val+k*25; // &val[k*jump];
3299 px0 = x+j*5; // &x[j*nb];
3300 py = py0;
3301 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3302
3303 k ++;
3304 j = JA[k];
3305 pA = val+k*25; // &val[k*jump];
3306 px0 = x+j*5; // &x[j*nb];
3307 py = py0;
3308 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3309
3310 k ++;
3311 j = JA[k];
3312 pA = val+k*25; // &val[k*jump];
3313 px0 = x+j*5; // &x[j*nb];
3314 py = py0;
3315 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3316
3317 k ++;
3318 j = JA[k];
3319 pA = val+k*25; // &val[k*jump];
3320 px0 = x+j*5; // &x[j*nb];
3321 py = py0;
3322 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3323
3324 k ++;
3325 j = JA[k];
3326 pA = val+k*25; // &val[k*jump];
3327 px0 = x+j*5; // &x[j*nb];
3328 py = py0;
3329 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3330
3331 k ++;
3332 j = JA[k];
3333 pA = val+k*25; // &val[k*jump];
3334 px0 = x+j*5; // &x[j*nb];
3335 py = py0;
3336 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3337
3338 break;
3339
3340 case 7:
3341 k = IA[i];
3342 j = JA[k];
3343 pA = val+k*25; // &val[k*jump];
3344 px0 = x+j*5; // &x[j*nb];
3345 py = py0;
3346 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3347
3348 k ++;
3349 j = JA[k];
3350 pA = val+k*25; // &val[k*jump];
3351 px0 = x+j*5; // &x[j*nb];
3352 py = py0;
3353 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3354
3355 k ++;
3356 j = JA[k];
3357 pA = val+k*25; // &val[k*jump];
3358 px0 = x+j*5; // &x[j*nb];
3359 py = py0;
3360 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3361
3362 k ++;
3363 j = JA[k];
3364 pA = val+k*25; // &val[k*jump];
3365 px0 = x+j*5; // &x[j*nb];
3366 py = py0;
3367 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3368
3369 k ++;
3370 j = JA[k];
3371 pA = val+k*25; // &val[k*jump];
3372 px0 = x+j*5; // &x[j*nb];
3373 py = py0;
3374 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3375
3376 k ++;
3377 j = JA[k];
3378 pA = val+k*25; // &val[k*jump];
3379 px0 = x+j*5; // &x[j*nb];
3380 py = py0;
3381 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3382
3383 k ++;
3384 j = JA[k];
3385 pA = val+k*25; // &val[k*jump];
3386 px0 = x+j*5; // &x[j*nb];
3387 py = py0;
3388 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3389
3390 break;
3391
3392 default:
3393 for (k = IA[i]; k < IA[i+1]; ++k)
3394 {
3395 j = JA[k];
3396 pA = val+k*25; // &val[k*jump];
3397 px0 = x+j*5; // &x[j*nb];
3398 py = py0;
3399 fasp_blas_smat_ypAx_nc5( pA, px0, py );
3400 }
3401 break;
3402 }
3403 }
3404 }
3405#endif
3406 }
3407 else {
3408 for (i = 0; i < ROW; ++i) {
3409 py0 = &y[i*5];
3410 num_nnz_row = IA[i+1] - IA[i];
3411 switch(num_nnz_row) {
3412
3413 case 3:
3414 k = IA[i];
3415 j = JA[k];
3416 px0 = x+j*5; // &x[j*nb];
3417 py = py0;
3418 py[0] += px0[0];
3419 py[1] += px0[1];
3420 py[2] += px0[2];
3421 py[3] += px0[3];
3422 py[4] += px0[4];
3423
3424 k ++;
3425 j = JA[k];
3426 px0 = x+j*5; // &x[j*nb];
3427 py = py0;
3428 py[0] += px0[0];
3429 py[1] += px0[1];
3430 py[2] += px0[2];
3431 py[3] += px0[3];
3432 py[4] += px0[4];
3433
3434 k ++;
3435 j = JA[k];
3436 px0 = x+j*5; // &x[j*nb];
3437 py = py0;
3438 py[0] += px0[0];
3439 py[1] += px0[1];
3440 py[2] += px0[2];
3441 py[3] += px0[3];
3442 py[4] += px0[4];
3443
3444 break;
3445
3446 case 4:
3447 k = IA[i];
3448 j = JA[k];
3449 px0 = x+j*5; // &x[j*nb];
3450 py = py0;
3451 py[0] += px0[0];
3452 py[1] += px0[1];
3453 py[2] += px0[2];
3454 py[3] += px0[3];
3455 py[4] += px0[4];
3456
3457 k ++;
3458 j = JA[k];
3459 px0 = x+j*5; // &x[j*nb];
3460 py = py0;
3461 py[0] += px0[0];
3462 py[1] += px0[1];
3463 py[2] += px0[2];
3464 py[3] += px0[3];
3465 py[4] += px0[4];
3466
3467 k ++;
3468 j = JA[k];
3469 px0 = x+j*5; // &x[j*nb];
3470 py = py0;
3471 py[0] += px0[0];
3472 py[1] += px0[1];
3473 py[2] += px0[2];
3474 py[3] += px0[3];
3475 py[4] += px0[4];
3476
3477 k ++;
3478 j = JA[k];
3479 px0 = x+j*5; // &x[j*nb];
3480 py = py0;
3481 py[0] += px0[0];
3482 py[1] += px0[1];
3483 py[2] += px0[2];
3484 py[3] += px0[3];
3485 py[4] += px0[4];
3486
3487 break;
3488
3489 case 5:
3490 k = IA[i];
3491 j = JA[k];
3492 px0 = x+j*5; // &x[j*nb];
3493 py = py0;
3494 py[0] += px0[0];
3495 py[1] += px0[1];
3496 py[2] += px0[2];
3497 py[3] += px0[3];
3498 py[4] += px0[4];
3499
3500 k ++;
3501 j = JA[k];
3502 px0 = x+j*5; // &x[j*nb];
3503 py = py0;
3504 py[0] += px0[0];
3505 py[1] += px0[1];
3506 py[2] += px0[2];
3507 py[3] += px0[3];
3508 py[4] += px0[4];
3509
3510 k ++;
3511 j = JA[k];
3512 px0 = x+j*5; // &x[j*nb];
3513 py = py0;
3514 py[0] += px0[0];
3515 py[1] += px0[1];
3516 py[2] += px0[2];
3517 py[3] += px0[3];
3518 py[4] += px0[4];
3519
3520 k ++;
3521 j = JA[k];
3522 px0 = x+j*5; // &x[j*nb];
3523 py = py0;
3524 py[0] += px0[0];
3525 py[1] += px0[1];
3526 py[2] += px0[2];
3527 py[3] += px0[3];
3528 py[4] += px0[4];
3529
3530 k ++;
3531 j = JA[k];
3532 px0 = x+j*5; // &x[j*nb];
3533 py = py0;
3534 py[0] += px0[0];
3535 py[1] += px0[1];
3536 py[2] += px0[2];
3537 py[3] += px0[3];
3538 py[4] += px0[4];
3539
3540 break;
3541
3542 case 6:
3543 k = IA[i];
3544 j = JA[k];
3545 px0 = x+j*5; // &x[j*nb];
3546 py = py0;
3547 py[0] += px0[0];
3548 py[1] += px0[1];
3549 py[2] += px0[2];
3550 py[3] += px0[3];
3551 py[4] += px0[4];
3552
3553 k ++;
3554 j = JA[k];
3555 px0 = x+j*5; // &x[j*nb];
3556 py = py0;
3557 py[0] += px0[0];
3558 py[1] += px0[1];
3559 py[2] += px0[2];
3560 py[3] += px0[3];
3561 py[4] += px0[4];
3562
3563 k ++;
3564 j = JA[k];
3565 px0 = x+j*5; // &x[j*nb];
3566 py = py0;
3567 py[0] += px0[0];
3568 py[1] += px0[1];
3569 py[2] += px0[2];
3570 py[3] += px0[3];
3571 py[4] += px0[4];
3572
3573 k ++;
3574 j = JA[k];
3575 px0 = x+j*5; // &x[j*nb];
3576 py = py0;
3577 py[0] += px0[0];
3578 py[1] += px0[1];
3579 py[2] += px0[2];
3580 py[3] += px0[3];
3581 py[4] += px0[4];
3582
3583 k ++;
3584 j = JA[k];
3585 px0 = x+j*5; // &x[j*nb];
3586 py = py0;
3587 py[0] += px0[0];
3588 py[1] += px0[1];
3589 py[2] += px0[2];
3590 py[3] += px0[3];
3591 py[4] += px0[4];
3592
3593 k ++;
3594 j = JA[k];
3595 px0 = x+j*5; // &x[j*nb];
3596 py = py0;
3597 py[0] += px0[0];
3598 py[1] += px0[1];
3599 py[2] += px0[2];
3600 py[3] += px0[3];
3601 py[4] += px0[4];
3602
3603 break;
3604
3605 case 7:
3606 k = IA[i];
3607 j = JA[k];
3608 px0 = x+j*5; // &x[j*nb];
3609 py = py0;
3610 py[0] += px0[0];
3611 py[1] += px0[1];
3612 py[2] += px0[2];
3613 py[3] += px0[3];
3614 py[4] += px0[4];
3615
3616 k ++;
3617 j = JA[k];
3618 px0 = x+j*5; // &x[j*nb];
3619 py = py0;
3620 py[0] += px0[0];
3621 py[1] += px0[1];
3622 py[2] += px0[2];
3623 py[3] += px0[3];
3624 py[4] += px0[4];
3625
3626 k ++;
3627 j = JA[k];
3628 px0 = x+j*5; // &x[j*nb];
3629 py = py0;
3630 py[0] += px0[0];
3631 py[1] += px0[1];
3632 py[2] += px0[2];
3633 py[3] += px0[3];
3634 py[4] += px0[4];
3635
3636 k ++;
3637 j = JA[k];
3638 px0 = x+j*5; // &x[j*nb];
3639 py = py0;
3640 py[0] += px0[0];
3641 py[1] += px0[1];
3642 py[2] += px0[2];
3643 py[3] += px0[3];
3644 py[4] += px0[4];
3645
3646 k ++;
3647 j = JA[k];
3648 px0 = x+j*5; // &x[j*nb];
3649 py = py0;
3650 py[0] += px0[0];
3651 py[1] += px0[1];
3652 py[2] += px0[2];
3653 py[3] += px0[3];
3654 py[4] += px0[4];
3655
3656 k ++;
3657 j = JA[k];
3658 px0 = x+j*5; // &x[j*nb];
3659 py = py0;
3660 py[0] += px0[0];
3661 py[1] += px0[1];
3662 py[2] += px0[2];
3663 py[3] += px0[3];
3664 py[4] += px0[4];
3665
3666 k ++;
3667 j = JA[k];
3668 px0 = x+j*5; // &x[j*nb];
3669 py = py0;
3670 py[0] += px0[0];
3671 py[1] += px0[1];
3672 py[2] += px0[2];
3673 py[3] += px0[3];
3674 py[4] += px0[4];
3675
3676 break;
3677
3678 default:
3679 for (k = IA[i]; k < IA[i+1]; ++k) {
3680 j = JA[k];
3681 px0 = x+j*5; // &x[j*nb];
3682 py = py0;
3683 py[0] += px0[0];
3684 py[1] += px0[1];
3685 py[2] += px0[2];
3686 py[3] += px0[3];
3687 py[4] += px0[4];
3688 }
3689 break;
3690 }
3691 }
3692 }
3693 }
3694 break;
3695
3696 case 7:
3697 {
3698 if (use_openmp) {
3699#ifdef _OPENMP
3700#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
3701 {
3702 myid = omp_get_thread_num();
3703 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
3704 for (i=mybegin; i < myend; ++i) {
3705 py0 = &y[i*7];
3706 num_nnz_row = IA[i+1] - IA[i];
3707 switch(num_nnz_row) {
3708
3709 case 3:
3710 k = IA[i];
3711 j = JA[k];
3712 pA = val+k*49; // &val[k*jump];
3713 px0 = x+j*7; // &x[j*nb];
3714 py = py0;
3715 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3716
3717 k ++;
3718 j = JA[k];
3719 pA = val+k*49; // &val[k*jump];
3720 px0 = x+j*7; // &x[j*nb];
3721 py = py0;
3722 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3723
3724 k ++;
3725 j = JA[k];
3726 pA = val+k*49; // &val[k*jump];
3727 px0 = x+j*7; // &x[j*nb];
3728 py = py0;
3729 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3730
3731 break;
3732
3733 case 4:
3734 k = IA[i];
3735 j = JA[k];
3736 pA = val+k*49; // &val[k*jump];
3737 px0 = x+j*7; // &x[j*nb];
3738 py = py0;
3739 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3740
3741 k ++;
3742 j = JA[k];
3743 pA = val+k*49; // &val[k*jump];
3744 px0 = x+j*7; // &x[j*nb];
3745 py = py0;
3746 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3747
3748 k ++;
3749 j = JA[k];
3750 pA = val+k*49; // &val[k*jump];
3751 px0 = x+j*7; // &x[j*nb];
3752 py = py0;
3753 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3754
3755 k ++;
3756 j = JA[k];
3757 pA = val+k*49; // &val[k*jump];
3758 px0 = x+j*7; // &x[j*nb];
3759 py = py0;
3760 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3761
3762 break;
3763
3764 case 5:
3765 k = IA[i];
3766 j = JA[k];
3767 pA = val+k*49; // &val[k*jump];
3768 px0 = x+j*7; // &x[j*nb];
3769 py = py0;
3770 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3771
3772 k ++;
3773 j = JA[k];
3774 pA = val+k*49; // &val[k*jump];
3775 px0 = x+j*7; // &x[j*nb];
3776 py = py0;
3777 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3778
3779 k ++;
3780 j = JA[k];
3781 pA = val+k*49; // &val[k*jump];
3782 px0 = x+j*7; // &x[j*nb];
3783 py = py0;
3784 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3785
3786 k ++;
3787 j = JA[k];
3788 pA = val+k*49; // &val[k*jump];
3789 px0 = x+j*7; // &x[j*nb];
3790 py = py0;
3791 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3792
3793 k ++;
3794 j = JA[k];
3795 pA = val+k*49; // &val[k*jump];
3796 px0 = x+j*7; // &x[j*nb];
3797 py = py0;
3798 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3799
3800 break;
3801
3802 case 6:
3803 k = IA[i];
3804 j = JA[k];
3805 pA = val+k*49; // &val[k*jump];
3806 px0 = x+j*7; // &x[j*nb];
3807 py = py0;
3808 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3809
3810 k ++;
3811 j = JA[k];
3812 pA = val+k*49; // &val[k*jump];
3813 px0 = x+j*7; // &x[j*nb];
3814 py = py0;
3815 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3816
3817 k ++;
3818 j = JA[k];
3819 pA = val+k*49; // &val[k*jump];
3820 px0 = x+j*7; // &x[j*nb];
3821 py = py0;
3822 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3823
3824 k ++;
3825 j = JA[k];
3826 pA = val+k*49; // &val[k*jump];
3827 px0 = x+j*7; // &x[j*nb];
3828 py = py0;
3829 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3830
3831 k ++;
3832 j = JA[k];
3833 pA = val+k*49; // &val[k*jump];
3834 px0 = x+j*7; // &x[j*nb];
3835 py = py0;
3836 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3837
3838 k ++;
3839 j = JA[k];
3840 pA = val+k*49; // &val[k*jump];
3841 px0 = x+j*7; // &x[j*nb];
3842 py = py0;
3843 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3844
3845 break;
3846
3847 case 7:
3848 k = IA[i];
3849 j = JA[k];
3850 pA = val+k*49; // &val[k*jump];
3851 px0 = x+j*7; // &x[j*nb];
3852 py = py0;
3853 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3854
3855 k ++;
3856 j = JA[k];
3857 pA = val+k*49; // &val[k*jump];
3858 px0 = x+j*7; // &x[j*nb];
3859 py = py0;
3860 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3861
3862 k ++;
3863 j = JA[k];
3864 pA = val+k*49; // &val[k*jump];
3865 px0 = x+j*7; // &x[j*nb];
3866 py = py0;
3867 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3868
3869 k ++;
3870 j = JA[k];
3871 pA = val+k*49; // &val[k*jump];
3872 px0 = x+j*7; // &x[j*nb];
3873 py = py0;
3874 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3875
3876 k ++;
3877 j = JA[k];
3878 pA = val+k*49; // &val[k*jump];
3879 px0 = x+j*7; // &x[j*nb];
3880 py = py0;
3881 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3882
3883 k ++;
3884 j = JA[k];
3885 pA = val+k*49; // &val[k*jump];
3886 px0 = x+j*7; // &x[j*nb];
3887 py = py0;
3888 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3889
3890 k ++;
3891 j = JA[k];
3892 pA = val+k*49; // &val[k*jump];
3893 px0 = x+j*7; // &x[j*nb];
3894 py = py0;
3895 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3896
3897 break;
3898
3899 default:
3900 for (k = IA[i]; k < IA[i+1]; ++k) {
3901 j = JA[k];
3902 pA = val+k*49; // &val[k*jump];
3903 px0 = x+j*7; // &x[j*nb];
3904 py = py0;
3905 fasp_blas_smat_ypAx_nc7( pA, px0, py );
3906 }
3907 break;
3908 }
3909 }
3910 }
3911#endif
3912 }
3913 else {
3914 for (i = 0; i < ROW; ++i) {
3915 py0 = &y[i*7];
3916 num_nnz_row = IA[i+1] - IA[i];
3917 switch(num_nnz_row) {
3918
3919 case 3:
3920 k = IA[i];
3921 j = JA[k];
3922 px0 = x+j*7; // &x[j*nb];
3923 py = py0;
3924 py[0] += px0[0];
3925 py[1] += px0[1];
3926 py[2] += px0[2];
3927 py[3] += px0[3];
3928 py[4] += px0[4];
3929 py[5] += px0[5];
3930 py[6] += px0[6];
3931
3932 k ++;
3933 j = JA[k];
3934 px0 = x+j*7; // &x[j*nb];
3935 py = py0;
3936 py[0] += px0[0];
3937 py[1] += px0[1];
3938 py[2] += px0[2];
3939 py[3] += px0[3];
3940 py[4] += px0[4];
3941 py[5] += px0[5];
3942 py[6] += px0[6];
3943
3944 k ++;
3945 j = JA[k];
3946 px0 = x+j*7; // &x[j*nb];
3947 py = py0;
3948 py[0] += px0[0];
3949 py[1] += px0[1];
3950 py[2] += px0[2];
3951 py[3] += px0[3];
3952 py[4] += px0[4];
3953 py[5] += px0[5];
3954 py[6] += px0[6];
3955
3956 break;
3957
3958 case 4:
3959 k = IA[i];
3960 j = JA[k];
3961 px0 = x+j*7; // &x[j*nb];
3962 py = py0;
3963 py[0] += px0[0];
3964 py[1] += px0[1];
3965 py[2] += px0[2];
3966 py[3] += px0[3];
3967 py[4] += px0[4];
3968 py[5] += px0[5];
3969 py[6] += px0[6];
3970
3971 k ++;
3972 j = JA[k];
3973 px0 = x+j*7; // &x[j*nb];
3974 py = py0;
3975 py[0] += px0[0];
3976 py[1] += px0[1];
3977 py[2] += px0[2];
3978 py[3] += px0[3];
3979 py[4] += px0[4];
3980 py[5] += px0[5];
3981 py[6] += px0[6];
3982
3983 k ++;
3984 j = JA[k];
3985 px0 = x+j*7; // &x[j*nb];
3986 py = py0;
3987 py[0] += px0[0];
3988 py[1] += px0[1];
3989 py[2] += px0[2];
3990 py[3] += px0[3];
3991 py[4] += px0[4];
3992 py[5] += px0[5];
3993 py[6] += px0[6];
3994
3995 k ++;
3996 j = JA[k];
3997 px0 = x+j*7; // &x[j*nb];
3998 py = py0;
3999 py[0] += px0[0];
4000 py[1] += px0[1];
4001 py[2] += px0[2];
4002 py[3] += px0[3];
4003 py[4] += px0[4];
4004 py[5] += px0[5];
4005 py[6] += px0[6];
4006
4007 break;
4008
4009 case 5:
4010 k = IA[i];
4011 j = JA[k];
4012 px0 = x+j*7; // &x[j*nb];
4013 py = py0;
4014 py[0] += px0[0];
4015 py[1] += px0[1];
4016 py[2] += px0[2];
4017 py[3] += px0[3];
4018 py[4] += px0[4];
4019 py[5] += px0[5];
4020 py[6] += px0[6];
4021
4022 k ++;
4023 j = JA[k];
4024 px0 = x+j*7; // &x[j*nb];
4025 py = py0;
4026 py[0] += px0[0];
4027 py[1] += px0[1];
4028 py[2] += px0[2];
4029 py[3] += px0[3];
4030 py[4] += px0[4];
4031 py[5] += px0[5];
4032 py[6] += px0[6];
4033
4034 k ++;
4035 j = JA[k];
4036 px0 = x+j*7; // &x[j*nb];
4037 py = py0;
4038 py[0] += px0[0];
4039 py[1] += px0[1];
4040 py[2] += px0[2];
4041 py[3] += px0[3];
4042 py[4] += px0[4];
4043 py[5] += px0[5];
4044 py[6] += px0[6];
4045
4046 k ++;
4047 j = JA[k];
4048 px0 = x+j*7; // &x[j*nb];
4049 py = py0;
4050 py[0] += px0[0];
4051 py[1] += px0[1];
4052 py[2] += px0[2];
4053 py[3] += px0[3];
4054 py[4] += px0[4];
4055 py[5] += px0[5];
4056 py[6] += px0[6];
4057
4058 k ++;
4059 j = JA[k];
4060 px0 = x+j*7; // &x[j*nb];
4061 py = py0;
4062 py[0] += px0[0];
4063 py[1] += px0[1];
4064 py[2] += px0[2];
4065 py[3] += px0[3];
4066 py[4] += px0[4];
4067 py[5] += px0[5];
4068 py[6] += px0[6];
4069
4070 break;
4071
4072 case 6:
4073 k = IA[i];
4074 j = JA[k];
4075 px0 = x+j*7; // &x[j*nb];
4076 py = py0;
4077 py[0] += px0[0];
4078 py[1] += px0[1];
4079 py[2] += px0[2];
4080 py[3] += px0[3];
4081 py[4] += px0[4];
4082 py[5] += px0[5];
4083 py[6] += px0[6];
4084
4085 k ++;
4086 j = JA[k];
4087 px0 = x+j*7; // &x[j*nb];
4088 py = py0;
4089 py[0] += px0[0];
4090 py[1] += px0[1];
4091 py[2] += px0[2];
4092 py[3] += px0[3];
4093 py[4] += px0[4];
4094 py[5] += px0[5];
4095 py[6] += px0[6];
4096
4097 k ++;
4098 j = JA[k];
4099 px0 = x+j*7; // &x[j*nb];
4100 py = py0;
4101 py[0] += px0[0];
4102 py[1] += px0[1];
4103 py[2] += px0[2];
4104 py[3] += px0[3];
4105 py[4] += px0[4];
4106 py[5] += px0[5];
4107 py[6] += px0[6];
4108
4109 k ++;
4110 j = JA[k];
4111 px0 = x+j*7; // &x[j*nb];
4112 py = py0;
4113 py[0] += px0[0];
4114 py[1] += px0[1];
4115 py[2] += px0[2];
4116 py[3] += px0[3];
4117 py[4] += px0[4];
4118 py[5] += px0[5];
4119 py[6] += px0[6];
4120
4121 k ++;
4122 j = JA[k];
4123 px0 = x+j*7; // &x[j*nb];
4124 py = py0;
4125 py[0] += px0[0];
4126 py[1] += px0[1];
4127 py[2] += px0[2];
4128 py[3] += px0[3];
4129 py[4] += px0[4];
4130 py[5] += px0[5];
4131 py[6] += px0[6];
4132
4133 k ++;
4134 j = JA[k];
4135 px0 = x+j*7; // &x[j*nb];
4136 py = py0;
4137 py[0] += px0[0];
4138 py[1] += px0[1];
4139 py[2] += px0[2];
4140 py[3] += px0[3];
4141 py[4] += px0[4];
4142 py[5] += px0[5];
4143 py[6] += px0[6];
4144
4145 break;
4146
4147 case 7:
4148 k = IA[i];
4149 j = JA[k];
4150 px0 = x+j*7; // &x[j*nb];
4151 py = py0;
4152 py[0] += px0[0];
4153 py[1] += px0[1];
4154 py[2] += px0[2];
4155 py[3] += px0[3];
4156 py[4] += px0[4];
4157 py[5] += px0[5];
4158 py[6] += px0[6];
4159
4160 k ++;
4161 j = JA[k];
4162 px0 = x+j*7; // &x[j*nb];
4163 py = py0;
4164 py[0] += px0[0];
4165 py[1] += px0[1];
4166 py[2] += px0[2];
4167 py[3] += px0[3];
4168 py[4] += px0[4];
4169 py[5] += px0[5];
4170 py[6] += px0[6];
4171
4172 k ++;
4173 j = JA[k];
4174 px0 = x+j*7; // &x[j*nb];
4175 py = py0;
4176 py[0] += px0[0];
4177 py[1] += px0[1];
4178 py[2] += px0[2];
4179 py[3] += px0[3];
4180 py[4] += px0[4];
4181 py[5] += px0[5];
4182 py[6] += px0[6];
4183
4184 k ++;
4185 j = JA[k];
4186 px0 = x+j*7; // &x[j*nb];
4187 py = py0;
4188 py[0] += px0[0];
4189 py[1] += px0[1];
4190 py[2] += px0[2];
4191 py[3] += px0[3];
4192 py[4] += px0[4];
4193 py[5] += px0[5];
4194 py[6] += px0[6];
4195
4196 k ++;
4197 j = JA[k];
4198 px0 = x+j*7; // &x[j*nb];
4199 py = py0;
4200 py[0] += px0[0];
4201 py[1] += px0[1];
4202 py[2] += px0[2];
4203 py[3] += px0[3];
4204 py[4] += px0[4];
4205 py[5] += px0[5];
4206 py[6] += px0[6];
4207
4208 k ++;
4209 j = JA[k];
4210 px0 = x+j*7; // &x[j*nb];
4211 py = py0;
4212 py[0] += px0[0];
4213 py[1] += px0[1];
4214 py[2] += px0[2];
4215 py[3] += px0[3];
4216 py[4] += px0[4];
4217 py[5] += px0[5];
4218 py[6] += px0[6];
4219
4220 k ++;
4221 j = JA[k];
4222 px0 = x+j*7; // &x[j*nb];
4223 py = py0;
4224 py[0] += px0[0];
4225 py[1] += px0[1];
4226 py[2] += px0[2];
4227 py[3] += px0[3];
4228 py[4] += px0[4];
4229 py[5] += px0[5];
4230 py[6] += px0[6];
4231
4232 break;
4233
4234 default:
4235 for (k = IA[i]; k < IA[i+1]; ++k) {
4236 j = JA[k];
4237 px0 = x+j*7; // &x[j*nb];
4238 py = py0;
4239 py[0] += px0[0];
4240 py[1] += px0[1];
4241 py[2] += px0[2];
4242 py[3] += px0[3];
4243 py[4] += px0[4];
4244 py[5] += px0[5];
4245 py[6] += px0[6];
4246 }
4247 break;
4248 }
4249 }
4250 }
4251 }
4252 break;
4253
4254 default:
4255 {
4256 if (use_openmp) {
4257#ifdef _OPENMP
4258#pragma omp parallel private(myid, mybegin, myend, i, py0, num_nnz_row, k, j, pA, px0, py)
4259 {
4260 myid = omp_get_thread_num();
4261 fasp_get_start_end(myid, nthreads, ROW, &mybegin, &myend);
4262 for (i=mybegin; i < myend; ++i) {
4263 py0 = &y[i*nb];
4264 num_nnz_row = IA[i+1] - IA[i];
4265 switch(num_nnz_row) {
4266
4267 case 3:
4268 k = IA[i];
4269 j = JA[k];
4270 px0 = x+j*nb; // &x[j*nb];
4271 py = py0;
4272 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4273
4274 k ++;
4275 j = JA[k];
4276 px0 = x+j*nb; // &x[j*nb];
4277 py = py0;
4278 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4279
4280 k ++;
4281 j = JA[k];
4282 px0 = x+j*nb; // &x[j*nb];
4283 py = py0;
4284 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4285
4286 break;
4287
4288 case 4:
4289 k = IA[i];
4290 j = JA[k];
4291 px0 = x+j*nb; // &x[j*nb];
4292 py = py0;
4293 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4294
4295 k ++;
4296 j = JA[k];
4297 px0 = x+j*nb; // &x[j*nb];
4298 py = py0;
4299 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4300
4301 k ++;
4302 j = JA[k];
4303 px0 = x+j*nb; // &x[j*nb];
4304 py = py0;
4305 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4306
4307 k ++;
4308 j = JA[k];
4309 px0 = x+j*nb; // &x[j*nb];
4310 py = py0;
4311 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4312
4313 break;
4314
4315 case 5:
4316 k = IA[i];
4317 j = JA[k];
4318 px0 = x+j*nb; // &x[j*nb];
4319 py = py0;
4320 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4321
4322 k ++;
4323 j = JA[k];
4324 px0 = x+j*nb; // &x[j*nb];
4325 py = py0;
4326 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4327
4328 k ++;
4329 j = JA[k];
4330 px0 = x+j*nb; // &x[j*nb];
4331 py = py0;
4332 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4333
4334 k ++;
4335 j = JA[k];
4336 px0 = x+j*nb; // &x[j*nb];
4337 py = py0;
4338 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4339
4340 k ++;
4341 j = JA[k];
4342 px0 = x+j*nb; // &x[j*nb];
4343 py = py0;
4344 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4345
4346 break;
4347
4348 case 6:
4349 k = IA[i];
4350 j = JA[k];
4351 px0 = x+j*nb; // &x[j*nb];
4352 py = py0;
4353 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4354
4355 k ++;
4356 j = JA[k];
4357 px0 = x+j*nb; // &x[j*nb];
4358 py = py0;
4359 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4360
4361 k ++;
4362 j = JA[k];
4363 px0 = x+j*nb; // &x[j*nb];
4364 py = py0;
4365 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4366
4367 k ++;
4368 j = JA[k];
4369 px0 = x+j*nb; // &x[j*nb];
4370 py = py0;
4371 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4372
4373 k ++;
4374 j = JA[k];
4375 px0 = x+j*nb; // &x[j*nb];
4376 py = py0;
4377 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4378
4379 k ++;
4380 j = JA[k];
4381 px0 = x+j*nb; // &x[j*nb];
4382 py = py0;
4383 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4384
4385 break;
4386
4387 case 7:
4388 k = IA[i];
4389 j = JA[k];
4390 px0 = x+j*nb; // &x[j*nb];
4391 py = py0;
4392 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4393
4394 k ++;
4395 j = JA[k];
4396 px0 = x+j*nb; // &x[j*nb];
4397 py = py0;
4398 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4399
4400 k ++;
4401 j = JA[k];
4402 px0 = x+j*nb; // &x[j*nb];
4403 py = py0;
4404 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4405
4406 k ++;
4407 j = JA[k];
4408 px0 = x+j*nb; // &x[j*nb];
4409 py = py0;
4410 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4411
4412 k ++;
4413 j = JA[k];
4414 px0 = x+j*nb; // &x[j*nb];
4415 py = py0;
4416 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4417
4418 k ++;
4419 j = JA[k];
4420 px0 = x+j*nb; // &x[j*nb];
4421 py = py0;
4422 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4423
4424 k ++;
4425 j = JA[k];
4426 px0 = x+j*nb; // &x[j*nb];
4427 py = py0;
4428 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4429
4430 break;
4431
4432 default:
4433 for (k = IA[i]; k < IA[i+1]; ++k) {
4434 j = JA[k];
4435 px0 = x+j*nb; // &x[j*nb];
4436 py = py0;
4437 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4438 }
4439 break;
4440 }
4441 }
4442 }
4443#endif
4444 }
4445 else {
4446 for (i = 0; i < ROW; ++i) {
4447 py0 = &y[i*nb];
4448 num_nnz_row = IA[i+1] - IA[i];
4449 switch(num_nnz_row) {
4450
4451 case 3:
4452 k = IA[i];
4453 j = JA[k];
4454 px0 = x+j*nb; // &x[j*nb];
4455 py = py0;
4456 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4457
4458 k ++;
4459 j = JA[k];
4460 px0 = x+j*nb; // &x[j*nb];
4461 py = py0;
4462 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4463
4464 k ++;
4465 j = JA[k];
4466 px0 = x+j*nb; // &x[j*nb];
4467 py = py0;
4468 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4469
4470 break;
4471
4472 case 4:
4473 k = IA[i];
4474 j = JA[k];
4475 px0 = x+j*nb; // &x[j*nb];
4476 py = py0;
4477 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4478
4479 k ++;
4480 j = JA[k];
4481 px0 = x+j*nb; // &x[j*nb];
4482 py = py0;
4483 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4484
4485 k ++;
4486 j = JA[k];
4487 px0 = x+j*nb; // &x[j*nb];
4488 py = py0;
4489 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4490
4491 k ++;
4492 j = JA[k];
4493 px0 = x+j*nb; // &x[j*nb];
4494 py = py0;
4495 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4496
4497 break;
4498
4499 case 5:
4500 k = IA[i];
4501 j = JA[k];
4502 px0 = x+j*nb; // &x[j*nb];
4503 py = py0;
4504 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4505
4506 k ++;
4507 j = JA[k];
4508 px0 = x+j*nb; // &x[j*nb];
4509 py = py0;
4510 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4511
4512 k ++;
4513 j = JA[k];
4514 px0 = x+j*nb; // &x[j*nb];
4515 py = py0;
4516 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4517
4518 k ++;
4519 j = JA[k];
4520 px0 = x+j*nb; // &x[j*nb];
4521 py = py0;
4522 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4523
4524 k ++;
4525 j = JA[k];
4526 px0 = x+j*nb; // &x[j*nb];
4527 py = py0;
4528 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4529
4530 break;
4531
4532 case 6:
4533 k = IA[i];
4534 j = JA[k];
4535 px0 = x+j*nb; // &x[j*nb];
4536 py = py0;
4537 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4538
4539 k ++;
4540 j = JA[k];
4541 px0 = x+j*nb; // &x[j*nb];
4542 py = py0;
4543 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4544
4545 k ++;
4546 j = JA[k];
4547 px0 = x+j*nb; // &x[j*nb];
4548 py = py0;
4549 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4550
4551 k ++;
4552 j = JA[k];
4553 px0 = x+j*nb; // &x[j*nb];
4554 py = py0;
4555 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4556
4557 k ++;
4558 j = JA[k];
4559 px0 = x+j*nb; // &x[j*nb];
4560 py = py0;
4561 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4562
4563 k ++;
4564 j = JA[k];
4565 px0 = x+j*nb; // &x[j*nb];
4566 py = py0;
4567 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4568
4569 break;
4570
4571 case 7:
4572 k = IA[i];
4573 j = JA[k];
4574 px0 = x+j*nb; // &x[j*nb];
4575 py = py0;
4576 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4577
4578 k ++;
4579 j = JA[k];
4580 px0 = x+j*nb; // &x[j*nb];
4581 py = py0;
4582 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4583
4584 k ++;
4585 j = JA[k];
4586 px0 = x+j*nb; // &x[j*nb];
4587 py = py0;
4588 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4589
4590 k ++;
4591 j = JA[k];
4592 px0 = x+j*nb; // &x[j*nb];
4593 py = py0;
4594 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4595
4596 k ++;
4597 j = JA[k];
4598 px0 = x+j*nb; // &x[j*nb];
4599 py = py0;
4600 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4601
4602 k ++;
4603 j = JA[k];
4604 px0 = x+j*nb; // &x[j*nb];
4605 py = py0;
4606 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4607
4608 k ++;
4609 j = JA[k];
4610 px0 = x+j*nb; // &x[j*nb];
4611 py = py0;
4612 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4613
4614 break;
4615
4616 default:
4617 for (k = IA[i]; k < IA[i+1]; ++k) {
4618 j = JA[k];
4619 px0 = x+j*nb; // &x[j*nb];
4620 py = py0;
4621 fasp_blas_darray_axpy (nb, 1.0, px0, py);
4622 }
4623 break;
4624 }
4625 }
4626 }
4627 }
4628 break;
4629 }
4630}
4631
4647 const dBSRmat *B,
4648 dBSRmat *C)
4649{
4650
4651 INT i,j,k,l,count;
4652 INT *JD = (INT *)fasp_mem_calloc(B->COL,sizeof(INT));
4653
4654 const INT nb = A->nb;
4655 const INT nb2 = nb*nb;
4656
4657 // check A and B see if there are compatible for multiplication
4658 if ( (A->COL != B->ROW) && (A->nb != B->nb ) ) {
4659 printf("### ERROR: Matrix sizes do not match!\n");
4660 fasp_chkerr(ERROR_MAT_SIZE, __FUNCTION__);
4661 }
4662
4663 C->ROW = A->ROW;
4664 C->COL = B->COL;
4665 C->nb = A->nb;
4667
4668 C->val = NULL;
4669 C->JA = NULL;
4670 C->IA = (INT*)fasp_mem_calloc(C->ROW+1,sizeof(INT));
4671
4672 REAL *temp = (REAL *)fasp_mem_calloc(nb2, sizeof(REAL));
4673
4674 for (i=0;i<B->COL;++i) JD[i]=-1;
4675
4676 // step 1: Find first the structure IA of C
4677 for (i=0;i<C->ROW;++i) {
4678 count=0;
4679
4680 for (k=A->IA[i];k<A->IA[i+1];++k) {
4681 for (j=B->IA[A->JA[k]];j<B->IA[A->JA[k]+1];++j) {
4682 for (l=0;l<count;l++) {
4683 if (JD[l]==B->JA[j]) break;
4684 }
4685
4686 if (l==count) {
4687 JD[count]=B->JA[j];
4688 count++;
4689 }
4690 }
4691 }
4692 C->IA[i+1]=count;
4693 for (j=0;j<count;++j) {
4694 JD[j]=-1;
4695 }
4696 }
4697
4698 for (i=0;i<C->ROW;++i) C->IA[i+1]+=C->IA[i];
4699
4700 // step 2: Find the structure JA of C
4701 INT countJD;
4702
4703 C->JA=(INT*)fasp_mem_calloc(C->IA[C->ROW],sizeof(INT));
4704
4705 for (i=0;i<C->ROW;++i) {
4706 countJD=0;
4707 count=C->IA[i];
4708 for (k=A->IA[i];k<A->IA[i+1];++k) {
4709 for (j=B->IA[A->JA[k]];j<B->IA[A->JA[k]+1];++j) {
4710 for (l=0;l<countJD;l++) {
4711 if (JD[l]==B->JA[j]) break;
4712 }
4713
4714 if (l==countJD) {
4715 C->JA[count]=B->JA[j];
4716 JD[countJD]=B->JA[j];
4717 count++;
4718 countJD++;
4719 }
4720 }
4721 }
4722
4723 //for (j=0;j<countJD;++j) JD[j]=-1;
4724 fasp_iarray_set(countJD, JD, -1);
4725 }
4726
4727 fasp_mem_free(JD); JD = NULL;
4728
4729 // step 3: Find the structure A of C
4730 C->val=(REAL*)fasp_mem_calloc((C->IA[C->ROW])*nb2,sizeof(REAL));
4731
4732 for (i=0;i<C->ROW;++i) {
4733 for (j=C->IA[i];j<C->IA[i+1];++j) {
4734
4735 fasp_darray_set(nb2, C->val+(j*nb2), 0x0);
4736
4737 for (k=A->IA[i];k<A->IA[i+1];++k) {
4738 for (l=B->IA[A->JA[k]];l<B->IA[A->JA[k]+1];l++) {
4739 if (B->JA[l]==C->JA[j]) {
4740 fasp_blas_smat_mul (A->val+(k*nb2), B->val+(l*nb2), temp, nb);
4741 fasp_blas_darray_axpy (nb2, 1.0, temp, C->val+(j*nb2));
4742 } // end if
4743 } // end for l
4744 } // end for k
4745 } // end for j
4746 } // end for i
4747
4748 C->NNZ = C->IA[C->ROW]-C->IA[0];
4749
4750 fasp_mem_free(temp); temp = NULL;
4751
4752}
4753
4772 const dBSRmat *A,
4773 const dBSRmat *P,
4774 dBSRmat *B)
4775{
4776 const INT row=R->ROW, col=P->COL, nb=A->nb, nb2=A->nb*A->nb;
4777
4778 const REAL *rj=R->val, *aj=A->val, *pj=P->val;
4779 const INT *ir=R->IA, *ia=A->IA, *ip=P->IA;
4780 const INT *jr=R->JA, *ja=A->JA, *jp=P->JA;
4781
4782 REAL *acj;
4783 INT *iac, *jac;
4784
4785 INT nB=A->NNZ;
4786 INT i,i1,j,jj,k,length;
4787 INT begin_row,end_row,begin_rowA,end_rowA,begin_rowR,end_rowR;
4788 INT istart,iistart,count;
4789
4790 INT *index=(INT *)fasp_mem_calloc(A->COL,sizeof(INT));
4791
4792 REAL *smat_tmp=(REAL *)fasp_mem_calloc(nb2,sizeof(REAL));
4793
4794 INT *iindex=(INT *)fasp_mem_calloc(col,sizeof(INT));
4795
4796 for (i=0; i<A->COL; ++i) index[i] = -2;
4797
4798 memcpy(iindex,index,col*sizeof(INT));
4799
4800 jac=(INT*)fasp_mem_calloc(nB,sizeof(INT));
4801
4802 iac=(INT*)fasp_mem_calloc(row+1,sizeof(INT));
4803
4804 REAL *temp=(REAL*)fasp_mem_calloc(A->COL*nb2,sizeof(REAL));
4805
4806 iac[0] = 0;
4807
4808 // First loop: form sparsity pattern of R*A*P
4809 for (i=0; i < row; ++i) {
4810 // reset istart and length at the beginning of each loop
4811 istart = -1; length = 0; i1 = i+1;
4812
4813 // go across the rows in R
4814 begin_rowR=ir[i]; end_rowR=ir[i1];
4815 for (jj=begin_rowR; jj<end_rowR; ++jj) {
4816 j = jr[jj];
4817 // for each column in A
4818 begin_rowA=ia[j]; end_rowA=ia[j+1];
4819 for (k=begin_rowA; k<end_rowA; ++k) {
4820 if (index[ja[k]] == -2) {
4821 index[ja[k]] = istart;
4822 istart = ja[k];
4823 ++length;
4824 }
4825 }
4826 }
4827
4828 // book-keeping [resetting length and setting iistart]
4829 count = length; iistart = -1; length = 0;
4830
4831 // use each column that would have resulted from R*A
4832 for (j=0; j < count; ++j) {
4833 jj = istart;
4834 istart = index[istart];
4835 index[jj] = -2;
4836
4837 // go across the row of P
4838 begin_row=ip[jj]; end_row=ip[jj+1];
4839 for (k=begin_row; k<end_row; ++k) {
4840 // pull out the appropriate columns of P
4841 if (iindex[jp[k]] == -2){
4842 iindex[jp[k]] = iistart;
4843 iistart = jp[k];
4844 ++length;
4845 }
4846 } // end for k
4847 } // end for j
4848
4849 // set B->IA
4850 iac[i1]=iac[i]+length;
4851
4852 if (iac[i1]>nB) {
4853 nB=nB*2;
4854 jac=(INT*)fasp_mem_realloc(jac, nB*sizeof(INT));
4855 }
4856
4857 // put the correct columns of p into the column list of the products
4858 begin_row=iac[i]; end_row=iac[i1];
4859 for (j=begin_row; j<end_row; ++j) {
4860 // put the value in B->JA
4861 jac[j] = iistart;
4862 // set istart to the next value
4863 iistart = iindex[iistart];
4864 // set the iindex spot to 0
4865 iindex[jac[j]] = -2;
4866 } // end j
4867 } // end i: First loop
4868
4869 jac=(INT*)fasp_mem_realloc(jac,(iac[row])*sizeof(INT));
4870
4871 acj=(REAL*)fasp_mem_calloc(iac[row]*nb2,sizeof(REAL));
4872
4873 INT *BTindex=(INT*)fasp_mem_calloc(col,sizeof(INT));
4874
4875 // Second loop: compute entries of R*A*P
4876 for (i=0; i<row; ++i) {
4877 i1 = i+1;
4878 // each col of B
4879 begin_row=iac[i]; end_row=iac[i1];
4880 for (j=begin_row; j<end_row; ++j) {
4881 BTindex[jac[j]]=j;
4882 }
4883 // reset istart and length at the beginning of each loop
4884 istart = -1; length = 0;
4885
4886 // go across the rows in R
4887 begin_rowR=ir[i]; end_rowR=ir[i1];
4888 for ( jj=begin_rowR; jj<end_rowR; ++jj ) {
4889 j = jr[jj];
4890 // for each column in A
4891 begin_rowA=ia[j]; end_rowA=ia[j+1];
4892 for (k=begin_rowA; k<end_rowA; ++k) {
4893 if (index[ja[k]] == -2) {
4894 index[ja[k]] = istart;
4895 istart = ja[k];
4896 ++length;
4897 }
4898 fasp_blas_smat_mul(&rj[jj*nb2],&aj[k*nb2],smat_tmp,nb);
4899 //fasp_darray_xpy(nb2,&temp[ja[k]*nb2], smat_tmp );
4900 fasp_blas_darray_axpy (nb2, 1.0, smat_tmp, &temp[ja[k]*nb2]);
4901
4902 //temp[ja[k]]+=rj[jj]*aj[k];
4903 // change to X = X+Y*Z
4904 }
4905 }
4906
4907 // book-keeping [resetting length and setting iistart]
4908 // use each column that would have resulted from R*A
4909 for (j=0; j<length; ++j) {
4910 jj = istart;
4911 istart = index[istart];
4912 index[jj] = -2;
4913
4914 // go across the row of P
4915 begin_row=ip[jj]; end_row=ip[jj+1];
4916 for (k=begin_row; k<end_row; ++k) {
4917 // pull out the appropriate columns of P
4918 //acj[BTindex[jp[k]]]+=temp[jj]*pj[k];
4919 fasp_blas_smat_mul(&temp[jj*nb2],&pj[k*nb2],smat_tmp,nb);
4920 //fasp_darray_xpy(nb2,&acj[BTindex[jp[k]]*nb2], smat_tmp );
4921 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp, &acj[BTindex[jp[k]]*nb2]);
4922
4923 // change to X = X+Y*Z
4924 }
4925 //temp[jj]=0.0; // change to X[nb,nb] = 0;
4926 fasp_darray_set(nb2,&temp[jj*nb2],0.0);
4927 }
4928 } // end for i: Second loop
4929 // setup coarse matrix B
4930 B->ROW=row; B->COL=col;
4931 B->IA=iac; B->JA=jac; B->val=acj;
4932 B->NNZ=B->IA[B->ROW]-B->IA[0];
4933
4934 B->nb=A->nb;
4936
4937 fasp_mem_free(temp); temp = NULL;
4938 fasp_mem_free(index); index = NULL;
4939 fasp_mem_free(iindex); iindex = NULL;
4940 fasp_mem_free(BTindex); BTindex = NULL;
4941 fasp_mem_free(smat_tmp); smat_tmp = NULL;
4942}
4943
4962 const dBSRmat *A,
4963 const dBSRmat *P,
4964 dBSRmat *B)
4965{
4966 const INT row=R->ROW, col=P->COL, nb=A->nb, nb2=A->nb*A->nb;
4967
4968 const REAL *rj=R->val, *aj=A->val, *pj=P->val;
4969 const INT *ir=R->IA, *ia=A->IA, *ip=P->IA;
4970 const INT *jr=R->JA, *ja=A->JA, *jp=P->JA;
4971
4972 REAL *acj;
4973 INT *iac, *jac;
4974
4975 INT *Ps_marker = NULL;
4976 INT *As_marker = NULL;
4977
4978#ifdef _OPENMP
4979 INT *P_marker = NULL;
4980 INT *A_marker = NULL;
4981 REAL *smat_tmp = NULL;
4982#endif
4983
4984 INT i, i1, i2, i3, jj1, jj2, jj3;
4985 INT counter, jj_row_begining;
4986
4987 INT nthreads = 1;
4988
4989#ifdef _OPENMP
4990 INT myid, mybegin, myend, Ctemp;
4991 nthreads = fasp_get_num_threads();
4992#endif
4993
4994 INT n_coarse = row;
4995 INT n_fine = A->ROW;
4996 INT coarse_mul_nthreads = n_coarse * nthreads;
4997 INT fine_mul_nthreads = n_fine * nthreads;
4998 INT coarse_add_nthreads = n_coarse + nthreads;
4999 INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
5000 INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
5001
5002 Ps_marker = (INT *)fasp_mem_calloc(total_calloc, sizeof(INT));
5003 As_marker = Ps_marker + coarse_mul_nthreads;
5004
5005 /*------------------------------------------------------*
5006 * First Pass: Determine size of B and set up B_i *
5007 *------------------------------------------------------*/
5008 iac = (INT *)fasp_mem_calloc(n_coarse+1, sizeof(INT));
5009
5010 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5011
5012 REAL *tmp=(REAL *)fasp_mem_calloc(2*nthreads*nb2, sizeof(REAL));
5013
5014#ifdef _OPENMP
5015 INT * RAP_temp = As_marker + fine_mul_nthreads;
5016 INT * part_end = RAP_temp + coarse_add_nthreads;
5017
5018 if (n_coarse > OPENMP_HOLDS) {
5019#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5020counter, i, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
5021 for (myid = 0; myid < nthreads; myid++) {
5022 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5023 P_marker = Ps_marker + myid * n_coarse;
5024 A_marker = As_marker + myid * n_fine;
5025 counter = 0;
5026 for (i = mybegin; i < myend; ++i) {
5027 P_marker[i] = counter;
5028 jj_row_begining = counter;
5029 counter ++;
5030 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5031 i1 = jr[jj1];
5032 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5033 i2 = ja[jj2];
5034 if (A_marker[i2] != i) {
5035 A_marker[i2] = i;
5036 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5037 i3 = jp[jj3];
5038 if (P_marker[i3] < jj_row_begining) {
5039 P_marker[i3] = counter;
5040 counter ++;
5041 }
5042 }
5043 }
5044 }
5045 }
5046 RAP_temp[i+myid] = jj_row_begining;
5047 }
5048 RAP_temp[myend+myid] = counter;
5049 part_end[myid] = myend + myid + 1;
5050 }
5051 fasp_iarray_cp(part_end[0], RAP_temp, iac);
5052 counter = part_end[0];
5053 Ctemp = 0;
5054 for (i1 = 1; i1 < nthreads; i1 ++) {
5055 Ctemp += RAP_temp[part_end[i1-1]-1];
5056 for (jj1 = part_end[i1-1]+1; jj1 < part_end[i1]; jj1 ++) {
5057 iac[counter] = RAP_temp[jj1] + Ctemp;
5058 counter ++;
5059 }
5060 }
5061 }
5062 else {
5063#endif
5064 counter = 0;
5065 for (i = 0; i < row; ++ i) {
5066 Ps_marker[i] = counter;
5067 jj_row_begining = counter;
5068 counter ++;
5069
5070 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5071 i1 = jr[jj1];
5072 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5073 i2 = ja[jj2];
5074 if (As_marker[i2] != i) {
5075 As_marker[i2] = i;
5076 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5077 i3 = jp[jj3];
5078 if (Ps_marker[i3] < jj_row_begining) {
5079 Ps_marker[i3] = counter;
5080 counter ++;
5081 }
5082 }
5083 }
5084 }
5085 }
5086 iac[i] = jj_row_begining;
5087 }
5088#ifdef _OPENMP
5089 }
5090#endif
5091
5092 iac[row] = counter;
5093
5094 jac=(INT*)fasp_mem_calloc(iac[row], sizeof(INT));
5095
5096 acj=(REAL*)fasp_mem_calloc(iac[row]*nb2, sizeof(REAL));
5097
5098 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5099
5100 /*------------------------------------------------------*
5101 * Second Pass: compute entries of B=R*A*P *
5102 *------------------------------------------------------*/
5103#ifdef _OPENMP
5104 if (n_coarse > OPENMP_HOLDS) {
5105#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5106counter, i, jj_row_begining, jj1, i1, jj2, i2, \
5107jj3, i3, smat_tmp)
5108 for (myid = 0; myid < nthreads; myid++) {
5109 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5110 P_marker = Ps_marker + myid * n_coarse;
5111 A_marker = As_marker + myid * n_fine;
5112 smat_tmp = tmp + myid*2*nb2;
5113 counter = iac[mybegin];
5114 for (i = mybegin; i < myend; ++i) {
5115 P_marker[i] = counter;
5116 jj_row_begining = counter;
5117 jac[counter] = i;
5118 fasp_darray_set(nb2, &acj[counter*nb2], 0x0);
5119 counter ++;
5120
5121 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5122 i1 = jr[jj1];
5123 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5124 fasp_blas_smat_mul(&rj[jj1*nb2],&aj[jj2*nb2], smat_tmp, nb);
5125 i2 = ja[jj2];
5126 if (A_marker[i2] != i) {
5127 A_marker[i2] = i;
5128 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5129 i3 = jp[jj3];
5130 fasp_blas_smat_mul(smat_tmp, &pj[jj3*nb2], smat_tmp+nb2, nb);
5131 if (P_marker[i3] < jj_row_begining) {
5132 P_marker[i3] = counter;
5133 fasp_darray_cp(nb2, smat_tmp+nb2, &acj[counter*nb2]);
5134 jac[counter] = i3;
5135 counter ++;
5136 }
5137 else {
5138 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp+nb2,
5139 &acj[P_marker[i3]*nb2]);
5140 }
5141 }
5142 }
5143 else {
5144 for (jj3 = ip[i2]; jj3 < ip[i2+1]; jj3 ++) {
5145 i3 = jp[jj3];
5146 fasp_blas_smat_mul(smat_tmp, &pj[jj3*nb2], smat_tmp+nb2, nb);
5147 fasp_blas_darray_axpy(nb2, 1.0, smat_tmp+nb2,
5148 &acj[P_marker[i3]*nb2]);
5149 }
5150 }
5151 }
5152 }
5153 }
5154 }
5155 }
5156 else {
5157#endif
5158 counter = 0;
5159 for (i = 0; i < row; ++i) {
5160 Ps_marker[i] = counter;
5161 jj_row_begining = counter;
5162 jac[counter] = i;
5163 fasp_darray_set(nb2, &acj[counter*nb2], 0x0);
5164 counter ++;
5165
5166 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5167 i1 = jr[jj1];
5168 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5169 fasp_blas_smat_mul(&rj[jj1*nb2],&aj[jj2*nb2], tmp, nb);
5170 i2 = ja[jj2];
5171 if (As_marker[i2] != i) {
5172 As_marker[i2] = i;
5173 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5174 i3 = jp[jj3];
5175 fasp_blas_smat_mul(tmp, &pj[jj3*nb2], tmp+nb2, nb);
5176 if (Ps_marker[i3] < jj_row_begining) {
5177 Ps_marker[i3] = counter;
5178 fasp_darray_cp(nb2, tmp+nb2, &acj[counter*nb2]);
5179 jac[counter] = i3;
5180 counter ++;
5181 }
5182 else {
5183 fasp_blas_darray_axpy(nb2, 1.0, tmp+nb2,
5184 &acj[Ps_marker[i3]*nb2]);
5185 }
5186 }
5187 }
5188 else {
5189 for (jj3 = ip[i2]; jj3 < ip[i2+1]; jj3 ++) {
5190 i3 = jp[jj3];
5191 fasp_blas_smat_mul(tmp, &pj[jj3*nb2], tmp+nb2, nb);
5192 fasp_blas_darray_axpy(nb2, 1.0, tmp+nb2, &acj[Ps_marker[i3]*nb2]);
5193 }
5194 }
5195 }
5196 }
5197 }
5198#ifdef _OPENMP
5199 }
5200#endif
5201 // setup coarse matrix B
5202 B->ROW=row; B->COL=col;
5203 B->IA=iac; B->JA=jac; B->val=acj;
5204 B->NNZ=B->IA[B->ROW]-B->IA[0];
5205 B->nb=A->nb;
5207
5208 fasp_mem_free(Ps_marker); Ps_marker = NULL;
5209 fasp_mem_free(tmp); tmp = NULL;
5210}
5211
5228 const dBSRmat *A,
5229 const dBSRmat *P,
5230 dBSRmat *B)
5231{
5232 const INT row=R->ROW, col=P->COL, nb2=A->nb*A->nb;
5233
5234 const REAL *aj=A->val;
5235 const INT *ir=R->IA, *ia=A->IA, *ip=P->IA;
5236 const INT *jr=R->JA, *ja=A->JA, *jp=P->JA;
5237
5238 INT *iac, *jac;
5239 REAL *acj;
5240 INT *Ps_marker = NULL;
5241 INT *As_marker = NULL;
5242
5243#ifdef _OPENMP
5244 INT *P_marker = NULL;
5245 INT *A_marker = NULL;
5246#endif
5247
5248 INT i, i1, i2, i3, jj1, jj2, jj3;
5249 INT counter, jj_row_begining;
5250
5251 INT nthreads = 1;
5252
5253#ifdef _OPENMP
5254 INT myid, mybegin, myend, Ctemp;
5255 nthreads = fasp_get_num_threads();
5256#endif
5257
5258 INT n_coarse = row;
5259 INT n_fine = A->ROW;
5260 INT coarse_mul_nthreads = n_coarse * nthreads;
5261 INT fine_mul_nthreads = n_fine * nthreads;
5262 INT coarse_add_nthreads = n_coarse + nthreads;
5263 INT minus_one_length = coarse_mul_nthreads + fine_mul_nthreads;
5264 INT total_calloc = minus_one_length + coarse_add_nthreads + nthreads;
5265
5266 Ps_marker = (INT *)fasp_mem_calloc(total_calloc, sizeof(INT));
5267 As_marker = Ps_marker + coarse_mul_nthreads;
5268
5269 /*------------------------------------------------------*
5270 * First Pass: Determine size of B and set up B_i *
5271 *------------------------------------------------------*/
5272 iac = (INT *)fasp_mem_calloc(n_coarse+1, sizeof(INT));
5273
5274 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5275
5276#ifdef _OPENMP
5277 INT * RAP_temp = As_marker + fine_mul_nthreads;
5278 INT * part_end = RAP_temp + coarse_add_nthreads;
5279
5280 if (n_coarse > OPENMP_HOLDS) {
5281#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5282counter, i, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
5283 for (myid = 0; myid < nthreads; myid++) {
5284 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5285 P_marker = Ps_marker + myid * n_coarse;
5286 A_marker = As_marker + myid * n_fine;
5287 counter = 0;
5288 for (i = mybegin; i < myend; ++i) {
5289 P_marker[i] = counter;
5290 jj_row_begining = counter;
5291 counter ++;
5292 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5293 i1 = jr[jj1];
5294 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5295 i2 = ja[jj2];
5296 if (A_marker[i2] != i) {
5297 A_marker[i2] = i;
5298 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5299 i3 = jp[jj3];
5300 if (P_marker[i3] < jj_row_begining) {
5301 P_marker[i3] = counter;
5302 counter ++;
5303 }
5304 }
5305 }
5306 }
5307 }
5308 RAP_temp[i+myid] = jj_row_begining;
5309 }
5310 RAP_temp[myend+myid] = counter;
5311 part_end[myid] = myend + myid + 1;
5312 }
5313 fasp_iarray_cp(part_end[0], RAP_temp, iac);
5314 counter = part_end[0];
5315 Ctemp = 0;
5316 for (i1 = 1; i1 < nthreads; i1 ++) {
5317 Ctemp += RAP_temp[part_end[i1-1]-1];
5318 for (jj1 = part_end[i1-1]+1; jj1 < part_end[i1]; jj1 ++) {
5319 iac[counter] = RAP_temp[jj1] + Ctemp;
5320 counter ++;
5321 }
5322 }
5323 }
5324 else {
5325#endif
5326 counter = 0;
5327 for (i = 0; i < row; ++ i) {
5328 Ps_marker[i] = counter;
5329 jj_row_begining = counter;
5330 counter ++;
5331
5332 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5333 i1 = jr[jj1];
5334 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5335 i2 = ja[jj2];
5336 if (As_marker[i2] != i) {
5337 As_marker[i2] = i;
5338 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5339 i3 = jp[jj3];
5340 if (Ps_marker[i3] < jj_row_begining) {
5341 Ps_marker[i3] = counter;
5342 counter ++;
5343 }
5344 }
5345 }
5346 }
5347 }
5348 iac[i] = jj_row_begining;
5349 }
5350#ifdef _OPENMP
5351 }
5352#endif
5353
5354 iac[row] = counter;
5355
5356 jac=(INT*)fasp_mem_calloc(iac[row], sizeof(INT));
5357
5358 acj=(REAL*)fasp_mem_calloc(iac[row]*nb2, sizeof(REAL));
5359
5360 fasp_iarray_set(minus_one_length, Ps_marker, -1);
5361
5362 /*------------------------------------------------------*
5363 * Second Pass: compute entries of B=R*A*P *
5364 *------------------------------------------------------*/
5365#ifdef _OPENMP
5366 if (n_coarse > OPENMP_HOLDS) {
5367#pragma omp parallel for private(myid, mybegin, myend, Ctemp, P_marker, A_marker, \
5368counter, i, jj_row_begining, jj1, i1, jj2, i2, jj3, i3)
5369 for (myid = 0; myid < nthreads; myid++) {
5370 fasp_get_start_end(myid, nthreads, n_coarse, &mybegin, &myend);
5371 P_marker = Ps_marker + myid * n_coarse;
5372 A_marker = As_marker + myid * n_fine;
5373 counter = iac[mybegin];
5374 for (i = mybegin; i < myend; ++i) {
5375 P_marker[i] = counter;
5376 jj_row_begining = counter;
5377 jac[counter] = i;
5378 fasp_darray_set(nb2, &acj[counter*nb2], 0x0);
5379 counter ++;
5380
5381 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5382 i1 = jr[jj1];
5383 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5384
5385 i2 = ja[jj2];
5386 if (A_marker[i2] != i) {
5387 A_marker[i2] = i;
5388 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5389 i3 = jp[jj3];
5390
5391 if (P_marker[i3] < jj_row_begining) {
5392 P_marker[i3] = counter;
5393 fasp_darray_cp(nb2, &aj[jj2*nb2], &acj[counter*nb2]);
5394 jac[counter] = i3;
5395 counter ++;
5396 }
5397 else {
5398 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2*nb2],
5399 &acj[P_marker[i3]*nb2]);
5400 }
5401 }
5402 }
5403 else {
5404 for (jj3 = ip[i2]; jj3 < ip[i2+1]; jj3 ++) {
5405 i3 = jp[jj3];
5406 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2*nb2],
5407 &acj[P_marker[i3]*nb2]);
5408 }
5409 }
5410 }
5411 }
5412 }
5413 }
5414 }
5415 else {
5416#endif
5417 counter = 0;
5418 for (i = 0; i < row; ++i) {
5419 Ps_marker[i] = counter;
5420 jj_row_begining = counter;
5421 jac[counter] = i;
5422 fasp_darray_set(nb2, &acj[counter*nb2], 0x0);
5423 counter ++;
5424
5425 for (jj1 = ir[i]; jj1 < ir[i+1]; ++jj1) {
5426 i1 = jr[jj1];
5427 for (jj2 = ia[i1]; jj2 < ia[i1+1]; ++jj2) {
5428
5429 i2 = ja[jj2];
5430 if (As_marker[i2] != i) {
5431 As_marker[i2] = i;
5432 for (jj3 = ip[i2]; jj3 < ip[i2+1]; ++jj3) {
5433 i3 = jp[jj3];
5434 if (Ps_marker[i3] < jj_row_begining) {
5435 Ps_marker[i3] = counter;
5436 fasp_darray_cp(nb2, &aj[jj2*nb2], &acj[counter*nb2]);
5437 jac[counter] = i3;
5438 counter ++;
5439 }
5440 else {
5441 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2*nb2],
5442 &acj[Ps_marker[i3]*nb2]);
5443 }
5444 }
5445 }
5446 else {
5447 for (jj3 = ip[i2]; jj3 < ip[i2+1]; jj3 ++) {
5448 i3 = jp[jj3];
5449 fasp_blas_darray_axpy(nb2, 1.0, &aj[jj2*nb2],
5450 &acj[Ps_marker[i3]*nb2]);
5451 }
5452 }
5453 }
5454 }
5455 }
5456#ifdef _OPENMP
5457 }
5458#endif
5459
5460 // setup coarse matrix B
5461 B->ROW=row; B->COL=col;
5462 B->IA=iac; B->JA=jac; B->val=acj;
5463 B->NNZ=B->IA[B->ROW]-B->IA[0];
5464 B->nb=A->nb;
5466
5467 fasp_mem_free(Ps_marker); Ps_marker = NULL;
5468}
5469
5470/*---------------------------------*/
5471/*-- End of File --*/
5472/*---------------------------------*/
void fasp_darray_set(const INT n, REAL *x, const REAL val)
Set initial value for an array to be x=val.
Definition: AuxArray.c:41
void fasp_iarray_set(const INT n, INT *x, const INT val)
Set initial value for an array to be x=val.
Definition: AuxArray.c:103
void fasp_darray_cp(const INT n, const REAL *x, REAL *y)
Copy an array to the other y=x.
Definition: AuxArray.c:164
void fasp_iarray_cp(const INT n, const INT *x, INT *y)
Copy an array to the other y=x.
Definition: AuxArray.c:184
void fasp_mem_free(void *mem)
Free up previous allocated memory body and set pointer to NULL.
Definition: AuxMemory.c:155
void * fasp_mem_realloc(void *oldmem, const LONGLONG tsize)
Reallocate, initiate, and check memory.
Definition: AuxMemory.c:114
void * fasp_mem_calloc(const unsigned int size, const unsigned int type)
Allocate, initiate, and check memory.
Definition: AuxMemory.c:65
void fasp_chkerr(const SHORT status, const char *fctname)
Check error status and print out error messages before quit.
Definition: AuxMessage.c:213
void fasp_get_start_end(const INT procid, const INT nprocs, const INT n, INT *start, INT *end)
Assign Load to each thread.
Definition: AuxThreads.c:92
void fasp_blas_darray_ax(const INT n, const REAL a, REAL *x)
x = a*x
Definition: BlaArray.c:43
void fasp_blas_darray_axpy(const INT n, const REAL a, const REAL *x, REAL *y)
y = a*x + y
Definition: BlaArray.c:93
void fasp_blas_smat_ypAx(const REAL *A, const REAL *x, REAL *y, const INT n)
Compute y := y + Ax, where 'A' is a n*n dense matrix.
Definition: BlaSmallMat.c:720
void fasp_blas_smat_ypAx_nc3(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 3*3 dense matrix.
Definition: BlaSmallMat.c:613
void fasp_blas_smat_ypAx_nc2(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 2*2 dense matrix.
Definition: BlaSmallMat.c:589
void fasp_blas_smat_mul(const REAL *a, const REAL *b, REAL *c, const INT n)
Compute the matrix product of two small full matrices a and b, stored in c.
Definition: BlaSmallMat.c:540
void fasp_blas_smat_ypAx_nc5(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 5*5 dense matrix.
Definition: BlaSmallMat.c:662
void fasp_blas_smat_ypAx_nc7(const REAL *A, const REAL *x, REAL *y)
Compute y := y + Ax, where 'A' is a 7*7 dense matrix.
Definition: BlaSmallMat.c:688
void fasp_blas_dbsr_rap1(const dBSRmat *R, const dBSRmat *A, const dBSRmat *P, dBSRmat *B)
dBSRmat sparse matrix multiplication B=R*A*P
Definition: BlaSpmvBSR.c:4771
void fasp_blas_dbsr_axm(dBSRmat *A, const REAL alpha)
Multiply a sparse matrix A in BSR format by a scalar alpha.
Definition: BlaSpmvBSR.c:38
void fasp_blas_dbsr_mxm(const dBSRmat *A, const dBSRmat *B, dBSRmat *C)
Sparse matrix multiplication C=A*B.
Definition: BlaSpmvBSR.c:4646
void fasp_blas_dbsr_rap_agg(const dBSRmat *R, const dBSRmat *A, const dBSRmat *P, dBSRmat *B)
dBSRmat sparse matrix multiplication B=R*A*P, where small block matrices in P and R are identity matr...
Definition: BlaSpmvBSR.c:5227
void fasp_blas_dbsr_aAxpby(const REAL alpha, dBSRmat *A, REAL *x, const REAL beta, REAL *y)
Compute y := alpha*A*x + beta*y.
Definition: BlaSpmvBSR.c:67
void fasp_blas_dbsr_mxv_agg(const dBSRmat *A, const REAL *x, REAL *y)
Compute y := A*x, where each small block matrices of A is an identity.
Definition: BlaSpmvBSR.c:2697
void fasp_blas_dbsr_aAxpy(const REAL alpha, const dBSRmat *A, const REAL *x, REAL *y)
Compute y := alpha*A*x + y.
Definition: BlaSpmvBSR.c:348
void fasp_blas_dbsr_mxv(const dBSRmat *A, const REAL *x, REAL *y)
Compute y := A*x.
Definition: BlaSpmvBSR.c:910
void fasp_blas_dbsr_aAxpy_agg(const REAL alpha, const dBSRmat *A, const REAL *x, REAL *y)
Compute y := alpha*A*x + y where each small block matrix is an identity matrix.
Definition: BlaSpmvBSR.c:624
void fasp_blas_dbsr_rap(const dBSRmat *R, const dBSRmat *A, const dBSRmat *P, dBSRmat *B)
dBSRmat sparse matrix multiplication B=R*A*P
Definition: BlaSpmvBSR.c:4961
Main header file for the FASP project.
#define REAL
Definition: fasp.h:67
#define SHORT
FASP integer and floating point numbers.
Definition: fasp.h:63
#define INT
Definition: fasp.h:64
#define OPENMP_HOLDS
Definition: fasp_const.h:260
#define TRUE
Definition of logic type.
Definition: fasp_const.h:61
#define FALSE
Definition: fasp_const.h:62
#define ERROR_MAT_SIZE
Definition: fasp_const.h:26
Block sparse row storage matrix of REAL type.
Definition: fasp_block.h:34
INT COL
number of cols of sub-blocks in matrix A, N
Definition: fasp_block.h:40
INT NNZ
number of nonzero sub-blocks in matrix A, NNZ
Definition: fasp_block.h:43
REAL * val
Definition: fasp_block.h:57
INT nb
dimension of each sub-block
Definition: fasp_block.h:46
INT * IA
integer array of row pointers, the size is ROW+1
Definition: fasp_block.h:60
INT ROW
number of rows of sub-blocks in matrix A, M
Definition: fasp_block.h:37
INT * JA
Definition: fasp_block.h:64
INT storage_manner
storage manner for each sub-block
Definition: fasp_block.h:49