Actual source code: baij2.c
petsc-3.8.4 2018-03-24
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
8: {
9: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
11: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
12: const PetscInt *idx;
13: PetscInt start,end,*ai,*aj,bs,*nidx2;
14: PetscBT table;
17: m = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
22: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24: PetscBTCreate(m,&table);
25: PetscMalloc1(m+1,&nidx);
26: PetscMalloc1(A->rmap->N+1,&nidx2);
28: for (i=0; i<is_max; i++) {
29: /* Initialise the two local arrays */
30: isz = 0;
31: PetscBTMemzero(m,table);
33: /* Extract the indices, assume there can be duplicate entries */
34: ISGetIndices(is[i],&idx);
35: ISGetLocalSize(is[i],&n);
37: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38: for (j=0; j<n; ++j) {
39: ival = idx[j]/bs; /* convert the indices into block indices */
40: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
42: }
43: ISRestoreIndices(is[i],&idx);
44: ISDestroy(&is[i]);
46: k = 0;
47: for (j=0; j<ov; j++) { /* for each overlap*/
48: n = isz;
49: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
50: row = nidx[k];
51: start = ai[row];
52: end = ai[row+1];
53: for (l = start; l<end; l++) {
54: val = aj[l];
55: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
56: }
57: }
58: }
59: /* expand the Index Set */
60: for (j=0; j<isz; j++) {
61: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
62: }
63: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
64: }
65: PetscBTDestroy(&table);
66: PetscFree(nidx);
67: PetscFree(nidx2);
68: return(0);
69: }
71: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
72: {
73: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
75: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
76: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
77: const PetscInt *irow,*icol;
78: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
79: PetscInt *aj = a->j,*ai = a->i;
80: MatScalar *mat_a;
81: Mat C;
82: PetscBool flag;
85: ISGetIndices(isrow,&irow);
86: ISGetIndices(iscol,&icol);
87: ISGetLocalSize(isrow,&nrows);
88: ISGetLocalSize(iscol,&ncols);
90: PetscCalloc1(1+oldcols,&smap);
91: ssmap = smap;
92: PetscMalloc1(1+nrows,&lens);
93: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94: /* determine lens of each row */
95: for (i=0; i<nrows; i++) {
96: kstart = ai[irow[i]];
97: kend = kstart + a->ilen[irow[i]];
98: lens[i] = 0;
99: for (k=kstart; k<kend; k++) {
100: if (ssmap[aj[k]]) lens[i]++;
101: }
102: }
103: /* Create and fill new matrix */
104: if (scall == MAT_REUSE_MATRIX) {
105: c = (Mat_SeqBAIJ*)((*B)->data);
107: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
109: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
111: C = *B;
112: } else {
113: MatCreate(PetscObjectComm((PetscObject)A),&C);
114: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
115: MatSetType(C,((PetscObject)A)->type_name);
116: MatSeqBAIJSetPreallocation(C,bs,0,lens);
117: }
118: c = (Mat_SeqBAIJ*)(C->data);
119: for (i=0; i<nrows; i++) {
120: row = irow[i];
121: kstart = ai[row];
122: kend = kstart + a->ilen[row];
123: mat_i = c->i[i];
124: mat_j = c->j + mat_i;
125: mat_a = c->a + mat_i*bs2;
126: mat_ilen = c->ilen + i;
127: for (k=kstart; k<kend; k++) {
128: if ((tcol=ssmap[a->j[k]])) {
129: *mat_j++ = tcol - 1;
130: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
131: mat_a += bs2;
132: (*mat_ilen)++;
133: }
134: }
135: }
136: /* sort */
137: {
138: MatScalar *work;
139: PetscMalloc1(bs2,&work);
140: for (i=0; i<nrows; i++) {
141: PetscInt ilen;
142: mat_i = c->i[i];
143: mat_j = c->j + mat_i;
144: mat_a = c->a + mat_i*bs2;
145: ilen = c->ilen[i];
146: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
147: }
148: PetscFree(work);
149: }
151: /* Free work space */
152: ISRestoreIndices(iscol,&icol);
153: PetscFree(smap);
154: PetscFree(lens);
155: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
158: ISRestoreIndices(isrow,&irow);
159: *B = C;
160: return(0);
161: }
163: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
166: IS is1,is2;
168: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
169: const PetscInt *irow,*icol;
172: ISGetIndices(isrow,&irow);
173: ISGetIndices(iscol,&icol);
174: ISGetLocalSize(isrow,&nrows);
175: ISGetLocalSize(iscol,&ncols);
177: /* Verify if the indices corespond to each element in a block
178: and form the IS with compressed IS */
179: maxmnbs = PetscMax(a->mbs,a->nbs);
180: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
181: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
182: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183: for (i=0; i<a->mbs; i++) {
184: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185: }
186: count = 0;
187: for (i=0; i<nrows; i++) {
188: j = irow[i] / bs;
189: if ((vary[j]--)==bs) iary[count++] = j;
190: }
191: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
193: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
194: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195: for (i=0; i<a->nbs; i++) {
196: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197: }
198: count = 0;
199: for (i=0; i<ncols; i++) {
200: j = icol[i] / bs;
201: if ((vary[j]--)==bs) iary[count++] = j;
202: }
203: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
204: ISRestoreIndices(isrow,&irow);
205: ISRestoreIndices(iscol,&icol);
206: PetscFree2(vary,iary);
208: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
209: ISDestroy(&is1);
210: ISDestroy(&is2);
211: return(0);
212: }
214: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
215: {
217: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
218: Mat_SubSppt *submatj = c->submatis1;
221: submatj->destroy(C);
222: MatDestroySubMatrix_Private(submatj);
223: return(0);
224: }
226: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
227: {
229: PetscInt i;
230: Mat C;
231: Mat_SeqBAIJ *c;
232: Mat_SubSppt *submatj;
235: for (i=0; i<n; i++) {
236: C = (*mat)[i];
237: c = (Mat_SeqBAIJ*)C->data;
238: submatj = c->submatis1;
239: if (submatj) {
240: submatj->destroy(C);
241: MatDestroySubMatrix_Private(submatj);
242: PetscLayoutDestroy(&C->rmap);
243: PetscLayoutDestroy(&C->cmap);
244: PetscHeaderDestroy(&C);
245: } else {
246: MatDestroy(&C);
247: }
248: }
249: PetscFree(*mat);
250: return(0);
251: }
253: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
254: {
256: PetscInt i;
259: if (scall == MAT_INITIAL_MATRIX) {
260: PetscCalloc1(n+1,B);
261: }
263: for (i=0; i<n; i++) {
264: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
265: }
266: return(0);
267: }
269: /* -------------------------------------------------------*/
270: /* Should check that shapes of vectors and matrices match */
271: /* -------------------------------------------------------*/
273: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
274: {
275: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
276: PetscScalar *z,sum;
277: const PetscScalar *x;
278: const MatScalar *v;
279: PetscErrorCode ierr;
280: PetscInt mbs,i,n;
281: const PetscInt *idx,*ii,*ridx=NULL;
282: PetscBool usecprow=a->compressedrow.use;
285: VecGetArrayRead(xx,&x);
286: VecGetArray(zz,&z);
288: if (usecprow) {
289: mbs = a->compressedrow.nrows;
290: ii = a->compressedrow.i;
291: ridx = a->compressedrow.rindex;
292: PetscMemzero(z,a->mbs*sizeof(PetscScalar));
293: } else {
294: mbs = a->mbs;
295: ii = a->i;
296: }
298: for (i=0; i<mbs; i++) {
299: n = ii[1] - ii[0];
300: v = a->a + ii[0];
301: idx = a->j + ii[0];
302: ii++;
303: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
304: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
305: sum = 0.0;
306: PetscSparseDensePlusDot(sum,x,v,idx,n);
307: if (usecprow) {
308: z[ridx[i]] = sum;
309: } else {
310: z[i] = sum;
311: }
312: }
313: VecRestoreArrayRead(xx,&x);
314: VecRestoreArray(zz,&z);
315: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
316: return(0);
317: }
319: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
320: {
321: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
322: PetscScalar *z = 0,sum1,sum2,*zarray;
323: const PetscScalar *x,*xb;
324: PetscScalar x1,x2;
325: const MatScalar *v;
326: PetscErrorCode ierr;
327: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
328: PetscBool usecprow=a->compressedrow.use;
331: VecGetArrayRead(xx,&x);
332: VecGetArray(zz,&zarray);
334: idx = a->j;
335: v = a->a;
336: if (usecprow) {
337: mbs = a->compressedrow.nrows;
338: ii = a->compressedrow.i;
339: ridx = a->compressedrow.rindex;
340: PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));
341: } else {
342: mbs = a->mbs;
343: ii = a->i;
344: z = zarray;
345: }
347: for (i=0; i<mbs; i++) {
348: n = ii[1] - ii[0]; ii++;
349: sum1 = 0.0; sum2 = 0.0;
350: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
351: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
352: for (j=0; j<n; j++) {
353: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
354: sum1 += v[0]*x1 + v[2]*x2;
355: sum2 += v[1]*x1 + v[3]*x2;
356: v += 4;
357: }
358: if (usecprow) z = zarray + 2*ridx[i];
359: z[0] = sum1; z[1] = sum2;
360: if (!usecprow) z += 2;
361: }
362: VecRestoreArrayRead(xx,&x);
363: VecRestoreArray(zz,&zarray);
364: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
365: return(0);
366: }
368: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
369: {
370: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
371: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
372: const PetscScalar *x,*xb;
373: const MatScalar *v;
374: PetscErrorCode ierr;
375: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
376: PetscBool usecprow=a->compressedrow.use;
378: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379: #pragma disjoint(*v,*z,*xb)
380: #endif
383: VecGetArrayRead(xx,&x);
384: VecGetArray(zz,&zarray);
386: idx = a->j;
387: v = a->a;
388: if (usecprow) {
389: mbs = a->compressedrow.nrows;
390: ii = a->compressedrow.i;
391: ridx = a->compressedrow.rindex;
392: PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));
393: } else {
394: mbs = a->mbs;
395: ii = a->i;
396: z = zarray;
397: }
399: for (i=0; i<mbs; i++) {
400: n = ii[1] - ii[0]; ii++;
401: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
402: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
403: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
404: for (j=0; j<n; j++) {
405: xb = x + 3*(*idx++);
406: x1 = xb[0];
407: x2 = xb[1];
408: x3 = xb[2];
410: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
411: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
412: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
413: v += 9;
414: }
415: if (usecprow) z = zarray + 3*ridx[i];
416: z[0] = sum1; z[1] = sum2; z[2] = sum3;
417: if (!usecprow) z += 3;
418: }
419: VecRestoreArrayRead(xx,&x);
420: VecRestoreArray(zz,&zarray);
421: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
422: return(0);
423: }
425: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
426: {
427: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
428: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
429: const PetscScalar *x,*xb;
430: const MatScalar *v;
431: PetscErrorCode ierr;
432: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
433: PetscBool usecprow=a->compressedrow.use;
436: VecGetArrayRead(xx,&x);
437: VecGetArray(zz,&zarray);
439: idx = a->j;
440: v = a->a;
441: if (usecprow) {
442: mbs = a->compressedrow.nrows;
443: ii = a->compressedrow.i;
444: ridx = a->compressedrow.rindex;
445: PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));
446: } else {
447: mbs = a->mbs;
448: ii = a->i;
449: z = zarray;
450: }
452: for (i=0; i<mbs; i++) {
453: n = ii[1] - ii[0];
454: ii++;
455: sum1 = 0.0;
456: sum2 = 0.0;
457: sum3 = 0.0;
458: sum4 = 0.0;
460: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
461: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
462: for (j=0; j<n; j++) {
463: xb = x + 4*(*idx++);
464: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
465: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
466: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
467: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
468: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
469: v += 16;
470: }
471: if (usecprow) z = zarray + 4*ridx[i];
472: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
473: if (!usecprow) z += 4;
474: }
475: VecRestoreArrayRead(xx,&x);
476: VecRestoreArray(zz,&zarray);
477: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
478: return(0);
479: }
481: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
482: {
483: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
484: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
485: const PetscScalar *xb,*x;
486: const MatScalar *v;
487: PetscErrorCode ierr;
488: const PetscInt *idx,*ii,*ridx=NULL;
489: PetscInt mbs,i,j,n;
490: PetscBool usecprow=a->compressedrow.use;
493: VecGetArrayRead(xx,&x);
494: VecGetArray(zz,&zarray);
496: idx = a->j;
497: v = a->a;
498: if (usecprow) {
499: mbs = a->compressedrow.nrows;
500: ii = a->compressedrow.i;
501: ridx = a->compressedrow.rindex;
502: PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));
503: } else {
504: mbs = a->mbs;
505: ii = a->i;
506: z = zarray;
507: }
509: for (i=0; i<mbs; i++) {
510: n = ii[1] - ii[0]; ii++;
511: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
512: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
513: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
514: for (j=0; j<n; j++) {
515: xb = x + 5*(*idx++);
516: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
517: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
518: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
519: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
520: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
521: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
522: v += 25;
523: }
524: if (usecprow) z = zarray + 5*ridx[i];
525: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
526: if (!usecprow) z += 5;
527: }
528: VecRestoreArrayRead(xx,&x);
529: VecRestoreArray(zz,&zarray);
530: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
531: return(0);
532: }
536: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
537: {
538: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
539: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
540: const PetscScalar *x,*xb;
541: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
542: const MatScalar *v;
543: PetscErrorCode ierr;
544: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
545: PetscBool usecprow=a->compressedrow.use;
548: VecGetArrayRead(xx,&x);
549: VecGetArray(zz,&zarray);
551: idx = a->j;
552: v = a->a;
553: if (usecprow) {
554: mbs = a->compressedrow.nrows;
555: ii = a->compressedrow.i;
556: ridx = a->compressedrow.rindex;
557: PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));
558: } else {
559: mbs = a->mbs;
560: ii = a->i;
561: z = zarray;
562: }
564: for (i=0; i<mbs; i++) {
565: n = ii[1] - ii[0];
566: ii++;
567: sum1 = 0.0;
568: sum2 = 0.0;
569: sum3 = 0.0;
570: sum4 = 0.0;
571: sum5 = 0.0;
572: sum6 = 0.0;
574: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
575: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
576: for (j=0; j<n; j++) {
577: xb = x + 6*(*idx++);
578: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
579: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
580: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
581: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
582: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
583: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
584: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
585: v += 36;
586: }
587: if (usecprow) z = zarray + 6*ridx[i];
588: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
589: if (!usecprow) z += 6;
590: }
592: VecRestoreArrayRead(xx,&x);
593: VecRestoreArray(zz,&zarray);
594: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
595: return(0);
596: }
598: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
599: {
600: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
601: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
602: const PetscScalar *x,*xb;
603: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
604: const MatScalar *v;
605: PetscErrorCode ierr;
606: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
607: PetscBool usecprow=a->compressedrow.use;
610: VecGetArrayRead(xx,&x);
611: VecGetArray(zz,&zarray);
613: idx = a->j;
614: v = a->a;
615: if (usecprow) {
616: mbs = a->compressedrow.nrows;
617: ii = a->compressedrow.i;
618: ridx = a->compressedrow.rindex;
619: PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));
620: } else {
621: mbs = a->mbs;
622: ii = a->i;
623: z = zarray;
624: }
626: for (i=0; i<mbs; i++) {
627: n = ii[1] - ii[0];
628: ii++;
629: sum1 = 0.0;
630: sum2 = 0.0;
631: sum3 = 0.0;
632: sum4 = 0.0;
633: sum5 = 0.0;
634: sum6 = 0.0;
635: sum7 = 0.0;
637: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
638: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
639: for (j=0; j<n; j++) {
640: xb = x + 7*(*idx++);
641: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
642: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
643: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
644: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
645: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
646: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
647: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
648: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
649: v += 49;
650: }
651: if (usecprow) z = zarray + 7*ridx[i];
652: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
653: if (!usecprow) z += 7;
654: }
656: VecRestoreArrayRead(xx,&x);
657: VecRestoreArray(zz,&zarray);
658: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
659: return(0);
660: }
662: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
663: {
664: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
665: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
666: const PetscScalar *x,*xb;
667: PetscScalar *zarray,xv;
668: const MatScalar *v;
669: PetscErrorCode ierr;
670: const PetscInt *ii,*ij=a->j,*idx;
671: PetscInt mbs,i,j,k,n,*ridx=NULL;
672: PetscBool usecprow=a->compressedrow.use;
675: VecGetArrayRead(xx,&x);
676: VecGetArray(zz,&zarray);
678: v = a->a;
679: if (usecprow) {
680: mbs = a->compressedrow.nrows;
681: ii = a->compressedrow.i;
682: ridx = a->compressedrow.rindex;
683: PetscMemzero(zarray,11*a->mbs*sizeof(PetscScalar));
684: } else {
685: mbs = a->mbs;
686: ii = a->i;
687: z = zarray;
688: }
690: for (i=0; i<mbs; i++) {
691: n = ii[i+1] - ii[i];
692: idx = ij + ii[i];
693: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
694: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
696: for (j=0; j<n; j++) {
697: xb = x + 11*(idx[j]);
699: for (k=0; k<11; k++) {
700: xv = xb[k];
701: sum1 += v[0]*xv;
702: sum2 += v[1]*xv;
703: sum3 += v[2]*xv;
704: sum4 += v[3]*xv;
705: sum5 += v[4]*xv;
706: sum6 += v[5]*xv;
707: sum7 += v[6]*xv;
708: sum8 += v[7]*xv;
709: sum9 += v[8]*xv;
710: sum10 += v[9]*xv;
711: sum11 += v[10]*xv;
712: v += 11;
713: }
714: }
715: if (usecprow) z = zarray + 11*ridx[i];
716: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
717: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
719: if (!usecprow) z += 11;
720: }
722: VecRestoreArrayRead(xx,&x);
723: VecRestoreArray(zz,&zarray);
724: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
725: return(0);
726: }
728: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
729: /* Default MatMult for block size 15 */
731: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
732: {
733: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
734: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
735: const PetscScalar *x,*xb;
736: PetscScalar *zarray,xv;
737: const MatScalar *v;
738: PetscErrorCode ierr;
739: const PetscInt *ii,*ij=a->j,*idx;
740: PetscInt mbs,i,j,k,n,*ridx=NULL;
741: PetscBool usecprow=a->compressedrow.use;
744: VecGetArrayRead(xx,&x);
745: VecGetArray(zz,&zarray);
747: v = a->a;
748: if (usecprow) {
749: mbs = a->compressedrow.nrows;
750: ii = a->compressedrow.i;
751: ridx = a->compressedrow.rindex;
752: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
753: } else {
754: mbs = a->mbs;
755: ii = a->i;
756: z = zarray;
757: }
759: for (i=0; i<mbs; i++) {
760: n = ii[i+1] - ii[i];
761: idx = ij + ii[i];
762: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
763: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
765: for (j=0; j<n; j++) {
766: xb = x + 15*(idx[j]);
768: for (k=0; k<15; k++) {
769: xv = xb[k];
770: sum1 += v[0]*xv;
771: sum2 += v[1]*xv;
772: sum3 += v[2]*xv;
773: sum4 += v[3]*xv;
774: sum5 += v[4]*xv;
775: sum6 += v[5]*xv;
776: sum7 += v[6]*xv;
777: sum8 += v[7]*xv;
778: sum9 += v[8]*xv;
779: sum10 += v[9]*xv;
780: sum11 += v[10]*xv;
781: sum12 += v[11]*xv;
782: sum13 += v[12]*xv;
783: sum14 += v[13]*xv;
784: sum15 += v[14]*xv;
785: v += 15;
786: }
787: }
788: if (usecprow) z = zarray + 15*ridx[i];
789: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
790: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
792: if (!usecprow) z += 15;
793: }
795: VecRestoreArrayRead(xx,&x);
796: VecRestoreArray(zz,&zarray);
797: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
798: return(0);
799: }
801: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
802: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
803: {
804: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
805: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
806: const PetscScalar *x,*xb;
807: PetscScalar x1,x2,x3,x4,*zarray;
808: const MatScalar *v;
809: PetscErrorCode ierr;
810: const PetscInt *ii,*ij=a->j,*idx;
811: PetscInt mbs,i,j,n,*ridx=NULL;
812: PetscBool usecprow=a->compressedrow.use;
815: VecGetArrayRead(xx,&x);
816: VecGetArray(zz,&zarray);
818: v = a->a;
819: if (usecprow) {
820: mbs = a->compressedrow.nrows;
821: ii = a->compressedrow.i;
822: ridx = a->compressedrow.rindex;
823: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
824: } else {
825: mbs = a->mbs;
826: ii = a->i;
827: z = zarray;
828: }
830: for (i=0; i<mbs; i++) {
831: n = ii[i+1] - ii[i];
832: idx = ij + ii[i];
833: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
834: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
836: for (j=0; j<n; j++) {
837: xb = x + 15*(idx[j]);
838: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
840: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
841: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
842: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
843: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
844: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
845: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
846: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
847: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
848: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
849: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
850: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
851: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
852: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
853: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
854: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
856: v += 60;
858: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
860: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
861: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
862: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
863: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
864: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
865: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
866: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
867: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
868: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
869: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
870: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
871: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
872: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
873: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
874: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
875: v += 60;
877: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
878: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
879: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
880: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
881: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
882: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
883: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
884: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
885: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
886: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
887: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
888: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
889: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
890: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
891: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
892: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
893: v += 60;
895: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
896: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
897: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
898: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
899: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
900: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
901: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
902: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
903: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
904: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
905: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
906: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
907: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
908: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
909: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
910: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
911: v += 45;
912: }
913: if (usecprow) z = zarray + 15*ridx[i];
914: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
917: if (!usecprow) z += 15;
918: }
920: VecRestoreArrayRead(xx,&x);
921: VecRestoreArray(zz,&zarray);
922: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
923: return(0);
924: }
926: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
927: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
928: {
929: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
930: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931: const PetscScalar *x,*xb;
932: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
933: const MatScalar *v;
934: PetscErrorCode ierr;
935: const PetscInt *ii,*ij=a->j,*idx;
936: PetscInt mbs,i,j,n,*ridx=NULL;
937: PetscBool usecprow=a->compressedrow.use;
940: VecGetArrayRead(xx,&x);
941: VecGetArray(zz,&zarray);
943: v = a->a;
944: if (usecprow) {
945: mbs = a->compressedrow.nrows;
946: ii = a->compressedrow.i;
947: ridx = a->compressedrow.rindex;
948: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
949: } else {
950: mbs = a->mbs;
951: ii = a->i;
952: z = zarray;
953: }
955: for (i=0; i<mbs; i++) {
956: n = ii[i+1] - ii[i];
957: idx = ij + ii[i];
958: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
959: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
961: for (j=0; j<n; j++) {
962: xb = x + 15*(idx[j]);
963: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
964: x8 = xb[7];
966: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
967: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
968: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
969: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
970: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
971: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
972: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
973: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
974: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
975: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
976: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
977: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
978: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
979: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
980: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
981: v += 120;
983: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
985: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
986: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
987: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
988: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
989: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
990: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
991: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
992: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
993: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
994: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
995: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
996: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
997: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
998: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
999: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1000: v += 105;
1001: }
1002: if (usecprow) z = zarray + 15*ridx[i];
1003: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1004: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1006: if (!usecprow) z += 15;
1007: }
1009: VecRestoreArrayRead(xx,&x);
1010: VecRestoreArray(zz,&zarray);
1011: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1012: return(0);
1013: }
1015: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1017: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1018: {
1019: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1020: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1021: const PetscScalar *x,*xb;
1022: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1023: const MatScalar *v;
1024: PetscErrorCode ierr;
1025: const PetscInt *ii,*ij=a->j,*idx;
1026: PetscInt mbs,i,j,n,*ridx=NULL;
1027: PetscBool usecprow=a->compressedrow.use;
1030: VecGetArrayRead(xx,&x);
1031: VecGetArray(zz,&zarray);
1033: v = a->a;
1034: if (usecprow) {
1035: mbs = a->compressedrow.nrows;
1036: ii = a->compressedrow.i;
1037: ridx = a->compressedrow.rindex;
1038: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
1039: } else {
1040: mbs = a->mbs;
1041: ii = a->i;
1042: z = zarray;
1043: }
1045: for (i=0; i<mbs; i++) {
1046: n = ii[i+1] - ii[i];
1047: idx = ij + ii[i];
1048: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1049: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1051: for (j=0; j<n; j++) {
1052: xb = x + 15*(idx[j]);
1053: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1054: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1056: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1057: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1058: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1059: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1060: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1061: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1062: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1063: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1064: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1065: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1066: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1067: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1068: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1069: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1070: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1071: v += 225;
1072: }
1073: if (usecprow) z = zarray + 15*ridx[i];
1074: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1075: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1077: if (!usecprow) z += 15;
1078: }
1080: VecRestoreArrayRead(xx,&x);
1081: VecRestoreArray(zz,&zarray);
1082: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1083: return(0);
1084: }
1087: /*
1088: This will not work with MatScalar == float because it calls the BLAS
1089: */
1090: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1091: {
1092: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1093: PetscScalar *z = 0,*work,*workt,*zarray;
1094: const PetscScalar *x,*xb;
1095: const MatScalar *v;
1096: PetscErrorCode ierr;
1097: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1098: const PetscInt *idx,*ii,*ridx=NULL;
1099: PetscInt ncols,k;
1100: PetscBool usecprow=a->compressedrow.use;
1103: VecGetArrayRead(xx,&x);
1104: VecGetArray(zz,&zarray);
1106: idx = a->j;
1107: v = a->a;
1108: if (usecprow) {
1109: mbs = a->compressedrow.nrows;
1110: ii = a->compressedrow.i;
1111: ridx = a->compressedrow.rindex;
1112: PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
1113: } else {
1114: mbs = a->mbs;
1115: ii = a->i;
1116: z = zarray;
1117: }
1119: if (!a->mult_work) {
1120: k = PetscMax(A->rmap->n,A->cmap->n);
1121: PetscMalloc1(k+1,&a->mult_work);
1122: }
1123: work = a->mult_work;
1124: for (i=0; i<mbs; i++) {
1125: n = ii[1] - ii[0]; ii++;
1126: ncols = n*bs;
1127: workt = work;
1128: for (j=0; j<n; j++) {
1129: xb = x + bs*(*idx++);
1130: for (k=0; k<bs; k++) workt[k] = xb[k];
1131: workt += bs;
1132: }
1133: if (usecprow) z = zarray + bs*ridx[i];
1134: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1135: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1136: v += n*bs2;
1137: if (!usecprow) z += bs;
1138: }
1139: VecRestoreArrayRead(xx,&x);
1140: VecRestoreArray(zz,&zarray);
1141: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1142: return(0);
1143: }
1145: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1146: {
1147: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1148: const PetscScalar *x;
1149: PetscScalar *y,*z,sum;
1150: const MatScalar *v;
1151: PetscErrorCode ierr;
1152: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1153: const PetscInt *idx,*ii;
1154: PetscBool usecprow=a->compressedrow.use;
1157: VecGetArrayRead(xx,&x);
1158: VecGetArrayPair(yy,zz,&y,&z);
1160: idx = a->j;
1161: v = a->a;
1162: if (usecprow) {
1163: if (zz != yy) {
1164: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1165: }
1166: mbs = a->compressedrow.nrows;
1167: ii = a->compressedrow.i;
1168: ridx = a->compressedrow.rindex;
1169: } else {
1170: ii = a->i;
1171: }
1173: for (i=0; i<mbs; i++) {
1174: n = ii[1] - ii[0];
1175: ii++;
1176: if (!usecprow) {
1177: sum = y[i];
1178: } else {
1179: sum = y[ridx[i]];
1180: }
1181: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1182: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1183: PetscSparseDensePlusDot(sum,x,v,idx,n);
1184: v += n;
1185: idx += n;
1186: if (usecprow) {
1187: z[ridx[i]] = sum;
1188: } else {
1189: z[i] = sum;
1190: }
1191: }
1192: VecRestoreArrayRead(xx,&x);
1193: VecRestoreArrayPair(yy,zz,&y,&z);
1194: PetscLogFlops(2.0*a->nz);
1195: return(0);
1196: }
1198: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1199: {
1200: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1201: PetscScalar *y = 0,*z = 0,sum1,sum2;
1202: const PetscScalar *x,*xb;
1203: PetscScalar x1,x2,*yarray,*zarray;
1204: const MatScalar *v;
1205: PetscErrorCode ierr;
1206: PetscInt mbs = a->mbs,i,n,j;
1207: const PetscInt *idx,*ii,*ridx = NULL;
1208: PetscBool usecprow = a->compressedrow.use;
1211: VecGetArrayRead(xx,&x);
1212: VecGetArrayPair(yy,zz,&yarray,&zarray);
1214: idx = a->j;
1215: v = a->a;
1216: if (usecprow) {
1217: if (zz != yy) {
1218: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1219: }
1220: mbs = a->compressedrow.nrows;
1221: ii = a->compressedrow.i;
1222: ridx = a->compressedrow.rindex;
1223: } else {
1224: ii = a->i;
1225: y = yarray;
1226: z = zarray;
1227: }
1229: for (i=0; i<mbs; i++) {
1230: n = ii[1] - ii[0]; ii++;
1231: if (usecprow) {
1232: z = zarray + 2*ridx[i];
1233: y = yarray + 2*ridx[i];
1234: }
1235: sum1 = y[0]; sum2 = y[1];
1236: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1237: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1238: for (j=0; j<n; j++) {
1239: xb = x + 2*(*idx++);
1240: x1 = xb[0];
1241: x2 = xb[1];
1243: sum1 += v[0]*x1 + v[2]*x2;
1244: sum2 += v[1]*x1 + v[3]*x2;
1245: v += 4;
1246: }
1247: z[0] = sum1; z[1] = sum2;
1248: if (!usecprow) {
1249: z += 2; y += 2;
1250: }
1251: }
1252: VecRestoreArrayRead(xx,&x);
1253: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1254: PetscLogFlops(4.0*a->nz);
1255: return(0);
1256: }
1258: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1259: {
1260: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1261: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1262: const PetscScalar *x,*xb;
1263: const MatScalar *v;
1264: PetscErrorCode ierr;
1265: PetscInt mbs = a->mbs,i,j,n;
1266: const PetscInt *idx,*ii,*ridx = NULL;
1267: PetscBool usecprow = a->compressedrow.use;
1270: VecGetArrayRead(xx,&x);
1271: VecGetArrayPair(yy,zz,&yarray,&zarray);
1273: idx = a->j;
1274: v = a->a;
1275: if (usecprow) {
1276: if (zz != yy) {
1277: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1278: }
1279: mbs = a->compressedrow.nrows;
1280: ii = a->compressedrow.i;
1281: ridx = a->compressedrow.rindex;
1282: } else {
1283: ii = a->i;
1284: y = yarray;
1285: z = zarray;
1286: }
1288: for (i=0; i<mbs; i++) {
1289: n = ii[1] - ii[0]; ii++;
1290: if (usecprow) {
1291: z = zarray + 3*ridx[i];
1292: y = yarray + 3*ridx[i];
1293: }
1294: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1295: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1296: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1297: for (j=0; j<n; j++) {
1298: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1299: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1300: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1301: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1302: v += 9;
1303: }
1304: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1305: if (!usecprow) {
1306: z += 3; y += 3;
1307: }
1308: }
1309: VecRestoreArrayRead(xx,&x);
1310: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1311: PetscLogFlops(18.0*a->nz);
1312: return(0);
1313: }
1315: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1316: {
1317: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1318: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1319: const PetscScalar *x,*xb;
1320: const MatScalar *v;
1321: PetscErrorCode ierr;
1322: PetscInt mbs = a->mbs,i,j,n;
1323: const PetscInt *idx,*ii,*ridx=NULL;
1324: PetscBool usecprow=a->compressedrow.use;
1327: VecGetArrayRead(xx,&x);
1328: VecGetArrayPair(yy,zz,&yarray,&zarray);
1330: idx = a->j;
1331: v = a->a;
1332: if (usecprow) {
1333: if (zz != yy) {
1334: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1335: }
1336: mbs = a->compressedrow.nrows;
1337: ii = a->compressedrow.i;
1338: ridx = a->compressedrow.rindex;
1339: } else {
1340: ii = a->i;
1341: y = yarray;
1342: z = zarray;
1343: }
1345: for (i=0; i<mbs; i++) {
1346: n = ii[1] - ii[0]; ii++;
1347: if (usecprow) {
1348: z = zarray + 4*ridx[i];
1349: y = yarray + 4*ridx[i];
1350: }
1351: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1352: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1353: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1354: for (j=0; j<n; j++) {
1355: xb = x + 4*(*idx++);
1356: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1357: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1358: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1359: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1360: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1361: v += 16;
1362: }
1363: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1364: if (!usecprow) {
1365: z += 4; y += 4;
1366: }
1367: }
1368: VecRestoreArrayRead(xx,&x);
1369: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1370: PetscLogFlops(32.0*a->nz);
1371: return(0);
1372: }
1374: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1375: {
1376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1377: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1378: const PetscScalar *x,*xb;
1379: PetscScalar *yarray,*zarray;
1380: const MatScalar *v;
1381: PetscErrorCode ierr;
1382: PetscInt mbs = a->mbs,i,j,n;
1383: const PetscInt *idx,*ii,*ridx = NULL;
1384: PetscBool usecprow=a->compressedrow.use;
1387: VecGetArrayRead(xx,&x);
1388: VecGetArrayPair(yy,zz,&yarray,&zarray);
1390: idx = a->j;
1391: v = a->a;
1392: if (usecprow) {
1393: if (zz != yy) {
1394: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1395: }
1396: mbs = a->compressedrow.nrows;
1397: ii = a->compressedrow.i;
1398: ridx = a->compressedrow.rindex;
1399: } else {
1400: ii = a->i;
1401: y = yarray;
1402: z = zarray;
1403: }
1405: for (i=0; i<mbs; i++) {
1406: n = ii[1] - ii[0]; ii++;
1407: if (usecprow) {
1408: z = zarray + 5*ridx[i];
1409: y = yarray + 5*ridx[i];
1410: }
1411: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1412: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1413: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1414: for (j=0; j<n; j++) {
1415: xb = x + 5*(*idx++);
1416: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1417: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1418: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1419: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1420: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1421: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1422: v += 25;
1423: }
1424: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1425: if (!usecprow) {
1426: z += 5; y += 5;
1427: }
1428: }
1429: VecRestoreArrayRead(xx,&x);
1430: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1431: PetscLogFlops(50.0*a->nz);
1432: return(0);
1433: }
1434: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1435: {
1436: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1437: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1438: const PetscScalar *x,*xb;
1439: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1440: const MatScalar *v;
1441: PetscErrorCode ierr;
1442: PetscInt mbs = a->mbs,i,j,n;
1443: const PetscInt *idx,*ii,*ridx=NULL;
1444: PetscBool usecprow=a->compressedrow.use;
1447: VecGetArrayRead(xx,&x);
1448: VecGetArrayPair(yy,zz,&yarray,&zarray);
1450: idx = a->j;
1451: v = a->a;
1452: if (usecprow) {
1453: if (zz != yy) {
1454: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1455: }
1456: mbs = a->compressedrow.nrows;
1457: ii = a->compressedrow.i;
1458: ridx = a->compressedrow.rindex;
1459: } else {
1460: ii = a->i;
1461: y = yarray;
1462: z = zarray;
1463: }
1465: for (i=0; i<mbs; i++) {
1466: n = ii[1] - ii[0]; ii++;
1467: if (usecprow) {
1468: z = zarray + 6*ridx[i];
1469: y = yarray + 6*ridx[i];
1470: }
1471: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1472: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1473: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1474: for (j=0; j<n; j++) {
1475: xb = x + 6*(*idx++);
1476: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1477: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1478: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1479: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1480: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1481: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1482: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1483: v += 36;
1484: }
1485: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1486: if (!usecprow) {
1487: z += 6; y += 6;
1488: }
1489: }
1490: VecRestoreArrayRead(xx,&x);
1491: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1492: PetscLogFlops(72.0*a->nz);
1493: return(0);
1494: }
1496: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1497: {
1498: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1499: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1500: const PetscScalar *x,*xb;
1501: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1502: const MatScalar *v;
1503: PetscErrorCode ierr;
1504: PetscInt mbs = a->mbs,i,j,n;
1505: const PetscInt *idx,*ii,*ridx = NULL;
1506: PetscBool usecprow=a->compressedrow.use;
1509: VecGetArrayRead(xx,&x);
1510: VecGetArrayPair(yy,zz,&yarray,&zarray);
1512: idx = a->j;
1513: v = a->a;
1514: if (usecprow) {
1515: if (zz != yy) {
1516: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1517: }
1518: mbs = a->compressedrow.nrows;
1519: ii = a->compressedrow.i;
1520: ridx = a->compressedrow.rindex;
1521: } else {
1522: ii = a->i;
1523: y = yarray;
1524: z = zarray;
1525: }
1527: for (i=0; i<mbs; i++) {
1528: n = ii[1] - ii[0]; ii++;
1529: if (usecprow) {
1530: z = zarray + 7*ridx[i];
1531: y = yarray + 7*ridx[i];
1532: }
1533: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1534: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1535: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536: for (j=0; j<n; j++) {
1537: xb = x + 7*(*idx++);
1538: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1539: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1540: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1541: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1542: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1543: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1544: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1545: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1546: v += 49;
1547: }
1548: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1549: if (!usecprow) {
1550: z += 7; y += 7;
1551: }
1552: }
1553: VecRestoreArrayRead(xx,&x);
1554: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1555: PetscLogFlops(98.0*a->nz);
1556: return(0);
1557: }
1559: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1560: {
1561: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1562: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1563: const PetscScalar *x,*xb;
1564: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1565: const MatScalar *v;
1566: PetscErrorCode ierr;
1567: PetscInt mbs = a->mbs,i,j,n;
1568: const PetscInt *idx,*ii,*ridx = NULL;
1569: PetscBool usecprow=a->compressedrow.use;
1572: VecGetArrayRead(xx,&x);
1573: VecGetArrayPair(yy,zz,&yarray,&zarray);
1575: idx = a->j;
1576: v = a->a;
1577: if (usecprow) {
1578: if (zz != yy) {
1579: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1580: }
1581: mbs = a->compressedrow.nrows;
1582: ii = a->compressedrow.i;
1583: ridx = a->compressedrow.rindex;
1584: } else {
1585: ii = a->i;
1586: y = yarray;
1587: z = zarray;
1588: }
1590: for (i=0; i<mbs; i++) {
1591: n = ii[1] - ii[0]; ii++;
1592: if (usecprow) {
1593: z = zarray + 11*ridx[i];
1594: y = yarray + 11*ridx[i];
1595: }
1596: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1597: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1598: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1599: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1600: for (j=0; j<n; j++) {
1601: xb = x + 11*(*idx++);
1602: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1603: sum1 += v[0]*x1 + v[11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
1604: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
1605: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
1606: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
1607: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
1608: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
1609: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
1610: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
1611: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
1612: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
1613: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
1614: v += 121;
1615: }
1616: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1617: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1618: if (!usecprow) {
1619: z += 11; y += 11;
1620: }
1621: }
1622: VecRestoreArrayRead(xx,&x);
1623: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1624: PetscLogFlops(242.0*a->nz);
1625: return(0);
1626: }
1628: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1629: {
1630: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1631: PetscScalar *z = 0,*work,*workt,*zarray;
1632: const PetscScalar *x,*xb;
1633: const MatScalar *v;
1634: PetscErrorCode ierr;
1635: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1636: PetscInt ncols,k;
1637: const PetscInt *ridx = NULL,*idx,*ii;
1638: PetscBool usecprow = a->compressedrow.use;
1641: VecCopy(yy,zz);
1642: VecGetArrayRead(xx,&x);
1643: VecGetArray(zz,&zarray);
1645: idx = a->j;
1646: v = a->a;
1647: if (usecprow) {
1648: mbs = a->compressedrow.nrows;
1649: ii = a->compressedrow.i;
1650: ridx = a->compressedrow.rindex;
1651: } else {
1652: mbs = a->mbs;
1653: ii = a->i;
1654: z = zarray;
1655: }
1657: if (!a->mult_work) {
1658: k = PetscMax(A->rmap->n,A->cmap->n);
1659: PetscMalloc1(k+1,&a->mult_work);
1660: }
1661: work = a->mult_work;
1662: for (i=0; i<mbs; i++) {
1663: n = ii[1] - ii[0]; ii++;
1664: ncols = n*bs;
1665: workt = work;
1666: for (j=0; j<n; j++) {
1667: xb = x + bs*(*idx++);
1668: for (k=0; k<bs; k++) workt[k] = xb[k];
1669: workt += bs;
1670: }
1671: if (usecprow) z = zarray + bs*ridx[i];
1672: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1673: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1674: v += n*bs2;
1675: if (!usecprow) z += bs;
1676: }
1677: VecRestoreArrayRead(xx,&x);
1678: VecRestoreArray(zz,&zarray);
1679: PetscLogFlops(2.0*a->nz*bs2);
1680: return(0);
1681: }
1683: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1684: {
1685: PetscScalar zero = 0.0;
1689: VecSet(zz,zero);
1690: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1691: return(0);
1692: }
1694: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1695: {
1696: PetscScalar zero = 0.0;
1700: VecSet(zz,zero);
1701: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1702: return(0);
1703: }
1705: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1706: {
1707: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1708: PetscScalar *z,x1,x2,x3,x4,x5;
1709: const PetscScalar *x,*xb = NULL;
1710: const MatScalar *v;
1711: PetscErrorCode ierr;
1712: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1713: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1714: Mat_CompressedRow cprow = a->compressedrow;
1715: PetscBool usecprow = cprow.use;
1718: if (yy != zz) { VecCopy(yy,zz); }
1719: VecGetArrayRead(xx,&x);
1720: VecGetArray(zz,&z);
1722: idx = a->j;
1723: v = a->a;
1724: if (usecprow) {
1725: mbs = cprow.nrows;
1726: ii = cprow.i;
1727: ridx = cprow.rindex;
1728: } else {
1729: mbs=a->mbs;
1730: ii = a->i;
1731: xb = x;
1732: }
1734: switch (bs) {
1735: case 1:
1736: for (i=0; i<mbs; i++) {
1737: if (usecprow) xb = x + ridx[i];
1738: x1 = xb[0];
1739: ib = idx + ii[0];
1740: n = ii[1] - ii[0]; ii++;
1741: for (j=0; j<n; j++) {
1742: rval = ib[j];
1743: z[rval] += PetscConj(*v) * x1;
1744: v++;
1745: }
1746: if (!usecprow) xb++;
1747: }
1748: break;
1749: case 2:
1750: for (i=0; i<mbs; i++) {
1751: if (usecprow) xb = x + 2*ridx[i];
1752: x1 = xb[0]; x2 = xb[1];
1753: ib = idx + ii[0];
1754: n = ii[1] - ii[0]; ii++;
1755: for (j=0; j<n; j++) {
1756: rval = ib[j]*2;
1757: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1758: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1759: v += 4;
1760: }
1761: if (!usecprow) xb += 2;
1762: }
1763: break;
1764: case 3:
1765: for (i=0; i<mbs; i++) {
1766: if (usecprow) xb = x + 3*ridx[i];
1767: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1768: ib = idx + ii[0];
1769: n = ii[1] - ii[0]; ii++;
1770: for (j=0; j<n; j++) {
1771: rval = ib[j]*3;
1772: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1773: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1774: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1775: v += 9;
1776: }
1777: if (!usecprow) xb += 3;
1778: }
1779: break;
1780: case 4:
1781: for (i=0; i<mbs; i++) {
1782: if (usecprow) xb = x + 4*ridx[i];
1783: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1784: ib = idx + ii[0];
1785: n = ii[1] - ii[0]; ii++;
1786: for (j=0; j<n; j++) {
1787: rval = ib[j]*4;
1788: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1789: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1790: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1791: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1792: v += 16;
1793: }
1794: if (!usecprow) xb += 4;
1795: }
1796: break;
1797: case 5:
1798: for (i=0; i<mbs; i++) {
1799: if (usecprow) xb = x + 5*ridx[i];
1800: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1801: x4 = xb[3]; x5 = xb[4];
1802: ib = idx + ii[0];
1803: n = ii[1] - ii[0]; ii++;
1804: for (j=0; j<n; j++) {
1805: rval = ib[j]*5;
1806: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1807: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1808: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1809: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1810: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1811: v += 25;
1812: }
1813: if (!usecprow) xb += 5;
1814: }
1815: break;
1816: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1817: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1818: #if 0
1819: {
1820: PetscInt ncols,k,bs2=a->bs2;
1821: PetscScalar *work,*workt,zb;
1822: const PetscScalar *xtmp;
1823: if (!a->mult_work) {
1824: k = PetscMax(A->rmap->n,A->cmap->n);
1825: PetscMalloc1(k+1,&a->mult_work);
1826: }
1827: work = a->mult_work;
1828: xtmp = x;
1829: for (i=0; i<mbs; i++) {
1830: n = ii[1] - ii[0]; ii++;
1831: ncols = n*bs;
1832: PetscMemzero(work,ncols*sizeof(PetscScalar));
1833: if (usecprow) xtmp = x + bs*ridx[i];
1834: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1835: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1836: v += n*bs2;
1837: if (!usecprow) xtmp += bs;
1838: workt = work;
1839: for (j=0; j<n; j++) {
1840: zb = z + bs*(*idx++);
1841: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1842: workt += bs;
1843: }
1844: }
1845: }
1846: #endif
1847: }
1848: VecRestoreArrayRead(xx,&x);
1849: VecRestoreArray(zz,&z);
1850: PetscLogFlops(2.0*a->nz*a->bs2);
1851: return(0);
1852: }
1854: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1855: {
1856: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1857: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
1858: const PetscScalar *x,*xb = 0;
1859: const MatScalar *v;
1860: PetscErrorCode ierr;
1861: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1862: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1863: Mat_CompressedRow cprow = a->compressedrow;
1864: PetscBool usecprow=cprow.use;
1867: if (yy != zz) { VecCopy(yy,zz); }
1868: VecGetArrayRead(xx,&x);
1869: VecGetArray(zz,&z);
1871: idx = a->j;
1872: v = a->a;
1873: if (usecprow) {
1874: mbs = cprow.nrows;
1875: ii = cprow.i;
1876: ridx = cprow.rindex;
1877: } else {
1878: mbs=a->mbs;
1879: ii = a->i;
1880: xb = x;
1881: }
1883: switch (bs) {
1884: case 1:
1885: for (i=0; i<mbs; i++) {
1886: if (usecprow) xb = x + ridx[i];
1887: x1 = xb[0];
1888: ib = idx + ii[0];
1889: n = ii[1] - ii[0]; ii++;
1890: for (j=0; j<n; j++) {
1891: rval = ib[j];
1892: z[rval] += *v * x1;
1893: v++;
1894: }
1895: if (!usecprow) xb++;
1896: }
1897: break;
1898: case 2:
1899: for (i=0; i<mbs; i++) {
1900: if (usecprow) xb = x + 2*ridx[i];
1901: x1 = xb[0]; x2 = xb[1];
1902: ib = idx + ii[0];
1903: n = ii[1] - ii[0]; ii++;
1904: for (j=0; j<n; j++) {
1905: rval = ib[j]*2;
1906: z[rval++] += v[0]*x1 + v[1]*x2;
1907: z[rval++] += v[2]*x1 + v[3]*x2;
1908: v += 4;
1909: }
1910: if (!usecprow) xb += 2;
1911: }
1912: break;
1913: case 3:
1914: for (i=0; i<mbs; i++) {
1915: if (usecprow) xb = x + 3*ridx[i];
1916: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1917: ib = idx + ii[0];
1918: n = ii[1] - ii[0]; ii++;
1919: for (j=0; j<n; j++) {
1920: rval = ib[j]*3;
1921: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1922: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1923: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1924: v += 9;
1925: }
1926: if (!usecprow) xb += 3;
1927: }
1928: break;
1929: case 4:
1930: for (i=0; i<mbs; i++) {
1931: if (usecprow) xb = x + 4*ridx[i];
1932: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1933: ib = idx + ii[0];
1934: n = ii[1] - ii[0]; ii++;
1935: for (j=0; j<n; j++) {
1936: rval = ib[j]*4;
1937: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1938: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1939: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1940: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1941: v += 16;
1942: }
1943: if (!usecprow) xb += 4;
1944: }
1945: break;
1946: case 5:
1947: for (i=0; i<mbs; i++) {
1948: if (usecprow) xb = x + 5*ridx[i];
1949: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1950: x4 = xb[3]; x5 = xb[4];
1951: ib = idx + ii[0];
1952: n = ii[1] - ii[0]; ii++;
1953: for (j=0; j<n; j++) {
1954: rval = ib[j]*5;
1955: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1956: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1957: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1958: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1959: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1960: v += 25;
1961: }
1962: if (!usecprow) xb += 5;
1963: }
1964: break;
1965: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1966: PetscInt ncols,k;
1967: PetscScalar *work,*workt;
1968: const PetscScalar *xtmp;
1969: if (!a->mult_work) {
1970: k = PetscMax(A->rmap->n,A->cmap->n);
1971: PetscMalloc1(k+1,&a->mult_work);
1972: }
1973: work = a->mult_work;
1974: xtmp = x;
1975: for (i=0; i<mbs; i++) {
1976: n = ii[1] - ii[0]; ii++;
1977: ncols = n*bs;
1978: PetscMemzero(work,ncols*sizeof(PetscScalar));
1979: if (usecprow) xtmp = x + bs*ridx[i];
1980: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1981: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1982: v += n*bs2;
1983: if (!usecprow) xtmp += bs;
1984: workt = work;
1985: for (j=0; j<n; j++) {
1986: zb = z + bs*(*idx++);
1987: for (k=0; k<bs; k++) zb[k] += workt[k];
1988: workt += bs;
1989: }
1990: }
1991: }
1992: }
1993: VecRestoreArrayRead(xx,&x);
1994: VecRestoreArray(zz,&z);
1995: PetscLogFlops(2.0*a->nz*a->bs2);
1996: return(0);
1997: }
1999: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2000: {
2001: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2002: PetscInt totalnz = a->bs2*a->nz;
2003: PetscScalar oalpha = alpha;
2005: PetscBLASInt one = 1,tnz;
2008: PetscBLASIntCast(totalnz,&tnz);
2009: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2010: PetscLogFlops(totalnz);
2011: return(0);
2012: }
2014: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2015: {
2017: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2018: MatScalar *v = a->a;
2019: PetscReal sum = 0.0;
2020: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2023: if (type == NORM_FROBENIUS) {
2024: #if defined(PETSC_USE_REAL___FP16)
2025: PetscBLASInt one = 1,cnt = bs2*nz;
2026: *norm = BLASnrm2_(&cnt,v,&one);
2027: #else
2028: for (i=0; i<bs2*nz; i++) {
2029: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2030: }
2031: #endif
2032: *norm = PetscSqrtReal(sum);
2033: PetscLogFlops(2*bs2*nz);
2034: } else if (type == NORM_1) { /* maximum column sum */
2035: PetscReal *tmp;
2036: PetscInt *bcol = a->j;
2037: PetscCalloc1(A->cmap->n+1,&tmp);
2038: for (i=0; i<nz; i++) {
2039: for (j=0; j<bs; j++) {
2040: k1 = bs*(*bcol) + j; /* column index */
2041: for (k=0; k<bs; k++) {
2042: tmp[k1] += PetscAbsScalar(*v); v++;
2043: }
2044: }
2045: bcol++;
2046: }
2047: *norm = 0.0;
2048: for (j=0; j<A->cmap->n; j++) {
2049: if (tmp[j] > *norm) *norm = tmp[j];
2050: }
2051: PetscFree(tmp);
2052: PetscLogFlops(PetscMax(bs2*nz-1,0));
2053: } else if (type == NORM_INFINITY) { /* maximum row sum */
2054: *norm = 0.0;
2055: for (k=0; k<bs; k++) {
2056: for (j=0; j<a->mbs; j++) {
2057: v = a->a + bs2*a->i[j] + k;
2058: sum = 0.0;
2059: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2060: for (k1=0; k1<bs; k1++) {
2061: sum += PetscAbsScalar(*v);
2062: v += bs;
2063: }
2064: }
2065: if (sum > *norm) *norm = sum;
2066: }
2067: }
2068: PetscLogFlops(PetscMax(bs2*nz-1,0));
2069: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2070: return(0);
2071: }
2074: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2075: {
2076: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2080: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2081: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2082: *flg = PETSC_FALSE;
2083: return(0);
2084: }
2086: /* if the a->i are the same */
2087: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2088: if (!*flg) return(0);
2090: /* if a->j are the same */
2091: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2092: if (!*flg) return(0);
2094: /* if a->a are the same */
2095: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2096: return(0);
2098: }
2100: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2101: {
2102: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2104: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2105: PetscScalar *x,zero = 0.0;
2106: MatScalar *aa,*aa_j;
2109: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2110: bs = A->rmap->bs;
2111: aa = a->a;
2112: ai = a->i;
2113: aj = a->j;
2114: ambs = a->mbs;
2115: bs2 = a->bs2;
2117: VecSet(v,zero);
2118: VecGetArray(v,&x);
2119: VecGetLocalSize(v,&n);
2120: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2121: for (i=0; i<ambs; i++) {
2122: for (j=ai[i]; j<ai[i+1]; j++) {
2123: if (aj[j] == i) {
2124: row = i*bs;
2125: aa_j = aa+j*bs2;
2126: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2127: break;
2128: }
2129: }
2130: }
2131: VecRestoreArray(v,&x);
2132: return(0);
2133: }
2135: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2136: {
2137: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2138: const PetscScalar *l,*r,*li,*ri;
2139: PetscScalar x;
2140: MatScalar *aa, *v;
2141: PetscErrorCode ierr;
2142: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2143: const PetscInt *ai,*aj;
2146: ai = a->i;
2147: aj = a->j;
2148: aa = a->a;
2149: m = A->rmap->n;
2150: n = A->cmap->n;
2151: bs = A->rmap->bs;
2152: mbs = a->mbs;
2153: bs2 = a->bs2;
2154: if (ll) {
2155: VecGetArrayRead(ll,&l);
2156: VecGetLocalSize(ll,&lm);
2157: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2158: for (i=0; i<mbs; i++) { /* for each block row */
2159: M = ai[i+1] - ai[i];
2160: li = l + i*bs;
2161: v = aa + bs2*ai[i];
2162: for (j=0; j<M; j++) { /* for each block */
2163: for (k=0; k<bs2; k++) {
2164: (*v++) *= li[k%bs];
2165: }
2166: }
2167: }
2168: VecRestoreArrayRead(ll,&l);
2169: PetscLogFlops(a->nz);
2170: }
2172: if (rr) {
2173: VecGetArrayRead(rr,&r);
2174: VecGetLocalSize(rr,&rn);
2175: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2176: for (i=0; i<mbs; i++) { /* for each block row */
2177: iai = ai[i];
2178: M = ai[i+1] - iai;
2179: v = aa + bs2*iai;
2180: for (j=0; j<M; j++) { /* for each block */
2181: ri = r + bs*aj[iai+j];
2182: for (k=0; k<bs; k++) {
2183: x = ri[k];
2184: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2185: v += bs;
2186: }
2187: }
2188: }
2189: VecRestoreArrayRead(rr,&r);
2190: PetscLogFlops(a->nz);
2191: }
2192: return(0);
2193: }
2196: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2197: {
2198: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2201: info->block_size = a->bs2;
2202: info->nz_allocated = a->bs2*a->maxnz;
2203: info->nz_used = a->bs2*a->nz;
2204: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2205: info->assemblies = A->num_ass;
2206: info->mallocs = A->info.mallocs;
2207: info->memory = ((PetscObject)A)->mem;
2208: if (A->factortype) {
2209: info->fill_ratio_given = A->info.fill_ratio_given;
2210: info->fill_ratio_needed = A->info.fill_ratio_needed;
2211: info->factor_mallocs = A->info.factor_mallocs;
2212: } else {
2213: info->fill_ratio_given = 0;
2214: info->fill_ratio_needed = 0;
2215: info->factor_mallocs = 0;
2216: }
2217: return(0);
2218: }
2220: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2221: {
2222: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2226: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2227: return(0);
2228: }