Actual source code: baij2.c
petsc-3.4.5 2014-06-29
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc-private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
14: const PetscInt *idx;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,&table);
27: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
28: PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n; ++j) {
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
66: }
67: PetscBTDestroy(&table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag,sorted;
89: ISSorted(iscol,&sorted);
90: if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92: ISGetIndices(isrow,&irow);
93: ISGetIndices(iscol,&icol);
94: ISGetLocalSize(isrow,&nrows);
95: ISGetLocalSize(iscol,&ncols);
97: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
98: ssmap = smap;
99: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100: PetscMemzero(smap,oldcols*sizeof(PetscInt));
101: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102: /* determine lens of each row */
103: for (i=0; i<nrows; i++) {
104: kstart = ai[irow[i]];
105: kend = kstart + a->ilen[irow[i]];
106: lens[i] = 0;
107: for (k=kstart; k<kend; k++) {
108: if (ssmap[aj[k]]) lens[i]++;
109: }
110: }
111: /* Create and fill new matrix */
112: if (scall == MAT_REUSE_MATRIX) {
113: c = (Mat_SeqBAIJ*)((*B)->data);
115: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
116: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
117: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
118: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
119: C = *B;
120: } else {
121: MatCreate(PetscObjectComm((PetscObject)A),&C);
122: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
123: MatSetType(C,((PetscObject)A)->type_name);
124: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
125: }
126: c = (Mat_SeqBAIJ*)(C->data);
127: for (i=0; i<nrows; i++) {
128: row = irow[i];
129: kstart = ai[row];
130: kend = kstart + a->ilen[row];
131: mat_i = c->i[i];
132: mat_j = c->j + mat_i;
133: mat_a = c->a + mat_i*bs2;
134: mat_ilen = c->ilen + i;
135: for (k=kstart; k<kend; k++) {
136: if ((tcol=ssmap[a->j[k]])) {
137: *mat_j++ = tcol - 1;
138: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
139: mat_a += bs2;
140: (*mat_ilen)++;
141: }
142: }
143: }
145: /* Free work space */
146: ISRestoreIndices(iscol,&icol);
147: PetscFree(smap);
148: PetscFree(lens);
149: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
150: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
152: ISRestoreIndices(isrow,&irow);
153: *B = C;
154: return(0);
155: }
159: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
160: {
161: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
162: IS is1,is2;
164: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
165: const PetscInt *irow,*icol;
168: ISGetIndices(isrow,&irow);
169: ISGetIndices(iscol,&icol);
170: ISGetLocalSize(isrow,&nrows);
171: ISGetLocalSize(iscol,&ncols);
173: /* Verify if the indices corespond to each element in a block
174: and form the IS with compressed IS */
175: PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
176: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
177: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
178: count = 0;
179: for (i=0; i<a->mbs; i++) {
180: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
181: if (vary[i]==bs) iary[count++] = i;
182: }
183: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
185: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
186: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
187: count = 0;
188: for (i=0; i<a->mbs; i++) {
189: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
190: if (vary[i]==bs) iary[count++] = i;
191: }
192: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
193: ISRestoreIndices(isrow,&irow);
194: ISRestoreIndices(iscol,&icol);
195: PetscFree2(vary,iary);
197: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
198: ISDestroy(&is1);
199: ISDestroy(&is2);
200: return(0);
201: }
205: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
206: {
208: PetscInt i;
211: if (scall == MAT_INITIAL_MATRIX) {
212: PetscMalloc((n+1)*sizeof(Mat),B);
213: }
215: for (i=0; i<n; i++) {
216: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
217: }
218: return(0);
219: }
222: /* -------------------------------------------------------*/
223: /* Should check that shapes of vectors and matrices match */
224: /* -------------------------------------------------------*/
228: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229: {
230: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
231: PetscScalar *z,sum;
232: const PetscScalar *x;
233: const MatScalar *v;
234: PetscErrorCode ierr;
235: PetscInt mbs,i,n,nonzerorow=0;
236: const PetscInt *idx,*ii,*ridx=NULL;
237: PetscBool usecprow=a->compressedrow.use;
240: VecGetArrayRead(xx,&x);
241: VecGetArray(zz,&z);
243: if (usecprow) {
244: mbs = a->compressedrow.nrows;
245: ii = a->compressedrow.i;
246: ridx = a->compressedrow.rindex;
247: PetscMemzero(z,mbs*sizeof(PetscScalar));
248: } else {
249: mbs = a->mbs;
250: ii = a->i;
251: }
253: for (i=0; i<mbs; i++) {
254: n = ii[1] - ii[0];
255: v = a->a + ii[0];
256: idx = a->j + ii[0];
257: ii++;
258: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
259: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
260: sum = 0.0;
261: PetscSparseDensePlusDot(sum,x,v,idx,n);
262: if (usecprow) {
263: z[ridx[i]] = sum;
264: } else {
265: nonzerorow += (n>0);
266: z[i] = sum;
267: }
268: }
269: VecRestoreArrayRead(xx,&x);
270: VecRestoreArray(zz,&z);
271: PetscLogFlops(2.0*a->nz - nonzerorow);
272: return(0);
273: }
277: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
278: {
279: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
280: PetscScalar *z = 0,sum1,sum2,*zarray;
281: const PetscScalar *x,*xb;
282: PetscScalar x1,x2;
283: const MatScalar *v;
284: PetscErrorCode ierr;
285: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
286: PetscBool usecprow=a->compressedrow.use;
289: VecGetArrayRead(xx,&x);
290: VecGetArray(zz,&zarray);
292: idx = a->j;
293: v = a->a;
294: if (usecprow) {
295: mbs = a->compressedrow.nrows;
296: ii = a->compressedrow.i;
297: ridx = a->compressedrow.rindex;
298: } else {
299: mbs = a->mbs;
300: ii = a->i;
301: z = zarray;
302: }
304: for (i=0; i<mbs; i++) {
305: n = ii[1] - ii[0]; ii++;
306: sum1 = 0.0; sum2 = 0.0;
307: nonzerorow += (n>0);
308: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
309: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
310: for (j=0; j<n; j++) {
311: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
312: sum1 += v[0]*x1 + v[2]*x2;
313: sum2 += v[1]*x1 + v[3]*x2;
314: v += 4;
315: }
316: if (usecprow) z = zarray + 2*ridx[i];
317: z[0] = sum1; z[1] = sum2;
318: if (!usecprow) z += 2;
319: }
320: VecRestoreArrayRead(xx,&x);
321: VecRestoreArray(zz,&zarray);
322: PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);
323: return(0);
324: }
328: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
329: {
330: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
331: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
332: const PetscScalar *x,*xb;
333: const MatScalar *v;
334: PetscErrorCode ierr;
335: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
336: PetscBool usecprow=a->compressedrow.use;
339: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
340: #pragma disjoint(*v,*z,*xb)
341: #endif
344: VecGetArrayRead(xx,&x);
345: VecGetArray(zz,&zarray);
347: idx = a->j;
348: v = a->a;
349: if (usecprow) {
350: mbs = a->compressedrow.nrows;
351: ii = a->compressedrow.i;
352: ridx = a->compressedrow.rindex;
353: } else {
354: mbs = a->mbs;
355: ii = a->i;
356: z = zarray;
357: }
359: for (i=0; i<mbs; i++) {
360: n = ii[1] - ii[0]; ii++;
361: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
362: nonzerorow += (n>0);
363: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
364: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
365: for (j=0; j<n; j++) {
366: xb = x + 3*(*idx++);
367: x1 = xb[0];
368: x2 = xb[1];
369: x3 = xb[2];
371: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
372: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
373: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
374: v += 9;
375: }
376: if (usecprow) z = zarray + 3*ridx[i];
377: z[0] = sum1; z[1] = sum2; z[2] = sum3;
378: if (!usecprow) z += 3;
379: }
380: VecRestoreArrayRead(xx,&x);
381: VecRestoreArray(zz,&zarray);
382: PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);
383: return(0);
384: }
388: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
389: {
390: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
391: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
392: const PetscScalar *x,*xb;
393: const MatScalar *v;
394: PetscErrorCode ierr;
395: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
396: PetscBool usecprow=a->compressedrow.use;
399: VecGetArrayRead(xx,&x);
400: VecGetArray(zz,&zarray);
402: idx = a->j;
403: v = a->a;
404: if (usecprow) {
405: mbs = a->compressedrow.nrows;
406: ii = a->compressedrow.i;
407: ridx = a->compressedrow.rindex;
408: } else {
409: mbs = a->mbs;
410: ii = a->i;
411: z = zarray;
412: }
414: for (i=0; i<mbs; i++) {
415: n = ii[1] - ii[0];
416: ii++;
417: sum1 = 0.0;
418: sum2 = 0.0;
419: sum3 = 0.0;
420: sum4 = 0.0;
422: nonzerorow += (n>0);
423: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
424: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
425: for (j=0; j<n; j++) {
426: xb = x + 4*(*idx++);
427: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
428: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
429: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
430: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
431: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
432: v += 16;
433: }
434: if (usecprow) z = zarray + 4*ridx[i];
435: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
436: if (!usecprow) z += 4;
437: }
438: VecRestoreArrayRead(xx,&x);
439: VecRestoreArray(zz,&zarray);
440: PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);
441: return(0);
442: }
446: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
447: {
448: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
449: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
450: const PetscScalar *xb,*x;
451: const MatScalar *v;
452: PetscErrorCode ierr;
453: const PetscInt *idx,*ii,*ridx=NULL;
454: PetscInt mbs,i,j,n,nonzerorow=0;
455: PetscBool usecprow=a->compressedrow.use;
458: VecGetArrayRead(xx,&x);
459: VecGetArray(zz,&zarray);
461: idx = a->j;
462: v = a->a;
463: if (usecprow) {
464: mbs = a->compressedrow.nrows;
465: ii = a->compressedrow.i;
466: ridx = a->compressedrow.rindex;
467: } else {
468: mbs = a->mbs;
469: ii = a->i;
470: z = zarray;
471: }
473: for (i=0; i<mbs; i++) {
474: n = ii[1] - ii[0]; ii++;
475: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
476: nonzerorow += (n>0);
477: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
478: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
479: for (j=0; j<n; j++) {
480: xb = x + 5*(*idx++);
481: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
482: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
483: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
484: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
485: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
486: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
487: v += 25;
488: }
489: if (usecprow) z = zarray + 5*ridx[i];
490: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
491: if (!usecprow) z += 5;
492: }
493: VecRestoreArrayRead(xx,&x);
494: VecRestoreArray(zz,&zarray);
495: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
496: return(0);
497: }
502: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
503: {
504: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
505: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
506: const PetscScalar *x,*xb;
507: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
508: const MatScalar *v;
509: PetscErrorCode ierr;
510: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
511: PetscBool usecprow=a->compressedrow.use;
514: VecGetArrayRead(xx,&x);
515: VecGetArray(zz,&zarray);
517: idx = a->j;
518: v = a->a;
519: if (usecprow) {
520: mbs = a->compressedrow.nrows;
521: ii = a->compressedrow.i;
522: ridx = a->compressedrow.rindex;
523: } else {
524: mbs = a->mbs;
525: ii = a->i;
526: z = zarray;
527: }
529: for (i=0; i<mbs; i++) {
530: n = ii[1] - ii[0];
531: ii++;
532: sum1 = 0.0;
533: sum2 = 0.0;
534: sum3 = 0.0;
535: sum4 = 0.0;
536: sum5 = 0.0;
537: sum6 = 0.0;
539: nonzerorow += (n>0);
540: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
541: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
542: for (j=0; j<n; j++) {
543: xb = x + 6*(*idx++);
544: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
545: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
546: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
547: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
548: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
549: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
550: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
551: v += 36;
552: }
553: if (usecprow) z = zarray + 6*ridx[i];
554: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
555: if (!usecprow) z += 6;
556: }
558: VecRestoreArrayRead(xx,&x);
559: VecRestoreArray(zz,&zarray);
560: PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);
561: return(0);
562: }
566: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
567: {
568: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
569: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
570: const PetscScalar *x,*xb;
571: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
572: const MatScalar *v;
573: PetscErrorCode ierr;
574: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL,nonzerorow=0;
575: PetscBool usecprow=a->compressedrow.use;
578: VecGetArrayRead(xx,&x);
579: VecGetArray(zz,&zarray);
581: idx = a->j;
582: v = a->a;
583: if (usecprow) {
584: mbs = a->compressedrow.nrows;
585: ii = a->compressedrow.i;
586: ridx = a->compressedrow.rindex;
587: } else {
588: mbs = a->mbs;
589: ii = a->i;
590: z = zarray;
591: }
593: for (i=0; i<mbs; i++) {
594: n = ii[1] - ii[0];
595: ii++;
596: sum1 = 0.0;
597: sum2 = 0.0;
598: sum3 = 0.0;
599: sum4 = 0.0;
600: sum5 = 0.0;
601: sum6 = 0.0;
602: sum7 = 0.0;
604: nonzerorow += (n>0);
605: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
606: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
607: for (j=0; j<n; j++) {
608: xb = x + 7*(*idx++);
609: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
610: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
611: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
612: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
613: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
614: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
615: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
616: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
617: v += 49;
618: }
619: if (usecprow) z = zarray + 7*ridx[i];
620: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
621: if (!usecprow) z += 7;
622: }
624: VecRestoreArrayRead(xx,&x);
625: VecRestoreArray(zz,&zarray);
626: PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);
627: return(0);
628: }
630: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
631: /* Default MatMult for block size 15 */
635: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
636: {
637: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
638: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
639: const PetscScalar *x,*xb;
640: PetscScalar *zarray,xv;
641: const MatScalar *v;
642: PetscErrorCode ierr;
643: const PetscInt *ii,*ij=a->j,*idx;
644: PetscInt mbs,i,j,k,n,*ridx=NULL,nonzerorow=0;
645: PetscBool usecprow=a->compressedrow.use;
648: VecGetArrayRead(xx,&x);
649: VecGetArray(zz,&zarray);
651: v = a->a;
652: if (usecprow) {
653: mbs = a->compressedrow.nrows;
654: ii = a->compressedrow.i;
655: ridx = a->compressedrow.rindex;
656: } else {
657: mbs = a->mbs;
658: ii = a->i;
659: z = zarray;
660: }
662: for (i=0; i<mbs; i++) {
663: n = ii[i+1] - ii[i];
664: idx = ij + ii[i];
665: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
666: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
668: nonzerorow += (n>0);
669: for (j=0; j<n; j++) {
670: xb = x + 15*(idx[j]);
672: for (k=0; k<15; k++) {
673: xv = xb[k];
674: sum1 += v[0]*xv;
675: sum2 += v[1]*xv;
676: sum3 += v[2]*xv;
677: sum4 += v[3]*xv;
678: sum5 += v[4]*xv;
679: sum6 += v[5]*xv;
680: sum7 += v[6]*xv;
681: sum8 += v[7]*xv;
682: sum9 += v[8]*xv;
683: sum10 += v[9]*xv;
684: sum11 += v[10]*xv;
685: sum12 += v[11]*xv;
686: sum13 += v[12]*xv;
687: sum14 += v[13]*xv;
688: sum15 += v[14]*xv;
689: v += 15;
690: }
691: }
692: if (usecprow) z = zarray + 15*ridx[i];
693: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
694: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
696: if (!usecprow) z += 15;
697: }
699: VecRestoreArrayRead(xx,&x);
700: VecRestoreArray(zz,&zarray);
701: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
702: return(0);
703: }
705: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
708: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
709: {
710: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
711: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
712: const PetscScalar *x,*xb;
713: PetscScalar x1,x2,x3,x4,*zarray;
714: const MatScalar *v;
715: PetscErrorCode ierr;
716: const PetscInt *ii,*ij=a->j,*idx;
717: PetscInt mbs,i,j,n,*ridx=NULL,nonzerorow=0;
718: PetscBool usecprow=a->compressedrow.use;
721: VecGetArrayRead(xx,&x);
722: VecGetArray(zz,&zarray);
724: v = a->a;
725: if (usecprow) {
726: mbs = a->compressedrow.nrows;
727: ii = a->compressedrow.i;
728: ridx = a->compressedrow.rindex;
729: } else {
730: mbs = a->mbs;
731: ii = a->i;
732: z = zarray;
733: }
735: for (i=0; i<mbs; i++) {
736: n = ii[i+1] - ii[i];
737: idx = ij + ii[i];
738: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
739: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
741: nonzerorow += (n>0);
742: for (j=0; j<n; j++) {
743: xb = x + 15*(idx[j]);
744: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
746: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
747: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
748: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
749: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
750: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
751: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
752: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
753: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
754: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
755: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
756: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
757: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
758: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
759: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
760: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
762: v += 60;
764: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
766: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
767: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
768: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
769: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
770: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
771: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
772: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
773: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
774: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
775: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
776: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
777: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
778: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
779: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
780: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
781: v += 60;
783: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
784: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
785: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
786: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
787: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
788: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
789: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
790: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
791: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
792: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
793: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
794: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
795: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
796: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
797: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
798: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
799: v += 60;
801: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
802: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
803: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
804: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
805: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
806: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
807: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
808: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
809: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
810: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
811: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
812: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
813: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
814: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
815: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
816: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
817: v += 45;
818: }
819: if (usecprow) z = zarray + 15*ridx[i];
820: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
821: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
823: if (!usecprow) z += 15;
824: }
826: VecRestoreArrayRead(xx,&x);
827: VecRestoreArray(zz,&zarray);
828: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
829: return(0);
830: }
832: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
835: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
836: {
837: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
838: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
839: const PetscScalar *x,*xb;
840: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
841: const MatScalar *v;
842: PetscErrorCode ierr;
843: const PetscInt *ii,*ij=a->j,*idx;
844: PetscInt mbs,i,j,n,*ridx=NULL,nonzerorow=0;
845: PetscBool usecprow=a->compressedrow.use;
848: VecGetArrayRead(xx,&x);
849: VecGetArray(zz,&zarray);
851: v = a->a;
852: if (usecprow) {
853: mbs = a->compressedrow.nrows;
854: ii = a->compressedrow.i;
855: ridx = a->compressedrow.rindex;
856: } else {
857: mbs = a->mbs;
858: ii = a->i;
859: z = zarray;
860: }
862: for (i=0; i<mbs; i++) {
863: n = ii[i+1] - ii[i];
864: idx = ij + ii[i];
865: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
866: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
868: nonzerorow += (n>0);
869: for (j=0; j<n; j++) {
870: xb = x + 15*(idx[j]);
871: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
872: x8 = xb[7];
874: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
875: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
876: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
877: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
878: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
879: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
880: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
881: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
882: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
883: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
884: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
885: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
886: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
887: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
888: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
889: v += 120;
891: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
893: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
894: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
895: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
896: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
897: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
898: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
899: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
900: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
901: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
902: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
903: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
904: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
905: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
906: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
907: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
908: v += 105;
909: }
910: if (usecprow) z = zarray + 15*ridx[i];
911: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
912: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
914: if (!usecprow) z += 15;
915: }
917: VecRestoreArrayRead(xx,&x);
918: VecRestoreArray(zz,&zarray);
919: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
920: return(0);
921: }
923: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
927: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
928: {
929: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
930: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
931: const PetscScalar *x,*xb;
932: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
933: const MatScalar *v;
934: PetscErrorCode ierr;
935: const PetscInt *ii,*ij=a->j,*idx;
936: PetscInt mbs,i,j,n,*ridx=NULL,nonzerorow=0;
937: PetscBool usecprow=a->compressedrow.use;
940: VecGetArrayRead(xx,&x);
941: VecGetArray(zz,&zarray);
943: v = a->a;
944: if (usecprow) {
945: mbs = a->compressedrow.nrows;
946: ii = a->compressedrow.i;
947: ridx = a->compressedrow.rindex;
948: } else {
949: mbs = a->mbs;
950: ii = a->i;
951: z = zarray;
952: }
954: for (i=0; i<mbs; i++) {
955: n = ii[i+1] - ii[i];
956: idx = ij + ii[i];
957: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
960: nonzerorow += (n>0);
961: for (j=0; j<n; j++) {
962: xb = x + 15*(idx[j]);
963: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
964: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
966: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
967: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
968: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
969: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
970: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
971: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
972: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
973: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
974: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
975: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
976: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
977: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
978: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
979: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
980: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
981: v += 225;
982: }
983: if (usecprow) z = zarray + 15*ridx[i];
984: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
985: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
987: if (!usecprow) z += 15;
988: }
990: VecRestoreArrayRead(xx,&x);
991: VecRestoreArray(zz,&zarray);
992: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
993: return(0);
994: }
997: /*
998: This will not work with MatScalar == float because it calls the BLAS
999: */
1002: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1003: {
1004: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1005: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1006: MatScalar *v;
1008: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1009: PetscInt ncols,k,*ridx=NULL,nonzerorow=0;
1010: PetscBool usecprow=a->compressedrow.use;
1013: VecGetArray(xx,&x);
1014: VecGetArray(zz,&zarray);
1016: idx = a->j;
1017: v = a->a;
1018: if (usecprow) {
1019: mbs = a->compressedrow.nrows;
1020: ii = a->compressedrow.i;
1021: ridx = a->compressedrow.rindex;
1022: } else {
1023: mbs = a->mbs;
1024: ii = a->i;
1025: z = zarray;
1026: }
1028: if (!a->mult_work) {
1029: k = PetscMax(A->rmap->n,A->cmap->n);
1030: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1031: }
1032: work = a->mult_work;
1033: for (i=0; i<mbs; i++) {
1034: n = ii[1] - ii[0]; ii++;
1035: ncols = n*bs;
1036: workt = work;
1037: nonzerorow += (n>0);
1038: for (j=0; j<n; j++) {
1039: xb = x + bs*(*idx++);
1040: for (k=0; k<bs; k++) workt[k] = xb[k];
1041: workt += bs;
1042: }
1043: if (usecprow) z = zarray + bs*ridx[i];
1044: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1045: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1046: v += n*bs2;
1047: if (!usecprow) z += bs;
1048: }
1049: VecRestoreArray(xx,&x);
1050: VecRestoreArray(zz,&zarray);
1051: PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);
1052: return(0);
1053: }
1057: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1058: {
1059: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1060: const PetscScalar *x;
1061: PetscScalar *y,*z,sum;
1062: const MatScalar *v;
1063: PetscErrorCode ierr;
1064: PetscInt mbs=a->mbs,i,n,*ridx=NULL,nonzerorow=0;
1065: const PetscInt *idx,*ii;
1066: PetscBool usecprow=a->compressedrow.use;
1069: VecGetArrayRead(xx,&x);
1070: VecGetArray(yy,&y);
1071: if (zz != yy) {
1072: VecGetArray(zz,&z);
1073: } else {
1074: z = y;
1075: }
1077: idx = a->j;
1078: v = a->a;
1079: if (usecprow) {
1080: if (zz != yy) {
1081: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1082: }
1083: mbs = a->compressedrow.nrows;
1084: ii = a->compressedrow.i;
1085: ridx = a->compressedrow.rindex;
1086: } else {
1087: ii = a->i;
1088: }
1090: for (i=0; i<mbs; i++) {
1091: n = ii[1] - ii[0];
1092: ii++;
1093: if (!usecprow) {
1094: nonzerorow += (n>0);
1095: sum = y[i];
1096: } else {
1097: sum = y[ridx[i]];
1098: }
1099: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1100: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1101: PetscSparseDensePlusDot(sum,x,v,idx,n);
1102: v += n;
1103: idx += n;
1104: if (usecprow) {
1105: z[ridx[i]] = sum;
1106: } else {
1107: z[i] = sum;
1108: }
1109: }
1110: VecRestoreArrayRead(xx,&x);
1111: VecRestoreArray(yy,&y);
1112: if (zz != yy) {
1113: VecRestoreArray(zz,&z);
1114: }
1115: PetscLogFlops(2.0*a->nz - nonzerorow);
1116: return(0);
1117: }
1121: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1122: {
1123: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1124: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
1125: PetscScalar x1,x2,*yarray,*zarray;
1126: MatScalar *v;
1128: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1129: PetscBool usecprow=a->compressedrow.use;
1132: VecGetArray(xx,&x);
1133: VecGetArray(yy,&yarray);
1134: if (zz != yy) {
1135: VecGetArray(zz,&zarray);
1136: } else {
1137: zarray = yarray;
1138: }
1140: idx = a->j;
1141: v = a->a;
1142: if (usecprow) {
1143: if (zz != yy) {
1144: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1145: }
1146: mbs = a->compressedrow.nrows;
1147: ii = a->compressedrow.i;
1148: ridx = a->compressedrow.rindex;
1149: if (zz != yy) {
1150: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1151: }
1152: } else {
1153: ii = a->i;
1154: y = yarray;
1155: z = zarray;
1156: }
1158: for (i=0; i<mbs; i++) {
1159: n = ii[1] - ii[0]; ii++;
1160: if (usecprow) {
1161: z = zarray + 2*ridx[i];
1162: y = yarray + 2*ridx[i];
1163: }
1164: sum1 = y[0]; sum2 = y[1];
1165: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1166: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1167: for (j=0; j<n; j++) {
1168: xb = x + 2*(*idx++);
1169: x1 = xb[0];
1170: x2 = xb[1];
1172: sum1 += v[0]*x1 + v[2]*x2;
1173: sum2 += v[1]*x1 + v[3]*x2;
1174: v += 4;
1175: }
1176: z[0] = sum1; z[1] = sum2;
1177: if (!usecprow) {
1178: z += 2; y += 2;
1179: }
1180: }
1181: VecRestoreArray(xx,&x);
1182: VecRestoreArray(yy,&yarray);
1183: if (zz != yy) {
1184: VecRestoreArray(zz,&zarray);
1185: }
1186: PetscLogFlops(4.0*a->nz);
1187: return(0);
1188: }
1192: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1193: {
1194: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1195: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1196: MatScalar *v;
1198: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1199: PetscBool usecprow=a->compressedrow.use;
1202: VecGetArray(xx,&x);
1203: VecGetArray(yy,&yarray);
1204: if (zz != yy) {
1205: VecGetArray(zz,&zarray);
1206: } else {
1207: zarray = yarray;
1208: }
1210: idx = a->j;
1211: v = a->a;
1212: if (usecprow) {
1213: if (zz != yy) {
1214: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1215: }
1216: mbs = a->compressedrow.nrows;
1217: ii = a->compressedrow.i;
1218: ridx = a->compressedrow.rindex;
1219: } else {
1220: ii = a->i;
1221: y = yarray;
1222: z = zarray;
1223: }
1225: for (i=0; i<mbs; i++) {
1226: n = ii[1] - ii[0]; ii++;
1227: if (usecprow) {
1228: z = zarray + 3*ridx[i];
1229: y = yarray + 3*ridx[i];
1230: }
1231: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1232: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1233: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1234: for (j=0; j<n; j++) {
1235: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1236: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1237: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1238: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1239: v += 9;
1240: }
1241: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1242: if (!usecprow) {
1243: z += 3; y += 3;
1244: }
1245: }
1246: VecRestoreArray(xx,&x);
1247: VecRestoreArray(yy,&yarray);
1248: if (zz != yy) {
1249: VecRestoreArray(zz,&zarray);
1250: }
1251: PetscLogFlops(18.0*a->nz);
1252: return(0);
1253: }
1257: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1258: {
1259: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1260: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1261: MatScalar *v;
1263: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1264: PetscBool usecprow=a->compressedrow.use;
1267: VecGetArray(xx,&x);
1268: VecGetArray(yy,&yarray);
1269: if (zz != yy) {
1270: VecGetArray(zz,&zarray);
1271: } else {
1272: zarray = yarray;
1273: }
1275: idx = a->j;
1276: v = a->a;
1277: if (usecprow) {
1278: if (zz != yy) {
1279: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1280: }
1281: mbs = a->compressedrow.nrows;
1282: ii = a->compressedrow.i;
1283: ridx = a->compressedrow.rindex;
1284: } else {
1285: ii = a->i;
1286: y = yarray;
1287: z = zarray;
1288: }
1290: for (i=0; i<mbs; i++) {
1291: n = ii[1] - ii[0]; ii++;
1292: if (usecprow) {
1293: z = zarray + 4*ridx[i];
1294: y = yarray + 4*ridx[i];
1295: }
1296: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1297: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1298: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1299: for (j=0; j<n; j++) {
1300: xb = x + 4*(*idx++);
1301: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1302: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1303: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1304: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1305: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1306: v += 16;
1307: }
1308: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1309: if (!usecprow) {
1310: z += 4; y += 4;
1311: }
1312: }
1313: VecRestoreArray(xx,&x);
1314: VecRestoreArray(yy,&yarray);
1315: if (zz != yy) {
1316: VecRestoreArray(zz,&zarray);
1317: }
1318: PetscLogFlops(32.0*a->nz);
1319: return(0);
1320: }
1324: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1325: {
1326: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1327: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1328: PetscScalar *yarray,*zarray;
1329: MatScalar *v;
1331: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1332: PetscBool usecprow=a->compressedrow.use;
1335: VecGetArray(xx,&x);
1336: VecGetArray(yy,&yarray);
1337: if (zz != yy) {
1338: VecGetArray(zz,&zarray);
1339: } else {
1340: zarray = yarray;
1341: }
1343: idx = a->j;
1344: v = a->a;
1345: if (usecprow) {
1346: if (zz != yy) {
1347: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1348: }
1349: mbs = a->compressedrow.nrows;
1350: ii = a->compressedrow.i;
1351: ridx = a->compressedrow.rindex;
1352: } else {
1353: ii = a->i;
1354: y = yarray;
1355: z = zarray;
1356: }
1358: for (i=0; i<mbs; i++) {
1359: n = ii[1] - ii[0]; ii++;
1360: if (usecprow) {
1361: z = zarray + 5*ridx[i];
1362: y = yarray + 5*ridx[i];
1363: }
1364: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1365: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1366: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1367: for (j=0; j<n; j++) {
1368: xb = x + 5*(*idx++);
1369: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1370: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1371: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1372: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1373: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1374: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1375: v += 25;
1376: }
1377: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1378: if (!usecprow) {
1379: z += 5; y += 5;
1380: }
1381: }
1382: VecRestoreArray(xx,&x);
1383: VecRestoreArray(yy,&yarray);
1384: if (zz != yy) {
1385: VecRestoreArray(zz,&zarray);
1386: }
1387: PetscLogFlops(50.0*a->nz);
1388: return(0);
1389: }
1392: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1393: {
1394: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1395: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1396: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1397: MatScalar *v;
1399: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1400: PetscBool usecprow=a->compressedrow.use;
1403: VecGetArray(xx,&x);
1404: VecGetArray(yy,&yarray);
1405: if (zz != yy) {
1406: VecGetArray(zz,&zarray);
1407: } else {
1408: zarray = yarray;
1409: }
1411: idx = a->j;
1412: v = a->a;
1413: if (usecprow) {
1414: if (zz != yy) {
1415: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1416: }
1417: mbs = a->compressedrow.nrows;
1418: ii = a->compressedrow.i;
1419: ridx = a->compressedrow.rindex;
1420: } else {
1421: ii = a->i;
1422: y = yarray;
1423: z = zarray;
1424: }
1426: for (i=0; i<mbs; i++) {
1427: n = ii[1] - ii[0]; ii++;
1428: if (usecprow) {
1429: z = zarray + 6*ridx[i];
1430: y = yarray + 6*ridx[i];
1431: }
1432: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1433: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1434: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1435: for (j=0; j<n; j++) {
1436: xb = x + 6*(*idx++);
1437: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1438: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1439: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1440: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1441: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1442: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1443: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1444: v += 36;
1445: }
1446: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1447: if (!usecprow) {
1448: z += 6; y += 6;
1449: }
1450: }
1451: VecRestoreArray(xx,&x);
1452: VecRestoreArray(yy,&yarray);
1453: if (zz != yy) {
1454: VecRestoreArray(zz,&zarray);
1455: }
1456: PetscLogFlops(72.0*a->nz);
1457: return(0);
1458: }
1462: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1463: {
1464: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1465: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1466: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1467: MatScalar *v;
1469: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1470: PetscBool usecprow=a->compressedrow.use;
1473: VecGetArray(xx,&x);
1474: VecGetArray(yy,&yarray);
1475: if (zz != yy) {
1476: VecGetArray(zz,&zarray);
1477: } else {
1478: zarray = yarray;
1479: }
1481: idx = a->j;
1482: v = a->a;
1483: if (usecprow) {
1484: if (zz != yy) {
1485: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1486: }
1487: mbs = a->compressedrow.nrows;
1488: ii = a->compressedrow.i;
1489: ridx = a->compressedrow.rindex;
1490: } else {
1491: ii = a->i;
1492: y = yarray;
1493: z = zarray;
1494: }
1496: for (i=0; i<mbs; i++) {
1497: n = ii[1] - ii[0]; ii++;
1498: if (usecprow) {
1499: z = zarray + 7*ridx[i];
1500: y = yarray + 7*ridx[i];
1501: }
1502: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1503: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1504: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1505: for (j=0; j<n; j++) {
1506: xb = x + 7*(*idx++);
1507: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1508: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1509: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1510: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1511: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1512: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1513: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1514: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1515: v += 49;
1516: }
1517: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1518: if (!usecprow) {
1519: z += 7; y += 7;
1520: }
1521: }
1522: VecRestoreArray(xx,&x);
1523: VecRestoreArray(yy,&yarray);
1524: if (zz != yy) {
1525: VecRestoreArray(zz,&zarray);
1526: }
1527: PetscLogFlops(98.0*a->nz);
1528: return(0);
1529: }
1533: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1534: {
1535: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1536: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1537: MatScalar *v;
1539: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1540: PetscInt ncols,k,*ridx=NULL;
1541: PetscBool usecprow=a->compressedrow.use;
1544: VecCopy(yy,zz);
1545: VecGetArray(xx,&x);
1546: VecGetArray(zz,&zarray);
1548: idx = a->j;
1549: v = a->a;
1550: if (usecprow) {
1551: mbs = a->compressedrow.nrows;
1552: ii = a->compressedrow.i;
1553: ridx = a->compressedrow.rindex;
1554: } else {
1555: mbs = a->mbs;
1556: ii = a->i;
1557: z = zarray;
1558: }
1560: if (!a->mult_work) {
1561: k = PetscMax(A->rmap->n,A->cmap->n);
1562: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1563: }
1564: work = a->mult_work;
1565: for (i=0; i<mbs; i++) {
1566: n = ii[1] - ii[0]; ii++;
1567: ncols = n*bs;
1568: workt = work;
1569: for (j=0; j<n; j++) {
1570: xb = x + bs*(*idx++);
1571: for (k=0; k<bs; k++) workt[k] = xb[k];
1572: workt += bs;
1573: }
1574: if (usecprow) z = zarray + bs*ridx[i];
1575: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1576: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1577: v += n*bs2;
1578: if (!usecprow) z += bs;
1579: }
1580: VecRestoreArray(xx,&x);
1581: VecRestoreArray(zz,&zarray);
1582: PetscLogFlops(2.0*a->nz*bs2);
1583: return(0);
1584: }
1588: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1589: {
1590: PetscScalar zero = 0.0;
1594: VecSet(zz,zero);
1595: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1596: return(0);
1597: }
1601: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1602: {
1603: PetscScalar zero = 0.0;
1607: VecSet(zz,zero);
1608: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1609: return(0);
1610: }
1614: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1615: {
1616: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1617: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1618: MatScalar *v;
1619: PetscErrorCode ierr;
1620: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
1621: Mat_CompressedRow cprow = a->compressedrow;
1622: PetscBool usecprow=cprow.use;
1625: if (yy != zz) { VecCopy(yy,zz); }
1626: VecGetArray(xx,&x);
1627: VecGetArray(zz,&z);
1629: idx = a->j;
1630: v = a->a;
1631: if (usecprow) {
1632: mbs = cprow.nrows;
1633: ii = cprow.i;
1634: ridx = cprow.rindex;
1635: } else {
1636: mbs=a->mbs;
1637: ii = a->i;
1638: xb = x;
1639: }
1641: switch (bs) {
1642: case 1:
1643: for (i=0; i<mbs; i++) {
1644: if (usecprow) xb = x + ridx[i];
1645: x1 = xb[0];
1646: ib = idx + ii[0];
1647: n = ii[1] - ii[0]; ii++;
1648: for (j=0; j<n; j++) {
1649: rval = ib[j];
1650: z[rval] += PetscConj(*v) * x1;
1651: v++;
1652: }
1653: if (!usecprow) xb++;
1654: }
1655: break;
1656: case 2:
1657: for (i=0; i<mbs; i++) {
1658: if (usecprow) xb = x + 2*ridx[i];
1659: x1 = xb[0]; x2 = xb[1];
1660: ib = idx + ii[0];
1661: n = ii[1] - ii[0]; ii++;
1662: for (j=0; j<n; j++) {
1663: rval = ib[j]*2;
1664: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1665: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1666: v += 4;
1667: }
1668: if (!usecprow) xb += 2;
1669: }
1670: break;
1671: case 3:
1672: for (i=0; i<mbs; i++) {
1673: if (usecprow) xb = x + 3*ridx[i];
1674: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1675: ib = idx + ii[0];
1676: n = ii[1] - ii[0]; ii++;
1677: for (j=0; j<n; j++) {
1678: rval = ib[j]*3;
1679: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1680: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1681: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1682: v += 9;
1683: }
1684: if (!usecprow) xb += 3;
1685: }
1686: break;
1687: case 4:
1688: for (i=0; i<mbs; i++) {
1689: if (usecprow) xb = x + 4*ridx[i];
1690: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1691: ib = idx + ii[0];
1692: n = ii[1] - ii[0]; ii++;
1693: for (j=0; j<n; j++) {
1694: rval = ib[j]*4;
1695: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1696: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1697: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1698: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1699: v += 16;
1700: }
1701: if (!usecprow) xb += 4;
1702: }
1703: break;
1704: case 5:
1705: for (i=0; i<mbs; i++) {
1706: if (usecprow) xb = x + 5*ridx[i];
1707: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1708: x4 = xb[3]; x5 = xb[4];
1709: ib = idx + ii[0];
1710: n = ii[1] - ii[0]; ii++;
1711: for (j=0; j<n; j++) {
1712: rval = ib[j]*5;
1713: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1714: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1715: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1716: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1717: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1718: v += 25;
1719: }
1720: if (!usecprow) xb += 5;
1721: }
1722: break;
1723: default: { /* block sizes larger than 5 by 5 are handled by BLAS */
1724: PetscInt ncols,k;
1725: PetscScalar *work,*workt,*xtmp;
1727: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1728: if (!a->mult_work) {
1729: k = PetscMax(A->rmap->n,A->cmap->n);
1730: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1731: }
1732: work = a->mult_work;
1733: xtmp = x;
1734: for (i=0; i<mbs; i++) {
1735: n = ii[1] - ii[0]; ii++;
1736: ncols = n*bs;
1737: PetscMemzero(work,ncols*sizeof(PetscScalar));
1738: if (usecprow) xtmp = x + bs*ridx[i];
1739: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1740: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1741: v += n*bs2;
1742: if (!usecprow) xtmp += bs;
1743: workt = work;
1744: for (j=0; j<n; j++) {
1745: zb = z + bs*(*idx++);
1746: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1747: workt += bs;
1748: }
1749: }
1750: }
1751: }
1752: VecRestoreArray(xx,&x);
1753: VecRestoreArray(zz,&z);
1754: PetscLogFlops(2.0*a->nz*a->bs2);
1755: return(0);
1756: }
1760: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1761: {
1762: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1763: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1764: MatScalar *v;
1765: PetscErrorCode ierr;
1766: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
1767: Mat_CompressedRow cprow = a->compressedrow;
1768: PetscBool usecprow=cprow.use;
1771: if (yy != zz) { VecCopy(yy,zz); }
1772: VecGetArray(xx,&x);
1773: VecGetArray(zz,&z);
1775: idx = a->j;
1776: v = a->a;
1777: if (usecprow) {
1778: mbs = cprow.nrows;
1779: ii = cprow.i;
1780: ridx = cprow.rindex;
1781: } else {
1782: mbs=a->mbs;
1783: ii = a->i;
1784: xb = x;
1785: }
1787: switch (bs) {
1788: case 1:
1789: for (i=0; i<mbs; i++) {
1790: if (usecprow) xb = x + ridx[i];
1791: x1 = xb[0];
1792: ib = idx + ii[0];
1793: n = ii[1] - ii[0]; ii++;
1794: for (j=0; j<n; j++) {
1795: rval = ib[j];
1796: z[rval] += *v * x1;
1797: v++;
1798: }
1799: if (!usecprow) xb++;
1800: }
1801: break;
1802: case 2:
1803: for (i=0; i<mbs; i++) {
1804: if (usecprow) xb = x + 2*ridx[i];
1805: x1 = xb[0]; x2 = xb[1];
1806: ib = idx + ii[0];
1807: n = ii[1] - ii[0]; ii++;
1808: for (j=0; j<n; j++) {
1809: rval = ib[j]*2;
1810: z[rval++] += v[0]*x1 + v[1]*x2;
1811: z[rval++] += v[2]*x1 + v[3]*x2;
1812: v += 4;
1813: }
1814: if (!usecprow) xb += 2;
1815: }
1816: break;
1817: case 3:
1818: for (i=0; i<mbs; i++) {
1819: if (usecprow) xb = x + 3*ridx[i];
1820: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1821: ib = idx + ii[0];
1822: n = ii[1] - ii[0]; ii++;
1823: for (j=0; j<n; j++) {
1824: rval = ib[j]*3;
1825: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1826: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1827: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1828: v += 9;
1829: }
1830: if (!usecprow) xb += 3;
1831: }
1832: break;
1833: case 4:
1834: for (i=0; i<mbs; i++) {
1835: if (usecprow) xb = x + 4*ridx[i];
1836: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1837: ib = idx + ii[0];
1838: n = ii[1] - ii[0]; ii++;
1839: for (j=0; j<n; j++) {
1840: rval = ib[j]*4;
1841: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1842: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1843: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1844: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1845: v += 16;
1846: }
1847: if (!usecprow) xb += 4;
1848: }
1849: break;
1850: case 5:
1851: for (i=0; i<mbs; i++) {
1852: if (usecprow) xb = x + 5*ridx[i];
1853: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1854: x4 = xb[3]; x5 = xb[4];
1855: ib = idx + ii[0];
1856: n = ii[1] - ii[0]; ii++;
1857: for (j=0; j<n; j++) {
1858: rval = ib[j]*5;
1859: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1860: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1861: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1862: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1863: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1864: v += 25;
1865: }
1866: if (!usecprow) xb += 5;
1867: }
1868: break;
1869: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1870: PetscInt ncols,k;
1871: PetscScalar *work,*workt,*xtmp;
1873: if (!a->mult_work) {
1874: k = PetscMax(A->rmap->n,A->cmap->n);
1875: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1876: }
1877: work = a->mult_work;
1878: xtmp = x;
1879: for (i=0; i<mbs; i++) {
1880: n = ii[1] - ii[0]; ii++;
1881: ncols = n*bs;
1882: PetscMemzero(work,ncols*sizeof(PetscScalar));
1883: if (usecprow) xtmp = x + bs*ridx[i];
1884: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1885: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1886: v += n*bs2;
1887: if (!usecprow) xtmp += bs;
1888: workt = work;
1889: for (j=0; j<n; j++) {
1890: zb = z + bs*(*idx++);
1891: for (k=0; k<bs; k++) zb[k] += workt[k];
1892: workt += bs;
1893: }
1894: }
1895: }
1896: }
1897: VecRestoreArray(xx,&x);
1898: VecRestoreArray(zz,&z);
1899: PetscLogFlops(2.0*a->nz*a->bs2);
1900: return(0);
1901: }
1905: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1906: {
1907: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1908: PetscInt totalnz = a->bs2*a->nz;
1909: PetscScalar oalpha = alpha;
1911: PetscBLASInt one = 1,tnz;
1914: PetscBLASIntCast(totalnz,&tnz);
1915: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1916: PetscLogFlops(totalnz);
1917: return(0);
1918: }
1922: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1923: {
1925: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1926: MatScalar *v = a->a;
1927: PetscReal sum = 0.0;
1928: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1931: if (type == NORM_FROBENIUS) {
1932: for (i=0; i< bs2*nz; i++) {
1933: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1934: }
1935: *norm = PetscSqrtReal(sum);
1936: } else if (type == NORM_1) { /* maximum column sum */
1937: PetscReal *tmp;
1938: PetscInt *bcol = a->j;
1939: PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1940: PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1941: for (i=0; i<nz; i++) {
1942: for (j=0; j<bs; j++) {
1943: k1 = bs*(*bcol) + j; /* column index */
1944: for (k=0; k<bs; k++) {
1945: tmp[k1] += PetscAbsScalar(*v); v++;
1946: }
1947: }
1948: bcol++;
1949: }
1950: *norm = 0.0;
1951: for (j=0; j<A->cmap->n; j++) {
1952: if (tmp[j] > *norm) *norm = tmp[j];
1953: }
1954: PetscFree(tmp);
1955: } else if (type == NORM_INFINITY) { /* maximum row sum */
1956: *norm = 0.0;
1957: for (k=0; k<bs; k++) {
1958: for (j=0; j<a->mbs; j++) {
1959: v = a->a + bs2*a->i[j] + k;
1960: sum = 0.0;
1961: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1962: for (k1=0; k1<bs; k1++) {
1963: sum += PetscAbsScalar(*v);
1964: v += bs;
1965: }
1966: }
1967: if (sum > *norm) *norm = sum;
1968: }
1969: }
1970: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1971: return(0);
1972: }
1977: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1978: {
1979: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1983: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1984: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1985: *flg = PETSC_FALSE;
1986: return(0);
1987: }
1989: /* if the a->i are the same */
1990: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1991: if (!*flg) return(0);
1993: /* if a->j are the same */
1994: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1995: if (!*flg) return(0);
1997: /* if a->a are the same */
1998: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1999: return(0);
2001: }
2005: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2006: {
2007: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2009: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2010: PetscScalar *x,zero = 0.0;
2011: MatScalar *aa,*aa_j;
2014: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2015: bs = A->rmap->bs;
2016: aa = a->a;
2017: ai = a->i;
2018: aj = a->j;
2019: ambs = a->mbs;
2020: bs2 = a->bs2;
2022: VecSet(v,zero);
2023: VecGetArray(v,&x);
2024: VecGetLocalSize(v,&n);
2025: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2026: for (i=0; i<ambs; i++) {
2027: for (j=ai[i]; j<ai[i+1]; j++) {
2028: if (aj[j] == i) {
2029: row = i*bs;
2030: aa_j = aa+j*bs2;
2031: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2032: break;
2033: }
2034: }
2035: }
2036: VecRestoreArray(v,&x);
2037: return(0);
2038: }
2042: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2043: {
2044: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2045: const PetscScalar *l,*r,*li,*ri;
2046: PetscScalar x;
2047: MatScalar *aa, *v;
2048: PetscErrorCode ierr;
2049: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2050: const PetscInt *ai,*aj;
2053: ai = a->i;
2054: aj = a->j;
2055: aa = a->a;
2056: m = A->rmap->n;
2057: n = A->cmap->n;
2058: bs = A->rmap->bs;
2059: mbs = a->mbs;
2060: bs2 = a->bs2;
2061: if (ll) {
2062: VecGetArrayRead(ll,&l);
2063: VecGetLocalSize(ll,&lm);
2064: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2065: for (i=0; i<mbs; i++) { /* for each block row */
2066: M = ai[i+1] - ai[i];
2067: li = l + i*bs;
2068: v = aa + bs2*ai[i];
2069: for (j=0; j<M; j++) { /* for each block */
2070: for (k=0; k<bs2; k++) {
2071: (*v++) *= li[k%bs];
2072: }
2073: }
2074: }
2075: VecRestoreArrayRead(ll,&l);
2076: PetscLogFlops(a->nz);
2077: }
2079: if (rr) {
2080: VecGetArrayRead(rr,&r);
2081: VecGetLocalSize(rr,&rn);
2082: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2083: for (i=0; i<mbs; i++) { /* for each block row */
2084: iai = ai[i];
2085: M = ai[i+1] - iai;
2086: v = aa + bs2*iai;
2087: for (j=0; j<M; j++) { /* for each block */
2088: ri = r + bs*aj[iai+j];
2089: for (k=0; k<bs; k++) {
2090: x = ri[k];
2091: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2092: v += bs;
2093: }
2094: }
2095: }
2096: VecRestoreArrayRead(rr,&r);
2097: PetscLogFlops(a->nz);
2098: }
2099: return(0);
2100: }
2105: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2106: {
2107: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2110: info->block_size = a->bs2;
2111: info->nz_allocated = a->bs2*a->maxnz;
2112: info->nz_used = a->bs2*a->nz;
2113: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2114: info->assemblies = A->num_ass;
2115: info->mallocs = A->info.mallocs;
2116: info->memory = ((PetscObject)A)->mem;
2117: if (A->factortype) {
2118: info->fill_ratio_given = A->info.fill_ratio_given;
2119: info->fill_ratio_needed = A->info.fill_ratio_needed;
2120: info->factor_mallocs = A->info.factor_mallocs;
2121: } else {
2122: info->fill_ratio_given = 0;
2123: info->fill_ratio_needed = 0;
2124: info->factor_mallocs = 0;
2125: }
2126: return(0);
2127: }
2132: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2133: {
2134: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2138: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2139: return(0);
2140: }