Actual source code: sbaij2.c
petsc-3.7.3 2016-08-01
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
6: #include <petscblaslapack.h>
10: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
14: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15: const PetscInt *idx;
16: PetscBT table_out,table_in;
19: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20: mbs = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap->bs;
24: PetscBTCreate(mbs,&table_out);
25: PetscMalloc1(mbs+1,&nidx);
26: PetscMalloc1(A->rmap->N+1,&nidx2);
27: PetscBTCreate(mbs,&table_in);
29: for (i=0; i<is_max; i++) { /* for each is */
30: isz = 0;
31: PetscBTMemzero(mbs,table_out);
33: /* Extract the indices, assume there can be duplicate entries */
34: ISGetIndices(is[i],&idx);
35: ISGetLocalSize(is[i],&n);
37: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38: bcol_max = 0;
39: for (j=0; j<n; ++j) {
40: brow = idx[j]/bs; /* convert the indices into block indices */
41: if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42: if (!PetscBTLookupSet(table_out,brow)) {
43: nidx[isz++] = brow;
44: if (bcol_max < brow) bcol_max = brow;
45: }
46: }
47: ISRestoreIndices(is[i],&idx);
48: ISDestroy(&is[i]);
50: k = 0;
51: for (j=0; j<ov; j++) { /* for each overlap */
52: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53: PetscBTMemzero(mbs,table_in);
54: for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }
56: n = isz; /* length of the updated is[i] */
57: for (brow=0; brow<mbs; brow++) {
58: start = ai[brow]; end = ai[brow+1];
59: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60: for (l = start; l<end; l++) {
61: bcol = aj[l];
62: if (!PetscBTLookupSet(table_out,bcol)) {
63: nidx[isz++] = bcol;
64: if (bcol_max < bcol) bcol_max = bcol;
65: }
66: }
67: k++;
68: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70: for (l = start; l<end; l++) {
71: bcol = aj[l];
72: if (bcol > bcol_max) break;
73: if (PetscBTLookup(table_in,bcol)) {
74: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75: break; /* for l = start; l<end ; l++) */
76: }
77: }
78: }
79: }
80: } /* for each overlap */
82: /* expand the Index Set */
83: for (j=0; j<isz; j++) {
84: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85: }
86: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
87: }
88: PetscBTDestroy(&table_out);
89: PetscFree(nidx);
90: PetscFree(nidx2);
91: PetscBTDestroy(&table_in);
92: return(0);
93: }
95: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
96: Zero some ops' to avoid invalid usse */
99: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
100: {
104: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
105: Bseq->ops->mult = 0;
106: Bseq->ops->multadd = 0;
107: Bseq->ops->multtranspose = 0;
108: Bseq->ops->multtransposeadd = 0;
109: Bseq->ops->lufactor = 0;
110: Bseq->ops->choleskyfactor = 0;
111: Bseq->ops->lufactorsymbolic = 0;
112: Bseq->ops->choleskyfactorsymbolic = 0;
113: Bseq->ops->getinertia = 0;
114: return(0);
115: }
117: /* same as MatGetSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
120: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
121: {
122: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
124: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
125: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
126: const PetscInt *irow,*icol;
127: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
128: PetscInt *aj = a->j,*ai = a->i;
129: MatScalar *mat_a;
130: Mat C;
131: PetscBool flag;
135: ISGetIndices(isrow,&irow);
136: ISGetIndices(iscol,&icol);
137: ISGetLocalSize(isrow,&nrows);
138: ISGetLocalSize(iscol,&ncols);
140: PetscCalloc1(1+oldcols,&smap);
141: ssmap = smap;
142: PetscMalloc1(1+nrows,&lens);
143: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
144: /* determine lens of each row */
145: for (i=0; i<nrows; i++) {
146: kstart = ai[irow[i]];
147: kend = kstart + a->ilen[irow[i]];
148: lens[i] = 0;
149: for (k=kstart; k<kend; k++) {
150: if (ssmap[aj[k]]) lens[i]++;
151: }
152: }
153: /* Create and fill new matrix */
154: if (scall == MAT_REUSE_MATRIX) {
155: c = (Mat_SeqSBAIJ*)((*B)->data);
157: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
158: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
159: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
160: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
161: C = *B;
162: } else {
163: MatCreate(PetscObjectComm((PetscObject)A),&C);
164: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
165: MatSetType(C,((PetscObject)A)->type_name);
166: MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
167: }
168: c = (Mat_SeqSBAIJ*)(C->data);
169: for (i=0; i<nrows; i++) {
170: row = irow[i];
171: kstart = ai[row];
172: kend = kstart + a->ilen[row];
173: mat_i = c->i[i];
174: mat_j = c->j + mat_i;
175: mat_a = c->a + mat_i*bs2;
176: mat_ilen = c->ilen + i;
177: for (k=kstart; k<kend; k++) {
178: if ((tcol=ssmap[a->j[k]])) {
179: *mat_j++ = tcol - 1;
180: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
181: mat_a += bs2;
182: (*mat_ilen)++;
183: }
184: }
185: }
186: /* sort */
187: {
188: MatScalar *work;
190: PetscMalloc1(bs2,&work);
191: for (i=0; i<nrows; i++) {
192: PetscInt ilen;
193: mat_i = c->i[i];
194: mat_j = c->j + mat_i;
195: mat_a = c->a + mat_i*bs2;
196: ilen = c->ilen[i];
197: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
198: }
199: PetscFree(work);
200: }
202: /* Free work space */
203: ISRestoreIndices(iscol,&icol);
204: PetscFree(smap);
205: PetscFree(lens);
206: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
207: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
209: ISRestoreIndices(isrow,&irow);
210: *B = C;
211: return(0);
212: }
216: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
217: {
218: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
219: IS is1,is2;
221: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
222: const PetscInt *irow,*icol;
225: ISGetIndices(isrow,&irow);
226: ISGetIndices(iscol,&icol);
227: ISGetLocalSize(isrow,&nrows);
228: ISGetLocalSize(iscol,&ncols);
230: /* Verify if the indices corespond to each element in a block
231: and form the IS with compressed IS */
232: maxmnbs = PetscMax(a->mbs,a->nbs);
233: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
234: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
235: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
236: for (i=0; i<a->mbs; i++) {
237: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
238: }
239: count = 0;
240: for (i=0; i<nrows; i++) {
241: PetscInt j = irow[i] / bs;
242: if ((vary[j]--)==bs) iary[count++] = j;
243: }
244: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
246: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
247: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
248: for (i=0; i<a->nbs; i++) {
249: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
250: }
251: count = 0;
252: for (i=0; i<ncols; i++) {
253: PetscInt j = icol[i] / bs;
254: if ((vary[j]--)==bs) iary[count++] = j;
255: }
256: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
257: ISRestoreIndices(isrow,&irow);
258: ISRestoreIndices(iscol,&icol);
259: PetscFree2(vary,iary);
261: MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
262: ISDestroy(&is1);
263: ISDestroy(&is2);
265: if (isrow != iscol) {
266: PetscBool isequal;
267: ISEqual(isrow,iscol,&isequal);
268: if (!isequal) {
269: MatSeqSBAIJZeroOps_Private(*B);
270: }
271: }
272: return(0);
273: }
277: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
278: {
280: PetscInt i;
283: if (scall == MAT_INITIAL_MATRIX) {
284: PetscMalloc1(n+1,B);
285: }
287: for (i=0; i<n; i++) {
288: MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
289: }
290: return(0);
291: }
293: /* -------------------------------------------------------*/
294: /* Should check that shapes of vectors and matrices match */
295: /* -------------------------------------------------------*/
299: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
300: {
301: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
302: PetscScalar *z,x1,x2,zero=0.0;
303: const PetscScalar *x,*xb;
304: const MatScalar *v;
305: PetscErrorCode ierr;
306: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
307: const PetscInt *aj=a->j,*ai=a->i,*ib;
308: PetscInt nonzerorow=0;
311: VecSet(zz,zero);
312: VecGetArrayRead(xx,&x);
313: VecGetArray(zz,&z);
315: v = a->a;
316: xb = x;
318: for (i=0; i<mbs; i++) {
319: n = ai[1] - ai[0]; /* length of i_th block row of A */
320: x1 = xb[0]; x2 = xb[1];
321: ib = aj + *ai;
322: jmin = 0;
323: nonzerorow += (n>0);
324: if (*ib == i) { /* (diag of A)*x */
325: z[2*i] += v[0]*x1 + v[2]*x2;
326: z[2*i+1] += v[2]*x1 + v[3]*x2;
327: v += 4; jmin++;
328: }
329: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
330: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
331: for (j=jmin; j<n; j++) {
332: /* (strict lower triangular part of A)*x */
333: cval = ib[j]*2;
334: z[cval] += v[0]*x1 + v[1]*x2;
335: z[cval+1] += v[2]*x1 + v[3]*x2;
336: /* (strict upper triangular part of A)*x */
337: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
338: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
339: v += 4;
340: }
341: xb +=2; ai++;
342: }
344: VecRestoreArrayRead(xx,&x);
345: VecRestoreArray(zz,&z);
346: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
347: return(0);
348: }
352: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
353: {
354: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
355: PetscScalar *z,x1,x2,x3,zero=0.0;
356: const PetscScalar *x,*xb;
357: const MatScalar *v;
358: PetscErrorCode ierr;
359: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
360: const PetscInt *aj = a->j,*ai = a->i,*ib;
361: PetscInt nonzerorow=0;
364: VecSet(zz,zero);
365: VecGetArrayRead(xx,&x);
366: VecGetArray(zz,&z);
368: v = a->a;
369: xb = x;
371: for (i=0; i<mbs; i++) {
372: n = ai[1] - ai[0]; /* length of i_th block row of A */
373: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
374: ib = aj + *ai;
375: jmin = 0;
376: nonzerorow += (n>0);
377: if (*ib == i) { /* (diag of A)*x */
378: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
379: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
380: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
381: v += 9; jmin++;
382: }
383: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
384: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
385: for (j=jmin; j<n; j++) {
386: /* (strict lower triangular part of A)*x */
387: cval = ib[j]*3;
388: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
389: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
390: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
391: /* (strict upper triangular part of A)*x */
392: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
393: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
394: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
395: v += 9;
396: }
397: xb +=3; ai++;
398: }
400: VecRestoreArrayRead(xx,&x);
401: VecRestoreArray(zz,&z);
402: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
403: return(0);
404: }
408: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
409: {
410: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
411: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
412: const PetscScalar *x,*xb;
413: const MatScalar *v;
414: PetscErrorCode ierr;
415: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
416: const PetscInt *aj = a->j,*ai = a->i,*ib;
417: PetscInt nonzerorow = 0;
420: VecSet(zz,zero);
421: VecGetArrayRead(xx,&x);
422: VecGetArray(zz,&z);
424: v = a->a;
425: xb = x;
427: for (i=0; i<mbs; i++) {
428: n = ai[1] - ai[0]; /* length of i_th block row of A */
429: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
430: ib = aj + *ai;
431: jmin = 0;
432: nonzerorow += (n>0);
433: if (*ib == i) { /* (diag of A)*x */
434: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
435: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
436: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
437: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
438: v += 16; jmin++;
439: }
440: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
441: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
442: for (j=jmin; j<n; j++) {
443: /* (strict lower triangular part of A)*x */
444: cval = ib[j]*4;
445: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
446: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
447: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
448: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
449: /* (strict upper triangular part of A)*x */
450: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
451: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
452: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
453: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
454: v += 16;
455: }
456: xb +=4; ai++;
457: }
459: VecRestoreArrayRead(xx,&x);
460: VecRestoreArray(zz,&z);
461: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
462: return(0);
463: }
467: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
468: {
469: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
470: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
471: const PetscScalar *x,*xb;
472: const MatScalar *v;
473: PetscErrorCode ierr;
474: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
475: const PetscInt *aj = a->j,*ai = a->i,*ib;
476: PetscInt nonzerorow=0;
479: VecSet(zz,zero);
480: VecGetArrayRead(xx,&x);
481: VecGetArray(zz,&z);
483: v = a->a;
484: xb = x;
486: for (i=0; i<mbs; i++) {
487: n = ai[1] - ai[0]; /* length of i_th block row of A */
488: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
489: ib = aj + *ai;
490: jmin = 0;
491: nonzerorow += (n>0);
492: if (*ib == i) { /* (diag of A)*x */
493: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
494: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
495: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
496: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
497: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
498: v += 25; jmin++;
499: }
500: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
501: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
502: for (j=jmin; j<n; j++) {
503: /* (strict lower triangular part of A)*x */
504: cval = ib[j]*5;
505: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
506: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
507: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
508: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
509: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
510: /* (strict upper triangular part of A)*x */
511: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
512: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
513: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
514: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
515: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
516: v += 25;
517: }
518: xb +=5; ai++;
519: }
521: VecRestoreArrayRead(xx,&x);
522: VecRestoreArray(zz,&z);
523: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
524: return(0);
525: }
530: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
531: {
532: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
533: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
534: const PetscScalar *x,*xb;
535: const MatScalar *v;
536: PetscErrorCode ierr;
537: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
538: const PetscInt *aj=a->j,*ai=a->i,*ib;
539: PetscInt nonzerorow=0;
542: VecSet(zz,zero);
543: VecGetArrayRead(xx,&x);
544: VecGetArray(zz,&z);
546: v = a->a;
547: xb = x;
549: for (i=0; i<mbs; i++) {
550: n = ai[1] - ai[0]; /* length of i_th block row of A */
551: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
552: ib = aj + *ai;
553: jmin = 0;
554: nonzerorow += (n>0);
555: if (*ib == i) { /* (diag of A)*x */
556: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
557: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
558: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
559: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
560: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
561: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
562: v += 36; jmin++;
563: }
564: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
565: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
566: for (j=jmin; j<n; j++) {
567: /* (strict lower triangular part of A)*x */
568: cval = ib[j]*6;
569: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
570: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
571: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
572: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
573: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
574: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
575: /* (strict upper triangular part of A)*x */
576: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
577: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
578: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
579: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
580: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
581: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
582: v += 36;
583: }
584: xb +=6; ai++;
585: }
587: VecRestoreArrayRead(xx,&x);
588: VecRestoreArray(zz,&z);
589: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
590: return(0);
591: }
594: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
595: {
596: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
597: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
598: const PetscScalar *x,*xb;
599: const MatScalar *v;
600: PetscErrorCode ierr;
601: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
602: const PetscInt *aj=a->j,*ai=a->i,*ib;
603: PetscInt nonzerorow=0;
606: VecSet(zz,zero);
607: VecGetArrayRead(xx,&x);
608: VecGetArray(zz,&z);
610: v = a->a;
611: xb = x;
613: for (i=0; i<mbs; i++) {
614: n = ai[1] - ai[0]; /* length of i_th block row of A */
615: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
616: ib = aj + *ai;
617: jmin = 0;
618: nonzerorow += (n>0);
619: if (*ib == i) { /* (diag of A)*x */
620: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
621: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
622: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
623: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
624: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
625: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
626: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
627: v += 49; jmin++;
628: }
629: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
630: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
631: for (j=jmin; j<n; j++) {
632: /* (strict lower triangular part of A)*x */
633: cval = ib[j]*7;
634: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
635: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
636: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
637: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
638: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
639: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
640: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
641: /* (strict upper triangular part of A)*x */
642: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
643: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
644: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
645: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
646: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
647: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
648: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
649: v += 49;
650: }
651: xb +=7; ai++;
652: }
653: VecRestoreArrayRead(xx,&x);
654: VecRestoreArray(zz,&z);
655: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
656: return(0);
657: }
659: /*
660: This will not work with MatScalar == float because it calls the BLAS
661: */
664: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
665: {
666: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
667: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
668: const PetscScalar *x,*x_ptr,*xb;
669: const MatScalar *v;
670: PetscErrorCode ierr;
671: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
672: const PetscInt *idx,*aj,*ii;
673: PetscInt nonzerorow=0;
676: VecSet(zz,zero);
677: VecGetArrayRead(xx,&x);x_ptr = x;
678: VecGetArray(zz,&z); z_ptr=z;
680: aj = a->j;
681: v = a->a;
682: ii = a->i;
684: if (!a->mult_work) {
685: PetscMalloc1(A->rmap->N+1,&a->mult_work);
686: }
687: work = a->mult_work;
689: for (i=0; i<mbs; i++) {
690: n = ii[1] - ii[0]; ncols = n*bs;
691: workt = work; idx=aj+ii[0];
692: nonzerorow += (n>0);
694: /* upper triangular part */
695: for (j=0; j<n; j++) {
696: xb = x_ptr + bs*(*idx++);
697: for (k=0; k<bs; k++) workt[k] = xb[k];
698: workt += bs;
699: }
700: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
701: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
703: /* strict lower triangular part */
704: idx = aj+ii[0];
705: if (*idx == i) {
706: ncols -= bs; v += bs2; idx++; n--;
707: }
709: if (ncols > 0) {
710: workt = work;
711: PetscMemzero(workt,ncols*sizeof(PetscScalar));
712: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
713: for (j=0; j<n; j++) {
714: zb = z_ptr + bs*(*idx++);
715: for (k=0; k<bs; k++) zb[k] += workt[k];
716: workt += bs;
717: }
718: }
719: x += bs; v += n*bs2; z += bs; ii++;
720: }
722: VecRestoreArrayRead(xx,&x);
723: VecRestoreArray(zz,&z);
724: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
725: return(0);
726: }
730: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
731: {
732: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
733: PetscScalar *z,x1;
734: const PetscScalar *x,*xb;
735: const MatScalar *v;
736: PetscErrorCode ierr;
737: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
738: const PetscInt *aj=a->j,*ai=a->i,*ib;
739: PetscInt nonzerorow=0;
742: VecCopy(yy,zz);
743: VecGetArrayRead(xx,&x);
744: VecGetArray(zz,&z);
745: v = a->a;
746: xb = x;
748: for (i=0; i<mbs; i++) {
749: n = ai[1] - ai[0]; /* length of i_th row of A */
750: x1 = xb[0];
751: ib = aj + *ai;
752: jmin = 0;
753: nonzerorow += (n>0);
754: if (*ib == i) { /* (diag of A)*x */
755: z[i] += *v++ * x[*ib++]; jmin++;
756: }
757: for (j=jmin; j<n; j++) {
758: cval = *ib;
759: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
760: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
761: }
762: xb++; ai++;
763: }
765: VecRestoreArrayRead(xx,&x);
766: VecRestoreArray(zz,&z);
768: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
769: return(0);
770: }
774: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
775: {
776: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
777: PetscScalar *z,x1,x2;
778: const PetscScalar *x,*xb;
779: const MatScalar *v;
780: PetscErrorCode ierr;
781: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
782: const PetscInt *aj=a->j,*ai=a->i,*ib;
783: PetscInt nonzerorow=0;
786: VecCopy(yy,zz);
787: VecGetArrayRead(xx,&x);
788: VecGetArray(zz,&z);
790: v = a->a;
791: xb = x;
793: for (i=0; i<mbs; i++) {
794: n = ai[1] - ai[0]; /* length of i_th block row of A */
795: x1 = xb[0]; x2 = xb[1];
796: ib = aj + *ai;
797: jmin = 0;
798: nonzerorow += (n>0);
799: if (*ib == i) { /* (diag of A)*x */
800: z[2*i] += v[0]*x1 + v[2]*x2;
801: z[2*i+1] += v[2]*x1 + v[3]*x2;
802: v += 4; jmin++;
803: }
804: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
805: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
806: for (j=jmin; j<n; j++) {
807: /* (strict lower triangular part of A)*x */
808: cval = ib[j]*2;
809: z[cval] += v[0]*x1 + v[1]*x2;
810: z[cval+1] += v[2]*x1 + v[3]*x2;
811: /* (strict upper triangular part of A)*x */
812: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
813: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
814: v += 4;
815: }
816: xb +=2; ai++;
817: }
818: VecRestoreArrayRead(xx,&x);
819: VecRestoreArray(zz,&z);
821: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
822: return(0);
823: }
827: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
828: {
829: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
830: PetscScalar *z,x1,x2,x3;
831: const PetscScalar *x,*xb;
832: const MatScalar *v;
833: PetscErrorCode ierr;
834: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
835: const PetscInt *aj=a->j,*ai=a->i,*ib;
836: PetscInt nonzerorow=0;
839: VecCopy(yy,zz);
840: VecGetArrayRead(xx,&x);
841: VecGetArray(zz,&z);
843: v = a->a;
844: xb = x;
846: for (i=0; i<mbs; i++) {
847: n = ai[1] - ai[0]; /* length of i_th block row of A */
848: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
849: ib = aj + *ai;
850: jmin = 0;
851: nonzerorow += (n>0);
852: if (*ib == i) { /* (diag of A)*x */
853: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
854: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
855: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
856: v += 9; jmin++;
857: }
858: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
859: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
860: for (j=jmin; j<n; j++) {
861: /* (strict lower triangular part of A)*x */
862: cval = ib[j]*3;
863: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
864: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
865: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
866: /* (strict upper triangular part of A)*x */
867: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
868: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
869: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
870: v += 9;
871: }
872: xb +=3; ai++;
873: }
875: VecRestoreArrayRead(xx,&x);
876: VecRestoreArray(zz,&z);
878: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
879: return(0);
880: }
884: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
885: {
886: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
887: PetscScalar *z,x1,x2,x3,x4;
888: const PetscScalar *x,*xb;
889: const MatScalar *v;
890: PetscErrorCode ierr;
891: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
892: const PetscInt *aj=a->j,*ai=a->i,*ib;
893: PetscInt nonzerorow=0;
896: VecCopy(yy,zz);
897: VecGetArrayRead(xx,&x);
898: VecGetArray(zz,&z);
900: v = a->a;
901: xb = x;
903: for (i=0; i<mbs; i++) {
904: n = ai[1] - ai[0]; /* length of i_th block row of A */
905: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
906: ib = aj + *ai;
907: jmin = 0;
908: nonzerorow += (n>0);
909: if (*ib == i) { /* (diag of A)*x */
910: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
911: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
912: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
913: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
914: v += 16; jmin++;
915: }
916: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
917: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
918: for (j=jmin; j<n; j++) {
919: /* (strict lower triangular part of A)*x */
920: cval = ib[j]*4;
921: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
922: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
923: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
924: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
925: /* (strict upper triangular part of A)*x */
926: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
927: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
928: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
929: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
930: v += 16;
931: }
932: xb +=4; ai++;
933: }
935: VecRestoreArrayRead(xx,&x);
936: VecRestoreArray(zz,&z);
938: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
939: return(0);
940: }
944: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
945: {
946: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
947: PetscScalar *z,x1,x2,x3,x4,x5;
948: const PetscScalar *x,*xb;
949: const MatScalar *v;
950: PetscErrorCode ierr;
951: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
952: const PetscInt *aj=a->j,*ai=a->i,*ib;
953: PetscInt nonzerorow=0;
956: VecCopy(yy,zz);
957: VecGetArrayRead(xx,&x);
958: VecGetArray(zz,&z);
960: v = a->a;
961: xb = x;
963: for (i=0; i<mbs; i++) {
964: n = ai[1] - ai[0]; /* length of i_th block row of A */
965: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
966: ib = aj + *ai;
967: jmin = 0;
968: nonzerorow += (n>0);
969: if (*ib == i) { /* (diag of A)*x */
970: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
971: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
972: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
973: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
974: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
975: v += 25; jmin++;
976: }
977: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
978: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
979: for (j=jmin; j<n; j++) {
980: /* (strict lower triangular part of A)*x */
981: cval = ib[j]*5;
982: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
983: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
984: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
985: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
986: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
987: /* (strict upper triangular part of A)*x */
988: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
989: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
990: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
991: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
992: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
993: v += 25;
994: }
995: xb +=5; ai++;
996: }
998: VecRestoreArrayRead(xx,&x);
999: VecRestoreArray(zz,&z);
1001: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
1002: return(0);
1003: }
1006: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1007: {
1008: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1009: PetscScalar *z,x1,x2,x3,x4,x5,x6;
1010: const PetscScalar *x,*xb;
1011: const MatScalar *v;
1012: PetscErrorCode ierr;
1013: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1014: const PetscInt *aj=a->j,*ai=a->i,*ib;
1015: PetscInt nonzerorow=0;
1018: VecCopy(yy,zz);
1019: VecGetArrayRead(xx,&x);
1020: VecGetArray(zz,&z);
1022: v = a->a;
1023: xb = x;
1025: for (i=0; i<mbs; i++) {
1026: n = ai[1] - ai[0]; /* length of i_th block row of A */
1027: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1028: ib = aj + *ai;
1029: jmin = 0;
1030: nonzerorow += (n>0);
1031: if (*ib == i) { /* (diag of A)*x */
1032: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1033: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1034: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1035: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1036: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1037: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1038: v += 36; jmin++;
1039: }
1040: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1041: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1042: for (j=jmin; j<n; j++) {
1043: /* (strict lower triangular part of A)*x */
1044: cval = ib[j]*6;
1045: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1046: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1047: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1048: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1049: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1050: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1051: /* (strict upper triangular part of A)*x */
1052: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1053: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1054: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1055: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1056: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1057: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1058: v += 36;
1059: }
1060: xb +=6; ai++;
1061: }
1063: VecRestoreArrayRead(xx,&x);
1064: VecRestoreArray(zz,&z);
1066: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1067: return(0);
1068: }
1072: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1073: {
1074: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1075: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1076: const PetscScalar *x,*xb;
1077: const MatScalar *v;
1078: PetscErrorCode ierr;
1079: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1080: const PetscInt *aj=a->j,*ai=a->i,*ib;
1081: PetscInt nonzerorow=0;
1084: VecCopy(yy,zz);
1085: VecGetArrayRead(xx,&x);
1086: VecGetArray(zz,&z);
1088: v = a->a;
1089: xb = x;
1091: for (i=0; i<mbs; i++) {
1092: n = ai[1] - ai[0]; /* length of i_th block row of A */
1093: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1094: ib = aj + *ai;
1095: jmin = 0;
1096: nonzerorow += (n>0);
1097: if (*ib == i) { /* (diag of A)*x */
1098: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1099: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1100: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1101: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1102: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1103: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1104: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1105: v += 49; jmin++;
1106: }
1107: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1108: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1109: for (j=jmin; j<n; j++) {
1110: /* (strict lower triangular part of A)*x */
1111: cval = ib[j]*7;
1112: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1113: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1114: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1115: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1116: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1117: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1118: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1119: /* (strict upper triangular part of A)*x */
1120: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1121: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1122: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1123: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1124: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1125: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1126: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1127: v += 49;
1128: }
1129: xb +=7; ai++;
1130: }
1132: VecRestoreArrayRead(xx,&x);
1133: VecRestoreArray(zz,&z);
1135: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1136: return(0);
1137: }
1141: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1142: {
1143: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1144: PetscScalar *z,*z_ptr=0,*zb,*work,*workt;
1145: const PetscScalar *x,*x_ptr,*xb;
1146: const MatScalar *v;
1147: PetscErrorCode ierr;
1148: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1149: const PetscInt *idx,*aj,*ii;
1150: PetscInt nonzerorow=0;
1153: VecCopy(yy,zz);
1154: VecGetArrayRead(xx,&x); x_ptr=x;
1155: VecGetArray(zz,&z); z_ptr=z;
1157: aj = a->j;
1158: v = a->a;
1159: ii = a->i;
1161: if (!a->mult_work) {
1162: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1163: }
1164: work = a->mult_work;
1167: for (i=0; i<mbs; i++) {
1168: n = ii[1] - ii[0]; ncols = n*bs;
1169: workt = work; idx=aj+ii[0];
1170: nonzerorow += (n>0);
1172: /* upper triangular part */
1173: for (j=0; j<n; j++) {
1174: xb = x_ptr + bs*(*idx++);
1175: for (k=0; k<bs; k++) workt[k] = xb[k];
1176: workt += bs;
1177: }
1178: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1179: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1181: /* strict lower triangular part */
1182: idx = aj+ii[0];
1183: if (*idx == i) {
1184: ncols -= bs; v += bs2; idx++; n--;
1185: }
1186: if (ncols > 0) {
1187: workt = work;
1188: PetscMemzero(workt,ncols*sizeof(PetscScalar));
1189: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1190: for (j=0; j<n; j++) {
1191: zb = z_ptr + bs*(*idx++);
1192: for (k=0; k<bs; k++) zb[k] += workt[k];
1193: workt += bs;
1194: }
1195: }
1197: x += bs; v += n*bs2; z += bs; ii++;
1198: }
1200: VecRestoreArrayRead(xx,&x);
1201: VecRestoreArray(zz,&z);
1203: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1204: return(0);
1205: }
1209: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1210: {
1211: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1212: PetscScalar oalpha = alpha;
1214: PetscBLASInt one = 1,totalnz;
1217: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1218: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1219: PetscLogFlops(totalnz);
1220: return(0);
1221: }
1225: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1226: {
1227: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1228: const MatScalar *v = a->a;
1229: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1230: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1231: PetscErrorCode ierr;
1232: const PetscInt *aj=a->j,*col;
1235: if (type == NORM_FROBENIUS) {
1236: for (k=0; k<mbs; k++) {
1237: jmin = a->i[k]; jmax = a->i[k+1];
1238: col = aj + jmin;
1239: if (*col == k) { /* diagonal block */
1240: for (i=0; i<bs2; i++) {
1241: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242: }
1243: jmin++;
1244: }
1245: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1246: for (i=0; i<bs2; i++) {
1247: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1248: }
1249: }
1250: }
1251: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1252: PetscLogFlops(2*bs2*a->nz);
1253: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1254: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1255: for (i=0; i<mbs; i++) jl[i] = mbs;
1256: il[0] = 0;
1258: *norm = 0.0;
1259: for (k=0; k<mbs; k++) { /* k_th block row */
1260: for (j=0; j<bs; j++) sum[j]=0.0;
1261: /*-- col sum --*/
1262: i = jl[k]; /* first |A(i,k)| to be added */
1263: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1264: at step k */
1265: while (i<mbs) {
1266: nexti = jl[i]; /* next block row to be added */
1267: ik = il[i]; /* block index of A(i,k) in the array a */
1268: for (j=0; j<bs; j++) {
1269: v = a->a + ik*bs2 + j*bs;
1270: for (k1=0; k1<bs; k1++) {
1271: sum[j] += PetscAbsScalar(*v); v++;
1272: }
1273: }
1274: /* update il, jl */
1275: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1276: jmax = a->i[i+1];
1277: if (jmin < jmax) {
1278: il[i] = jmin;
1279: j = a->j[jmin];
1280: jl[i] = jl[j]; jl[j]=i;
1281: }
1282: i = nexti;
1283: }
1284: /*-- row sum --*/
1285: jmin = a->i[k]; jmax = a->i[k+1];
1286: for (i=jmin; i<jmax; i++) {
1287: for (j=0; j<bs; j++) {
1288: v = a->a + i*bs2 + j;
1289: for (k1=0; k1<bs; k1++) {
1290: sum[j] += PetscAbsScalar(*v); v += bs;
1291: }
1292: }
1293: }
1294: /* add k_th block row to il, jl */
1295: col = aj+jmin;
1296: if (*col == k) jmin++;
1297: if (jmin < jmax) {
1298: il[k] = jmin;
1299: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1300: }
1301: for (j=0; j<bs; j++) {
1302: if (sum[j] > *norm) *norm = sum[j];
1303: }
1304: }
1305: PetscFree3(sum,il,jl);
1306: PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1307: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1308: return(0);
1309: }
1313: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1314: {
1315: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1319: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1320: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1321: *flg = PETSC_FALSE;
1322: return(0);
1323: }
1325: /* if the a->i are the same */
1326: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1327: if (!*flg) return(0);
1329: /* if a->j are the same */
1330: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1331: if (!*flg) return(0);
1333: /* if a->a are the same */
1334: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1335: return(0);
1336: }
1340: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1341: {
1342: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1343: PetscErrorCode ierr;
1344: PetscInt i,j,k,row,bs,ambs,bs2;
1345: const PetscInt *ai,*aj;
1346: PetscScalar *x,zero = 0.0;
1347: const MatScalar *aa,*aa_j;
1350: bs = A->rmap->bs;
1351: if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1353: aa = a->a;
1354: ambs = a->mbs;
1356: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1357: PetscInt *diag=a->diag;
1358: aa = a->a;
1359: ambs = a->mbs;
1360: VecGetArray(v,&x);
1361: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1362: VecRestoreArray(v,&x);
1363: return(0);
1364: }
1366: ai = a->i;
1367: aj = a->j;
1368: bs2 = a->bs2;
1369: VecSet(v,zero);
1370: VecGetArray(v,&x);
1371: for (i=0; i<ambs; i++) {
1372: j=ai[i];
1373: if (aj[j] == i) { /* if this is a diagonal element */
1374: row = i*bs;
1375: aa_j = aa + j*bs2;
1376: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1377: }
1378: }
1379: VecRestoreArray(v,&x);
1380: return(0);
1381: }
1385: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1386: {
1387: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1388: PetscScalar x;
1389: const PetscScalar *l,*li,*ri;
1390: MatScalar *aa,*v;
1391: PetscErrorCode ierr;
1392: PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1393: PetscBool flg;
1396: if (ll != rr) {
1397: VecEqual(ll,rr,&flg);
1398: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1399: }
1400: if (!ll) return(0);
1401: ai = a->i;
1402: aj = a->j;
1403: aa = a->a;
1404: m = A->rmap->N;
1405: bs = A->rmap->bs;
1406: mbs = a->mbs;
1407: bs2 = a->bs2;
1409: VecGetArrayRead(ll,&l);
1410: VecGetLocalSize(ll,&lm);
1411: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1412: for (i=0; i<mbs; i++) { /* for each block row */
1413: M = ai[i+1] - ai[i];
1414: li = l + i*bs;
1415: v = aa + bs2*ai[i];
1416: for (j=0; j<M; j++) { /* for each block */
1417: ri = l + bs*aj[ai[i]+j];
1418: for (k=0; k<bs; k++) {
1419: x = ri[k];
1420: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1421: }
1422: }
1423: }
1424: VecRestoreArrayRead(ll,&l);
1425: PetscLogFlops(2.0*a->nz);
1426: return(0);
1427: }
1431: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1432: {
1433: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1436: info->block_size = a->bs2;
1437: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1438: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1439: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1440: info->assemblies = A->num_ass;
1441: info->mallocs = A->info.mallocs;
1442: info->memory = ((PetscObject)A)->mem;
1443: if (A->factortype) {
1444: info->fill_ratio_given = A->info.fill_ratio_given;
1445: info->fill_ratio_needed = A->info.fill_ratio_needed;
1446: info->factor_mallocs = A->info.factor_mallocs;
1447: } else {
1448: info->fill_ratio_given = 0;
1449: info->fill_ratio_needed = 0;
1450: info->factor_mallocs = 0;
1451: }
1452: return(0);
1453: }
1458: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1459: {
1460: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1464: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1465: return(0);
1466: }
1470: /*
1471: This code does not work since it only checks the upper triangular part of
1472: the matrix. Hence it is not listed in the function table.
1473: */
1474: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1475: {
1476: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1477: PetscErrorCode ierr;
1478: PetscInt i,j,n,row,col,bs,mbs;
1479: const PetscInt *ai,*aj;
1480: PetscReal atmp;
1481: const MatScalar *aa;
1482: PetscScalar *x;
1483: PetscInt ncols,brow,bcol,krow,kcol;
1486: if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1487: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1488: bs = A->rmap->bs;
1489: aa = a->a;
1490: ai = a->i;
1491: aj = a->j;
1492: mbs = a->mbs;
1494: VecSet(v,0.0);
1495: VecGetArray(v,&x);
1496: VecGetLocalSize(v,&n);
1497: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1498: for (i=0; i<mbs; i++) {
1499: ncols = ai[1] - ai[0]; ai++;
1500: brow = bs*i;
1501: for (j=0; j<ncols; j++) {
1502: bcol = bs*(*aj);
1503: for (kcol=0; kcol<bs; kcol++) {
1504: col = bcol + kcol; /* col index */
1505: for (krow=0; krow<bs; krow++) {
1506: atmp = PetscAbsScalar(*aa); aa++;
1507: row = brow + krow; /* row index */
1508: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1509: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1510: }
1511: }
1512: aj++;
1513: }
1514: }
1515: VecRestoreArray(v,&x);
1516: return(0);
1517: }