Actual source code: sbaij2.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
12: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
13: const PetscInt *idx;
14: PetscBT table_out,table_in;
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscBTCreate(mbs,&table_out);
22: PetscMalloc1(mbs+1,&nidx);
23: PetscMalloc1(A->rmap->N+1,&nidx2);
24: PetscBTCreate(mbs,&table_in);
26: for (i=0; i<is_max; i++) { /* for each is */
27: isz = 0;
28: PetscBTMemzero(mbs,table_out);
30: /* Extract the indices, assume there can be duplicate entries */
31: ISGetIndices(is[i],&idx);
32: ISGetLocalSize(is[i],&n);
34: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35: bcol_max = 0;
36: for (j=0; j<n; ++j) {
37: brow = idx[j]/bs; /* convert the indices into block indices */
39: if (!PetscBTLookupSet(table_out,brow)) {
40: nidx[isz++] = brow;
41: if (bcol_max < brow) bcol_max = brow;
42: }
43: }
44: ISRestoreIndices(is[i],&idx);
45: ISDestroy(&is[i]);
47: k = 0;
48: for (j=0; j<ov; j++) { /* for each overlap */
49: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50: PetscBTMemzero(mbs,table_in);
51: for (l=k; l<isz; l++) PetscBTSet(table_in,nidx[l]);
53: n = isz; /* length of the updated is[i] */
54: for (brow=0; brow<mbs; brow++) {
55: start = ai[brow]; end = ai[brow+1];
56: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57: for (l = start; l<end; l++) {
58: bcol = aj[l];
59: if (!PetscBTLookupSet(table_out,bcol)) {
60: nidx[isz++] = bcol;
61: if (bcol_max < bcol) bcol_max = bcol;
62: }
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l<end; l++) {
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table_in,bcol)) {
71: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
79: /* expand the Index Set */
80: for (j=0; j<isz; j++) {
81: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
82: }
83: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
84: }
85: PetscBTDestroy(&table_out);
86: PetscFree(nidx);
87: PetscFree(nidx2);
88: PetscBTDestroy(&table_in);
89: return 0;
90: }
92: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
93: Zero some ops' to avoid invalid usse */
94: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
95: {
96: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
97: Bseq->ops->mult = NULL;
98: Bseq->ops->multadd = NULL;
99: Bseq->ops->multtranspose = NULL;
100: Bseq->ops->multtransposeadd = NULL;
101: Bseq->ops->lufactor = NULL;
102: Bseq->ops->choleskyfactor = NULL;
103: Bseq->ops->lufactorsymbolic = NULL;
104: Bseq->ops->choleskyfactorsymbolic = NULL;
105: Bseq->ops->getinertia = NULL;
106: return 0;
107: }
109: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
110: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
111: {
112: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
113: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
114: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
115: const PetscInt *irow,*icol;
116: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
117: PetscInt *aj = a->j,*ai = a->i;
118: MatScalar *mat_a;
119: Mat C;
120: PetscBool flag;
123: ISGetIndices(isrow,&irow);
124: ISGetIndices(iscol,&icol);
125: ISGetLocalSize(isrow,&nrows);
126: ISGetLocalSize(iscol,&ncols);
128: PetscCalloc1(1+oldcols,&smap);
129: ssmap = smap;
130: PetscMalloc1(1+nrows,&lens);
131: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
132: /* determine lens of each row */
133: for (i=0; i<nrows; i++) {
134: kstart = ai[irow[i]];
135: kend = kstart + a->ilen[irow[i]];
136: lens[i] = 0;
137: for (k=kstart; k<kend; k++) {
138: if (ssmap[aj[k]]) lens[i]++;
139: }
140: }
141: /* Create and fill new matrix */
142: if (scall == MAT_REUSE_MATRIX) {
143: c = (Mat_SeqSBAIJ*)((*B)->data);
146: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
148: PetscArrayzero(c->ilen,c->mbs);
149: C = *B;
150: } else {
151: MatCreate(PetscObjectComm((PetscObject)A),&C);
152: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
153: MatSetType(C,((PetscObject)A)->type_name);
154: MatSeqSBAIJSetPreallocation(C,bs,0,lens);
155: }
156: c = (Mat_SeqSBAIJ*)(C->data);
157: for (i=0; i<nrows; i++) {
158: row = irow[i];
159: kstart = ai[row];
160: kend = kstart + a->ilen[row];
161: mat_i = c->i[i];
162: mat_j = c->j + mat_i;
163: mat_a = c->a + mat_i*bs2;
164: mat_ilen = c->ilen + i;
165: for (k=kstart; k<kend; k++) {
166: if ((tcol=ssmap[a->j[k]])) {
167: *mat_j++ = tcol - 1;
168: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
169: mat_a += bs2;
170: (*mat_ilen)++;
171: }
172: }
173: }
174: /* sort */
175: {
176: MatScalar *work;
178: PetscMalloc1(bs2,&work);
179: for (i=0; i<nrows; i++) {
180: PetscInt ilen;
181: mat_i = c->i[i];
182: mat_j = c->j + mat_i;
183: mat_a = c->a + mat_i*bs2;
184: ilen = c->ilen[i];
185: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
186: }
187: PetscFree(work);
188: }
190: /* Free work space */
191: ISRestoreIndices(iscol,&icol);
192: PetscFree(smap);
193: PetscFree(lens);
194: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
195: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
197: ISRestoreIndices(isrow,&irow);
198: *B = C;
199: return 0;
200: }
202: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
203: {
204: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
205: IS is1,is2;
206: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
207: const PetscInt *irow,*icol;
209: ISGetIndices(isrow,&irow);
210: ISGetIndices(iscol,&icol);
211: ISGetLocalSize(isrow,&nrows);
212: ISGetLocalSize(iscol,&ncols);
214: /* Verify if the indices corespond to each element in a block
215: and form the IS with compressed IS */
216: maxmnbs = PetscMax(a->mbs,a->nbs);
217: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
218: PetscArrayzero(vary,a->mbs);
219: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
220: for (i=0; i<a->mbs; i++) {
222: }
223: count = 0;
224: for (i=0; i<nrows; i++) {
225: PetscInt j = irow[i] / bs;
226: if ((vary[j]--)==bs) iary[count++] = j;
227: }
228: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
230: PetscArrayzero(vary,a->nbs);
231: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
232: for (i=0; i<a->nbs; i++) {
234: }
235: count = 0;
236: for (i=0; i<ncols; i++) {
237: PetscInt j = icol[i] / bs;
238: if ((vary[j]--)==bs) iary[count++] = j;
239: }
240: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
241: ISRestoreIndices(isrow,&irow);
242: ISRestoreIndices(iscol,&icol);
243: PetscFree2(vary,iary);
245: MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);
246: ISDestroy(&is1);
247: ISDestroy(&is2);
249: if (isrow != iscol) {
250: PetscBool isequal;
251: ISEqual(isrow,iscol,&isequal);
252: if (!isequal) {
253: MatSeqSBAIJZeroOps_Private(*B);
254: }
255: }
256: return 0;
257: }
259: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
261: PetscInt i;
263: if (scall == MAT_INITIAL_MATRIX) {
264: PetscCalloc1(n+1,B);
265: }
267: for (i=0; i<n; i++) {
268: MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
269: }
270: return 0;
271: }
273: /* -------------------------------------------------------*/
274: /* Should check that shapes of vectors and matrices match */
275: /* -------------------------------------------------------*/
277: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
278: {
279: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
280: PetscScalar *z,x1,x2,zero=0.0;
281: const PetscScalar *x,*xb;
282: const MatScalar *v;
283: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
284: const PetscInt *aj=a->j,*ai=a->i,*ib;
285: PetscInt nonzerorow=0;
287: VecSet(zz,zero);
288: if (!a->nz) return 0;
289: VecGetArrayRead(xx,&x);
290: VecGetArray(zz,&z);
292: v = a->a;
293: xb = x;
295: for (i=0; i<mbs; i++) {
296: n = ai[1] - ai[0]; /* length of i_th block row of A */
297: x1 = xb[0]; x2 = xb[1];
298: ib = aj + *ai;
299: jmin = 0;
300: nonzerorow += (n>0);
301: if (*ib == i) { /* (diag of A)*x */
302: z[2*i] += v[0]*x1 + v[2]*x2;
303: z[2*i+1] += v[2]*x1 + v[3]*x2;
304: v += 4; jmin++;
305: }
306: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
307: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
308: for (j=jmin; j<n; j++) {
309: /* (strict lower triangular part of A)*x */
310: cval = ib[j]*2;
311: z[cval] += v[0]*x1 + v[1]*x2;
312: z[cval+1] += v[2]*x1 + v[3]*x2;
313: /* (strict upper triangular part of A)*x */
314: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
315: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
316: v += 4;
317: }
318: xb +=2; ai++;
319: }
321: VecRestoreArrayRead(xx,&x);
322: VecRestoreArray(zz,&z);
323: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
324: return 0;
325: }
327: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
328: {
329: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
330: PetscScalar *z,x1,x2,x3,zero=0.0;
331: const PetscScalar *x,*xb;
332: const MatScalar *v;
333: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
334: const PetscInt *aj = a->j,*ai = a->i,*ib;
335: PetscInt nonzerorow=0;
337: VecSet(zz,zero);
338: if (!a->nz) return 0;
339: VecGetArrayRead(xx,&x);
340: VecGetArray(zz,&z);
342: v = a->a;
343: xb = x;
345: for (i=0; i<mbs; i++) {
346: n = ai[1] - ai[0]; /* length of i_th block row of A */
347: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
348: ib = aj + *ai;
349: jmin = 0;
350: nonzerorow += (n>0);
351: if (*ib == i) { /* (diag of A)*x */
352: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
353: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
354: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
355: v += 9; jmin++;
356: }
357: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
358: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
359: for (j=jmin; j<n; j++) {
360: /* (strict lower triangular part of A)*x */
361: cval = ib[j]*3;
362: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
363: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
364: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
365: /* (strict upper triangular part of A)*x */
366: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
367: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
368: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
369: v += 9;
370: }
371: xb +=3; ai++;
372: }
374: VecRestoreArrayRead(xx,&x);
375: VecRestoreArray(zz,&z);
376: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
377: return 0;
378: }
380: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
381: {
382: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
383: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
384: const PetscScalar *x,*xb;
385: const MatScalar *v;
386: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
387: const PetscInt *aj = a->j,*ai = a->i,*ib;
388: PetscInt nonzerorow = 0;
390: VecSet(zz,zero);
391: if (!a->nz) return 0;
392: VecGetArrayRead(xx,&x);
393: VecGetArray(zz,&z);
395: v = a->a;
396: xb = x;
398: for (i=0; i<mbs; i++) {
399: n = ai[1] - ai[0]; /* length of i_th block row of A */
400: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
401: ib = aj + *ai;
402: jmin = 0;
403: nonzerorow += (n>0);
404: if (*ib == i) { /* (diag of A)*x */
405: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
406: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
407: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
408: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
409: v += 16; jmin++;
410: }
411: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
412: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
413: for (j=jmin; j<n; j++) {
414: /* (strict lower triangular part of A)*x */
415: cval = ib[j]*4;
416: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
417: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
418: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
419: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
420: /* (strict upper triangular part of A)*x */
421: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
422: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
423: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
424: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
425: v += 16;
426: }
427: xb +=4; ai++;
428: }
430: VecRestoreArrayRead(xx,&x);
431: VecRestoreArray(zz,&z);
432: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
433: return 0;
434: }
436: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
437: {
438: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
439: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
440: const PetscScalar *x,*xb;
441: const MatScalar *v;
442: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
443: const PetscInt *aj = a->j,*ai = a->i,*ib;
444: PetscInt nonzerorow=0;
446: VecSet(zz,zero);
447: if (!a->nz) return 0;
448: VecGetArrayRead(xx,&x);
449: VecGetArray(zz,&z);
451: v = a->a;
452: xb = x;
454: for (i=0; i<mbs; i++) {
455: n = ai[1] - ai[0]; /* length of i_th block row of A */
456: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
457: ib = aj + *ai;
458: jmin = 0;
459: nonzerorow += (n>0);
460: if (*ib == i) { /* (diag of A)*x */
461: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
462: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
463: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
464: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
465: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
466: v += 25; jmin++;
467: }
468: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
469: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
470: for (j=jmin; j<n; j++) {
471: /* (strict lower triangular part of A)*x */
472: cval = ib[j]*5;
473: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
474: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
475: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
476: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
477: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
478: /* (strict upper triangular part of A)*x */
479: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
480: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
481: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
482: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
483: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
484: v += 25;
485: }
486: xb +=5; ai++;
487: }
489: VecRestoreArrayRead(xx,&x);
490: VecRestoreArray(zz,&z);
491: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
492: return 0;
493: }
495: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
496: {
497: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
498: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
499: const PetscScalar *x,*xb;
500: const MatScalar *v;
501: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
502: const PetscInt *aj=a->j,*ai=a->i,*ib;
503: PetscInt nonzerorow=0;
505: VecSet(zz,zero);
506: if (!a->nz) return 0;
507: VecGetArrayRead(xx,&x);
508: VecGetArray(zz,&z);
510: v = a->a;
511: xb = x;
513: for (i=0; i<mbs; i++) {
514: n = ai[1] - ai[0]; /* length of i_th block row of A */
515: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
516: ib = aj + *ai;
517: jmin = 0;
518: nonzerorow += (n>0);
519: if (*ib == i) { /* (diag of A)*x */
520: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
521: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
522: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
523: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
524: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
525: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
526: v += 36; jmin++;
527: }
528: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
529: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
530: for (j=jmin; j<n; j++) {
531: /* (strict lower triangular part of A)*x */
532: cval = ib[j]*6;
533: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
534: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
535: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
536: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
537: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
538: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
539: /* (strict upper triangular part of A)*x */
540: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
541: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
542: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
543: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
544: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
545: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
546: v += 36;
547: }
548: xb +=6; ai++;
549: }
551: VecRestoreArrayRead(xx,&x);
552: VecRestoreArray(zz,&z);
553: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
554: return 0;
555: }
557: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
558: {
559: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
560: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
561: const PetscScalar *x,*xb;
562: const MatScalar *v;
563: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
564: const PetscInt *aj=a->j,*ai=a->i,*ib;
565: PetscInt nonzerorow=0;
567: VecSet(zz,zero);
568: if (!a->nz) return 0;
569: VecGetArrayRead(xx,&x);
570: VecGetArray(zz,&z);
572: v = a->a;
573: xb = x;
575: for (i=0; i<mbs; i++) {
576: n = ai[1] - ai[0]; /* length of i_th block row of A */
577: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
578: ib = aj + *ai;
579: jmin = 0;
580: nonzerorow += (n>0);
581: if (*ib == i) { /* (diag of A)*x */
582: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
583: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
584: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
585: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
586: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
587: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
588: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
589: v += 49; jmin++;
590: }
591: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
592: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
593: for (j=jmin; j<n; j++) {
594: /* (strict lower triangular part of A)*x */
595: cval = ib[j]*7;
596: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
597: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
598: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
599: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
600: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
601: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
602: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
603: /* (strict upper triangular part of A)*x */
604: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
605: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
606: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
607: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
608: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
609: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
610: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
611: v += 49;
612: }
613: xb +=7; ai++;
614: }
615: VecRestoreArrayRead(xx,&x);
616: VecRestoreArray(zz,&z);
617: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
618: return 0;
619: }
621: /*
622: This will not work with MatScalar == float because it calls the BLAS
623: */
624: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
625: {
626: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
627: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
628: const PetscScalar *x,*x_ptr,*xb;
629: const MatScalar *v;
630: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
631: const PetscInt *idx,*aj,*ii;
632: PetscInt nonzerorow=0;
634: VecSet(zz,zero);
635: if (!a->nz) return 0;
636: VecGetArrayRead(xx,&x);
637: VecGetArray(zz,&z);
639: x_ptr = x;
640: z_ptr = z;
642: aj = a->j;
643: v = a->a;
644: ii = a->i;
646: if (!a->mult_work) {
647: PetscMalloc1(A->rmap->N+1,&a->mult_work);
648: }
649: work = a->mult_work;
651: for (i=0; i<mbs; i++) {
652: n = ii[1] - ii[0]; ncols = n*bs;
653: workt = work; idx=aj+ii[0];
654: nonzerorow += (n>0);
656: /* upper triangular part */
657: for (j=0; j<n; j++) {
658: xb = x_ptr + bs*(*idx++);
659: for (k=0; k<bs; k++) workt[k] = xb[k];
660: workt += bs;
661: }
662: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
663: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
665: /* strict lower triangular part */
666: idx = aj+ii[0];
667: if (n && *idx == i) {
668: ncols -= bs; v += bs2; idx++; n--;
669: }
671: if (ncols > 0) {
672: workt = work;
673: PetscArrayzero(workt,ncols);
674: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
675: for (j=0; j<n; j++) {
676: zb = z_ptr + bs*(*idx++);
677: for (k=0; k<bs; k++) zb[k] += workt[k];
678: workt += bs;
679: }
680: }
681: x += bs; v += n*bs2; z += bs; ii++;
682: }
684: VecRestoreArrayRead(xx,&x);
685: VecRestoreArray(zz,&z);
686: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
687: return 0;
688: }
690: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
691: {
692: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
693: PetscScalar *z,x1;
694: const PetscScalar *x,*xb;
695: const MatScalar *v;
696: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
697: const PetscInt *aj=a->j,*ai=a->i,*ib;
698: PetscInt nonzerorow=0;
699: #if defined(PETSC_USE_COMPLEX)
700: const int aconj = A->hermitian;
701: #else
702: const int aconj = 0;
703: #endif
705: VecCopy(yy,zz);
706: if (!a->nz) return 0;
707: VecGetArrayRead(xx,&x);
708: VecGetArray(zz,&z);
709: v = a->a;
710: xb = x;
712: for (i=0; i<mbs; i++) {
713: n = ai[1] - ai[0]; /* length of i_th row of A */
714: x1 = xb[0];
715: ib = aj + *ai;
716: jmin = 0;
717: nonzerorow += (n>0);
718: if (n && *ib == i) { /* (diag of A)*x */
719: z[i] += *v++ * x[*ib++]; jmin++;
720: }
721: if (aconj) {
722: for (j=jmin; j<n; j++) {
723: cval = *ib;
724: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
725: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
726: }
727: } else {
728: for (j=jmin; j<n; j++) {
729: cval = *ib;
730: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
731: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
732: }
733: }
734: xb++; ai++;
735: }
737: VecRestoreArrayRead(xx,&x);
738: VecRestoreArray(zz,&z);
740: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
741: return 0;
742: }
744: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
745: {
746: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
747: PetscScalar *z,x1,x2;
748: const PetscScalar *x,*xb;
749: const MatScalar *v;
750: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
751: const PetscInt *aj=a->j,*ai=a->i,*ib;
752: PetscInt nonzerorow=0;
754: VecCopy(yy,zz);
755: if (!a->nz) return 0;
756: VecGetArrayRead(xx,&x);
757: VecGetArray(zz,&z);
759: v = a->a;
760: xb = x;
762: for (i=0; i<mbs; i++) {
763: n = ai[1] - ai[0]; /* length of i_th block row of A */
764: x1 = xb[0]; x2 = xb[1];
765: ib = aj + *ai;
766: jmin = 0;
767: nonzerorow += (n>0);
768: if (n && *ib == i) { /* (diag of A)*x */
769: z[2*i] += v[0]*x1 + v[2]*x2;
770: z[2*i+1] += v[2]*x1 + v[3]*x2;
771: v += 4; jmin++;
772: }
773: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
774: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
775: for (j=jmin; j<n; j++) {
776: /* (strict lower triangular part of A)*x */
777: cval = ib[j]*2;
778: z[cval] += v[0]*x1 + v[1]*x2;
779: z[cval+1] += v[2]*x1 + v[3]*x2;
780: /* (strict upper triangular part of A)*x */
781: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
782: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
783: v += 4;
784: }
785: xb +=2; ai++;
786: }
787: VecRestoreArrayRead(xx,&x);
788: VecRestoreArray(zz,&z);
790: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
791: return 0;
792: }
794: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
795: {
796: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
797: PetscScalar *z,x1,x2,x3;
798: const PetscScalar *x,*xb;
799: const MatScalar *v;
800: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
801: const PetscInt *aj=a->j,*ai=a->i,*ib;
802: PetscInt nonzerorow=0;
804: VecCopy(yy,zz);
805: if (!a->nz) return 0;
806: VecGetArrayRead(xx,&x);
807: VecGetArray(zz,&z);
809: v = a->a;
810: xb = x;
812: for (i=0; i<mbs; i++) {
813: n = ai[1] - ai[0]; /* length of i_th block row of A */
814: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
815: ib = aj + *ai;
816: jmin = 0;
817: nonzerorow += (n>0);
818: if (n && *ib == i) { /* (diag of A)*x */
819: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
820: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
821: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
822: v += 9; jmin++;
823: }
824: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
825: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
826: for (j=jmin; j<n; j++) {
827: /* (strict lower triangular part of A)*x */
828: cval = ib[j]*3;
829: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
830: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
831: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
832: /* (strict upper triangular part of A)*x */
833: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
834: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
835: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
836: v += 9;
837: }
838: xb +=3; ai++;
839: }
841: VecRestoreArrayRead(xx,&x);
842: VecRestoreArray(zz,&z);
844: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
845: return 0;
846: }
848: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
849: {
850: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
851: PetscScalar *z,x1,x2,x3,x4;
852: const PetscScalar *x,*xb;
853: const MatScalar *v;
854: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
855: const PetscInt *aj=a->j,*ai=a->i,*ib;
856: PetscInt nonzerorow=0;
858: VecCopy(yy,zz);
859: if (!a->nz) return 0;
860: VecGetArrayRead(xx,&x);
861: VecGetArray(zz,&z);
863: v = a->a;
864: xb = x;
866: for (i=0; i<mbs; i++) {
867: n = ai[1] - ai[0]; /* length of i_th block row of A */
868: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
869: ib = aj + *ai;
870: jmin = 0;
871: nonzerorow += (n>0);
872: if (n && *ib == i) { /* (diag of A)*x */
873: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
874: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
875: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
876: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
877: v += 16; jmin++;
878: }
879: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
880: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
881: for (j=jmin; j<n; j++) {
882: /* (strict lower triangular part of A)*x */
883: cval = ib[j]*4;
884: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
885: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
886: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
887: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
888: /* (strict upper triangular part of A)*x */
889: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
890: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
891: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
892: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
893: v += 16;
894: }
895: xb +=4; ai++;
896: }
898: VecRestoreArrayRead(xx,&x);
899: VecRestoreArray(zz,&z);
901: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
902: return 0;
903: }
905: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
906: {
907: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
908: PetscScalar *z,x1,x2,x3,x4,x5;
909: const PetscScalar *x,*xb;
910: const MatScalar *v;
911: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
912: const PetscInt *aj=a->j,*ai=a->i,*ib;
913: PetscInt nonzerorow=0;
915: VecCopy(yy,zz);
916: if (!a->nz) return 0;
917: VecGetArrayRead(xx,&x);
918: VecGetArray(zz,&z);
920: v = a->a;
921: xb = x;
923: for (i=0; i<mbs; i++) {
924: n = ai[1] - ai[0]; /* length of i_th block row of A */
925: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
926: ib = aj + *ai;
927: jmin = 0;
928: nonzerorow += (n>0);
929: if (n && *ib == i) { /* (diag of A)*x */
930: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
931: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
932: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
933: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
934: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
935: v += 25; jmin++;
936: }
937: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
938: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
939: for (j=jmin; j<n; j++) {
940: /* (strict lower triangular part of A)*x */
941: cval = ib[j]*5;
942: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
943: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
944: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
945: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
946: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
947: /* (strict upper triangular part of A)*x */
948: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
949: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
950: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
951: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
952: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
953: v += 25;
954: }
955: xb +=5; ai++;
956: }
958: VecRestoreArrayRead(xx,&x);
959: VecRestoreArray(zz,&z);
961: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
962: return 0;
963: }
965: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
966: {
967: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
968: PetscScalar *z,x1,x2,x3,x4,x5,x6;
969: const PetscScalar *x,*xb;
970: const MatScalar *v;
971: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
972: const PetscInt *aj=a->j,*ai=a->i,*ib;
973: PetscInt nonzerorow=0;
975: VecCopy(yy,zz);
976: if (!a->nz) return 0;
977: VecGetArrayRead(xx,&x);
978: VecGetArray(zz,&z);
980: v = a->a;
981: xb = x;
983: for (i=0; i<mbs; i++) {
984: n = ai[1] - ai[0]; /* length of i_th block row of A */
985: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
986: ib = aj + *ai;
987: jmin = 0;
988: nonzerorow += (n>0);
989: if (n && *ib == i) { /* (diag of A)*x */
990: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
991: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
992: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
993: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
994: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
995: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
996: v += 36; jmin++;
997: }
998: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
999: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1000: for (j=jmin; j<n; j++) {
1001: /* (strict lower triangular part of A)*x */
1002: cval = ib[j]*6;
1003: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1004: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1005: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1006: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1007: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1008: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1009: /* (strict upper triangular part of A)*x */
1010: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1011: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1012: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1013: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1014: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1015: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1016: v += 36;
1017: }
1018: xb +=6; ai++;
1019: }
1021: VecRestoreArrayRead(xx,&x);
1022: VecRestoreArray(zz,&z);
1024: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
1025: return 0;
1026: }
1028: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1029: {
1030: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1031: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1032: const PetscScalar *x,*xb;
1033: const MatScalar *v;
1034: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1035: const PetscInt *aj=a->j,*ai=a->i,*ib;
1036: PetscInt nonzerorow=0;
1038: VecCopy(yy,zz);
1039: if (!a->nz) return 0;
1040: VecGetArrayRead(xx,&x);
1041: VecGetArray(zz,&z);
1043: v = a->a;
1044: xb = x;
1046: for (i=0; i<mbs; i++) {
1047: n = ai[1] - ai[0]; /* length of i_th block row of A */
1048: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1049: ib = aj + *ai;
1050: jmin = 0;
1051: nonzerorow += (n>0);
1052: if (n && *ib == i) { /* (diag of A)*x */
1053: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1054: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1055: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1056: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1057: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1058: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1059: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1060: v += 49; jmin++;
1061: }
1062: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1063: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1064: for (j=jmin; j<n; j++) {
1065: /* (strict lower triangular part of A)*x */
1066: cval = ib[j]*7;
1067: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1068: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1069: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1070: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1071: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1072: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1073: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1074: /* (strict upper triangular part of A)*x */
1075: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1076: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1077: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1078: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1079: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1080: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1081: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1082: v += 49;
1083: }
1084: xb +=7; ai++;
1085: }
1087: VecRestoreArrayRead(xx,&x);
1088: VecRestoreArray(zz,&z);
1090: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1091: return 0;
1092: }
1094: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1095: {
1096: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1097: PetscScalar *z,*z_ptr=NULL,*zb,*work,*workt;
1098: const PetscScalar *x,*x_ptr,*xb;
1099: const MatScalar *v;
1100: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1101: const PetscInt *idx,*aj,*ii;
1102: PetscInt nonzerorow=0;
1104: VecCopy(yy,zz);
1105: if (!a->nz) return 0;
1106: VecGetArrayRead(xx,&x); x_ptr=x;
1107: VecGetArray(zz,&z); z_ptr=z;
1109: aj = a->j;
1110: v = a->a;
1111: ii = a->i;
1113: if (!a->mult_work) {
1114: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1115: }
1116: work = a->mult_work;
1118: for (i=0; i<mbs; i++) {
1119: n = ii[1] - ii[0]; ncols = n*bs;
1120: workt = work; idx=aj+ii[0];
1121: nonzerorow += (n>0);
1123: /* upper triangular part */
1124: for (j=0; j<n; j++) {
1125: xb = x_ptr + bs*(*idx++);
1126: for (k=0; k<bs; k++) workt[k] = xb[k];
1127: workt += bs;
1128: }
1129: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1130: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1132: /* strict lower triangular part */
1133: idx = aj+ii[0];
1134: if (n && *idx == i) {
1135: ncols -= bs; v += bs2; idx++; n--;
1136: }
1137: if (ncols > 0) {
1138: workt = work;
1139: PetscArrayzero(workt,ncols);
1140: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1141: for (j=0; j<n; j++) {
1142: zb = z_ptr + bs*(*idx++);
1143: for (k=0; k<bs; k++) zb[k] += workt[k];
1144: workt += bs;
1145: }
1146: }
1148: x += bs; v += n*bs2; z += bs; ii++;
1149: }
1151: VecRestoreArrayRead(xx,&x);
1152: VecRestoreArray(zz,&z);
1154: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1155: return 0;
1156: }
1158: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1159: {
1160: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1161: PetscScalar oalpha = alpha;
1162: PetscBLASInt one = 1,totalnz;
1164: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1165: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1166: PetscLogFlops(totalnz);
1167: return 0;
1168: }
1170: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1171: {
1172: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1173: const MatScalar *v = a->a;
1174: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1175: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1176: const PetscInt *aj=a->j,*col;
1178: if (!a->nz) {
1179: *norm = 0.0;
1180: return 0;
1181: }
1182: if (type == NORM_FROBENIUS) {
1183: for (k=0; k<mbs; k++) {
1184: jmin = a->i[k];
1185: jmax = a->i[k+1];
1186: col = aj + jmin;
1187: if (jmax-jmin > 0 && *col == k) { /* diagonal block */
1188: for (i=0; i<bs2; i++) {
1189: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1190: }
1191: jmin++;
1192: }
1193: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1194: for (i=0; i<bs2; i++) {
1195: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1196: }
1197: }
1198: }
1199: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1200: PetscLogFlops(2.0*bs2*a->nz);
1201: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1202: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1203: for (i=0; i<mbs; i++) jl[i] = mbs;
1204: il[0] = 0;
1206: *norm = 0.0;
1207: for (k=0; k<mbs; k++) { /* k_th block row */
1208: for (j=0; j<bs; j++) sum[j]=0.0;
1209: /*-- col sum --*/
1210: i = jl[k]; /* first |A(i,k)| to be added */
1211: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1212: at step k */
1213: while (i<mbs) {
1214: nexti = jl[i]; /* next block row to be added */
1215: ik = il[i]; /* block index of A(i,k) in the array a */
1216: for (j=0; j<bs; j++) {
1217: v = a->a + ik*bs2 + j*bs;
1218: for (k1=0; k1<bs; k1++) {
1219: sum[j] += PetscAbsScalar(*v); v++;
1220: }
1221: }
1222: /* update il, jl */
1223: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1224: jmax = a->i[i+1];
1225: if (jmin < jmax) {
1226: il[i] = jmin;
1227: j = a->j[jmin];
1228: jl[i] = jl[j]; jl[j]=i;
1229: }
1230: i = nexti;
1231: }
1232: /*-- row sum --*/
1233: jmin = a->i[k];
1234: jmax = a->i[k+1];
1235: for (i=jmin; i<jmax; i++) {
1236: for (j=0; j<bs; j++) {
1237: v = a->a + i*bs2 + j;
1238: for (k1=0; k1<bs; k1++) {
1239: sum[j] += PetscAbsScalar(*v); v += bs;
1240: }
1241: }
1242: }
1243: /* add k_th block row to il, jl */
1244: col = aj+jmin;
1245: if (jmax - jmin > 0 && *col == k) jmin++;
1246: if (jmin < jmax) {
1247: il[k] = jmin;
1248: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1249: }
1250: for (j=0; j<bs; j++) {
1251: if (sum[j] > *norm) *norm = sum[j];
1252: }
1253: }
1254: PetscFree3(sum,il,jl);
1255: PetscLogFlops(PetscMax(mbs*a->nz-1,0));
1256: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1257: return 0;
1258: }
1260: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1261: {
1262: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1264: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1265: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1266: *flg = PETSC_FALSE;
1267: return 0;
1268: }
1270: /* if the a->i are the same */
1271: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
1272: if (!*flg) return 0;
1274: /* if a->j are the same */
1275: PetscArraycmp(a->j,b->j,a->nz,flg);
1276: if (!*flg) return 0;
1278: /* if a->a are the same */
1279: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);
1280: return 0;
1281: }
1283: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1284: {
1285: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1286: PetscInt i,j,k,row,bs,ambs,bs2;
1287: const PetscInt *ai,*aj;
1288: PetscScalar *x,zero = 0.0;
1289: const MatScalar *aa,*aa_j;
1291: bs = A->rmap->bs;
1294: aa = a->a;
1295: ambs = a->mbs;
1297: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1298: PetscInt *diag=a->diag;
1299: aa = a->a;
1300: ambs = a->mbs;
1301: VecGetArray(v,&x);
1302: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1303: VecRestoreArray(v,&x);
1304: return 0;
1305: }
1307: ai = a->i;
1308: aj = a->j;
1309: bs2 = a->bs2;
1310: VecSet(v,zero);
1311: if (!a->nz) return 0;
1312: VecGetArray(v,&x);
1313: for (i=0; i<ambs; i++) {
1314: j = ai[i];
1315: if (aj[j] == i) { /* if this is a diagonal element */
1316: row = i*bs;
1317: aa_j = aa + j*bs2;
1318: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1319: }
1320: }
1321: VecRestoreArray(v,&x);
1322: return 0;
1323: }
1325: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1326: {
1327: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1328: PetscScalar x;
1329: const PetscScalar *l,*li,*ri;
1330: MatScalar *aa,*v;
1331: PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1332: const PetscInt *ai,*aj;
1333: PetscBool flg;
1335: if (ll != rr) {
1336: VecEqual(ll,rr,&flg);
1338: }
1339: if (!ll) return 0;
1340: ai = a->i;
1341: aj = a->j;
1342: aa = a->a;
1343: m = A->rmap->N;
1344: bs = A->rmap->bs;
1345: mbs = a->mbs;
1346: bs2 = a->bs2;
1348: VecGetArrayRead(ll,&l);
1349: VecGetLocalSize(ll,&lm);
1351: for (i=0; i<mbs; i++) { /* for each block row */
1352: M = ai[i+1] - ai[i];
1353: li = l + i*bs;
1354: v = aa + bs2*ai[i];
1355: for (j=0; j<M; j++) { /* for each block */
1356: ri = l + bs*aj[ai[i]+j];
1357: for (k=0; k<bs; k++) {
1358: x = ri[k];
1359: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1360: }
1361: }
1362: }
1363: VecRestoreArrayRead(ll,&l);
1364: PetscLogFlops(2.0*a->nz);
1365: return 0;
1366: }
1368: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1369: {
1370: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1372: info->block_size = a->bs2;
1373: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1374: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1375: info->nz_unneeded = info->nz_allocated - info->nz_used;
1376: info->assemblies = A->num_ass;
1377: info->mallocs = A->info.mallocs;
1378: info->memory = ((PetscObject)A)->mem;
1379: if (A->factortype) {
1380: info->fill_ratio_given = A->info.fill_ratio_given;
1381: info->fill_ratio_needed = A->info.fill_ratio_needed;
1382: info->factor_mallocs = A->info.factor_mallocs;
1383: } else {
1384: info->fill_ratio_given = 0;
1385: info->fill_ratio_needed = 0;
1386: info->factor_mallocs = 0;
1387: }
1388: return 0;
1389: }
1391: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1392: {
1393: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1395: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
1396: return 0;
1397: }
1399: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1400: {
1401: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1402: PetscInt i,j,n,row,col,bs,mbs;
1403: const PetscInt *ai,*aj;
1404: PetscReal atmp;
1405: const MatScalar *aa;
1406: PetscScalar *x;
1407: PetscInt ncols,brow,bcol,krow,kcol;
1411: bs = A->rmap->bs;
1412: aa = a->a;
1413: ai = a->i;
1414: aj = a->j;
1415: mbs = a->mbs;
1417: VecSet(v,0.0);
1418: VecGetArray(v,&x);
1419: VecGetLocalSize(v,&n);
1421: for (i=0; i<mbs; i++) {
1422: ncols = ai[1] - ai[0]; ai++;
1423: brow = bs*i;
1424: for (j=0; j<ncols; j++) {
1425: bcol = bs*(*aj);
1426: for (kcol=0; kcol<bs; kcol++) {
1427: col = bcol + kcol; /* col index */
1428: for (krow=0; krow<bs; krow++) {
1429: atmp = PetscAbsScalar(*aa); aa++;
1430: row = brow + krow; /* row index */
1431: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1432: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1433: }
1434: }
1435: aj++;
1436: }
1437: }
1438: VecRestoreArray(v,&x);
1439: return 0;
1440: }
1442: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1443: {
1444: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1445: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1446: return 0;
1447: }
1449: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1450: {
1451: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1452: PetscScalar *z = c;
1453: const PetscScalar *xb;
1454: PetscScalar x1;
1455: const MatScalar *v = a->a,*vv;
1456: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1457: #if defined(PETSC_USE_COMPLEX)
1458: const int aconj = A->hermitian;
1459: #else
1460: const int aconj = 0;
1461: #endif
1463: for (i=0; i<mbs; i++) {
1464: n = ii[1] - ii[0]; ii++;
1465: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1466: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1467: jj = idx;
1468: vv = v;
1469: for (k=0; k<cn; k++) {
1470: idx = jj;
1471: v = vv;
1472: for (j=0; j<n; j++) {
1473: xb = b + (*idx); x1 = xb[0+k*bm];
1474: z[0+k*cm] += v[0]*x1;
1475: if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1476: v += 1;
1477: ++idx;
1478: }
1479: }
1480: z += 1;
1481: }
1482: return 0;
1483: }
1485: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1486: {
1487: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1488: PetscScalar *z = c;
1489: const PetscScalar *xb;
1490: PetscScalar x1,x2;
1491: const MatScalar *v = a->a,*vv;
1492: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1494: for (i=0; i<mbs; i++) {
1495: n = ii[1] - ii[0]; ii++;
1496: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1497: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1498: jj = idx;
1499: vv = v;
1500: for (k=0; k<cn; k++) {
1501: idx = jj;
1502: v = vv;
1503: for (j=0; j<n; j++) {
1504: xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1505: z[0+k*cm] += v[0]*x1 + v[2]*x2;
1506: z[1+k*cm] += v[1]*x1 + v[3]*x2;
1507: if (*idx != i) {
1508: c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1509: c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1510: }
1511: v += 4;
1512: ++idx;
1513: }
1514: }
1515: z += 2;
1516: }
1517: return 0;
1518: }
1520: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1521: {
1522: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1523: PetscScalar *z = c;
1524: const PetscScalar *xb;
1525: PetscScalar x1,x2,x3;
1526: const MatScalar *v = a->a,*vv;
1527: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1529: for (i=0; i<mbs; i++) {
1530: n = ii[1] - ii[0]; ii++;
1531: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1532: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1533: jj = idx;
1534: vv = v;
1535: for (k=0; k<cn; k++) {
1536: idx = jj;
1537: v = vv;
1538: for (j=0; j<n; j++) {
1539: xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1540: z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1541: z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1542: z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1543: if (*idx != i) {
1544: c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1545: c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1546: c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1547: }
1548: v += 9;
1549: ++idx;
1550: }
1551: }
1552: z += 3;
1553: }
1554: return 0;
1555: }
1557: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1558: {
1559: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1560: PetscScalar *z = c;
1561: const PetscScalar *xb;
1562: PetscScalar x1,x2,x3,x4;
1563: const MatScalar *v = a->a,*vv;
1564: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1566: for (i=0; i<mbs; i++) {
1567: n = ii[1] - ii[0]; ii++;
1568: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1569: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1570: jj = idx;
1571: vv = v;
1572: for (k=0; k<cn; k++) {
1573: idx = jj;
1574: v = vv;
1575: for (j=0; j<n; j++) {
1576: xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1577: z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1578: z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1579: z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1580: z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1581: if (*idx != i) {
1582: c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm];
1583: c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm];
1584: c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1585: c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1586: }
1587: v += 16;
1588: ++idx;
1589: }
1590: }
1591: z += 4;
1592: }
1593: return 0;
1594: }
1596: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1597: {
1598: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1599: PetscScalar *z = c;
1600: const PetscScalar *xb;
1601: PetscScalar x1,x2,x3,x4,x5;
1602: const MatScalar *v = a->a,*vv;
1603: PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1605: for (i=0; i<mbs; i++) {
1606: n = ii[1] - ii[0]; ii++;
1607: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1608: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1609: jj = idx;
1610: vv = v;
1611: for (k=0; k<cn; k++) {
1612: idx = jj;
1613: v = vv;
1614: for (j=0; j<n; j++) {
1615: xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1616: z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1617: z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1618: z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1619: z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1620: z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1621: if (*idx != i) {
1622: c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1623: c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1624: c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1625: c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1626: c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1627: }
1628: v += 25;
1629: ++idx;
1630: }
1631: }
1632: z += 5;
1633: }
1634: return 0;
1635: }
1637: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1638: {
1639: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1640: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1641: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
1642: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1643: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1644: PetscBLASInt bbs,bcn,bbm,bcm;
1645: PetscScalar *z = NULL;
1646: PetscScalar *c,*b;
1647: const MatScalar *v;
1648: const PetscInt *idx,*ii;
1649: PetscScalar _DOne=1.0;
1651: if (!cm || !cn) return 0;
1655: b = bd->v;
1656: MatZeroEntries(C);
1657: MatDenseGetArray(C,&c);
1658: switch (bs) {
1659: case 1:
1660: MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1661: break;
1662: case 2:
1663: MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1664: break;
1665: case 3:
1666: MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1667: break;
1668: case 4:
1669: MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1670: break;
1671: case 5:
1672: MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1673: break;
1674: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1675: PetscBLASIntCast(bs,&bbs);
1676: PetscBLASIntCast(cn,&bcn);
1677: PetscBLASIntCast(bm,&bbm);
1678: PetscBLASIntCast(cm,&bcm);
1679: idx = a->j;
1680: v = a->a;
1681: mbs = a->mbs;
1682: ii = a->i;
1683: z = c;
1684: for (i=0; i<mbs; i++) {
1685: n = ii[1] - ii[0]; ii++;
1686: for (j=0; j<n; j++) {
1687: if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1688: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1689: v += bs2;
1690: }
1691: z += bs;
1692: }
1693: }
1694: MatDenseRestoreArray(C,&c);
1695: PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);
1696: return 0;
1697: }