Actual source code: sbaijfact.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscis.h>
7: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneg,PetscInt *nzero,PetscInt *npos)
8: {
9: Mat_SeqSBAIJ *fact=(Mat_SeqSBAIJ*)F->data;
10: MatScalar *dd=fact->a;
11: PetscInt mbs=fact->mbs,bs=F->rmap->bs,i,nneg_tmp,npos_tmp,*fi=fact->diag;
15: nneg_tmp = 0; npos_tmp = 0;
16: for (i=0; i<mbs; i++) {
17: if (PetscRealPart(dd[*fi]) > 0.0) npos_tmp++;
18: else if (PetscRealPart(dd[*fi]) < 0.0) nneg_tmp++;
19: fi++;
20: }
21: if (nneg) *nneg = nneg_tmp;
22: if (npos) *npos = npos_tmp;
23: if (nzero) *nzero = mbs - nneg_tmp - npos_tmp;
24: return 0;
25: }
27: /*
28: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
29: Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
30: */
31: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info)
32: {
33: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
34: const PetscInt *rip,*ai,*aj;
35: PetscInt i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2;
36: PetscInt m,reallocs = 0,prow;
37: PetscInt *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
38: PetscReal f = info->fill;
39: PetscBool perm_identity;
41: /* check whether perm is the identity mapping */
42: ISIdentity(perm,&perm_identity);
43: ISGetIndices(perm,&rip);
45: if (perm_identity) { /* without permutation */
46: a->permute = PETSC_FALSE;
48: ai = a->i; aj = a->j;
49: } else { /* non-trivial permutation */
50: a->permute = PETSC_TRUE;
52: MatReorderingSeqSBAIJ(A,perm);
54: ai = a->inew; aj = a->jnew;
55: }
57: /* initialization */
58: PetscMalloc1(mbs+1,&iu);
59: umax = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
60: PetscMalloc1(umax,&ju);
61: iu[0] = mbs+1;
62: juidx = mbs + 1; /* index for ju */
63: /* jl linked list for pivot row -- linked list for col index */
64: PetscMalloc2(mbs,&jl,mbs,&q);
65: for (i=0; i<mbs; i++) {
66: jl[i] = mbs;
67: q[i] = 0;
68: }
70: /* for each row k */
71: for (k=0; k<mbs; k++) {
72: for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */
73: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
74: q[k] = mbs;
75: /* initialize nonzero structure of k-th row to row rip[k] of A */
76: jmin = ai[rip[k]] +1; /* exclude diag[k] */
77: jmax = ai[rip[k]+1];
78: for (j=jmin; j<jmax; j++) {
79: vj = rip[aj[j]]; /* col. value */
80: if (vj > k) {
81: qm = k;
82: do {
83: m = qm; qm = q[m];
84: } while (qm < vj);
86: nzk++;
87: q[m] = vj;
88: q[vj] = qm;
89: } /* if (vj > k) */
90: } /* for (j=jmin; j<jmax; j++) */
92: /* modify nonzero structure of k-th row by computing fill-in
93: for each row i to be merged in */
94: prow = k;
95: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
97: while (prow < k) {
98: /* merge row prow into k-th row */
99: jmin = iu[prow] + 1; jmax = iu[prow+1];
100: qm = k;
101: for (j=jmin; j<jmax; j++) {
102: vj = ju[j];
103: do {
104: m = qm; qm = q[m];
105: } while (qm < vj);
106: if (qm != vj) {
107: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
108: }
109: }
110: prow = jl[prow]; /* next pivot row */
111: }
113: /* add k to row list for first nonzero element in k-th row */
114: if (nzk > 0) {
115: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
116: jl[k] = jl[i]; jl[i] = k;
117: }
118: iu[k+1] = iu[k] + nzk;
120: /* allocate more space to ju if needed */
121: if (iu[k+1] > umax) {
122: /* estimate how much additional space we will need */
123: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
124: /* just double the memory each time */
125: maxadd = umax;
126: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
127: umax += maxadd;
129: /* allocate a longer ju */
130: PetscMalloc1(umax,&jutmp);
131: PetscArraycpy(jutmp,ju,iu[k]);
132: PetscFree(ju);
133: ju = jutmp;
134: reallocs++; /* count how many times we realloc */
135: }
137: /* save nonzero structure of k-th row in ju */
138: i=k;
139: while (nzk--) {
140: i = q[i];
141: ju[juidx++] = i;
142: }
143: }
145: #if defined(PETSC_USE_INFO)
146: if (ai[mbs] != 0) {
147: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
148: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
149: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
150: PetscInfo(A,"PCFactorSetFill(pc,%g);\n",(double)af);
151: PetscInfo(A,"for best performance.\n");
152: } else {
153: PetscInfo(A,"Empty matrix\n");
154: }
155: #endif
157: ISRestoreIndices(perm,&rip);
158: PetscFree2(jl,q);
160: /* put together the new matrix */
161: MatSeqSBAIJSetPreallocation(F,bs,MAT_SKIP_ALLOCATION,NULL);
163: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
164: b = (Mat_SeqSBAIJ*)(F)->data;
165: b->singlemalloc = PETSC_FALSE;
166: b->free_a = PETSC_TRUE;
167: b->free_ij = PETSC_TRUE;
169: PetscMalloc1((iu[mbs]+1)*bs2,&b->a);
170: b->j = ju;
171: b->i = iu;
172: b->diag = NULL;
173: b->ilen = NULL;
174: b->imax = NULL;
175: b->row = perm;
177: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
179: PetscObjectReference((PetscObject)perm);
181: b->icol = perm;
182: PetscObjectReference((PetscObject)perm);
183: PetscMalloc1(bs*mbs+bs,&b->solve_work);
184: /* In b structure: Free imax, ilen, old a, old j.
185: Allocate idnew, solve_work, new a, new j */
186: PetscLogObjectMemory((PetscObject)F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
187: b->maxnz = b->nz = iu[mbs];
189: (F)->info.factor_mallocs = reallocs;
190: (F)->info.fill_ratio_given = f;
191: if (ai[mbs] != 0) {
192: (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
193: } else {
194: (F)->info.fill_ratio_needed = 0.0;
195: }
196: MatSeqSBAIJSetNumericFactorization_inplace(F,perm_identity);
197: return 0;
198: }
199: /*
200: Symbolic U^T*D*U factorization for SBAIJ format.
201: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure.
202: */
203: #include <petscbt.h>
204: #include <../src/mat/utils/freespace.h>
205: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
206: {
207: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
208: Mat_SeqSBAIJ *b;
209: PetscBool perm_identity,missing;
210: PetscReal fill = info->fill;
211: const PetscInt *rip,*ai=a->i,*aj=a->j;
212: PetscInt i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow;
213: PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
214: PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr,*udiag;
215: PetscFreeSpaceList free_space=NULL,current_space=NULL;
216: PetscBT lnkbt;
219: MatMissingDiagonal(A,&missing,&i);
221: if (bs > 1) {
222: MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
223: return 0;
224: }
226: /* check whether perm is the identity mapping */
227: ISIdentity(perm,&perm_identity);
229: a->permute = PETSC_FALSE;
230: ISGetIndices(perm,&rip);
232: /* initialization */
233: PetscMalloc1(mbs+1,&ui);
234: PetscMalloc1(mbs+1,&udiag);
235: ui[0] = 0;
237: /* jl: linked list for storing indices of the pivot rows
238: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
239: PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);
240: for (i=0; i<mbs; i++) {
241: jl[i] = mbs; il[i] = 0;
242: }
244: /* create and initialize a linked list for storing column indices of the active row k */
245: nlnk = mbs + 1;
246: PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);
248: /* initial FreeSpace size is fill*(ai[mbs]+1) */
249: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);
250: current_space = free_space;
252: for (k=0; k<mbs; k++) { /* for each active row k */
253: /* initialize lnk by the column indices of row rip[k] of A */
254: nzk = 0;
255: ncols = ai[k+1] - ai[k];
257: for (j=0; j<ncols; j++) {
258: i = *(aj + ai[k] + j);
259: cols[j] = i;
260: }
261: PetscLLAdd(ncols,cols,mbs,&nlnk,lnk,lnkbt);
262: nzk += nlnk;
264: /* update lnk by computing fill-in for each pivot row to be merged in */
265: prow = jl[k]; /* 1st pivot row */
267: while (prow < k) {
268: nextprow = jl[prow];
269: /* merge prow into k-th row */
270: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
271: jmax = ui[prow+1];
272: ncols = jmax-jmin;
273: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
274: PetscLLAddSorted(ncols,uj_ptr,mbs,&nlnk,lnk,lnkbt);
275: nzk += nlnk;
277: /* update il and jl for prow */
278: if (jmin < jmax) {
279: il[prow] = jmin;
280: j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
281: }
282: prow = nextprow;
283: }
285: /* if free space is not available, make more free space */
286: if (current_space->local_remaining<nzk) {
287: i = mbs - k + 1; /* num of unfactored rows */
288: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
289: PetscFreeSpaceGet(i,¤t_space);
290: reallocs++;
291: }
293: /* copy data into free space, then initialize lnk */
294: PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);
296: /* add the k-th row into il and jl */
297: if (nzk > 1) {
298: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
299: jl[k] = jl[i]; jl[i] = k;
300: il[k] = ui[k] + 1;
301: }
302: ui_ptr[k] = current_space->array;
304: current_space->array += nzk;
305: current_space->local_used += nzk;
306: current_space->local_remaining -= nzk;
308: ui[k+1] = ui[k] + nzk;
309: }
311: ISRestoreIndices(perm,&rip);
312: PetscFree4(ui_ptr,il,jl,cols);
314: /* destroy list of free space and other temporary array(s) */
315: PetscMalloc1(ui[mbs]+1,&uj);
316: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,mbs,ui,udiag); /* store matrix factor */
317: PetscLLDestroy(lnk,lnkbt);
319: /* put together the new matrix in MATSEQSBAIJ format */
320: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
322: b = (Mat_SeqSBAIJ*)fact->data;
323: b->singlemalloc = PETSC_FALSE;
324: b->free_a = PETSC_TRUE;
325: b->free_ij = PETSC_TRUE;
327: PetscMalloc1(ui[mbs]+1,&b->a);
329: b->j = uj;
330: b->i = ui;
331: b->diag = udiag;
332: b->free_diag = PETSC_TRUE;
333: b->ilen = NULL;
334: b->imax = NULL;
335: b->row = perm;
336: b->icol = perm;
338: PetscObjectReference((PetscObject)perm);
339: PetscObjectReference((PetscObject)perm);
341: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
343: PetscMalloc1(mbs+1,&b->solve_work);
344: PetscLogObjectMemory((PetscObject)fact,ui[mbs]*(sizeof(PetscInt)+sizeof(MatScalar)));
346: b->maxnz = b->nz = ui[mbs];
348: fact->info.factor_mallocs = reallocs;
349: fact->info.fill_ratio_given = fill;
350: if (ai[mbs] != 0) {
351: fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
352: } else {
353: fact->info.fill_ratio_needed = 0.0;
354: }
355: #if defined(PETSC_USE_INFO)
356: if (ai[mbs] != 0) {
357: PetscReal af = fact->info.fill_ratio_needed;
358: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
359: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
360: PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
361: } else {
362: PetscInfo(A,"Empty matrix\n");
363: }
364: #endif
365: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
366: return 0;
367: }
369: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
370: {
371: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
372: Mat_SeqSBAIJ *b;
373: PetscBool perm_identity,missing;
374: PetscReal fill = info->fill;
375: const PetscInt *rip,*ai,*aj;
376: PetscInt i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d;
377: PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
378: PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
379: PetscFreeSpaceList free_space=NULL,current_space=NULL;
380: PetscBT lnkbt;
382: MatMissingDiagonal(A,&missing,&d);
385: /*
386: This code originally uses Modified Sparse Row (MSR) storage
387: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
388: Then it is rewritten so the factor B takes seqsbaij format. However the associated
389: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
390: thus the original code in MSR format is still used for these cases.
391: The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
392: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
393: */
394: if (bs > 1) {
395: MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
396: return 0;
397: }
399: /* check whether perm is the identity mapping */
400: ISIdentity(perm,&perm_identity);
402: a->permute = PETSC_FALSE;
403: ai = a->i; aj = a->j;
404: ISGetIndices(perm,&rip);
406: /* initialization */
407: PetscMalloc1(mbs+1,&ui);
408: ui[0] = 0;
410: /* jl: linked list for storing indices of the pivot rows
411: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
412: PetscMalloc4(mbs,&ui_ptr,mbs,&il,mbs,&jl,mbs,&cols);
413: for (i=0; i<mbs; i++) {
414: jl[i] = mbs; il[i] = 0;
415: }
417: /* create and initialize a linked list for storing column indices of the active row k */
418: nlnk = mbs + 1;
419: PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);
421: /* initial FreeSpace size is fill*(ai[mbs]+1) */
422: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[mbs]+1),&free_space);
423: current_space = free_space;
425: for (k=0; k<mbs; k++) { /* for each active row k */
426: /* initialize lnk by the column indices of row rip[k] of A */
427: nzk = 0;
428: ncols = ai[rip[k]+1] - ai[rip[k]];
429: for (j=0; j<ncols; j++) {
430: i = *(aj + ai[rip[k]] + j);
431: cols[j] = rip[i];
432: }
433: PetscLLAdd(ncols,cols,mbs,&nlnk,lnk,lnkbt);
434: nzk += nlnk;
436: /* update lnk by computing fill-in for each pivot row to be merged in */
437: prow = jl[k]; /* 1st pivot row */
439: while (prow < k) {
440: nextprow = jl[prow];
441: /* merge prow into k-th row */
442: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
443: jmax = ui[prow+1];
444: ncols = jmax-jmin;
445: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
446: PetscLLAddSorted(ncols,uj_ptr,mbs,&nlnk,lnk,lnkbt);
447: nzk += nlnk;
449: /* update il and jl for prow */
450: if (jmin < jmax) {
451: il[prow] = jmin;
453: j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
454: }
455: prow = nextprow;
456: }
458: /* if free space is not available, make more free space */
459: if (current_space->local_remaining<nzk) {
460: i = mbs - k + 1; /* num of unfactored rows */
461: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
462: PetscFreeSpaceGet(i,¤t_space);
463: reallocs++;
464: }
466: /* copy data into free space, then initialize lnk */
467: PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);
469: /* add the k-th row into il and jl */
470: if (nzk-1 > 0) {
471: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
472: jl[k] = jl[i]; jl[i] = k;
473: il[k] = ui[k] + 1;
474: }
475: ui_ptr[k] = current_space->array;
477: current_space->array += nzk;
478: current_space->local_used += nzk;
479: current_space->local_remaining -= nzk;
481: ui[k+1] = ui[k] + nzk;
482: }
484: ISRestoreIndices(perm,&rip);
485: PetscFree4(ui_ptr,il,jl,cols);
487: /* destroy list of free space and other temporary array(s) */
488: PetscMalloc1(ui[mbs]+1,&uj);
489: PetscFreeSpaceContiguous(&free_space,uj);
490: PetscLLDestroy(lnk,lnkbt);
492: /* put together the new matrix in MATSEQSBAIJ format */
493: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
495: b = (Mat_SeqSBAIJ*)fact->data;
496: b->singlemalloc = PETSC_FALSE;
497: b->free_a = PETSC_TRUE;
498: b->free_ij = PETSC_TRUE;
500: PetscMalloc1(ui[mbs]+1,&b->a);
502: b->j = uj;
503: b->i = ui;
504: b->diag = NULL;
505: b->ilen = NULL;
506: b->imax = NULL;
507: b->row = perm;
509: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
511: PetscObjectReference((PetscObject)perm);
512: b->icol = perm;
513: PetscObjectReference((PetscObject)perm);
514: PetscMalloc1(mbs+1,&b->solve_work);
515: PetscLogObjectMemory((PetscObject)fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
516: b->maxnz = b->nz = ui[mbs];
518: fact->info.factor_mallocs = reallocs;
519: fact->info.fill_ratio_given = fill;
520: if (ai[mbs] != 0) {
521: fact->info.fill_ratio_needed = ((PetscReal)ui[mbs])/ai[mbs];
522: } else {
523: fact->info.fill_ratio_needed = 0.0;
524: }
525: #if defined(PETSC_USE_INFO)
526: if (ai[mbs] != 0) {
527: PetscReal af = fact->info.fill_ratio_needed;
528: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
529: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
530: PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
531: } else {
532: PetscInfo(A,"Empty matrix\n");
533: }
534: #endif
535: MatSeqSBAIJSetNumericFactorization_inplace(fact,perm_identity);
536: return 0;
537: }
539: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
540: {
541: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
542: IS perm = b->row;
543: const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j;
544: PetscInt i,j;
545: PetscInt *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
546: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
547: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
548: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
549: MatScalar *work;
550: PetscInt *pivots;
551: PetscBool allowzeropivot,zeropivotdetected;
553: /* initialization */
554: PetscCalloc1(bs2*mbs,&rtmp);
555: PetscMalloc2(mbs,&il,mbs,&jl);
556: allowzeropivot = PetscNot(A->erroriffailure);
558: il[0] = 0;
559: for (i=0; i<mbs; i++) jl[i] = mbs;
561: PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);
562: PetscMalloc1(bs,&pivots);
564: ISGetIndices(perm,&perm_ptr);
566: /* check permutation */
567: if (!a->permute) {
568: ai = a->i; aj = a->j; aa = a->a;
569: } else {
570: ai = a->inew; aj = a->jnew;
571: PetscMalloc1(bs2*ai[mbs],&aa);
572: PetscArraycpy(aa,a->a,bs2*ai[mbs]);
573: PetscMalloc1(ai[mbs],&a2anew);
574: PetscArraycpy(a2anew,a->a2anew,ai[mbs]);
576: for (i=0; i<mbs; i++) {
577: jmin = ai[i]; jmax = ai[i+1];
578: for (j=jmin; j<jmax; j++) {
579: while (a2anew[j] != j) {
580: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
581: for (k1=0; k1<bs2; k1++) {
582: dk[k1] = aa[k*bs2+k1];
583: aa[k*bs2+k1] = aa[j*bs2+k1];
584: aa[j*bs2+k1] = dk[k1];
585: }
586: }
587: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
588: if (i > aj[j]) {
589: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
590: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
591: for (k=0; k<bs; k++) { /* j-th block of aa <- dk^T */
592: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
593: }
594: }
595: }
596: }
597: PetscFree(a2anew);
598: }
600: /* for each row k */
601: for (k = 0; k<mbs; k++) {
603: /*initialize k-th row with elements nonzero in row perm(k) of A */
604: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
606: ap = aa + jmin*bs2;
607: for (j = jmin; j < jmax; j++) {
608: vj = perm_ptr[aj[j]]; /* block col. index */
609: rtmp_ptr = rtmp + vj*bs2;
610: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
611: }
613: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
614: PetscArraycpy(dk,rtmp+k*bs2,bs2);
615: i = jl[k]; /* first row to be added to k_th row */
617: while (i < k) {
618: nexti = jl[i]; /* next row to be added to k_th row */
620: /* compute multiplier */
621: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
623: /* uik = -inv(Di)*U_bar(i,k) */
624: diag = ba + i*bs2;
625: u = ba + ili*bs2;
626: PetscArrayzero(uik,bs2);
627: PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
629: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
630: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
631: PetscLogFlops(4.0*bs*bs2);
633: /* update -U(i,k) */
634: PetscArraycpy(ba+ili*bs2,uik,bs2);
636: /* add multiple of row i to k-th row ... */
637: jmin = ili + 1; jmax = bi[i+1];
638: if (jmin < jmax) {
639: for (j=jmin; j<jmax; j++) {
640: /* rtmp += -U(i,k)^T * U_bar(i,j) */
641: rtmp_ptr = rtmp + bj[j]*bs2;
642: u = ba + j*bs2;
643: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
644: }
645: PetscLogFlops(2.0*bs*bs2*(jmax-jmin));
647: /* ... add i to row list for next nonzero entry */
648: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
649: j = bj[jmin];
650: jl[i] = jl[j]; jl[j] = i; /* update jl */
651: }
652: i = nexti;
653: }
655: /* save nonzero entries in k-th row of U ... */
657: /* invert diagonal block */
658: diag = ba+k*bs2;
659: PetscArraycpy(diag,dk,bs2);
661: PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);
662: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
664: jmin = bi[k]; jmax = bi[k+1];
665: if (jmin < jmax) {
666: for (j=jmin; j<jmax; j++) {
667: vj = bj[j]; /* block col. index of U */
668: u = ba + j*bs2;
669: rtmp_ptr = rtmp + vj*bs2;
670: for (k1=0; k1<bs2; k1++) {
671: *u++ = *rtmp_ptr;
672: *rtmp_ptr++ = 0.0;
673: }
674: }
676: /* ... add k to row list for first nonzero entry in k-th row */
677: il[k] = jmin;
678: i = bj[jmin];
679: jl[k] = jl[i]; jl[i] = k;
680: }
681: }
683: PetscFree(rtmp);
684: PetscFree2(il,jl);
685: PetscFree3(dk,uik,work);
686: PetscFree(pivots);
687: if (a->permute) {
688: PetscFree(aa);
689: }
691: ISRestoreIndices(perm,&perm_ptr);
693: C->ops->solve = MatSolve_SeqSBAIJ_N_inplace;
694: C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_inplace;
695: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_inplace;
696: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_inplace;
698: C->assembled = PETSC_TRUE;
699: C->preallocated = PETSC_TRUE;
701: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
702: return 0;
703: }
705: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
706: {
707: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
708: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
709: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
710: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
711: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
712: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
713: MatScalar *work;
714: PetscInt *pivots;
715: PetscBool allowzeropivot,zeropivotdetected;
717: PetscCalloc1(bs2*mbs,&rtmp);
718: PetscMalloc2(mbs,&il,mbs,&jl);
719: il[0] = 0;
720: for (i=0; i<mbs; i++) jl[i] = mbs;
722: PetscMalloc3(bs2,&dk,bs2,&uik,bs,&work);
723: PetscMalloc1(bs,&pivots);
724: allowzeropivot = PetscNot(A->erroriffailure);
726: ai = a->i; aj = a->j; aa = a->a;
728: /* for each row k */
729: for (k = 0; k<mbs; k++) {
731: /*initialize k-th row with elements nonzero in row k of A */
732: jmin = ai[k]; jmax = ai[k+1];
733: ap = aa + jmin*bs2;
734: for (j = jmin; j < jmax; j++) {
735: vj = aj[j]; /* block col. index */
736: rtmp_ptr = rtmp + vj*bs2;
737: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
738: }
740: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
741: PetscArraycpy(dk,rtmp+k*bs2,bs2);
742: i = jl[k]; /* first row to be added to k_th row */
744: while (i < k) {
745: nexti = jl[i]; /* next row to be added to k_th row */
747: /* compute multiplier */
748: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
750: /* uik = -inv(Di)*U_bar(i,k) */
751: diag = ba + i*bs2;
752: u = ba + ili*bs2;
753: PetscArrayzero(uik,bs2);
754: PetscKernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
756: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
757: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
758: PetscLogFlops(2.0*bs*bs2);
760: /* update -U(i,k) */
761: PetscArraycpy(ba+ili*bs2,uik,bs2);
763: /* add multiple of row i to k-th row ... */
764: jmin = ili + 1; jmax = bi[i+1];
765: if (jmin < jmax) {
766: for (j=jmin; j<jmax; j++) {
767: /* rtmp += -U(i,k)^T * U_bar(i,j) */
768: rtmp_ptr = rtmp + bj[j]*bs2;
769: u = ba + j*bs2;
770: PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
771: }
772: PetscLogFlops(2.0*bs*bs2*(jmax-jmin));
774: /* ... add i to row list for next nonzero entry */
775: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
776: j = bj[jmin];
777: jl[i] = jl[j]; jl[j] = i; /* update jl */
778: }
779: i = nexti;
780: }
782: /* save nonzero entries in k-th row of U ... */
784: /* invert diagonal block */
785: diag = ba+k*bs2;
786: PetscArraycpy(diag,dk,bs2);
788: PetscKernel_A_gets_inverse_A(bs,diag,pivots,work,allowzeropivot,&zeropivotdetected);
789: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
791: jmin = bi[k]; jmax = bi[k+1];
792: if (jmin < jmax) {
793: for (j=jmin; j<jmax; j++) {
794: vj = bj[j]; /* block col. index of U */
795: u = ba + j*bs2;
796: rtmp_ptr = rtmp + vj*bs2;
797: for (k1=0; k1<bs2; k1++) {
798: *u++ = *rtmp_ptr;
799: *rtmp_ptr++ = 0.0;
800: }
801: }
803: /* ... add k to row list for first nonzero entry in k-th row */
804: il[k] = jmin;
805: i = bj[jmin];
806: jl[k] = jl[i]; jl[i] = k;
807: }
808: }
810: PetscFree(rtmp);
811: PetscFree2(il,jl);
812: PetscFree3(dk,uik,work);
813: PetscFree(pivots);
815: C->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
816: C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
817: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
818: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace;
819: C->assembled = PETSC_TRUE;
820: C->preallocated = PETSC_TRUE;
822: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
823: return 0;
824: }
826: /*
827: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
828: Version for blocks 2 by 2.
829: */
830: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info)
831: {
832: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
833: IS perm = b->row;
834: const PetscInt *ai,*aj,*perm_ptr;
835: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
836: PetscInt *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
837: MatScalar *ba = b->a,*aa,*ap;
838: MatScalar *u,*diag,*rtmp,*rtmp_ptr,dk[4],uik[4];
839: PetscReal shift = info->shiftamount;
840: PetscBool allowzeropivot,zeropivotdetected;
842: allowzeropivot = PetscNot(A->erroriffailure);
844: /* initialization */
845: /* il and jl record the first nonzero element in each row of the accessing
846: window U(0:k, k:mbs-1).
847: jl: list of rows to be added to uneliminated rows
848: i>= k: jl(i) is the first row to be added to row i
849: i< k: jl(i) is the row following row i in some list of rows
850: jl(i) = mbs indicates the end of a list
851: il(i): points to the first nonzero element in columns k,...,mbs-1 of
852: row i of U */
853: PetscCalloc1(4*mbs,&rtmp);
854: PetscMalloc2(mbs,&il,mbs,&jl);
855: il[0] = 0;
856: for (i=0; i<mbs; i++) jl[i] = mbs;
858: ISGetIndices(perm,&perm_ptr);
860: /* check permutation */
861: if (!a->permute) {
862: ai = a->i; aj = a->j; aa = a->a;
863: } else {
864: ai = a->inew; aj = a->jnew;
865: PetscMalloc1(4*ai[mbs],&aa);
866: PetscArraycpy(aa,a->a,4*ai[mbs]);
867: PetscMalloc1(ai[mbs],&a2anew);
868: PetscArraycpy(a2anew,a->a2anew,ai[mbs]);
870: for (i=0; i<mbs; i++) {
871: jmin = ai[i]; jmax = ai[i+1];
872: for (j=jmin; j<jmax; j++) {
873: while (a2anew[j] != j) {
874: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
875: for (k1=0; k1<4; k1++) {
876: dk[k1] = aa[k*4+k1];
877: aa[k*4+k1] = aa[j*4+k1];
878: aa[j*4+k1] = dk[k1];
879: }
880: }
881: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
882: if (i > aj[j]) {
883: ap = aa + j*4; /* ptr to the beginning of the block */
884: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
885: ap[1] = ap[2];
886: ap[2] = dk[1];
887: }
888: }
889: }
890: PetscFree(a2anew);
891: }
893: /* for each row k */
894: for (k = 0; k<mbs; k++) {
896: /*initialize k-th row with elements nonzero in row perm(k) of A */
897: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
898: ap = aa + jmin*4;
899: for (j = jmin; j < jmax; j++) {
900: vj = perm_ptr[aj[j]]; /* block col. index */
901: rtmp_ptr = rtmp + vj*4;
902: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
903: }
905: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
906: PetscArraycpy(dk,rtmp+k*4,4);
907: i = jl[k]; /* first row to be added to k_th row */
909: while (i < k) {
910: nexti = jl[i]; /* next row to be added to k_th row */
912: /* compute multiplier */
913: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
915: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
916: diag = ba + i*4;
917: u = ba + ili*4;
918: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
919: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
920: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
921: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
923: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
924: dk[0] += uik[0]*u[0] + uik[1]*u[1];
925: dk[1] += uik[2]*u[0] + uik[3]*u[1];
926: dk[2] += uik[0]*u[2] + uik[1]*u[3];
927: dk[3] += uik[2]*u[2] + uik[3]*u[3];
929: PetscLogFlops(16.0*2.0);
931: /* update -U(i,k): ba[ili] = uik */
932: PetscArraycpy(ba+ili*4,uik,4);
934: /* add multiple of row i to k-th row ... */
935: jmin = ili + 1; jmax = bi[i+1];
936: if (jmin < jmax) {
937: for (j=jmin; j<jmax; j++) {
938: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
939: rtmp_ptr = rtmp + bj[j]*4;
940: u = ba + j*4;
941: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
942: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
943: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
944: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
945: }
946: PetscLogFlops(16.0*(jmax-jmin));
948: /* ... add i to row list for next nonzero entry */
949: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
950: j = bj[jmin];
951: jl[i] = jl[j]; jl[j] = i; /* update jl */
952: }
953: i = nexti;
954: }
956: /* save nonzero entries in k-th row of U ... */
958: /* invert diagonal block */
959: diag = ba+k*4;
960: PetscArraycpy(diag,dk,4);
961: PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);
962: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
964: jmin = bi[k]; jmax = bi[k+1];
965: if (jmin < jmax) {
966: for (j=jmin; j<jmax; j++) {
967: vj = bj[j]; /* block col. index of U */
968: u = ba + j*4;
969: rtmp_ptr = rtmp + vj*4;
970: for (k1=0; k1<4; k1++) {
971: *u++ = *rtmp_ptr;
972: *rtmp_ptr++ = 0.0;
973: }
974: }
976: /* ... add k to row list for first nonzero entry in k-th row */
977: il[k] = jmin;
978: i = bj[jmin];
979: jl[k] = jl[i]; jl[i] = k;
980: }
981: }
983: PetscFree(rtmp);
984: PetscFree2(il,jl);
985: if (a->permute) {
986: PetscFree(aa);
987: }
988: ISRestoreIndices(perm,&perm_ptr);
990: C->ops->solve = MatSolve_SeqSBAIJ_2_inplace;
991: C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_inplace;
992: C->assembled = PETSC_TRUE;
993: C->preallocated = PETSC_TRUE;
995: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
996: return 0;
997: }
999: /*
1000: Version for when blocks are 2 by 2 Using natural ordering
1001: */
1002: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
1003: {
1004: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)C->data;
1005: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1006: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1007: MatScalar *ba = b->a,*aa,*ap,dk[8],uik[8];
1008: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
1009: PetscReal shift = info->shiftamount;
1010: PetscBool allowzeropivot,zeropivotdetected;
1012: allowzeropivot = PetscNot(A->erroriffailure);
1014: /* initialization */
1015: /* il and jl record the first nonzero element in each row of the accessing
1016: window U(0:k, k:mbs-1).
1017: jl: list of rows to be added to uneliminated rows
1018: i>= k: jl(i) is the first row to be added to row i
1019: i< k: jl(i) is the row following row i in some list of rows
1020: jl(i) = mbs indicates the end of a list
1021: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1022: row i of U */
1023: PetscCalloc1(4*mbs,&rtmp);
1024: PetscMalloc2(mbs,&il,mbs,&jl);
1025: il[0] = 0;
1026: for (i=0; i<mbs; i++) jl[i] = mbs;
1028: ai = a->i; aj = a->j; aa = a->a;
1030: /* for each row k */
1031: for (k = 0; k<mbs; k++) {
1033: /*initialize k-th row with elements nonzero in row k of A */
1034: jmin = ai[k]; jmax = ai[k+1];
1035: ap = aa + jmin*4;
1036: for (j = jmin; j < jmax; j++) {
1037: vj = aj[j]; /* block col. index */
1038: rtmp_ptr = rtmp + vj*4;
1039: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1040: }
1042: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1043: PetscArraycpy(dk,rtmp+k*4,4);
1044: i = jl[k]; /* first row to be added to k_th row */
1046: while (i < k) {
1047: nexti = jl[i]; /* next row to be added to k_th row */
1049: /* compute multiplier */
1050: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1052: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1053: diag = ba + i*4;
1054: u = ba + ili*4;
1055: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1056: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1057: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1058: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1060: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1061: dk[0] += uik[0]*u[0] + uik[1]*u[1];
1062: dk[1] += uik[2]*u[0] + uik[3]*u[1];
1063: dk[2] += uik[0]*u[2] + uik[1]*u[3];
1064: dk[3] += uik[2]*u[2] + uik[3]*u[3];
1066: PetscLogFlops(16.0*2.0);
1068: /* update -U(i,k): ba[ili] = uik */
1069: PetscArraycpy(ba+ili*4,uik,4);
1071: /* add multiple of row i to k-th row ... */
1072: jmin = ili + 1; jmax = bi[i+1];
1073: if (jmin < jmax) {
1074: for (j=jmin; j<jmax; j++) {
1075: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1076: rtmp_ptr = rtmp + bj[j]*4;
1077: u = ba + j*4;
1078: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1079: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1080: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1081: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1082: }
1083: PetscLogFlops(16.0*(jmax-jmin));
1085: /* ... add i to row list for next nonzero entry */
1086: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1087: j = bj[jmin];
1088: jl[i] = jl[j]; jl[j] = i; /* update jl */
1089: }
1090: i = nexti;
1091: }
1093: /* save nonzero entries in k-th row of U ... */
1095: /* invert diagonal block */
1096: diag = ba+k*4;
1097: PetscArraycpy(diag,dk,4);
1098: PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);
1099: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1101: jmin = bi[k]; jmax = bi[k+1];
1102: if (jmin < jmax) {
1103: for (j=jmin; j<jmax; j++) {
1104: vj = bj[j]; /* block col. index of U */
1105: u = ba + j*4;
1106: rtmp_ptr = rtmp + vj*4;
1107: for (k1=0; k1<4; k1++) {
1108: *u++ = *rtmp_ptr;
1109: *rtmp_ptr++ = 0.0;
1110: }
1111: }
1113: /* ... add k to row list for first nonzero entry in k-th row */
1114: il[k] = jmin;
1115: i = bj[jmin];
1116: jl[k] = jl[i]; jl[i] = k;
1117: }
1118: }
1120: PetscFree(rtmp);
1121: PetscFree2(il,jl);
1123: C->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1124: C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1125: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1126: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace;
1127: C->assembled = PETSC_TRUE;
1128: C->preallocated = PETSC_TRUE;
1130: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1131: return 0;
1132: }
1134: /*
1135: Numeric U^T*D*U factorization for SBAIJ format.
1136: Version for blocks are 1 by 1.
1137: */
1138: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo *info)
1139: {
1140: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1141: IS ip=b->row;
1142: const PetscInt *ai,*aj,*rip;
1143: PetscInt *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1144: PetscInt k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1145: MatScalar *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1146: PetscReal rs;
1147: FactorShiftCtx sctx;
1149: /* MatPivotSetUp(): initialize shift context sctx */
1150: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1152: ISGetIndices(ip,&rip);
1153: if (!a->permute) {
1154: ai = a->i; aj = a->j; aa = a->a;
1155: } else {
1156: ai = a->inew; aj = a->jnew;
1157: nz = ai[mbs];
1158: PetscMalloc1(nz,&aa);
1159: a2anew = a->a2anew;
1160: bval = a->a;
1161: for (j=0; j<nz; j++) {
1162: aa[a2anew[j]] = *(bval++);
1163: }
1164: }
1166: /* initialization */
1167: /* il and jl record the first nonzero element in each row of the accessing
1168: window U(0:k, k:mbs-1).
1169: jl: list of rows to be added to uneliminated rows
1170: i>= k: jl(i) is the first row to be added to row i
1171: i< k: jl(i) is the row following row i in some list of rows
1172: jl(i) = mbs indicates the end of a list
1173: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1174: row i of U */
1175: PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&jl);
1177: do {
1178: sctx.newshift = PETSC_FALSE;
1179: il[0] = 0;
1180: for (i=0; i<mbs; i++) {
1181: rtmp[i] = 0.0; jl[i] = mbs;
1182: }
1184: for (k = 0; k<mbs; k++) {
1185: /*initialize k-th row by the perm[k]-th row of A */
1186: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1187: bval = ba + bi[k];
1188: for (j = jmin; j < jmax; j++) {
1189: col = rip[aj[j]];
1190: rtmp[col] = aa[j];
1191: *bval++ = 0.0; /* for in-place factorization */
1192: }
1194: /* shift the diagonal of the matrix */
1195: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1197: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1198: dk = rtmp[k];
1199: i = jl[k]; /* first row to be added to k_th row */
1201: while (i < k) {
1202: nexti = jl[i]; /* next row to be added to k_th row */
1204: /* compute multiplier, update diag(k) and U(i,k) */
1205: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1206: uikdi = -ba[ili]*ba[bi[i]]; /* diagonal(k) */
1207: dk += uikdi*ba[ili];
1208: ba[ili] = uikdi; /* -U(i,k) */
1210: /* add multiple of row i to k-th row */
1211: jmin = ili + 1; jmax = bi[i+1];
1212: if (jmin < jmax) {
1213: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1214: PetscLogFlops(2.0*(jmax-jmin));
1216: /* update il and jl for row i */
1217: il[i] = jmin;
1218: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1219: }
1220: i = nexti;
1221: }
1223: /* shift the diagonals when zero pivot is detected */
1224: /* compute rs=sum of abs(off-diagonal) */
1225: rs = 0.0;
1226: jmin = bi[k]+1;
1227: nz = bi[k+1] - jmin;
1228: if (nz) {
1229: bcol = bj + jmin;
1230: while (nz--) {
1231: rs += PetscAbsScalar(rtmp[*bcol]);
1232: bcol++;
1233: }
1234: }
1236: sctx.rs = rs;
1237: sctx.pv = dk;
1238: MatPivotCheck(C,A,info,&sctx,k);
1239: if (sctx.newshift) break; /* sctx.shift_amount is updated */
1240: dk = sctx.pv;
1242: /* copy data into U(k,:) */
1243: ba[bi[k]] = 1.0/dk; /* U(k,k) */
1244: jmin = bi[k]+1; jmax = bi[k+1];
1245: if (jmin < jmax) {
1246: for (j=jmin; j<jmax; j++) {
1247: col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1248: }
1249: /* add the k-th row into il and jl */
1250: il[k] = jmin;
1251: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1252: }
1253: }
1254: } while (sctx.newshift);
1255: PetscFree3(rtmp,il,jl);
1256: if (a->permute) PetscFree(aa);
1258: ISRestoreIndices(ip,&rip);
1260: C->ops->solve = MatSolve_SeqSBAIJ_1_inplace;
1261: C->ops->solves = MatSolves_SeqSBAIJ_1_inplace;
1262: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_inplace;
1263: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_inplace;
1264: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_inplace;
1265: C->assembled = PETSC_TRUE;
1266: C->preallocated = PETSC_TRUE;
1268: PetscLogFlops(C->rmap->N);
1269: if (sctx.nshift) {
1270: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1271: PetscInfo(A,"number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1272: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1273: PetscInfo(A,"number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1274: }
1275: }
1276: return 0;
1277: }
1279: /*
1280: Version for when blocks are 1 by 1 Using natural ordering under new datastructure
1281: Modified from MatCholeskyFactorNumeric_SeqAIJ()
1282: */
1283: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
1284: {
1285: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1286: Mat_SeqSBAIJ *b=(Mat_SeqSBAIJ*)B->data;
1287: PetscInt i,j,mbs=A->rmap->n,*bi=b->i,*bj=b->j,*bdiag=b->diag,*bjtmp;
1288: PetscInt *ai=a->i,*aj=a->j,*ajtmp;
1289: PetscInt k,jmin,jmax,*c2r,*il,col,nexti,ili,nz;
1290: MatScalar *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
1291: FactorShiftCtx sctx;
1292: PetscReal rs;
1293: MatScalar d,*v;
1295: PetscMalloc3(mbs,&rtmp,mbs,&il,mbs,&c2r);
1297: /* MatPivotSetUp(): initialize shift context sctx */
1298: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1300: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1301: sctx.shift_top = info->zeropivot;
1303: PetscArrayzero(rtmp,mbs);
1305: for (i=0; i<mbs; i++) {
1306: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1307: d = (aa)[a->diag[i]];
1308: rtmp[i] += -PetscRealPart(d); /* diagonal entry */
1309: ajtmp = aj + ai[i] + 1; /* exclude diagonal */
1310: v = aa + ai[i] + 1;
1311: nz = ai[i+1] - ai[i] - 1;
1312: for (j=0; j<nz; j++) {
1313: rtmp[i] += PetscAbsScalar(v[j]);
1314: rtmp[ajtmp[j]] += PetscAbsScalar(v[j]);
1315: }
1316: if (PetscRealPart(rtmp[i]) > sctx.shift_top) sctx.shift_top = PetscRealPart(rtmp[i]);
1317: }
1318: sctx.shift_top *= 1.1;
1319: sctx.nshift_max = 5;
1320: sctx.shift_lo = 0.;
1321: sctx.shift_hi = 1.;
1322: }
1324: /* allocate working arrays
1325: c2r: linked list, keep track of pivot rows for a given column. c2r[col]: head of the list for a given col
1326: il: for active k row, il[i] gives the index of the 1st nonzero entry in U[i,k:n-1] in bj and ba arrays
1327: */
1328: do {
1329: sctx.newshift = PETSC_FALSE;
1331: for (i=0; i<mbs; i++) c2r[i] = mbs;
1332: if (mbs) il[0] = 0;
1334: for (k = 0; k<mbs; k++) {
1335: /* zero rtmp */
1336: nz = bi[k+1] - bi[k];
1337: bjtmp = bj + bi[k];
1338: for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1340: /* load in initial unfactored row */
1341: bval = ba + bi[k];
1342: jmin = ai[k]; jmax = ai[k+1];
1343: for (j = jmin; j < jmax; j++) {
1344: col = aj[j];
1345: rtmp[col] = aa[j];
1346: *bval++ = 0.0; /* for in-place factorization */
1347: }
1348: /* shift the diagonal of the matrix: ZeropivotApply() */
1349: rtmp[k] += sctx.shift_amount; /* shift the diagonal of the matrix */
1351: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1352: dk = rtmp[k];
1353: i = c2r[k]; /* first row to be added to k_th row */
1355: while (i < k) {
1356: nexti = c2r[i]; /* next row to be added to k_th row */
1358: /* compute multiplier, update diag(k) and U(i,k) */
1359: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1360: uikdi = -ba[ili]*ba[bdiag[i]]; /* diagonal(k) */
1361: dk += uikdi*ba[ili]; /* update diag[k] */
1362: ba[ili] = uikdi; /* -U(i,k) */
1364: /* add multiple of row i to k-th row */
1365: jmin = ili + 1; jmax = bi[i+1];
1366: if (jmin < jmax) {
1367: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1368: /* update il and c2r for row i */
1369: il[i] = jmin;
1370: j = bj[jmin]; c2r[i] = c2r[j]; c2r[j] = i;
1371: }
1372: i = nexti;
1373: }
1375: /* copy data into U(k,:) */
1376: rs = 0.0;
1377: jmin = bi[k]; jmax = bi[k+1]-1;
1378: if (jmin < jmax) {
1379: for (j=jmin; j<jmax; j++) {
1380: col = bj[j]; ba[j] = rtmp[col]; rs += PetscAbsScalar(ba[j]);
1381: }
1382: /* add the k-th row into il and c2r */
1383: il[k] = jmin;
1384: i = bj[jmin]; c2r[k] = c2r[i]; c2r[i] = k;
1385: }
1387: sctx.rs = rs;
1388: sctx.pv = dk;
1389: MatPivotCheck(B,A,info,&sctx,k);
1390: if (sctx.newshift) break;
1391: dk = sctx.pv;
1393: ba[bdiag[k]] = 1.0/dk; /* U(k,k) */
1394: }
1395: } while (sctx.newshift);
1397: PetscFree3(rtmp,il,c2r);
1399: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1400: B->ops->solves = MatSolves_SeqSBAIJ_1;
1401: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1402: B->ops->matsolve = MatMatSolve_SeqSBAIJ_1_NaturalOrdering;
1403: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
1404: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
1406: B->assembled = PETSC_TRUE;
1407: B->preallocated = PETSC_TRUE;
1409: PetscLogFlops(B->rmap->n);
1411: /* MatPivotView() */
1412: if (sctx.nshift) {
1413: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1414: PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);
1415: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1416: PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1417: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1418: PetscInfo(A,"number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);
1419: }
1420: }
1421: return 0;
1422: }
1424: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
1425: {
1426: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ*)C->data;
1427: PetscInt i,j,mbs = a->mbs;
1428: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1429: PetscInt k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1430: MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1431: PetscReal rs;
1432: FactorShiftCtx sctx;
1434: /* MatPivotSetUp(): initialize shift context sctx */
1435: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1437: /* initialization */
1438: /* il and jl record the first nonzero element in each row of the accessing
1439: window U(0:k, k:mbs-1).
1440: jl: list of rows to be added to uneliminated rows
1441: i>= k: jl(i) is the first row to be added to row i
1442: i< k: jl(i) is the row following row i in some list of rows
1443: jl(i) = mbs indicates the end of a list
1444: il(i): points to the first nonzero element in U(i,k:mbs-1)
1445: */
1446: PetscMalloc1(mbs,&rtmp);
1447: PetscMalloc2(mbs,&il,mbs,&jl);
1449: do {
1450: sctx.newshift = PETSC_FALSE;
1451: il[0] = 0;
1452: for (i=0; i<mbs; i++) {
1453: rtmp[i] = 0.0; jl[i] = mbs;
1454: }
1456: for (k = 0; k<mbs; k++) {
1457: /*initialize k-th row with elements nonzero in row perm(k) of A */
1458: nz = ai[k+1] - ai[k];
1459: acol = aj + ai[k];
1460: aval = aa + ai[k];
1461: bval = ba + bi[k];
1462: while (nz--) {
1463: rtmp[*acol++] = *aval++;
1464: *bval++ = 0.0; /* for in-place factorization */
1465: }
1467: /* shift the diagonal of the matrix */
1468: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1470: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1471: dk = rtmp[k];
1472: i = jl[k]; /* first row to be added to k_th row */
1474: while (i < k) {
1475: nexti = jl[i]; /* next row to be added to k_th row */
1476: /* compute multiplier, update D(k) and U(i,k) */
1477: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1478: uikdi = -ba[ili]*ba[bi[i]];
1479: dk += uikdi*ba[ili];
1480: ba[ili] = uikdi; /* -U(i,k) */
1482: /* add multiple of row i to k-th row ... */
1483: jmin = ili + 1;
1484: nz = bi[i+1] - jmin;
1485: if (nz > 0) {
1486: bcol = bj + jmin;
1487: bval = ba + jmin;
1488: PetscLogFlops(2.0*nz);
1489: while (nz--) rtmp[*bcol++] += uikdi*(*bval++);
1491: /* update il and jl for i-th row */
1492: il[i] = jmin;
1493: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1494: }
1495: i = nexti;
1496: }
1498: /* shift the diagonals when zero pivot is detected */
1499: /* compute rs=sum of abs(off-diagonal) */
1500: rs = 0.0;
1501: jmin = bi[k]+1;
1502: nz = bi[k+1] - jmin;
1503: if (nz) {
1504: bcol = bj + jmin;
1505: while (nz--) {
1506: rs += PetscAbsScalar(rtmp[*bcol]);
1507: bcol++;
1508: }
1509: }
1511: sctx.rs = rs;
1512: sctx.pv = dk;
1513: MatPivotCheck(C,A,info,&sctx,k);
1514: if (sctx.newshift) break; /* sctx.shift_amount is updated */
1515: dk = sctx.pv;
1517: /* copy data into U(k,:) */
1518: ba[bi[k]] = 1.0/dk;
1519: jmin = bi[k]+1;
1520: nz = bi[k+1] - jmin;
1521: if (nz) {
1522: bcol = bj + jmin;
1523: bval = ba + jmin;
1524: while (nz--) {
1525: *bval++ = rtmp[*bcol];
1526: rtmp[*bcol++] = 0.0;
1527: }
1528: /* add k-th row into il and jl */
1529: il[k] = jmin;
1530: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1531: }
1532: } /* end of for (k = 0; k<mbs; k++) */
1533: } while (sctx.newshift);
1534: PetscFree(rtmp);
1535: PetscFree2(il,jl);
1537: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1538: C->ops->solves = MatSolves_SeqSBAIJ_1_inplace;
1539: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1540: C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1541: C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1543: C->assembled = PETSC_TRUE;
1544: C->preallocated = PETSC_TRUE;
1546: PetscLogFlops(C->rmap->N);
1547: if (sctx.nshift) {
1548: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1549: PetscInfo(A,"number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1550: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1551: PetscInfo(A,"number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);
1552: }
1553: }
1554: return 0;
1555: }
1557: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info)
1558: {
1559: Mat C;
1561: MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);
1562: MatCholeskyFactorSymbolic(C,A,perm,info);
1563: MatCholeskyFactorNumeric(C,A,info);
1565: A->ops->solve = C->ops->solve;
1566: A->ops->solvetranspose = C->ops->solvetranspose;
1568: MatHeaderMerge(A,&C);
1569: return 0;
1570: }