Actual source code: baijfact2.c
petsc-3.6.1 2015-08-06
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <petsc/private/kernels/blockinvert.h>
8: #include <petscbt.h>
9: #include <../src/mat/utils/freespace.h>
11: /* ----------------------------------------------------------------*/
12: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
16: /*
17: This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
18: */
19: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
20: {
21: Mat C =B;
22: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
23: PetscErrorCode ierr;
24: PetscInt i,j,k,ipvt[15];
25: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
26: PetscInt nz,nzL,row;
27: MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225];
28: const MatScalar *v,*aa=a->a;
29: PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg;
30: PetscInt sol_ver;
33: PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,NULL);
35: /* generate work space needed by the factorization */
36: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
37: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
39: for (i=0; i<n; i++) {
40: /* zero rtmp */
41: /* L part */
42: nz = bi[i+1] - bi[i];
43: bjtmp = bj + bi[i];
44: for (j=0; j<nz; j++) {
45: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
46: }
48: /* U part */
49: nz = bdiag[i] - bdiag[i+1];
50: bjtmp = bj + bdiag[i+1]+1;
51: for (j=0; j<nz; j++) {
52: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
53: }
55: /* load in initial (unfactored row) */
56: nz = ai[i+1] - ai[i];
57: ajtmp = aj + ai[i];
58: v = aa + bs2*ai[i];
59: for (j=0; j<nz; j++) {
60: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
61: }
63: /* elimination */
64: bjtmp = bj + bi[i];
65: nzL = bi[i+1] - bi[i];
66: for (k=0; k < nzL; k++) {
67: row = bjtmp[k];
68: pc = rtmp + bs2*row;
69: for (flg=0,j=0; j<bs2; j++) {
70: if (pc[j]!=0.0) {
71: flg = 1;
72: break;
73: }
74: }
75: if (flg) {
76: pv = b->a + bs2*bdiag[row];
77: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
78: /*PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);*/
79: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
80: pv = b->a + bs2*(bdiag[row+1]+1);
81: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
82: for (j=0; j<nz; j++) {
83: vv = rtmp + bs2*pj[j];
84: PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
85: /* PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
86: pv += bs2;
87: }
88: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
89: }
90: }
92: /* finished row so stick it into b->a */
93: /* L part */
94: pv = b->a + bs2*bi[i];
95: pj = b->j + bi[i];
96: nz = bi[i+1] - bi[i];
97: for (j=0; j<nz; j++) {
98: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
99: }
101: /* Mark diagonal and invert diagonal for simplier triangular solves */
102: pv = b->a + bs2*bdiag[i];
103: pj = b->j + bdiag[i];
104: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
105: /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */
106: PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);
108: /* U part */
109: pv = b->a + bs2*(bdiag[i+1]+1);
110: pj = b->j + bdiag[i+1]+1;
111: nz = bdiag[i] - bdiag[i+1] - 1;
112: for (j=0; j<nz; j++) {
113: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
114: }
115: }
117: PetscFree2(rtmp,mwork);
119: C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
120: C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
121: C->assembled = PETSC_TRUE;
123: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
124: return(0);
125: }
129: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
130: {
131: Mat C =B;
132: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
133: IS isrow = b->row,isicol = b->icol;
135: const PetscInt *r,*ic;
136: PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
137: PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
138: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
139: PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
140: MatScalar *v_work;
141: PetscBool col_identity,row_identity,both_identity;
144: ISGetIndices(isrow,&r);
145: ISGetIndices(isicol,&ic);
147: PetscMalloc1(bs2*n,&rtmp);
148: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
150: /* generate work space needed by dense LU factorization */
151: PetscMalloc3(bs,&v_work,bs2,&mwork,bs,&v_pivots);
153: for (i=0; i<n; i++) {
154: /* zero rtmp */
155: /* L part */
156: nz = bi[i+1] - bi[i];
157: bjtmp = bj + bi[i];
158: for (j=0; j<nz; j++) {
159: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
160: }
162: /* U part */
163: nz = bdiag[i] - bdiag[i+1];
164: bjtmp = bj + bdiag[i+1]+1;
165: for (j=0; j<nz; j++) {
166: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
167: }
169: /* load in initial (unfactored row) */
170: nz = ai[r[i]+1] - ai[r[i]];
171: ajtmp = aj + ai[r[i]];
172: v = aa + bs2*ai[r[i]];
173: for (j=0; j<nz; j++) {
174: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
175: }
177: /* elimination */
178: bjtmp = bj + bi[i];
179: nzL = bi[i+1] - bi[i];
180: for (k=0; k < nzL; k++) {
181: row = bjtmp[k];
182: pc = rtmp + bs2*row;
183: for (flg=0,j=0; j<bs2; j++) {
184: if (pc[j]!=0.0) {
185: flg = 1;
186: break;
187: }
188: }
189: if (flg) {
190: pv = b->a + bs2*bdiag[row];
191: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
192: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
193: pv = b->a + bs2*(bdiag[row+1]+1);
194: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
195: for (j=0; j<nz; j++) {
196: PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
197: }
198: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
199: }
200: }
202: /* finished row so stick it into b->a */
203: /* L part */
204: pv = b->a + bs2*bi[i];
205: pj = b->j + bi[i];
206: nz = bi[i+1] - bi[i];
207: for (j=0; j<nz; j++) {
208: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
209: }
211: /* Mark diagonal and invert diagonal for simplier triangular solves */
212: pv = b->a + bs2*bdiag[i];
213: pj = b->j + bdiag[i];
214: /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
215: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
216: PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);
218: /* U part */
219: pv = b->a + bs2*(bdiag[i+1]+1);
220: pj = b->j + bdiag[i+1]+1;
221: nz = bdiag[i] - bdiag[i+1] - 1;
222: for (j=0; j<nz; j++) {
223: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
224: }
225: }
227: PetscFree(rtmp);
228: PetscFree3(v_work,mwork,v_pivots);
229: ISRestoreIndices(isicol,&ic);
230: ISRestoreIndices(isrow,&r);
232: ISIdentity(isrow,&row_identity);
233: ISIdentity(isicol,&col_identity);
235: both_identity = (PetscBool) (row_identity && col_identity);
236: if (both_identity) {
237: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
238: } else {
239: C->ops->solve = MatSolve_SeqBAIJ_N;
240: }
241: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
243: C->assembled = PETSC_TRUE;
245: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
246: return(0);
247: }
249: /*
250: ilu(0) with natural ordering under new data structure.
251: See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
252: because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
253: */
257: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
258: {
260: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
262: PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
263: PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp;
266: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
267: b = (Mat_SeqBAIJ*)(fact)->data;
269: /* allocate matrix arrays for new data structure */
270: PetscMalloc3(bs2*ai[n]+1,&b->a,ai[n]+1,&b->j,n+1,&b->i);
271: PetscLogObjectMemory((PetscObject)fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
273: b->singlemalloc = PETSC_TRUE;
274: b->free_a = PETSC_TRUE;
275: b->free_ij = PETSC_TRUE;
276: fact->preallocated = PETSC_TRUE;
277: fact->assembled = PETSC_TRUE;
278: if (!b->diag) {
279: PetscMalloc1(n+1,&b->diag);
280: PetscLogObjectMemory((PetscObject)fact,(n+1)*sizeof(PetscInt));
281: }
282: bdiag = b->diag;
284: if (n > 0) {
285: PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
286: }
288: /* set bi and bj with new data structure */
289: bi = b->i;
290: bj = b->j;
292: /* L part */
293: bi[0] = 0;
294: for (i=0; i<n; i++) {
295: nz = adiag[i] - ai[i];
296: bi[i+1] = bi[i] + nz;
297: aj = a->j + ai[i];
298: for (j=0; j<nz; j++) {
299: *bj = aj[j]; bj++;
300: }
301: }
303: /* U part */
304: bi_temp = bi[n];
305: bdiag[n] = bi[n]-1;
306: for (i=n-1; i>=0; i--) {
307: nz = ai[i+1] - adiag[i] - 1;
308: bi_temp = bi_temp + nz + 1;
309: aj = a->j + adiag[i] + 1;
310: for (j=0; j<nz; j++) {
311: *bj = aj[j]; bj++;
312: }
313: /* diag[i] */
314: *bj = i; bj++;
315: bdiag[i] = bi_temp - 1;
316: }
317: return(0);
318: }
322: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
323: {
324: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
325: IS isicol;
326: PetscErrorCode ierr;
327: const PetscInt *r,*ic;
328: PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d;
329: PetscInt *bi,*cols,nnz,*cols_lvl;
330: PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
331: PetscInt i,levels,diagonal_fill;
332: PetscBool col_identity,row_identity,both_identity;
333: PetscReal f;
334: PetscInt nlnk,*lnk,*lnk_lvl=NULL;
335: PetscBT lnkbt;
336: PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr;
337: PetscFreeSpaceList free_space =NULL,current_space=NULL;
338: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
339: PetscBool missing;
340: PetscInt bs=A->rmap->bs,bs2=a->bs2;
343: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
344: if (bs>1) { /* check shifttype */
345: if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
346: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
347: }
349: MatMissingDiagonal(A,&missing,&d);
350: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
352: f = info->fill;
353: levels = (PetscInt)info->levels;
354: diagonal_fill = (PetscInt)info->diagonal_fill;
356: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
358: ISIdentity(isrow,&row_identity);
359: ISIdentity(iscol,&col_identity);
361: both_identity = (PetscBool) (row_identity && col_identity);
363: if (!levels && both_identity) {
364: /* special case: ilu(0) with natural ordering */
365: MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
366: MatSeqBAIJSetNumericFactorization(fact,both_identity);
368: fact->factortype = MAT_FACTOR_ILU;
369: (fact)->info.factor_mallocs = 0;
370: (fact)->info.fill_ratio_given = info->fill;
371: (fact)->info.fill_ratio_needed = 1.0;
373: b = (Mat_SeqBAIJ*)(fact)->data;
374: b->row = isrow;
375: b->col = iscol;
376: b->icol = isicol;
377: PetscObjectReference((PetscObject)isrow);
378: PetscObjectReference((PetscObject)iscol);
379: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
381: PetscMalloc1((n+1)*bs,&b->solve_work);
382: return(0);
383: }
385: ISGetIndices(isrow,&r);
386: ISGetIndices(isicol,&ic);
388: /* get new row pointers */
389: PetscMalloc1(n+1,&bi);
390: bi[0] = 0;
391: /* bdiag is location of diagonal in factor */
392: PetscMalloc1(n+1,&bdiag);
393: bdiag[0] = 0;
395: PetscMalloc2(n,&bj_ptr,n,&bjlvl_ptr);
397: /* create a linked list for storing column indices of the active row */
398: nlnk = n + 1;
399: PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
401: /* initial FreeSpace size is f*(ai[n]+1) */
402: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
403: current_space = free_space;
404: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
405: current_space_lvl = free_space_lvl;
407: for (i=0; i<n; i++) {
408: nzi = 0;
409: /* copy current row into linked list */
410: nnz = ai[r[i]+1] - ai[r[i]];
411: if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
412: cols = aj + ai[r[i]];
413: lnk[i] = -1; /* marker to indicate if diagonal exists */
414: PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
415: nzi += nlnk;
417: /* make sure diagonal entry is included */
418: if (diagonal_fill && lnk[i] == -1) {
419: fm = n;
420: while (lnk[fm] < i) fm = lnk[fm];
421: lnk[i] = lnk[fm]; /* insert diagonal into linked list */
422: lnk[fm] = i;
423: lnk_lvl[i] = 0;
424: nzi++; dcount++;
425: }
427: /* add pivot rows into the active row */
428: nzbd = 0;
429: prow = lnk[n];
430: while (prow < i) {
431: nnz = bdiag[prow];
432: cols = bj_ptr[prow] + nnz + 1;
433: cols_lvl = bjlvl_ptr[prow] + nnz + 1;
434: nnz = bi[prow+1] - bi[prow] - nnz - 1;
436: PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
437: nzi += nlnk;
438: prow = lnk[prow];
439: nzbd++;
440: }
441: bdiag[i] = nzbd;
442: bi[i+1] = bi[i] + nzi;
444: /* if free space is not available, make more free space */
445: if (current_space->local_remaining<nzi) {
446: nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
447: PetscFreeSpaceGet(nnz,¤t_space);
448: PetscFreeSpaceGet(nnz,¤t_space_lvl);
449: reallocs++;
450: }
452: /* copy data into free_space and free_space_lvl, then initialize lnk */
453: PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
455: bj_ptr[i] = current_space->array;
456: bjlvl_ptr[i] = current_space_lvl->array;
458: /* make sure the active row i has diagonal entry */
459: if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
461: current_space->array += nzi;
462: current_space->local_used += nzi;
463: current_space->local_remaining -= nzi;
465: current_space_lvl->array += nzi;
466: current_space_lvl->local_used += nzi;
467: current_space_lvl->local_remaining -= nzi;
468: }
470: ISRestoreIndices(isrow,&r);
471: ISRestoreIndices(isicol,&ic);
473: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
474: PetscMalloc1(bi[n]+1,&bj);
475: PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
477: PetscIncompleteLLDestroy(lnk,lnkbt);
478: PetscFreeSpaceDestroy(free_space_lvl);
479: PetscFree2(bj_ptr,bjlvl_ptr);
481: #if defined(PETSC_USE_INFO)
482: {
483: PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
484: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
485: PetscInfo1(A,"Run with -[sub_]pc_factor_fill %g or use \n",(double)af);
486: PetscInfo1(A,"PCFactorSetFill([sub]pc,%g);\n",(double)af);
487: PetscInfo(A,"for best performance.\n");
488: if (diagonal_fill) {
489: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
490: }
491: }
492: #endif
494: /* put together the new matrix */
495: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
496: PetscLogObjectParent((PetscObject)fact,(PetscObject)isicol);
498: b = (Mat_SeqBAIJ*)(fact)->data;
499: b->free_a = PETSC_TRUE;
500: b->free_ij = PETSC_TRUE;
501: b->singlemalloc = PETSC_FALSE;
503: PetscMalloc1(bs2*(bdiag[0]+1),&b->a);
505: b->j = bj;
506: b->i = bi;
507: b->diag = bdiag;
508: b->free_diag = PETSC_TRUE;
509: b->ilen = 0;
510: b->imax = 0;
511: b->row = isrow;
512: b->col = iscol;
513: PetscObjectReference((PetscObject)isrow);
514: PetscObjectReference((PetscObject)iscol);
515: b->icol = isicol;
517: PetscMalloc1(bs*n+bs,&b->solve_work);
518: /* In b structure: Free imax, ilen, old a, old j.
519: Allocate bdiag, solve_work, new a, new j */
520: PetscLogObjectMemory((PetscObject)fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
521: b->maxnz = b->nz = bdiag[0]+1;
523: fact->info.factor_mallocs = reallocs;
524: fact->info.fill_ratio_given = f;
525: fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
527: MatSeqBAIJSetNumericFactorization(fact,both_identity);
528: return(0);
529: }
531: /*
532: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
533: except that the data structure of Mat_SeqAIJ is slightly different.
534: Not a good example of code reuse.
535: */
538: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
539: {
540: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
541: IS isicol;
543: const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
544: PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
545: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
546: PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
547: PetscBool col_identity,row_identity,both_identity,flg;
548: PetscReal f;
551: MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
552: if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
554: f = info->fill;
555: levels = (PetscInt)info->levels;
556: diagonal_fill = (PetscInt)info->diagonal_fill;
558: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
560: ISIdentity(isrow,&row_identity);
561: ISIdentity(iscol,&col_identity);
562: both_identity = (PetscBool) (row_identity && col_identity);
564: if (!levels && both_identity) { /* special case copy the nonzero structure */
565: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
566: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
568: fact->factortype = MAT_FACTOR_ILU;
569: b = (Mat_SeqBAIJ*)fact->data;
570: b->row = isrow;
571: b->col = iscol;
572: PetscObjectReference((PetscObject)isrow);
573: PetscObjectReference((PetscObject)iscol);
574: b->icol = isicol;
575: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
577: PetscMalloc1((n+1)*bs,&b->solve_work);
578: return(0);
579: }
581: /* general case perform the symbolic factorization */
582: ISGetIndices(isrow,&r);
583: ISGetIndices(isicol,&ic);
585: /* get new row pointers */
586: PetscMalloc1(n+1,&ainew);
587: ainew[0] = 0;
588: /* don't know how many column pointers are needed so estimate */
589: jmax = (PetscInt)(f*ai[n] + 1);
590: PetscMalloc1(jmax,&ajnew);
591: /* ajfill is level of fill for each fill entry */
592: PetscMalloc1(jmax,&ajfill);
593: /* fill is a linked list of nonzeros in active row */
594: PetscMalloc1(n+1,&fill);
595: /* im is level for each filled value */
596: PetscMalloc1(n+1,&im);
597: /* dloc is location of diagonal in factor */
598: PetscMalloc1(n+1,&dloc);
599: dloc[0] = 0;
600: for (prow=0; prow<n; prow++) {
602: /* copy prow into linked list */
603: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
604: if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
605: xi = aj + ai[r[prow]];
606: fill[n] = n;
607: fill[prow] = -1; /* marker for diagonal entry */
608: while (nz--) {
609: fm = n;
610: idx = ic[*xi++];
611: do {
612: m = fm;
613: fm = fill[m];
614: } while (fm < idx);
615: fill[m] = idx;
616: fill[idx] = fm;
617: im[idx] = 0;
618: }
620: /* make sure diagonal entry is included */
621: if (diagonal_fill && fill[prow] == -1) {
622: fm = n;
623: while (fill[fm] < prow) fm = fill[fm];
624: fill[prow] = fill[fm]; /* insert diagonal into linked list */
625: fill[fm] = prow;
626: im[prow] = 0;
627: nzf++;
628: dcount++;
629: }
631: nzi = 0;
632: row = fill[n];
633: while (row < prow) {
634: incrlev = im[row] + 1;
635: nz = dloc[row];
636: xi = ajnew + ainew[row] + nz + 1;
637: flev = ajfill + ainew[row] + nz + 1;
638: nnz = ainew[row+1] - ainew[row] - nz - 1;
639: fm = row;
640: while (nnz-- > 0) {
641: idx = *xi++;
642: if (*flev + incrlev > levels) {
643: flev++;
644: continue;
645: }
646: do {
647: m = fm;
648: fm = fill[m];
649: } while (fm < idx);
650: if (fm != idx) {
651: im[idx] = *flev + incrlev;
652: fill[m] = idx;
653: fill[idx] = fm;
654: fm = idx;
655: nzf++;
656: } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
657: flev++;
658: }
659: row = fill[row];
660: nzi++;
661: }
662: /* copy new filled row into permanent storage */
663: ainew[prow+1] = ainew[prow] + nzf;
664: if (ainew[prow+1] > jmax) {
666: /* estimate how much additional space we will need */
667: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
668: /* just double the memory each time */
669: PetscInt maxadd = jmax;
670: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
671: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
672: jmax += maxadd;
674: /* allocate a longer ajnew and ajfill */
675: PetscMalloc1(jmax,&xitmp);
676: PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
677: PetscFree(ajnew);
678: ajnew = xitmp;
679: PetscMalloc1(jmax,&xitmp);
680: PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
681: PetscFree(ajfill);
682: ajfill = xitmp;
683: reallocate++; /* count how many reallocations are needed */
684: }
685: xitmp = ajnew + ainew[prow];
686: flev = ajfill + ainew[prow];
687: dloc[prow] = nzi;
688: fm = fill[n];
689: while (nzf--) {
690: *xitmp++ = fm;
691: *flev++ = im[fm];
692: fm = fill[fm];
693: }
694: /* make sure row has diagonal entry */
695: if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
696: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
697: }
698: PetscFree(ajfill);
699: ISRestoreIndices(isrow,&r);
700: ISRestoreIndices(isicol,&ic);
701: PetscFree(fill);
702: PetscFree(im);
704: #if defined(PETSC_USE_INFO)
705: {
706: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
707: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocate,(double)f,(double)af);
708: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
709: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
710: PetscInfo(A,"for best performance.\n");
711: if (diagonal_fill) {
712: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
713: }
714: }
715: #endif
717: /* put together the new matrix */
718: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);
719: PetscLogObjectParent((PetscObject)fact,(PetscObject)isicol);
720: b = (Mat_SeqBAIJ*)fact->data;
722: b->free_a = PETSC_TRUE;
723: b->free_ij = PETSC_TRUE;
724: b->singlemalloc = PETSC_FALSE;
726: PetscMalloc1(bs2*ainew[n],&b->a);
728: b->j = ajnew;
729: b->i = ainew;
730: for (i=0; i<n; i++) dloc[i] += ainew[i];
731: b->diag = dloc;
732: b->free_diag = PETSC_TRUE;
733: b->ilen = 0;
734: b->imax = 0;
735: b->row = isrow;
736: b->col = iscol;
737: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
739: PetscObjectReference((PetscObject)isrow);
740: PetscObjectReference((PetscObject)iscol);
741: b->icol = isicol;
742: PetscMalloc1(bs*n+bs,&b->solve_work);
743: /* In b structure: Free imax, ilen, old a, old j.
744: Allocate dloc, solve_work, new a, new j */
745: PetscLogObjectMemory((PetscObject)fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
746: b->maxnz = b->nz = ainew[n];
748: fact->info.factor_mallocs = reallocate;
749: fact->info.fill_ratio_given = f;
750: fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
752: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
753: return(0);
754: }
758: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
759: {
760: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
761: /* int i,*AJ=a->j,nz=a->nz; */
764: /* Undo Column scaling */
765: /* while (nz--) { */
766: /* AJ[i] = AJ[i]/4; */
767: /* } */
768: /* This should really invoke a push/pop logic, but we don't have that yet. */
769: A->ops->setunfactored = NULL;
770: return(0);
771: }
775: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
776: {
777: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
778: PetscInt *AJ=a->j,nz=a->nz;
779: unsigned short *aj=(unsigned short*)AJ;
782: /* Is this really necessary? */
783: while (nz--) {
784: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
785: }
786: A->ops->setunfactored = NULL;
787: return(0);
788: }