Actual source code: matptap.c
petsc-3.3-p7 2013-05-11
2: /*
3: Defines projective product routines where A is a SeqAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8: #include <../src/mat/utils/freespace.h>
9: #include <petscbt.h>
13: PetscErrorCode MatPtAPSymbolic_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
14: {
18: if (!P->ops->ptapsymbolic_seqaij) SETERRQ2(((PetscObject)A)->comm,PETSC_ERR_SUP,"Not implemented for A=%s and P=%s",((PetscObject)A)->type_name,((PetscObject)P)->type_name);
19: (*P->ops->ptapsymbolic_seqaij)(A,P,fill,C);
20: return(0);
21: }
25: PetscErrorCode MatPtAPNumeric_SeqAIJ(Mat A,Mat P,Mat C)
26: {
30: if (!P->ops->ptapnumeric_seqaij) SETERRQ2(((PetscObject)A)->comm,PETSC_ERR_SUP,"Not implemented for A=%s and P=%s",((PetscObject)A)->type_name,((PetscObject)P)->type_name);
31: (*P->ops->ptapnumeric_seqaij)(A,P,C);
32: return(0);
33: }
37: PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
38: {
40: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
41: Mat_PtAP *ptap = a->ptap;
44: /* free ptap, then A */
45: PetscFree(ptap->apa);
46: PetscFree(ptap->api);
47: PetscFree(ptap->apj);
48: (ptap->destroy)(A);
49: PetscFree(ptap);
50: return(0);
51: }
55: PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ_SparseAxpy2(Mat A,Mat P,PetscReal fill,Mat *C)
56: {
57: PetscErrorCode ierr;
58: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
59: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
60: PetscInt *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
61: PetscInt *ci,*cj,*ptadenserow,*ptasparserow,*ptaj,nspacedouble=0;
62: PetscInt an=A->cmap->N,am=A->rmap->N,pn=P->cmap->N;
63: PetscInt i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
64: MatScalar *ca;
65: PetscBT lnkbt;
68: /* Get ij structure of P^T */
69: MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);
70: ptJ=ptj;
72: /* Allocate ci array, arrays for fill computation and */
73: /* free space for accumulating nonzero column info */
74: PetscMalloc((pn+1)*sizeof(PetscInt),&ci);
75: ci[0] = 0;
77: PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);
78: PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));
79: ptasparserow = ptadenserow + an;
81: /* create and initialize a linked list */
82: nlnk = pn+1;
83: PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);
85: /* Set initial free space to be fill*nnz(A). */
86: /* This should be reasonable if sparsity of PtAP is similar to that of A. */
87: PetscFreeSpaceGet((PetscInt)(fill*ai[am]),&free_space);
88: current_space = free_space;
90: /* Determine symbolic info for each row of C: */
91: for (i=0;i<pn;i++) {
92: ptnzi = pti[i+1] - pti[i];
93: ptanzi = 0;
94: /* Determine symbolic row of PtA: */
95: for (j=0;j<ptnzi;j++) {
96: arow = *ptJ++;
97: anzj = ai[arow+1] - ai[arow];
98: ajj = aj + ai[arow];
99: for (k=0;k<anzj;k++) {
100: if (!ptadenserow[ajj[k]]) {
101: ptadenserow[ajj[k]] = -1;
102: ptasparserow[ptanzi++] = ajj[k];
103: }
104: }
105: }
106: /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
107: ptaj = ptasparserow;
108: cnzi = 0;
109: for (j=0;j<ptanzi;j++) {
110: prow = *ptaj++;
111: pnzj = pi[prow+1] - pi[prow];
112: pjj = pj + pi[prow];
113: /* add non-zero cols of P into the sorted linked list lnk */
114: PetscLLAddSorted(pnzj,pjj,pn,nlnk,lnk,lnkbt);
115: cnzi += nlnk;
116: }
117:
118: /* If free space is not available, make more free space */
119: /* Double the amount of total space in the list */
120: if (current_space->local_remaining<cnzi) {
121: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
122: nspacedouble++;
123: }
125: /* Copy data into free space, and zero out denserows */
126: PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);
127: current_space->array += cnzi;
128: current_space->local_used += cnzi;
129: current_space->local_remaining -= cnzi;
130:
131: for (j=0;j<ptanzi;j++) {
132: ptadenserow[ptasparserow[j]] = 0;
133: }
134: /* Aside: Perhaps we should save the pta info for the numerical factorization. */
135: /* For now, we will recompute what is needed. */
136: ci[i+1] = ci[i] + cnzi;
137: }
138: /* nnz is now stored in ci[ptm], column indices are in the list of free space */
139: /* Allocate space for cj, initialize cj, and */
140: /* destroy list of free space and other temporary array(s) */
141: PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);
142: PetscFreeSpaceContiguous(&free_space,cj);
143: PetscFree(ptadenserow);
144: PetscLLDestroy(lnk,lnkbt);
145:
146: /* Allocate space for ca */
147: PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);
148: PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));
149:
150: /* put together the new matrix */
151: MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,pn,pn,ci,cj,ca,C);
152: (*C)->rmap->bs = P->cmap->bs;
153: (*C)->cmap->bs = P->cmap->bs;
154: PetscPrintf(PETSC_COMM_SELF,"************%s C.bs=%d,%d\n",__FUNCT__,(*C)->rmap->bs,(*C)->cmap->bs);
155: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
156: /* Since these are PETSc arrays, change flags to free them as necessary. */
157: c = (Mat_SeqAIJ *)((*C)->data);
158: c->free_a = PETSC_TRUE;
159: c->free_ij = PETSC_TRUE;
160: c->nonew = 0;
161: A->ops->ptapnumeric = MatPtAPNumeric_SeqAIJ_SeqAIJ_SparseAxpy2; /* should use *C->ops until PtAP insterface is updated to double dispatch as MatMatMult() */
163: /* Clean up. */
164: MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);
165: #if defined(PETSC_USE_INFO)
166: if (ci[pn] != 0) {
167: PetscReal afill = ((PetscReal)ci[pn])/ai[am];
168: if (afill < 1.0) afill = 1.0;
169: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
170: PetscInfo1((*C),"Use MatPtAP(A,P,MatReuse,%G,&C) for best performance.\n",afill);
171: } else {
172: PetscInfo((*C),"Empty matrix product\n");
173: }
174: #endif
175: return(0);
176: }
180: PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ_SparseAxpy2(Mat A,Mat P,Mat C)
181: {
183: Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
184: Mat_SeqAIJ *p = (Mat_SeqAIJ *) P->data;
185: Mat_SeqAIJ *c = (Mat_SeqAIJ *) C->data;
186: PetscInt *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
187: PetscInt *ci=c->i,*cj=c->j,*cjj;
188: PetscInt am=A->rmap->N,cn=C->cmap->N,cm=C->rmap->N;
189: PetscInt i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
190: MatScalar *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
193: /* Allocate temporary array for storage of one row of A*P */
194: PetscMalloc(cn*(sizeof(MatScalar)+sizeof(PetscInt))+c->rmax*sizeof(PetscInt),&apa);
196: apjdense = (PetscInt *)(apa + cn);
197: apj = apjdense + cn;
198: PetscMemzero(apa,cn*(sizeof(MatScalar)+sizeof(PetscInt)));
200: /* Clear old values in C */
201: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
203: for (i=0; i<am; i++) {
204: /* Form sparse row of A*P */
205: anzi = ai[i+1] - ai[i];
206: apnzj = 0;
207: for (j=0; j<anzi; j++) {
208: prow = *aj++;
209: pnzj = pi[prow+1] - pi[prow];
210: pjj = pj + pi[prow];
211: paj = pa + pi[prow];
212: for (k=0;k<pnzj;k++) {
213: if (!apjdense[pjj[k]]) {
214: apjdense[pjj[k]] = -1;
215: apj[apnzj++] = pjj[k];
216: }
217: apa[pjj[k]] += (*aa)*paj[k];
218: }
219: PetscLogFlops(2.0*pnzj);
220: aa++;
221: }
223: /* Sort the j index array for quick sparse axpy. */
224: /* Note: a array does not need sorting as it is in dense storage locations. */
225: PetscSortInt(apnzj,apj);
227: /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
228: pnzi = pi[i+1] - pi[i];
229: for (j=0; j<pnzi; j++) {
230: nextap = 0;
231: crow = *pJ++;
232: cjj = cj + ci[crow];
233: caj = ca + ci[crow];
234: /* Perform sparse axpy operation. Note cjj includes apj. */
235: for (k=0;nextap<apnzj;k++) {
236: #if defined(PETSC_USE_DEBUG)
237: if (k >= ci[crow+1] - ci[crow]) {
238: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"k too large k %d, crow %d",k,crow);
239: }
240: #endif
241: if (cjj[k]==apj[nextap]) {
242: caj[k] += (*pA)*apa[apj[nextap++]];
243: }
244: }
245: PetscLogFlops(2.0*apnzj);
246: pA++;
247: }
249: /* Zero the current row info for A*P */
250: for (j=0; j<apnzj; j++) {
251: apa[apj[j]] = 0.;
252: apjdense[apj[j]] = 0;
253: }
254: }
256: /* Assemble the final matrix and clean up */
257: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
258: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
259: PetscPrintf(PETSC_COMM_SELF,"************%s C.bs=%d,%d\n",__FUNCT__,C->rmap->bs,C->cmap->bs);
260: PetscFree(apa);
261: return(0);
262: }
266: PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
267: {
268: PetscErrorCode ierr;
269: Mat_SeqAIJ *ap,*c;
270: PetscInt *api,*apj,*ci,pn=P->cmap->N,sparse_axpy=0;
271: MatScalar *ca;
272: Mat_PtAP *ptap;
273: Mat Pt,AP;
274:
276: /* flag 'sparse_axpy' determines which implementations to be used:
277: 0: do dense axpy in MatPtAPNumeric() - fastest, but requires storage of struct A*P; (default)
278: 1: do one sparse axpy - uses same memory as sparse_axpy=0 and might execute less flops
279: (apnz vs. cnz in the outerproduct), slower than case '0' when cnz is not too large than apnz;
280: 2: do two sparse axpy in MatPtAPNumeric() - slowest, does not store structure of A*P. */
281: PetscOptionsGetInt(PETSC_NULL,"-matptap_sparseaxpy",&sparse_axpy,PETSC_NULL);
282: if (sparse_axpy == 2){
283: MatPtAPSymbolic_SeqAIJ_SeqAIJ_SparseAxpy2(A,P,fill,C);
285: return(0);
286: }
288: /* Get symbolic Pt = P^T */
289: MatTransposeSymbolic_SeqAIJ(P,&Pt);
291: /* Get symbolic AP = A*P */
292: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,P,fill,&AP);
294: ap = (Mat_SeqAIJ*)AP->data;
295: api = ap->i;
296: apj = ap->j;
297: ap->free_ij = PETSC_FALSE; /* api and apj are kept in struct ptap, cannot be destroyed with AP */
299: /* Get C = Pt*AP */
300: MatMatMultSymbolic_SeqAIJ_SeqAIJ(Pt,AP,fill,C);
302: c = (Mat_SeqAIJ*)(*C)->data;
303: ci = c->i;
304: PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);
305: PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));
306: c->a = ca;
307: c->free_a = PETSC_TRUE;
309: /* Create a supporting struct for reuse by MatPtAPNumeric() */
310: PetscNew(Mat_PtAP,&ptap);
311: c->ptap = ptap;
312: ptap->destroy = (*C)->ops->destroy;
313: (*C)->ops->destroy = MatDestroy_SeqAIJ_PtAP;
315: /* Allocate temporary array for storage of one row of A*P */
316: PetscMalloc((pn+1)*sizeof(PetscScalar),&ptap->apa);
317: PetscMemzero(ptap->apa,(pn+1)*sizeof(PetscScalar));
318: if (sparse_axpy == 1){
319: A->ops->ptapnumeric = MatPtAPNumeric_SeqAIJ_SeqAIJ_SparseAxpy;
320: } else {
321: A->ops->ptapnumeric = MatPtAPNumeric_SeqAIJ_SeqAIJ;
322: }
323: ptap->api = api;
324: ptap->apj = apj;
326: /* Clean up. */
327: MatDestroy(&Pt);
328: MatDestroy(&AP);
329: return(0);
330: }
332: /* #define PROFILE_MatPtAPNumeric */
335: PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
336: {
337: PetscErrorCode ierr;
338: Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
339: Mat_SeqAIJ *p = (Mat_SeqAIJ *) P->data;
340: Mat_SeqAIJ *c = (Mat_SeqAIJ *) C->data;
341: const PetscInt *ai=a->i,*aj=a->j,*pi=p->i,*pj=p->j,*ci=c->i,*cj=c->j;
342: const PetscScalar *aa=a->a,*pa=p->a,*pval;
343: const PetscInt *apj,*pcol,*cjj;
344: const PetscInt am=A->rmap->N,cm=C->rmap->N;
345: PetscInt i,j,k,anz,apnz,pnz,prow,crow,cnz;
346: PetscScalar *apa,*ca=c->a,*caj,pvalj;
347: Mat_PtAP *ptap = c->ptap;
348: #if defined(PROFILE_MatPtAPNumeric)
349: PetscLogDouble t0,tf,time_Cseq0=0.0,time_Cseq1=0.0;
350: PetscInt flops0=0,flops1=0;
351: #endif
354: /* Get temporary array for storage of one row of A*P */
355: apa = ptap->apa;
356:
357: /* Clear old values in C */
358: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
360: for (i=0;i<am;i++) {
361: /* Form sparse row of AP[i,:] = A[i,:]*P */
362: #if defined(PROFILE_MatPtAPNumeric)
363: PetscGetTime(&t0);
364: #endif
365: anz = ai[i+1] - ai[i];
366: apnz = 0;
367: for (j=0; j<anz; j++) {
368: prow = aj[j];
369: pnz = pi[prow+1] - pi[prow];
370: pcol = pj + pi[prow];
371: pval = pa + pi[prow];
372: for (k=0; k<pnz; k++) {
373: apa[pcol[k]] += aa[j]*pval[k];
374: }
375: PetscLogFlops(2.0*pnz);
376: #if defined(PROFILE_MatPtAPNumeric)
377: flops0 += 2.0*pnz;
378: #endif
379: }
380: aj += anz; aa += anz;
381: #if defined(PROFILE_MatPtAPNumeric)
382: PetscGetTime(&tf);
383: time_Cseq0 += tf - t0;
384: #endif
385:
386: /* Compute P^T*A*P using outer product P[i,:]^T*AP[i,:]. */
387: #if defined(PROFILE_MatPtAPNumeric)
388: PetscGetTime(&t0);
389: #endif
390: apj = ptap->apj + ptap->api[i];
391: apnz = ptap->api[i+1] - ptap->api[i];
392: pnz = pi[i+1] - pi[i];
393: pcol = pj + pi[i];
394: pval = pa + pi[i];
395:
396: /* Perform dense axpy */
397: for (j=0; j<pnz; j++) {
398: crow = pcol[j];
399: cjj = cj + ci[crow];
400: caj = ca + ci[crow];
401: pvalj = pval[j];
402: cnz = ci[crow+1] - ci[crow];
403: for (k=0; k<cnz; k++){
404: caj[k] += pvalj*apa[cjj[k]];
405: }
406: PetscLogFlops(2.0*cnz);
407: #if defined(PROFILE_MatPtAPNumeric)
408: flops1 += 2.0*cnz;
409: #endif
410: }
411: #if defined(PROFILE_MatPtAPNumeric)
412: PetscGetTime(&tf);
413: time_Cseq1 += tf - t0;
414: #endif
416: /* Zero the current row info for A*P */
417: for (j=0; j<apnz; j++) apa[apj[j]] = 0.0;
418: }
420: /* Assemble the final matrix and clean up */
421: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
422: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
423: #if defined(PROFILE_MatPtAPNumeric)
424: printf("PtAPNumeric_SeqAIJ time %g + %g, flops %d %d\n",time_Cseq0,time_Cseq1,flops0,flops1);
425: #endif
426: return(0);
427: }
431: PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ_SparseAxpy(Mat A,Mat P,Mat C)
432: {
433: PetscErrorCode ierr;
434: Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data;
435: Mat_SeqAIJ *p = (Mat_SeqAIJ *) P->data;
436: Mat_SeqAIJ *c = (Mat_SeqAIJ *) C->data;
437: const PetscInt *ai=a->i,*aj=a->j,*pi=p->i,*pj=p->j,*ci=c->i,*cj=c->j;
438: const PetscScalar *aa=a->a,*pa=p->a,*pval;
439: const PetscInt *apj,*pcol,*cjj;
440: const PetscInt am=A->rmap->N,cm=C->rmap->N;
441: PetscInt i,j,k,anz,apnz,pnz,prow,crow,apcol,nextap;
442: PetscScalar *apa,*ca=c->a,*caj,pvalj;
443: Mat_PtAP *ptap = c->ptap;
444: #if defined(PROFILE_MatPtAPNumeric)
445: PetscLogDouble t0,tf,time_Cseq0=0.0,time_Cseq1=0.0;
446: PetscInt flops0=0,flops1=0;
447: #endif
450: /* Get temporary array for storage of one row of A*P */
451: apa = ptap->apa;
452:
453: /* Clear old values in C */
454: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
456: for (i=0;i<am;i++) {
457: /* Form sparse row of AP[i,:] = A[i,:]*P */
458: #if defined(PROFILE_MatPtAPNumeric)
459: PetscGetTime(&t0);
460: #endif
461: anz = ai[i+1] - ai[i];
462: apnz = 0;
463: for (j=0; j<anz; j++) {
464: prow = aj[j];
465: pnz = pi[prow+1] - pi[prow];
466: pcol = pj + pi[prow];
467: pval = pa + pi[prow];
468: for (k=0; k<pnz; k++) {
469: apa[pcol[k]] += aa[j]*pval[k];
470: }
471: PetscLogFlops(2.0*pnz);
472: #if defined(PROFILE_MatPtAPNumeric)
473: flops0 += 2.0*pnz;
474: #endif
475: }
476: aj += anz; aa += anz;
477: #if defined(PROFILE_MatPtAPNumeric)
478: PetscGetTime(&tf);
479: time_Cseq0 += tf - t0;
480: #endif
481:
482: /* Compute P^T*A*P using outer product P[i,:]^T*AP[i,:]. */
483: #if defined(PROFILE_MatPtAPNumeric)
484: PetscGetTime(&t0);
485: #endif
486: apj = ptap->apj + ptap->api[i];
487: apnz = ptap->api[i+1] - ptap->api[i];
488: pnz = pi[i+1] - pi[i];
489: pcol = pj + pi[i];
490: pval = pa + pi[i];
491:
492: /* Perform sparse axpy */
493: for (j=0; j<pnz; j++) {
494: crow = pcol[j];
495: cjj = cj + ci[crow];
496: caj = ca + ci[crow];
497: pvalj = pval[j];
498: nextap = 1;
499: apcol = apj[nextap];
500: for (k=0; nextap<apnz; k++) {
501: if (cjj[k] == apcol) {
502: caj[k] += pvalj*apa[apcol];
503: apcol = apj[nextap++];
504: }
505: }
506: PetscLogFlops(2.0*apnz);
507: #if defined(PROFILE_MatPtAPNumeric)
508: flops1 += 2.0*apnz;
509: #endif
510: }
511: #if defined(PROFILE_MatPtAPNumeric)
512: PetscGetTime(&tf);
513: time_Cseq1 += tf - t0;
514: #endif
515:
516: /* Zero the current row info for A*P */
517: for (j=0; j<apnz; j++) {
518: apcol = apj[j];
519: apa[apcol] = 0.;
520: }
521: }
523: /* Assemble the final matrix and clean up */
524: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
525: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
526: #if defined(PROFILE_MatPtAPNumeric)
527: printf("MatPtAPNumeric_SeqAIJ_SeqAIJ_SparseAxpy time %g + %g, flops %d %d\n",time_Cseq0,time_Cseq1,flops0,flops1);
528: #endif
529: return(0);
530: }