Actual source code: mpimatmatmult.c
petsc-3.6.1 2015-08-06
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc/private/vecimpl.h>
15: PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
18: const char *algTypes[2] = {"scalable","nonscalable"};
19: PetscInt alg=0; /* set default algorithm */
22: if (scall == MAT_INITIAL_MATRIX) {
23: PetscObjectOptionsBegin((PetscObject)A);
24: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);
25: PetscOptionsEnd();
27: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
28: switch (alg) {
29: case 1:
30: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
31: break;
32: default:
33: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
34: break;
35: }
36: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
37: }
38: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
39: (*(*C)->ops->matmultnumeric)(A,B,*C);
40: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
41: return(0);
42: }
46: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47: {
49: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
50: Mat_PtAPMPI *ptap = a->ptap;
53: PetscFree2(ptap->startsj_s,ptap->startsj_r);
54: PetscFree(ptap->bufa);
55: MatDestroy(&ptap->P_loc);
56: MatDestroy(&ptap->P_oth);
57: MatDestroy(&ptap->Pt);
58: PetscFree(ptap->api);
59: PetscFree(ptap->apj);
60: PetscFree(ptap->apa);
61: ptap->destroy(A);
62: PetscFree(ptap);
63: return(0);
64: }
68: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69: {
71: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
72: Mat_PtAPMPI *ptap = a->ptap;
75: (*ptap->duplicate)(A,op,M);
77: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79: return(0);
80: }
84: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
85: {
87: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
88: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
89: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
90: PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj;
91: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
92: Mat_SeqAIJ *p_loc,*p_oth;
93: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
94: PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
95: PetscInt cm =C->rmap->n,anz,pnz;
96: Mat_PtAPMPI *ptap=c->ptap;
97: PetscInt *api,*apj,*apJ,i,j,k,row;
98: PetscInt cstart=C->cmap->rstart;
99: PetscInt cdnz,conz,k0,k1;
100: MPI_Comm comm;
101: PetscMPIInt size;
104: PetscObjectGetComm((PetscObject)A,&comm);
105: MPI_Comm_size(comm,&size);
107: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
108: /*-----------------------------------------------------*/
109: /* update numerical values of P_oth and P_loc */
110: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
111: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
113: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
114: /*----------------------------------------------------------*/
115: /* get data from symbolic products */
116: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
117: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
118: if (size >1) {
119: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
120: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
121: } else {
122: pi_oth=NULL; pj_oth=NULL; pa_oth=NULL;
123: }
125: /* get apa for storing dense row A[i,:]*P */
126: apa = ptap->apa;
128: api = ptap->api;
129: apj = ptap->apj;
130: for (i=0; i<cm; i++) {
131: /* diagonal portion of A */
132: anz = adi[i+1] - adi[i];
133: adj = ad->j + adi[i];
134: ada = ad->a + adi[i];
135: for (j=0; j<anz; j++) {
136: row = adj[j];
137: pnz = pi_loc[row+1] - pi_loc[row];
138: pj = pj_loc + pi_loc[row];
139: pa = pa_loc + pi_loc[row];
141: /* perform dense axpy */
142: valtmp = ada[j];
143: for (k=0; k<pnz; k++) {
144: apa[pj[k]] += valtmp*pa[k];
145: }
146: PetscLogFlops(2.0*pnz);
147: }
149: /* off-diagonal portion of A */
150: anz = aoi[i+1] - aoi[i];
151: aoj = ao->j + aoi[i];
152: aoa = ao->a + aoi[i];
153: for (j=0; j<anz; j++) {
154: row = aoj[j];
155: pnz = pi_oth[row+1] - pi_oth[row];
156: pj = pj_oth + pi_oth[row];
157: pa = pa_oth + pi_oth[row];
159: /* perform dense axpy */
160: valtmp = aoa[j];
161: for (k=0; k<pnz; k++) {
162: apa[pj[k]] += valtmp*pa[k];
163: }
164: PetscLogFlops(2.0*pnz);
165: }
167: /* set values in C */
168: apJ = apj + api[i];
169: cdnz = cd->i[i+1] - cd->i[i];
170: conz = co->i[i+1] - co->i[i];
172: /* 1st off-diagoanl part of C */
173: ca = coa + co->i[i];
174: k = 0;
175: for (k0=0; k0<conz; k0++) {
176: if (apJ[k] >= cstart) break;
177: ca[k0] = apa[apJ[k]];
178: apa[apJ[k]] = 0.0;
179: k++;
180: }
182: /* diagonal part of C */
183: ca = cda + cd->i[i];
184: for (k1=0; k1<cdnz; k1++) {
185: ca[k1] = apa[apJ[k]];
186: apa[apJ[k]] = 0.0;
187: k++;
188: }
190: /* 2nd off-diagoanl part of C */
191: ca = coa + co->i[i];
192: for (; k0<conz; k0++) {
193: ca[k0] = apa[apJ[k]];
194: apa[apJ[k]] = 0.0;
195: k++;
196: }
197: }
198: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
199: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
200: return(0);
201: }
205: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
206: {
207: PetscErrorCode ierr;
208: MPI_Comm comm;
209: PetscMPIInt size;
210: Mat Cmpi;
211: Mat_PtAPMPI *ptap;
212: PetscFreeSpaceList free_space=NULL,current_space=NULL;
213: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
214: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
215: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
216: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
217: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
218: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
219: PetscBT lnkbt;
220: PetscScalar *apa;
221: PetscReal afill;
222: PetscInt nlnk_max,armax,prmax;
225: PetscObjectGetComm((PetscObject)A,&comm);
226: MPI_Comm_size(comm,&size);
228: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
229: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
230: }
231:
232: /* create struct Mat_PtAPMPI and attached it to C later */
233: PetscNew(&ptap);
235: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
236: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
238: /* get P_loc by taking all local rows of P */
239: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
241: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
242: pi_loc = p_loc->i; pj_loc = p_loc->j;
243: if (size > 1) {
244: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
245: pi_oth = p_oth->i; pj_oth = p_oth->j;
246: } else {
247: p_oth = NULL;
248: pi_oth = NULL; pj_oth = NULL;
249: }
251: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
252: /*-------------------------------------------------------------------*/
253: PetscMalloc1(am+2,&api);
254: ptap->api = api;
255: api[0] = 0;
257: /* create and initialize a linked list */
258: armax = ad->rmax+ao->rmax;
259: if (size >1) {
260: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
261: } else {
262: prmax = p_loc->rmax;
263: }
264: nlnk_max = armax*prmax;
265: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
266: PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);
268: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
269: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
271: current_space = free_space;
273: MatPreallocateInitialize(comm,am,pn,dnz,onz);
274: for (i=0; i<am; i++) {
275: /* diagonal portion of A */
276: nzi = adi[i+1] - adi[i];
277: for (j=0; j<nzi; j++) {
278: row = *adj++;
279: pnz = pi_loc[row+1] - pi_loc[row];
280: Jptr = pj_loc + pi_loc[row];
281: /* add non-zero cols of P into the sorted linked list lnk */
282: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
283: }
284: /* off-diagonal portion of A */
285: nzi = aoi[i+1] - aoi[i];
286: for (j=0; j<nzi; j++) {
287: row = *aoj++;
288: pnz = pi_oth[row+1] - pi_oth[row];
289: Jptr = pj_oth + pi_oth[row];
290: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
291: }
293: apnz = lnk[0];
294: api[i+1] = api[i] + apnz;
296: /* if free space is not available, double the total space in the list */
297: if (current_space->local_remaining<apnz) {
298: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
299: nspacedouble++;
300: }
302: /* Copy data into free space, then initialize lnk */
303: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
304: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
306: current_space->array += apnz;
307: current_space->local_used += apnz;
308: current_space->local_remaining -= apnz;
309: }
311: /* Allocate space for apj, initialize apj, and */
312: /* destroy list of free space and other temporary array(s) */
313: PetscMalloc1(api[am]+1,&ptap->apj);
314: apj = ptap->apj;
315: PetscFreeSpaceContiguous(&free_space,ptap->apj);
316: PetscLLDestroy(lnk,lnkbt);
318: /* malloc apa to store dense row A[i,:]*P */
319: PetscCalloc1(pN,&apa);
321: ptap->apa = apa;
323: /* create and assemble symbolic parallel matrix Cmpi */
324: /*----------------------------------------------------*/
325: MatCreate(comm,&Cmpi);
326: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
327: MatSetBlockSizesFromMats(Cmpi,A,P);
329: MatSetType(Cmpi,MATMPIAIJ);
330: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
331: MatPreallocateFinalize(dnz,onz);
332: for (i=0; i<am; i++) {
333: row = i + rstart;
334: apnz = api[i+1] - api[i];
335: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
336: apj += apnz;
337: }
338: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
339: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
341: ptap->destroy = Cmpi->ops->destroy;
342: ptap->duplicate = Cmpi->ops->duplicate;
343: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
344: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
345: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
347: /* attach the supporting struct to Cmpi for reuse */
348: c = (Mat_MPIAIJ*)Cmpi->data;
349: c->ptap = ptap;
351: *C = Cmpi;
353: /* set MatInfo */
354: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
355: if (afill < 1.0) afill = 1.0;
356: Cmpi->info.mallocs = nspacedouble;
357: Cmpi->info.fill_ratio_given = fill;
358: Cmpi->info.fill_ratio_needed = afill;
360: #if defined(PETSC_USE_INFO)
361: if (api[am]) {
362: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
363: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
364: } else {
365: PetscInfo(Cmpi,"Empty matrix product\n");
366: }
367: #endif
368: return(0);
369: }
373: PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
374: {
378: if (scall == MAT_INITIAL_MATRIX) {
379: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
380: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
381: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
382: }
383: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
384: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
385: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
386: return(0);
387: }
389: typedef struct {
390: Mat workB;
391: PetscScalar *rvalues,*svalues;
392: MPI_Request *rwaits,*swaits;
393: } MPIAIJ_MPIDense;
397: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
398: {
399: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
400: PetscErrorCode ierr;
403: MatDestroy(&contents->workB);
404: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
405: PetscFree(contents);
406: return(0);
407: }
411: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
412: {
413: PetscErrorCode ierr;
414: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
415: PetscInt nz = aij->B->cmap->n;
416: PetscContainer container;
417: MPIAIJ_MPIDense *contents;
418: VecScatter ctx = aij->Mvctx;
419: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
420: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
421: PetscInt m = A->rmap->n,n=B->cmap->n;
424: MatCreate(PetscObjectComm((PetscObject)B),C);
425: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
426: MatSetBlockSizesFromMats(*C,A,B);
427: MatSetType(*C,MATMPIDENSE);
428: MatMPIDenseSetPreallocation(*C,NULL);
429: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
430: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
432: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
434: PetscNew(&contents);
435: /* Create work matrix used to store off processor rows of B needed for local product */
436: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
437: /* Create work arrays needed */
438: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
439: B->cmap->N*to->starts[to->n],&contents->svalues,
440: from->n,&contents->rwaits,
441: to->n,&contents->swaits);
443: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
444: PetscContainerSetPointer(container,contents);
445: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
446: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
447: PetscContainerDestroy(&container);
448: return(0);
449: }
453: /*
454: Performs an efficient scatter on the rows of B needed by this process; this is
455: a modification of the VecScatterBegin_() routines.
456: */
457: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
458: {
459: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
460: PetscErrorCode ierr;
461: PetscScalar *b,*w,*svalues,*rvalues;
462: VecScatter ctx = aij->Mvctx;
463: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
464: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
465: PetscInt i,j,k;
466: PetscInt *sindices,*sstarts,*rindices,*rstarts;
467: PetscMPIInt *sprocs,*rprocs,nrecvs;
468: MPI_Request *swaits,*rwaits;
469: MPI_Comm comm;
470: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
471: MPI_Status status;
472: MPIAIJ_MPIDense *contents;
473: PetscContainer container;
474: Mat workB;
477: PetscObjectGetComm((PetscObject)A,&comm);
478: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
479: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
480: PetscContainerGetPointer(container,(void**)&contents);
482: workB = *outworkB = contents->workB;
483: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
484: sindices = to->indices;
485: sstarts = to->starts;
486: sprocs = to->procs;
487: swaits = contents->swaits;
488: svalues = contents->svalues;
490: rindices = from->indices;
491: rstarts = from->starts;
492: rprocs = from->procs;
493: rwaits = contents->rwaits;
494: rvalues = contents->rvalues;
496: MatDenseGetArray(B,&b);
497: MatDenseGetArray(workB,&w);
499: for (i=0; i<from->n; i++) {
500: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
501: }
503: for (i=0; i<to->n; i++) {
504: /* pack a message at a time */
505: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
506: for (k=0; k<ncols; k++) {
507: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
508: }
509: }
510: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
511: }
513: nrecvs = from->n;
514: while (nrecvs) {
515: MPI_Waitany(from->n,rwaits,&imdex,&status);
516: nrecvs--;
517: /* unpack a message at a time */
518: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
519: for (k=0; k<ncols; k++) {
520: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
521: }
522: }
523: }
524: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
526: MatDenseRestoreArray(B,&b);
527: MatDenseRestoreArray(workB,&w);
528: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
529: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
530: return(0);
531: }
532: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
536: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
537: {
539: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
540: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
541: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
542: Mat workB;
545: /* diagonal block of A times all local rows of B*/
546: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
548: /* get off processor parts of B needed to complete the product */
549: MatMPIDenseScatter(A,B,C,&workB);
551: /* off-diagonal block of A times nonlocal rows of B */
552: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
553: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
554: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
555: return(0);
556: }
560: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
561: {
563: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
564: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
565: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
566: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
567: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
568: Mat_SeqAIJ *p_loc,*p_oth;
569: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
570: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
571: PetscInt cm = C->rmap->n,anz,pnz;
572: Mat_PtAPMPI *ptap = c->ptap;
573: PetscScalar *apa_sparse = ptap->apa;
574: PetscInt *api,*apj,*apJ,i,j,k,row;
575: PetscInt cstart = C->cmap->rstart;
576: PetscInt cdnz,conz,k0,k1,nextp;
577: MPI_Comm comm;
578: PetscMPIInt size;
581: PetscObjectGetComm((PetscObject)A,&comm);
582: MPI_Comm_size(comm,&size);
584: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
585: /*-----------------------------------------------------*/
586: /* update numerical values of P_oth and P_loc */
587: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
588: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
590: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
591: /*----------------------------------------------------------*/
592: /* get data from symbolic products */
593: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
594: pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
595: if (size >1) {
596: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
597: pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
598: } else {
599: p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
600: }
602: api = ptap->api;
603: apj = ptap->apj;
604: for (i=0; i<cm; i++) {
605: apJ = apj + api[i];
607: /* diagonal portion of A */
608: anz = adi[i+1] - adi[i];
609: adj = ad->j + adi[i];
610: ada = ad->a + adi[i];
611: for (j=0; j<anz; j++) {
612: row = adj[j];
613: pnz = pi_loc[row+1] - pi_loc[row];
614: pj = pj_loc + pi_loc[row];
615: pa = pa_loc + pi_loc[row];
616: /* perform sparse axpy */
617: valtmp = ada[j];
618: nextp = 0;
619: for (k=0; nextp<pnz; k++) {
620: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
621: apa_sparse[k] += valtmp*pa[nextp++];
622: }
623: }
624: PetscLogFlops(2.0*pnz);
625: }
627: /* off-diagonal portion of A */
628: anz = aoi[i+1] - aoi[i];
629: aoj = ao->j + aoi[i];
630: aoa = ao->a + aoi[i];
631: for (j=0; j<anz; j++) {
632: row = aoj[j];
633: pnz = pi_oth[row+1] - pi_oth[row];
634: pj = pj_oth + pi_oth[row];
635: pa = pa_oth + pi_oth[row];
636: /* perform sparse axpy */
637: valtmp = aoa[j];
638: nextp = 0;
639: for (k=0; nextp<pnz; k++) {
640: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
641: apa_sparse[k] += valtmp*pa[nextp++];
642: }
643: }
644: PetscLogFlops(2.0*pnz);
645: }
647: /* set values in C */
648: cdnz = cd->i[i+1] - cd->i[i];
649: conz = co->i[i+1] - co->i[i];
651: /* 1st off-diagoanl part of C */
652: ca = coa + co->i[i];
653: k = 0;
654: for (k0=0; k0<conz; k0++) {
655: if (apJ[k] >= cstart) break;
656: ca[k0] = apa_sparse[k];
657: apa_sparse[k] = 0.0;
658: k++;
659: }
661: /* diagonal part of C */
662: ca = cda + cd->i[i];
663: for (k1=0; k1<cdnz; k1++) {
664: ca[k1] = apa_sparse[k];
665: apa_sparse[k] = 0.0;
666: k++;
667: }
669: /* 2nd off-diagoanl part of C */
670: ca = coa + co->i[i];
671: for (; k0<conz; k0++) {
672: ca[k0] = apa_sparse[k];
673: apa_sparse[k] = 0.0;
674: k++;
675: }
676: }
677: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
678: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
679: return(0);
680: }
682: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
685: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
686: {
687: PetscErrorCode ierr;
688: MPI_Comm comm;
689: PetscMPIInt size;
690: Mat Cmpi;
691: Mat_PtAPMPI *ptap;
692: PetscFreeSpaceList free_space = NULL,current_space=NULL;
693: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
694: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
695: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
696: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
697: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
698: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
699: PetscInt nlnk_max,armax,prmax;
700: PetscReal afill;
701: PetscScalar *apa;
704: PetscObjectGetComm((PetscObject)A,&comm);
705: MPI_Comm_size(comm,&size);
707: /* create struct Mat_PtAPMPI and attached it to C later */
708: PetscNew(&ptap);
710: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
711: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
712:
713: /* get P_loc by taking all local rows of P */
714: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
716: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
717: pi_loc = p_loc->i; pj_loc = p_loc->j;
718: if (size > 1) {
719: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
720: pi_oth = p_oth->i; pj_oth = p_oth->j;
721: } else {
722: p_oth = NULL;
723: pi_oth = NULL; pj_oth = NULL;
724: }
726: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
727: /*-------------------------------------------------------------------*/
728: PetscMalloc1(am+2,&api);
729: ptap->api = api;
730: api[0] = 0;
732: /* create and initialize a linked list */
733: armax = ad->rmax+ao->rmax;
734: if (size >1) {
735: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
736: } else {
737: prmax = p_loc->rmax;
738: }
739: nlnk_max = armax*prmax;
740: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
741: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
743: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
744: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
746: current_space = free_space;
748: MatPreallocateInitialize(comm,am,pn,dnz,onz);
749: for (i=0; i<am; i++) {
750: /* diagonal portion of A */
751: nzi = adi[i+1] - adi[i];
752: for (j=0; j<nzi; j++) {
753: row = *adj++;
754: pnz = pi_loc[row+1] - pi_loc[row];
755: Jptr = pj_loc + pi_loc[row];
756: /* add non-zero cols of P into the sorted linked list lnk */
757: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
758: }
759: /* off-diagonal portion of A */
760: nzi = aoi[i+1] - aoi[i];
761: for (j=0; j<nzi; j++) {
762: row = *aoj++;
763: pnz = pi_oth[row+1] - pi_oth[row];
764: Jptr = pj_oth + pi_oth[row];
765: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
766: }
768: apnz = *lnk;
769: api[i+1] = api[i] + apnz;
770: if (apnz > apnz_max) apnz_max = apnz;
772: /* if free space is not available, double the total space in the list */
773: if (current_space->local_remaining<apnz) {
774: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
775: nspacedouble++;
776: }
778: /* Copy data into free space, then initialize lnk */
779: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
780: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
782: current_space->array += apnz;
783: current_space->local_used += apnz;
784: current_space->local_remaining -= apnz;
785: }
787: /* Allocate space for apj, initialize apj, and */
788: /* destroy list of free space and other temporary array(s) */
789: PetscMalloc1(api[am]+1,&ptap->apj);
790: apj = ptap->apj;
791: PetscFreeSpaceContiguous(&free_space,ptap->apj);
792: PetscLLCondensedDestroy_Scalable(lnk);
794: /* create and assemble symbolic parallel matrix Cmpi */
795: /*----------------------------------------------------*/
796: MatCreate(comm,&Cmpi);
797: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
798: MatSetBlockSizesFromMats(Cmpi,A,P);
799: MatSetType(Cmpi,MATMPIAIJ);
800: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
801: MatPreallocateFinalize(dnz,onz);
803: /* malloc apa for assembly Cmpi */
804: PetscCalloc1(apnz_max,&apa);
806: ptap->apa = apa;
807: for (i=0; i<am; i++) {
808: row = i + rstart;
809: apnz = api[i+1] - api[i];
810: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
811: apj += apnz;
812: }
813: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
814: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
816: ptap->destroy = Cmpi->ops->destroy;
817: ptap->duplicate = Cmpi->ops->duplicate;
818: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
819: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
820: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
822: /* attach the supporting struct to Cmpi for reuse */
823: c = (Mat_MPIAIJ*)Cmpi->data;
824: c->ptap = ptap;
826: *C = Cmpi;
828: /* set MatInfo */
829: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
830: if (afill < 1.0) afill = 1.0;
831: Cmpi->info.mallocs = nspacedouble;
832: Cmpi->info.fill_ratio_given = fill;
833: Cmpi->info.fill_ratio_needed = afill;
835: #if defined(PETSC_USE_INFO)
836: if (api[am]) {
837: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
838: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
839: } else {
840: PetscInfo(Cmpi,"Empty matrix product\n");
841: }
842: #endif
843: return(0);
844: }
846: /*-------------------------------------------------------------------------*/
849: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
850: {
852: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
853: PetscInt alg=0; /* set default algorithm */
856: if (scall == MAT_INITIAL_MATRIX) {
857: PetscObjectOptionsBegin((PetscObject)A);
858: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);
859: PetscOptionsEnd();
861: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
862: switch (alg) {
863: case 1:
864: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
865: break;
866: case 2:
867: {
868: Mat Pt;
869: Mat_PtAPMPI *ptap;
870: Mat_MPIAIJ *c;
871: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
872: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
873: c = (Mat_MPIAIJ*)(*C)->data;
874: ptap = c->ptap;
875: ptap->Pt = Pt;
876: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
877: return(0);
878: }
879: break;
880: default:
881: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
882: break;
883: }
884: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
885: }
886: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
887: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
888: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
889: return(0);
890: }
892: /* This routine only works when scall=MAT_REUSE_MATRIX! */
895: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
896: {
898: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
899: Mat_PtAPMPI *ptap= c->ptap;
900: Mat Pt=ptap->Pt;
903: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
904: MatMatMultNumeric(Pt,A,C);
905: return(0);
906: }
908: /* Non-scalable version, use dense axpy */
911: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
912: {
913: PetscErrorCode ierr;
914: Mat_Merge_SeqsToMPI *merge;
915: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
916: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
917: Mat_PtAPMPI *ptap;
918: PetscInt *adj,*aJ;
919: PetscInt i,j,k,anz,pnz,row,*cj;
920: MatScalar *ada,*aval,*ca,valtmp;
921: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
922: MPI_Comm comm;
923: PetscMPIInt size,rank,taga,*len_s;
924: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
925: PetscInt **buf_ri,**buf_rj;
926: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
927: MPI_Request *s_waits,*r_waits;
928: MPI_Status *status;
929: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
930: PetscInt *ai,*aj,*coi,*coj;
931: PetscInt *poJ,*pdJ;
932: Mat A_loc;
933: Mat_SeqAIJ *a_loc;
936: PetscObjectGetComm((PetscObject)C,&comm);
937: MPI_Comm_size(comm,&size);
938: MPI_Comm_rank(comm,&rank);
940: ptap = c->ptap;
941: merge = ptap->merge;
943: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
944: /*--------------------------------------------------------------*/
945: /* get data from symbolic products */
946: coi = merge->coi; coj = merge->coj;
947: PetscCalloc1(coi[pon]+1,&coa);
949: bi = merge->bi; bj = merge->bj;
950: owners = merge->rowmap->range;
951: PetscCalloc1(bi[cm]+1,&ba);
953: /* get A_loc by taking all local rows of A */
954: A_loc = ptap->A_loc;
955: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
956: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
957: ai = a_loc->i;
958: aj = a_loc->j;
960: PetscCalloc1(A->cmap->N,&aval); /* non-scalable!!! */
962: for (i=0; i<am; i++) {
963: /* 2-a) put A[i,:] to dense array aval */
964: anz = ai[i+1] - ai[i];
965: adj = aj + ai[i];
966: ada = a_loc->a + ai[i];
967: for (j=0; j<anz; j++) {
968: aval[adj[j]] = ada[j];
969: }
971: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
972: /*--------------------------------------------------------------*/
973: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
974: pnz = po->i[i+1] - po->i[i];
975: poJ = po->j + po->i[i];
976: pA = po->a + po->i[i];
977: for (j=0; j<pnz; j++) {
978: row = poJ[j];
979: cnz = coi[row+1] - coi[row];
980: cj = coj + coi[row];
981: ca = coa + coi[row];
982: /* perform dense axpy */
983: valtmp = pA[j];
984: for (k=0; k<cnz; k++) {
985: ca[k] += valtmp*aval[cj[k]];
986: }
987: PetscLogFlops(2.0*cnz);
988: }
990: /* put the value into Cd (diagonal part) */
991: pnz = pd->i[i+1] - pd->i[i];
992: pdJ = pd->j + pd->i[i];
993: pA = pd->a + pd->i[i];
994: for (j=0; j<pnz; j++) {
995: row = pdJ[j];
996: cnz = bi[row+1] - bi[row];
997: cj = bj + bi[row];
998: ca = ba + bi[row];
999: /* perform dense axpy */
1000: valtmp = pA[j];
1001: for (k=0; k<cnz; k++) {
1002: ca[k] += valtmp*aval[cj[k]];
1003: }
1004: PetscLogFlops(2.0*cnz);
1005: }
1007: /* zero the current row of Pt*A */
1008: aJ = aj + ai[i];
1009: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1010: }
1012: /* 3) send and recv matrix values coa */
1013: /*------------------------------------*/
1014: buf_ri = merge->buf_ri;
1015: buf_rj = merge->buf_rj;
1016: len_s = merge->len_s;
1017: PetscCommGetNewTag(comm,&taga);
1018: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1020: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1021: for (proc=0,k=0; proc<size; proc++) {
1022: if (!len_s[proc]) continue;
1023: i = merge->owners_co[proc];
1024: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1025: k++;
1026: }
1027: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1028: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1030: PetscFree2(s_waits,status);
1031: PetscFree(r_waits);
1032: PetscFree(coa);
1034: /* 4) insert local Cseq and received values into Cmpi */
1035: /*----------------------------------------------------*/
1036: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1037: for (k=0; k<merge->nrecv; k++) {
1038: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1039: nrows = *(buf_ri_k[k]);
1040: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1041: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1042: }
1044: for (i=0; i<cm; i++) {
1045: row = owners[rank] + i; /* global row index of C_seq */
1046: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1047: ba_i = ba + bi[i];
1048: bnz = bi[i+1] - bi[i];
1049: /* add received vals into ba */
1050: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1051: /* i-th row */
1052: if (i == *nextrow[k]) {
1053: cnz = *(nextci[k]+1) - *nextci[k];
1054: cj = buf_rj[k] + *(nextci[k]);
1055: ca = abuf_r[k] + *(nextci[k]);
1056: nextcj = 0;
1057: for (j=0; nextcj<cnz; j++) {
1058: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1059: ba_i[j] += ca[nextcj++];
1060: }
1061: }
1062: nextrow[k]++; nextci[k]++;
1063: PetscLogFlops(2.0*cnz);
1064: }
1065: }
1066: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1067: }
1068: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1069: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1071: PetscFree(ba);
1072: PetscFree(abuf_r[0]);
1073: PetscFree(abuf_r);
1074: PetscFree3(buf_ri_k,nextrow,nextci);
1075: PetscFree(aval);
1076: return(0);
1077: }
1079: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1080: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1083: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1084: {
1085: PetscErrorCode ierr;
1086: Mat Cmpi,A_loc,POt,PDt;
1087: Mat_PtAPMPI *ptap;
1088: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1089: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1090: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1091: PetscInt nnz;
1092: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1093: PetscInt am=A->rmap->n,pn=P->cmap->n;
1094: PetscBT lnkbt;
1095: MPI_Comm comm;
1096: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1097: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1098: PetscInt len,proc,*dnz,*onz,*owners;
1099: PetscInt nzi,*bi,*bj;
1100: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1101: MPI_Request *swaits,*rwaits;
1102: MPI_Status *sstatus,rstatus;
1103: Mat_Merge_SeqsToMPI *merge;
1104: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1105: PetscReal afill =1.0,afill_tmp;
1106: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1107: PetscScalar *vals;
1108: Mat_SeqAIJ *a_loc, *pdt,*pot;
1111: PetscObjectGetComm((PetscObject)A,&comm);
1112: /* check if matrix local sizes are compatible */
1113: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1114: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1115: }
1117: MPI_Comm_size(comm,&size);
1118: MPI_Comm_rank(comm,&rank);
1120: /* create struct Mat_PtAPMPI and attached it to C later */
1121: PetscNew(&ptap);
1123: /* get A_loc by taking all local rows of A */
1124: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1126: ptap->A_loc = A_loc;
1128: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1129: ai = a_loc->i;
1130: aj = a_loc->j;
1132: /* determine symbolic Co=(p->B)^T*A - send to others */
1133: /*----------------------------------------------------*/
1134: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1135: pdt = (Mat_SeqAIJ*)PDt->data;
1136: pdti = pdt->i; pdtj = pdt->j;
1138: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1139: pot = (Mat_SeqAIJ*)POt->data;
1140: poti = pot->i; potj = pot->j;
1142: /* then, compute symbolic Co = (p->B)^T*A */
1143: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1144: PetscMalloc1(pon+1,&coi);
1145: coi[0] = 0;
1147: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1148: nnz = fill*(poti[pon] + ai[am]);
1149: PetscFreeSpaceGet(nnz,&free_space);
1150: current_space = free_space;
1152: /* create and initialize a linked list */
1153: i = PetscMax(pdt->rmax,pot->rmax);
1154: Crmax = i*a_loc->rmax*size;
1155: if (!Crmax || Crmax > aN) Crmax = aN;
1156: PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);
1158: for (i=0; i<pon; i++) {
1159: pnz = poti[i+1] - poti[i];
1160: ptJ = potj + poti[i];
1161: for (j=0; j<pnz; j++) {
1162: row = ptJ[j]; /* row of A_loc == col of Pot */
1163: anz = ai[row+1] - ai[row];
1164: Jptr = aj + ai[row];
1165: /* add non-zero cols of AP into the sorted linked list lnk */
1166: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1167: }
1168: nnz = lnk[0];
1170: /* If free space is not available, double the total space in the list */
1171: if (current_space->local_remaining<nnz) {
1172: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1173: nspacedouble++;
1174: }
1176: /* Copy data into free space, and zero out denserows */
1177: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1179: current_space->array += nnz;
1180: current_space->local_used += nnz;
1181: current_space->local_remaining -= nnz;
1183: coi[i+1] = coi[i] + nnz;
1184: }
1186: PetscMalloc1(coi[pon]+1,&coj);
1187: PetscFreeSpaceContiguous(&free_space,coj);
1189: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1190: if (afill_tmp > afill) afill = afill_tmp;
1192: /* send j-array (coj) of Co to other processors */
1193: /*----------------------------------------------*/
1194: /* determine row ownership */
1195: PetscNew(&merge);
1196: PetscLayoutCreate(comm,&merge->rowmap);
1198: merge->rowmap->n = pn;
1199: merge->rowmap->bs = 1;
1201: PetscLayoutSetUp(merge->rowmap);
1202: owners = merge->rowmap->range;
1204: /* determine the number of messages to send, their lengths */
1205: PetscCalloc1(size,&len_si);
1206: PetscMalloc1(size,&merge->len_s);
1208: len_s = merge->len_s;
1209: merge->nsend = 0;
1211: PetscMalloc1(size+2,&owners_co);
1212: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1214: proc = 0;
1215: for (i=0; i<pon; i++) {
1216: while (prmap[i] >= owners[proc+1]) proc++;
1217: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1218: len_s[proc] += coi[i+1] - coi[i];
1219: }
1221: len = 0; /* max length of buf_si[] */
1222: owners_co[0] = 0;
1223: for (proc=0; proc<size; proc++) {
1224: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1225: if (len_si[proc]) {
1226: merge->nsend++;
1227: len_si[proc] = 2*(len_si[proc] + 1);
1228: len += len_si[proc];
1229: }
1230: }
1232: /* determine the number and length of messages to receive for coi and coj */
1233: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1234: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1236: /* post the Irecv and Isend of coj */
1237: PetscCommGetNewTag(comm,&tagj);
1238: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1239: PetscMalloc1(merge->nsend+1,&swaits);
1240: for (proc=0, k=0; proc<size; proc++) {
1241: if (!len_s[proc]) continue;
1242: i = owners_co[proc];
1243: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1244: k++;
1245: }
1247: /* receives and sends of coj are complete */
1248: PetscMalloc1(size,&sstatus);
1249: for (i=0; i<merge->nrecv; i++) {
1250: PetscMPIInt icompleted;
1251: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1252: }
1253: PetscFree(rwaits);
1254: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1256: /* send and recv coi */
1257: /*-------------------*/
1258: PetscCommGetNewTag(comm,&tagi);
1259: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1260: PetscMalloc1(len+1,&buf_s);
1261: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1262: for (proc=0,k=0; proc<size; proc++) {
1263: if (!len_s[proc]) continue;
1264: /* form outgoing message for i-structure:
1265: buf_si[0]: nrows to be sent
1266: [1:nrows]: row index (global)
1267: [nrows+1:2*nrows+1]: i-structure index
1268: */
1269: /*-------------------------------------------*/
1270: nrows = len_si[proc]/2 - 1;
1271: buf_si_i = buf_si + nrows+1;
1272: buf_si[0] = nrows;
1273: buf_si_i[0] = 0;
1274: nrows = 0;
1275: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1276: nzi = coi[i+1] - coi[i];
1277: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1278: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1279: nrows++;
1280: }
1281: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1282: k++;
1283: buf_si += len_si[proc];
1284: }
1285: i = merge->nrecv;
1286: while (i--) {
1287: PetscMPIInt icompleted;
1288: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1289: }
1290: PetscFree(rwaits);
1291: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1292: PetscFree(len_si);
1293: PetscFree(len_ri);
1294: PetscFree(swaits);
1295: PetscFree(sstatus);
1296: PetscFree(buf_s);
1298: /* compute the local portion of C (mpi mat) */
1299: /*------------------------------------------*/
1300: /* allocate bi array and free space for accumulating nonzero column info */
1301: PetscMalloc1(pn+1,&bi);
1302: bi[0] = 0;
1304: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1305: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1306: PetscFreeSpaceGet(nnz,&free_space);
1307: current_space = free_space;
1309: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1310: for (k=0; k<merge->nrecv; k++) {
1311: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1312: nrows = *buf_ri_k[k];
1313: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1314: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1315: }
1317: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1318: rmax = 0;
1319: for (i=0; i<pn; i++) {
1320: /* add pdt[i,:]*AP into lnk */
1321: pnz = pdti[i+1] - pdti[i];
1322: ptJ = pdtj + pdti[i];
1323: for (j=0; j<pnz; j++) {
1324: row = ptJ[j]; /* row of AP == col of Pt */
1325: anz = ai[row+1] - ai[row];
1326: Jptr = aj + ai[row];
1327: /* add non-zero cols of AP into the sorted linked list lnk */
1328: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1329: }
1331: /* add received col data into lnk */
1332: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1333: if (i == *nextrow[k]) { /* i-th row */
1334: nzi = *(nextci[k]+1) - *nextci[k];
1335: Jptr = buf_rj[k] + *nextci[k];
1336: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1337: nextrow[k]++; nextci[k]++;
1338: }
1339: }
1340: nnz = lnk[0];
1342: /* if free space is not available, make more free space */
1343: if (current_space->local_remaining<nnz) {
1344: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1345: nspacedouble++;
1346: }
1347: /* copy data into free space, then initialize lnk */
1348: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1349: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1351: current_space->array += nnz;
1352: current_space->local_used += nnz;
1353: current_space->local_remaining -= nnz;
1355: bi[i+1] = bi[i] + nnz;
1356: if (nnz > rmax) rmax = nnz;
1357: }
1358: PetscFree3(buf_ri_k,nextrow,nextci);
1360: PetscMalloc1(bi[pn]+1,&bj);
1361: PetscFreeSpaceContiguous(&free_space,bj);
1363: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1364: if (afill_tmp > afill) afill = afill_tmp;
1365: PetscLLCondensedDestroy(lnk,lnkbt);
1366: MatDestroy(&POt);
1367: MatDestroy(&PDt);
1369: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1370: /*----------------------------------------------------------------------------------*/
1371: PetscCalloc1(rmax+1,&vals);
1373: MatCreate(comm,&Cmpi);
1374: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1375: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1376: MatSetType(Cmpi,MATMPIAIJ);
1377: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1378: MatPreallocateFinalize(dnz,onz);
1379: MatSetBlockSize(Cmpi,1);
1380: for (i=0; i<pn; i++) {
1381: row = i + rstart;
1382: nnz = bi[i+1] - bi[i];
1383: Jptr = bj + bi[i];
1384: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1385: }
1386: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1387: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1388: PetscFree(vals);
1390: merge->bi = bi;
1391: merge->bj = bj;
1392: merge->coi = coi;
1393: merge->coj = coj;
1394: merge->buf_ri = buf_ri;
1395: merge->buf_rj = buf_rj;
1396: merge->owners_co = owners_co;
1397: merge->destroy = Cmpi->ops->destroy;
1398: merge->duplicate = Cmpi->ops->duplicate;
1400: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1401: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1402: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1404: /* attach the supporting struct to Cmpi for reuse */
1405: c = (Mat_MPIAIJ*)Cmpi->data;
1406: c->ptap = ptap;
1407: ptap->api = NULL;
1408: ptap->apj = NULL;
1409: ptap->merge = merge;
1410: ptap->rmax = rmax;
1412: *C = Cmpi;
1413: #if defined(PETSC_USE_INFO)
1414: if (bi[pn] != 0) {
1415: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1416: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1417: } else {
1418: PetscInfo(Cmpi,"Empty matrix product\n");
1419: }
1420: #endif
1421: return(0);
1422: }
1426: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1427: {
1428: PetscErrorCode ierr;
1429: Mat_Merge_SeqsToMPI *merge;
1430: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1431: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1432: Mat_PtAPMPI *ptap;
1433: PetscInt *adj;
1434: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1435: MatScalar *ada,*ca,valtmp;
1436: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1437: MPI_Comm comm;
1438: PetscMPIInt size,rank,taga,*len_s;
1439: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1440: PetscInt **buf_ri,**buf_rj;
1441: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1442: MPI_Request *s_waits,*r_waits;
1443: MPI_Status *status;
1444: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1445: PetscInt *ai,*aj,*coi,*coj;
1446: PetscInt *poJ,*pdJ;
1447: Mat A_loc;
1448: Mat_SeqAIJ *a_loc;
1451: PetscObjectGetComm((PetscObject)C,&comm);
1452: MPI_Comm_size(comm,&size);
1453: MPI_Comm_rank(comm,&rank);
1455: ptap = c->ptap;
1456: merge = ptap->merge;
1458: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1459: /*------------------------------------------*/
1460: /* get data from symbolic products */
1461: coi = merge->coi; coj = merge->coj;
1462: PetscCalloc1(coi[pon]+1,&coa);
1463: bi = merge->bi; bj = merge->bj;
1464: owners = merge->rowmap->range;
1465: PetscCalloc1(bi[cm]+1,&ba);
1467: /* get A_loc by taking all local rows of A */
1468: A_loc = ptap->A_loc;
1469: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1470: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1471: ai = a_loc->i;
1472: aj = a_loc->j;
1474: for (i=0; i<am; i++) {
1475: anz = ai[i+1] - ai[i];
1476: adj = aj + ai[i];
1477: ada = a_loc->a + ai[i];
1479: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1480: /*-------------------------------------------------------------*/
1481: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1482: pnz = po->i[i+1] - po->i[i];
1483: poJ = po->j + po->i[i];
1484: pA = po->a + po->i[i];
1485: for (j=0; j<pnz; j++) {
1486: row = poJ[j];
1487: cj = coj + coi[row];
1488: ca = coa + coi[row];
1489: /* perform sparse axpy */
1490: nexta = 0;
1491: valtmp = pA[j];
1492: for (k=0; nexta<anz; k++) {
1493: if (cj[k] == adj[nexta]) {
1494: ca[k] += valtmp*ada[nexta];
1495: nexta++;
1496: }
1497: }
1498: PetscLogFlops(2.0*anz);
1499: }
1501: /* put the value into Cd (diagonal part) */
1502: pnz = pd->i[i+1] - pd->i[i];
1503: pdJ = pd->j + pd->i[i];
1504: pA = pd->a + pd->i[i];
1505: for (j=0; j<pnz; j++) {
1506: row = pdJ[j];
1507: cj = bj + bi[row];
1508: ca = ba + bi[row];
1509: /* perform sparse axpy */
1510: nexta = 0;
1511: valtmp = pA[j];
1512: for (k=0; nexta<anz; k++) {
1513: if (cj[k] == adj[nexta]) {
1514: ca[k] += valtmp*ada[nexta];
1515: nexta++;
1516: }
1517: }
1518: PetscLogFlops(2.0*anz);
1519: }
1520: }
1522: /* 3) send and recv matrix values coa */
1523: /*------------------------------------*/
1524: buf_ri = merge->buf_ri;
1525: buf_rj = merge->buf_rj;
1526: len_s = merge->len_s;
1527: PetscCommGetNewTag(comm,&taga);
1528: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1530: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1531: for (proc=0,k=0; proc<size; proc++) {
1532: if (!len_s[proc]) continue;
1533: i = merge->owners_co[proc];
1534: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1535: k++;
1536: }
1537: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1538: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1540: PetscFree2(s_waits,status);
1541: PetscFree(r_waits);
1542: PetscFree(coa);
1544: /* 4) insert local Cseq and received values into Cmpi */
1545: /*----------------------------------------------------*/
1546: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1547: for (k=0; k<merge->nrecv; k++) {
1548: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1549: nrows = *(buf_ri_k[k]);
1550: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1551: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1552: }
1554: for (i=0; i<cm; i++) {
1555: row = owners[rank] + i; /* global row index of C_seq */
1556: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1557: ba_i = ba + bi[i];
1558: bnz = bi[i+1] - bi[i];
1559: /* add received vals into ba */
1560: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1561: /* i-th row */
1562: if (i == *nextrow[k]) {
1563: cnz = *(nextci[k]+1) - *nextci[k];
1564: cj = buf_rj[k] + *(nextci[k]);
1565: ca = abuf_r[k] + *(nextci[k]);
1566: nextcj = 0;
1567: for (j=0; nextcj<cnz; j++) {
1568: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1569: ba_i[j] += ca[nextcj++];
1570: }
1571: }
1572: nextrow[k]++; nextci[k]++;
1573: PetscLogFlops(2.0*cnz);
1574: }
1575: }
1576: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1577: }
1578: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1579: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1581: PetscFree(ba);
1582: PetscFree(abuf_r[0]);
1583: PetscFree(abuf_r);
1584: PetscFree3(buf_ri_k,nextrow,nextci);
1585: return(0);
1586: }
1588: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1589: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1590: differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1593: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1594: {
1595: PetscErrorCode ierr;
1596: Mat Cmpi,A_loc,POt,PDt;
1597: Mat_PtAPMPI *ptap;
1598: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1599: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1600: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1601: PetscInt nnz;
1602: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1603: PetscInt am =A->rmap->n,pn=P->cmap->n;
1604: MPI_Comm comm;
1605: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1606: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1607: PetscInt len,proc,*dnz,*onz,*owners;
1608: PetscInt nzi,*bi,*bj;
1609: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1610: MPI_Request *swaits,*rwaits;
1611: MPI_Status *sstatus,rstatus;
1612: Mat_Merge_SeqsToMPI *merge;
1613: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1614: PetscReal afill =1.0,afill_tmp;
1615: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1616: PetscScalar *vals;
1617: Mat_SeqAIJ *a_loc, *pdt,*pot;
1620: PetscObjectGetComm((PetscObject)A,&comm);
1621: /* check if matrix local sizes are compatible */
1622: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1623: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1624: }
1626: MPI_Comm_size(comm,&size);
1627: MPI_Comm_rank(comm,&rank);
1629: /* create struct Mat_PtAPMPI and attached it to C later */
1630: PetscNew(&ptap);
1632: /* get A_loc by taking all local rows of A */
1633: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1635: ptap->A_loc = A_loc;
1636: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1637: ai = a_loc->i;
1638: aj = a_loc->j;
1640: /* determine symbolic Co=(p->B)^T*A - send to others */
1641: /*----------------------------------------------------*/
1642: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1643: pdt = (Mat_SeqAIJ*)PDt->data;
1644: pdti = pdt->i; pdtj = pdt->j;
1646: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1647: pot = (Mat_SeqAIJ*)POt->data;
1648: poti = pot->i; potj = pot->j;
1650: /* then, compute symbolic Co = (p->B)^T*A */
1651: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1652: >= (num of nonzero rows of C_seq) - pn */
1653: PetscMalloc1(pon+1,&coi);
1654: coi[0] = 0;
1656: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1657: nnz = fill*(poti[pon] + ai[am]);
1658: PetscFreeSpaceGet(nnz,&free_space);
1659: current_space = free_space;
1661: /* create and initialize a linked list */
1662: i = PetscMax(pdt->rmax,pot->rmax);
1663: Crmax = i*a_loc->rmax*size; /* non-scalable! */
1664: if (!Crmax || Crmax > aN) Crmax = aN;
1665: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
1667: for (i=0; i<pon; i++) {
1668: pnz = poti[i+1] - poti[i];
1669: ptJ = potj + poti[i];
1670: for (j=0; j<pnz; j++) {
1671: row = ptJ[j]; /* row of A_loc == col of Pot */
1672: anz = ai[row+1] - ai[row];
1673: Jptr = aj + ai[row];
1674: /* add non-zero cols of AP into the sorted linked list lnk */
1675: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1676: }
1677: nnz = lnk[0];
1679: /* If free space is not available, double the total space in the list */
1680: if (current_space->local_remaining<nnz) {
1681: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1682: nspacedouble++;
1683: }
1685: /* Copy data into free space, and zero out denserows */
1686: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1688: current_space->array += nnz;
1689: current_space->local_used += nnz;
1690: current_space->local_remaining -= nnz;
1692: coi[i+1] = coi[i] + nnz;
1693: }
1695: PetscMalloc1(coi[pon]+1,&coj);
1696: PetscFreeSpaceContiguous(&free_space,coj);
1698: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1699: if (afill_tmp > afill) afill = afill_tmp;
1701: /* send j-array (coj) of Co to other processors */
1702: /*----------------------------------------------*/
1703: /* determine row ownership */
1704: PetscNew(&merge);
1705: PetscLayoutCreate(comm,&merge->rowmap);
1707: merge->rowmap->n = pn;
1708: merge->rowmap->bs = 1;
1710: PetscLayoutSetUp(merge->rowmap);
1711: owners = merge->rowmap->range;
1713: /* determine the number of messages to send, their lengths */
1714: PetscCalloc1(size,&len_si);
1715: PetscMalloc1(size,&merge->len_s);
1717: len_s = merge->len_s;
1718: merge->nsend = 0;
1720: PetscMalloc1(size+2,&owners_co);
1721: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1723: proc = 0;
1724: for (i=0; i<pon; i++) {
1725: while (prmap[i] >= owners[proc+1]) proc++;
1726: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1727: len_s[proc] += coi[i+1] - coi[i];
1728: }
1730: len = 0; /* max length of buf_si[] */
1731: owners_co[0] = 0;
1732: for (proc=0; proc<size; proc++) {
1733: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1734: if (len_si[proc]) {
1735: merge->nsend++;
1736: len_si[proc] = 2*(len_si[proc] + 1);
1737: len += len_si[proc];
1738: }
1739: }
1741: /* determine the number and length of messages to receive for coi and coj */
1742: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1743: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1745: /* post the Irecv and Isend of coj */
1746: PetscCommGetNewTag(comm,&tagj);
1747: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1748: PetscMalloc1(merge->nsend+1,&swaits);
1749: for (proc=0, k=0; proc<size; proc++) {
1750: if (!len_s[proc]) continue;
1751: i = owners_co[proc];
1752: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1753: k++;
1754: }
1756: /* receives and sends of coj are complete */
1757: PetscMalloc1(size,&sstatus);
1758: for (i=0; i<merge->nrecv; i++) {
1759: PetscMPIInt icompleted;
1760: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1761: }
1762: PetscFree(rwaits);
1763: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1765: /* send and recv coi */
1766: /*-------------------*/
1767: PetscCommGetNewTag(comm,&tagi);
1768: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1769: PetscMalloc1(len+1,&buf_s);
1770: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1771: for (proc=0,k=0; proc<size; proc++) {
1772: if (!len_s[proc]) continue;
1773: /* form outgoing message for i-structure:
1774: buf_si[0]: nrows to be sent
1775: [1:nrows]: row index (global)
1776: [nrows+1:2*nrows+1]: i-structure index
1777: */
1778: /*-------------------------------------------*/
1779: nrows = len_si[proc]/2 - 1;
1780: buf_si_i = buf_si + nrows+1;
1781: buf_si[0] = nrows;
1782: buf_si_i[0] = 0;
1783: nrows = 0;
1784: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1785: nzi = coi[i+1] - coi[i];
1786: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1787: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1788: nrows++;
1789: }
1790: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1791: k++;
1792: buf_si += len_si[proc];
1793: }
1794: i = merge->nrecv;
1795: while (i--) {
1796: PetscMPIInt icompleted;
1797: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1798: }
1799: PetscFree(rwaits);
1800: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1801: PetscFree(len_si);
1802: PetscFree(len_ri);
1803: PetscFree(swaits);
1804: PetscFree(sstatus);
1805: PetscFree(buf_s);
1807: /* compute the local portion of C (mpi mat) */
1808: /*------------------------------------------*/
1809: /* allocate bi array and free space for accumulating nonzero column info */
1810: PetscMalloc1(pn+1,&bi);
1811: bi[0] = 0;
1813: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1814: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1815: PetscFreeSpaceGet(nnz,&free_space);
1816: current_space = free_space;
1818: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1819: for (k=0; k<merge->nrecv; k++) {
1820: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1821: nrows = *buf_ri_k[k];
1822: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1823: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1824: }
1826: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1827: rmax = 0;
1828: for (i=0; i<pn; i++) {
1829: /* add pdt[i,:]*AP into lnk */
1830: pnz = pdti[i+1] - pdti[i];
1831: ptJ = pdtj + pdti[i];
1832: for (j=0; j<pnz; j++) {
1833: row = ptJ[j]; /* row of AP == col of Pt */
1834: anz = ai[row+1] - ai[row];
1835: Jptr = aj + ai[row];
1836: /* add non-zero cols of AP into the sorted linked list lnk */
1837: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1838: }
1840: /* add received col data into lnk */
1841: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1842: if (i == *nextrow[k]) { /* i-th row */
1843: nzi = *(nextci[k]+1) - *nextci[k];
1844: Jptr = buf_rj[k] + *nextci[k];
1845: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1846: nextrow[k]++; nextci[k]++;
1847: }
1848: }
1849: nnz = lnk[0];
1851: /* if free space is not available, make more free space */
1852: if (current_space->local_remaining<nnz) {
1853: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1854: nspacedouble++;
1855: }
1856: /* copy data into free space, then initialize lnk */
1857: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1858: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1860: current_space->array += nnz;
1861: current_space->local_used += nnz;
1862: current_space->local_remaining -= nnz;
1864: bi[i+1] = bi[i] + nnz;
1865: if (nnz > rmax) rmax = nnz;
1866: }
1867: PetscFree3(buf_ri_k,nextrow,nextci);
1869: PetscMalloc1(bi[pn]+1,&bj);
1870: PetscFreeSpaceContiguous(&free_space,bj);
1871: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1872: if (afill_tmp > afill) afill = afill_tmp;
1873: PetscLLCondensedDestroy_Scalable(lnk);
1874: MatDestroy(&POt);
1875: MatDestroy(&PDt);
1877: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1878: /*----------------------------------------------------------------------------------*/
1879: PetscCalloc1(rmax+1,&vals);
1881: MatCreate(comm,&Cmpi);
1882: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1883: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1884: MatSetType(Cmpi,MATMPIAIJ);
1885: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1886: MatPreallocateFinalize(dnz,onz);
1887: MatSetBlockSize(Cmpi,1);
1888: for (i=0; i<pn; i++) {
1889: row = i + rstart;
1890: nnz = bi[i+1] - bi[i];
1891: Jptr = bj + bi[i];
1892: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1893: }
1894: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1895: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1896: PetscFree(vals);
1898: merge->bi = bi;
1899: merge->bj = bj;
1900: merge->coi = coi;
1901: merge->coj = coj;
1902: merge->buf_ri = buf_ri;
1903: merge->buf_rj = buf_rj;
1904: merge->owners_co = owners_co;
1905: merge->destroy = Cmpi->ops->destroy;
1906: merge->duplicate = Cmpi->ops->duplicate;
1908: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1909: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1910: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1912: /* attach the supporting struct to Cmpi for reuse */
1913: c = (Mat_MPIAIJ*)Cmpi->data;
1915: c->ptap = ptap;
1916: ptap->api = NULL;
1917: ptap->apj = NULL;
1918: ptap->merge = merge;
1919: ptap->rmax = rmax;
1920: ptap->apa = NULL;
1922: *C = Cmpi;
1923: #if defined(PETSC_USE_INFO)
1924: if (bi[pn] != 0) {
1925: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1926: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1927: } else {
1928: PetscInfo(Cmpi,"Empty matrix product\n");
1929: }
1930: #endif
1931: return(0);
1932: }