Actual source code: mpimatmatmult.c
petsc-3.7.3 2016-08-01
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc/private/vecimpl.h>
15: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
18: const char *algTypes[2] = {"scalable","nonscalable"};
19: PetscInt alg=1; /* set default algorithm */
20: MPI_Comm comm;
23: if (scall == MAT_INITIAL_MATRIX) {
24: PetscObjectGetComm((PetscObject)A,&comm);
25: if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
27: PetscObjectOptionsBegin((PetscObject)A);
28: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);
29: PetscOptionsEnd();
31: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
32: switch (alg) {
33: case 1:
34: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
35: break;
36: default:
37: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
38: break;
39: }
40: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
41: }
42: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
43: (*(*C)->ops->matmultnumeric)(A,B,*C);
44: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
45: return(0);
46: }
50: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51: {
53: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
54: Mat_PtAPMPI *ptap = a->ptap;
57: PetscFree2(ptap->startsj_s,ptap->startsj_r);
58: PetscFree(ptap->bufa);
59: MatDestroy(&ptap->P_loc);
60: MatDestroy(&ptap->P_oth);
61: MatDestroy(&ptap->Pt);
62: PetscFree(ptap->api);
63: PetscFree(ptap->apj);
64: PetscFree(ptap->apa);
65: ptap->destroy(A);
66: PetscFree(ptap);
67: return(0);
68: }
72: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73: {
75: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
76: Mat_PtAPMPI *ptap = a->ptap;
79: (*ptap->duplicate)(A,op,M);
81: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83: return(0);
84: }
88: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89: {
91: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94: PetscScalar *cda=cd->a,*coa=co->a;
95: Mat_SeqAIJ *p_loc,*p_oth;
96: PetscScalar *apa,*ca;
97: PetscInt cm =C->rmap->n;
98: Mat_PtAPMPI *ptap=c->ptap;
99: PetscInt *api,*apj,*apJ,i,k;
100: PetscInt cstart=C->cmap->rstart;
101: PetscInt cdnz,conz,k0,k1;
102: MPI_Comm comm;
103: PetscMPIInt size;
106: PetscObjectGetComm((PetscObject)A,&comm);
107: MPI_Comm_size(comm,&size);
109: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
110: /*-----------------------------------------------------*/
111: /* update numerical values of P_oth and P_loc */
112: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
113: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
115: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116: /*----------------------------------------------------------*/
117: /* get data from symbolic products */
118: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119: p_oth = NULL;
120: if (size >1) {
121: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122: }
124: /* get apa for storing dense row A[i,:]*P */
125: apa = ptap->apa;
127: api = ptap->api;
128: apj = ptap->apj;
129: for (i=0; i<cm; i++) {
130: /* compute apa = A[i,:]*P */
131: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
133: /* set values in C */
134: apJ = apj + api[i];
135: cdnz = cd->i[i+1] - cd->i[i];
136: conz = co->i[i+1] - co->i[i];
138: /* 1st off-diagoanl part of C */
139: ca = coa + co->i[i];
140: k = 0;
141: for (k0=0; k0<conz; k0++) {
142: if (apJ[k] >= cstart) break;
143: ca[k0] = apa[apJ[k]];
144: apa[apJ[k]] = 0.0;
145: k++;
146: }
148: /* diagonal part of C */
149: ca = cda + cd->i[i];
150: for (k1=0; k1<cdnz; k1++) {
151: ca[k1] = apa[apJ[k]];
152: apa[apJ[k]] = 0.0;
153: k++;
154: }
156: /* 2nd off-diagoanl part of C */
157: ca = coa + co->i[i];
158: for (; k0<conz; k0++) {
159: ca[k0] = apa[apJ[k]];
160: apa[apJ[k]] = 0.0;
161: k++;
162: }
163: }
164: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
165: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
166: return(0);
167: }
171: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172: {
173: PetscErrorCode ierr;
174: MPI_Comm comm;
175: PetscMPIInt size;
176: Mat Cmpi;
177: Mat_PtAPMPI *ptap;
178: PetscFreeSpaceList free_space=NULL,current_space=NULL;
179: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
180: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185: PetscBT lnkbt;
186: PetscScalar *apa;
187: PetscReal afill;
188: PetscTable ta;
191: PetscObjectGetComm((PetscObject)A,&comm);
192: MPI_Comm_size(comm,&size);
194: /* create struct Mat_PtAPMPI and attached it to C later */
195: PetscNew(&ptap);
197: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
200: /* get P_loc by taking all local rows of P */
201: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
203: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204: pi_loc = p_loc->i; pj_loc = p_loc->j;
205: if (size > 1) {
206: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207: pi_oth = p_oth->i; pj_oth = p_oth->j;
208: } else {
209: p_oth = NULL;
210: pi_oth = NULL; pj_oth = NULL;
211: }
213: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214: /*-------------------------------------------------------------------*/
215: PetscMalloc1(am+2,&api);
216: ptap->api = api;
217: api[0] = 0;
219: /* create and initialize a linked list */
220: PetscTableCreate(pN,pN,&ta);
221: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
222: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
223: PetscTableGetCount(ta,&Crmax);
224: PetscTableDestroy(&ta);
226: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
228: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
229: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
230: current_space = free_space;
232: MatPreallocateInitialize(comm,am,pn,dnz,onz);
233: for (i=0; i<am; i++) {
234: /* diagonal portion of A */
235: nzi = adi[i+1] - adi[i];
236: for (j=0; j<nzi; j++) {
237: row = *adj++;
238: pnz = pi_loc[row+1] - pi_loc[row];
239: Jptr = pj_loc + pi_loc[row];
240: /* add non-zero cols of P into the sorted linked list lnk */
241: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
242: }
243: /* off-diagonal portion of A */
244: nzi = aoi[i+1] - aoi[i];
245: for (j=0; j<nzi; j++) {
246: row = *aoj++;
247: pnz = pi_oth[row+1] - pi_oth[row];
248: Jptr = pj_oth + pi_oth[row];
249: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
250: }
252: apnz = lnk[0];
253: api[i+1] = api[i] + apnz;
255: /* if free space is not available, double the total space in the list */
256: if (current_space->local_remaining<apnz) {
257: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
258: nspacedouble++;
259: }
261: /* Copy data into free space, then initialize lnk */
262: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
263: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
265: current_space->array += apnz;
266: current_space->local_used += apnz;
267: current_space->local_remaining -= apnz;
268: }
270: /* Allocate space for apj, initialize apj, and */
271: /* destroy list of free space and other temporary array(s) */
272: PetscMalloc1(api[am]+1,&ptap->apj);
273: apj = ptap->apj;
274: PetscFreeSpaceContiguous(&free_space,ptap->apj);
275: PetscLLDestroy(lnk,lnkbt);
277: /* malloc apa to store dense row A[i,:]*P */
278: PetscCalloc1(pN,&apa);
280: ptap->apa = apa;
282: /* create and assemble symbolic parallel matrix Cmpi */
283: /*----------------------------------------------------*/
284: MatCreate(comm,&Cmpi);
285: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
286: MatSetBlockSizesFromMats(Cmpi,A,P);
288: MatSetType(Cmpi,MATMPIAIJ);
289: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
290: MatPreallocateFinalize(dnz,onz);
291: for (i=0; i<am; i++) {
292: row = i + rstart;
293: apnz = api[i+1] - api[i];
294: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
295: apj += apnz;
296: }
297: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
298: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
300: ptap->destroy = Cmpi->ops->destroy;
301: ptap->duplicate = Cmpi->ops->duplicate;
302: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
303: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
304: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
306: /* attach the supporting struct to Cmpi for reuse */
307: c = (Mat_MPIAIJ*)Cmpi->data;
308: c->ptap = ptap;
310: *C = Cmpi;
312: /* set MatInfo */
313: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
314: if (afill < 1.0) afill = 1.0;
315: Cmpi->info.mallocs = nspacedouble;
316: Cmpi->info.fill_ratio_given = fill;
317: Cmpi->info.fill_ratio_needed = afill;
319: #if defined(PETSC_USE_INFO)
320: if (api[am]) {
321: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
322: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
323: } else {
324: PetscInfo(Cmpi,"Empty matrix product\n");
325: }
326: #endif
327: return(0);
328: }
332: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
333: {
337: if (scall == MAT_INITIAL_MATRIX) {
338: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
339: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
340: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
341: }
342: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
343: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
344: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
345: return(0);
346: }
348: typedef struct {
349: Mat workB;
350: PetscScalar *rvalues,*svalues;
351: MPI_Request *rwaits,*swaits;
352: } MPIAIJ_MPIDense;
356: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
357: {
358: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
359: PetscErrorCode ierr;
362: MatDestroy(&contents->workB);
363: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
364: PetscFree(contents);
365: return(0);
366: }
370: /*
371: This is a "dummy function" that handles the case where matrix C was created as a dense matrix
372: directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
374: It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
375: */
376: PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
377: {
378: PetscErrorCode ierr;
379: PetscBool flg;
380: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
381: PetscInt nz = aij->B->cmap->n;
382: PetscContainer container;
383: MPIAIJ_MPIDense *contents;
384: VecScatter ctx = aij->Mvctx;
385: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
386: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
389: PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);
390: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
392: /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
393: PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);
394: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
396: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
398: PetscNew(&contents);
399: /* Create work matrix used to store off processor rows of B needed for local product */
400: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
401: /* Create work arrays needed */
402: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
403: B->cmap->N*to->starts[to->n],&contents->svalues,
404: from->n,&contents->rwaits,
405: to->n,&contents->swaits);
407: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
408: PetscContainerSetPointer(container,contents);
409: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
410: PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);
411: PetscContainerDestroy(&container);
413: (*C->ops->matmultnumeric)(A,B,C);
414: return(0);
415: }
419: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
420: {
421: PetscErrorCode ierr;
422: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
423: PetscInt nz = aij->B->cmap->n;
424: PetscContainer container;
425: MPIAIJ_MPIDense *contents;
426: VecScatter ctx = aij->Mvctx;
427: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
428: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
429: PetscInt m = A->rmap->n,n=B->cmap->n;
432: MatCreate(PetscObjectComm((PetscObject)B),C);
433: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
434: MatSetBlockSizesFromMats(*C,A,B);
435: MatSetType(*C,MATMPIDENSE);
436: MatMPIDenseSetPreallocation(*C,NULL);
437: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
438: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
440: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
442: PetscNew(&contents);
443: /* Create work matrix used to store off processor rows of B needed for local product */
444: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
445: /* Create work arrays needed */
446: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
447: B->cmap->N*to->starts[to->n],&contents->svalues,
448: from->n,&contents->rwaits,
449: to->n,&contents->swaits);
451: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
452: PetscContainerSetPointer(container,contents);
453: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
454: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
455: PetscContainerDestroy(&container);
456: return(0);
457: }
461: /*
462: Performs an efficient scatter on the rows of B needed by this process; this is
463: a modification of the VecScatterBegin_() routines.
464: */
465: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
466: {
467: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
468: PetscErrorCode ierr;
469: PetscScalar *b,*w,*svalues,*rvalues;
470: VecScatter ctx = aij->Mvctx;
471: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
472: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
473: PetscInt i,j,k;
474: PetscInt *sindices,*sstarts,*rindices,*rstarts;
475: PetscMPIInt *sprocs,*rprocs,nrecvs;
476: MPI_Request *swaits,*rwaits;
477: MPI_Comm comm;
478: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
479: MPI_Status status;
480: MPIAIJ_MPIDense *contents;
481: PetscContainer container;
482: Mat workB;
485: PetscObjectGetComm((PetscObject)A,&comm);
486: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
487: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
488: PetscContainerGetPointer(container,(void**)&contents);
490: workB = *outworkB = contents->workB;
491: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
492: sindices = to->indices;
493: sstarts = to->starts;
494: sprocs = to->procs;
495: swaits = contents->swaits;
496: svalues = contents->svalues;
498: rindices = from->indices;
499: rstarts = from->starts;
500: rprocs = from->procs;
501: rwaits = contents->rwaits;
502: rvalues = contents->rvalues;
504: MatDenseGetArray(B,&b);
505: MatDenseGetArray(workB,&w);
507: for (i=0; i<from->n; i++) {
508: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
509: }
511: for (i=0; i<to->n; i++) {
512: /* pack a message at a time */
513: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
514: for (k=0; k<ncols; k++) {
515: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
516: }
517: }
518: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
519: }
521: nrecvs = from->n;
522: while (nrecvs) {
523: MPI_Waitany(from->n,rwaits,&imdex,&status);
524: nrecvs--;
525: /* unpack a message at a time */
526: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
527: for (k=0; k<ncols; k++) {
528: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
529: }
530: }
531: }
532: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
534: MatDenseRestoreArray(B,&b);
535: MatDenseRestoreArray(workB,&w);
536: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
537: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
538: return(0);
539: }
540: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
544: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
545: {
547: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
548: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
549: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
550: Mat workB;
553: /* diagonal block of A times all local rows of B*/
554: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
556: /* get off processor parts of B needed to complete the product */
557: MatMPIDenseScatter(A,B,C,&workB);
559: /* off-diagonal block of A times nonlocal rows of B */
560: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
561: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
562: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
563: return(0);
564: }
568: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
569: {
571: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
572: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
573: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
574: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
575: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
576: Mat_SeqAIJ *p_loc,*p_oth;
577: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
578: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
579: PetscInt cm = C->rmap->n,anz,pnz;
580: Mat_PtAPMPI *ptap = c->ptap;
581: PetscScalar *apa_sparse = ptap->apa;
582: PetscInt *api,*apj,*apJ,i,j,k,row;
583: PetscInt cstart = C->cmap->rstart;
584: PetscInt cdnz,conz,k0,k1,nextp;
585: MPI_Comm comm;
586: PetscMPIInt size;
589: PetscObjectGetComm((PetscObject)A,&comm);
590: MPI_Comm_size(comm,&size);
592: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
593: /*-----------------------------------------------------*/
594: /* update numerical values of P_oth and P_loc */
595: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
596: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
598: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
599: /*----------------------------------------------------------*/
600: /* get data from symbolic products */
601: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
602: pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
603: if (size >1) {
604: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
605: pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
606: } else {
607: p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
608: }
610: api = ptap->api;
611: apj = ptap->apj;
612: for (i=0; i<cm; i++) {
613: apJ = apj + api[i];
615: /* diagonal portion of A */
616: anz = adi[i+1] - adi[i];
617: adj = ad->j + adi[i];
618: ada = ad->a + adi[i];
619: for (j=0; j<anz; j++) {
620: row = adj[j];
621: pnz = pi_loc[row+1] - pi_loc[row];
622: pj = pj_loc + pi_loc[row];
623: pa = pa_loc + pi_loc[row];
624: /* perform sparse axpy */
625: valtmp = ada[j];
626: nextp = 0;
627: for (k=0; nextp<pnz; k++) {
628: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
629: apa_sparse[k] += valtmp*pa[nextp++];
630: }
631: }
632: PetscLogFlops(2.0*pnz);
633: }
635: /* off-diagonal portion of A */
636: anz = aoi[i+1] - aoi[i];
637: aoj = ao->j + aoi[i];
638: aoa = ao->a + aoi[i];
639: for (j=0; j<anz; j++) {
640: row = aoj[j];
641: pnz = pi_oth[row+1] - pi_oth[row];
642: pj = pj_oth + pi_oth[row];
643: pa = pa_oth + pi_oth[row];
644: /* perform sparse axpy */
645: valtmp = aoa[j];
646: nextp = 0;
647: for (k=0; nextp<pnz; k++) {
648: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
649: apa_sparse[k] += valtmp*pa[nextp++];
650: }
651: }
652: PetscLogFlops(2.0*pnz);
653: }
655: /* set values in C */
656: cdnz = cd->i[i+1] - cd->i[i];
657: conz = co->i[i+1] - co->i[i];
659: /* 1st off-diagoanl part of C */
660: ca = coa + co->i[i];
661: k = 0;
662: for (k0=0; k0<conz; k0++) {
663: if (apJ[k] >= cstart) break;
664: ca[k0] = apa_sparse[k];
665: apa_sparse[k] = 0.0;
666: k++;
667: }
669: /* diagonal part of C */
670: ca = cda + cd->i[i];
671: for (k1=0; k1<cdnz; k1++) {
672: ca[k1] = apa_sparse[k];
673: apa_sparse[k] = 0.0;
674: k++;
675: }
677: /* 2nd off-diagoanl part of C */
678: ca = coa + co->i[i];
679: for (; k0<conz; k0++) {
680: ca[k0] = apa_sparse[k];
681: apa_sparse[k] = 0.0;
682: k++;
683: }
684: }
685: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
686: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
687: return(0);
688: }
690: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
693: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
694: {
695: PetscErrorCode ierr;
696: MPI_Comm comm;
697: PetscMPIInt size;
698: Mat Cmpi;
699: Mat_PtAPMPI *ptap;
700: PetscFreeSpaceList free_space = NULL,current_space=NULL;
701: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
702: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
703: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
704: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
705: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
706: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
707: PetscReal afill;
708: PetscScalar *apa;
709: PetscTable ta;
712: PetscObjectGetComm((PetscObject)A,&comm);
713: MPI_Comm_size(comm,&size);
715: /* create struct Mat_PtAPMPI and attached it to C later */
716: PetscNew(&ptap);
718: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
719: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
720:
721: /* get P_loc by taking all local rows of P */
722: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
724: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
725: pi_loc = p_loc->i; pj_loc = p_loc->j;
726: if (size > 1) {
727: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
728: pi_oth = p_oth->i; pj_oth = p_oth->j;
729: } else {
730: p_oth = NULL;
731: pi_oth = NULL; pj_oth = NULL;
732: }
734: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
735: /*-------------------------------------------------------------------*/
736: PetscMalloc1(am+2,&api);
737: ptap->api = api;
738: api[0] = 0;
740: /* create and initialize a linked list */
741: apnz_max = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN)); /* expected apnz_max */
742: if (apnz_max > pN) apnz_max = pN;
743: PetscTableCreate(apnz_max,pN,&ta);
745: /* Calculate apnz_max */
746: apnz_max = 0;
747: for (i=0; i<am; i++) {
748: PetscTableRemoveAll(ta);
749: /* diagonal portion of A */
750: nzi = adi[i+1] - adi[i];
751: Jptr = adj+adi[i]; /* cols of A_diag */
752: MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
753: PetscTableGetCount(ta,&apnz);
754: if (apnz_max < apnz) apnz_max = apnz;
756: /* off-diagonal portion of A */
757: nzi = aoi[i+1] - aoi[i];
758: Jptr = aoj+aoi[i]; /* cols of A_off */
759: MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
760: PetscTableGetCount(ta,&apnz);
761: if (apnz_max < apnz) apnz_max = apnz;
762: }
763: PetscTableDestroy(&ta);
764:
765: PetscLLCondensedCreate_Scalable(apnz_max,&lnk);
767: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
768: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
769: current_space = free_space;
770: MatPreallocateInitialize(comm,am,pn,dnz,onz);
771: for (i=0; i<am; i++) {
772: /* diagonal portion of A */
773: nzi = adi[i+1] - adi[i];
774: for (j=0; j<nzi; j++) {
775: row = *adj++;
776: pnz = pi_loc[row+1] - pi_loc[row];
777: Jptr = pj_loc + pi_loc[row];
778: /* add non-zero cols of P into the sorted linked list lnk */
779: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
780: }
781: /* off-diagonal portion of A */
782: nzi = aoi[i+1] - aoi[i];
783: for (j=0; j<nzi; j++) {
784: row = *aoj++;
785: pnz = pi_oth[row+1] - pi_oth[row];
786: Jptr = pj_oth + pi_oth[row];
787: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
788: }
790: apnz = *lnk;
791: api[i+1] = api[i] + apnz;
793: /* if free space is not available, double the total space in the list */
794: if (current_space->local_remaining<apnz) {
795: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
796: nspacedouble++;
797: }
799: /* Copy data into free space, then initialize lnk */
800: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
801: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
803: current_space->array += apnz;
804: current_space->local_used += apnz;
805: current_space->local_remaining -= apnz;
806: }
808: /* Allocate space for apj, initialize apj, and */
809: /* destroy list of free space and other temporary array(s) */
810: PetscMalloc1(api[am]+1,&ptap->apj);
811: apj = ptap->apj;
812: PetscFreeSpaceContiguous(&free_space,ptap->apj);
813: PetscLLCondensedDestroy_Scalable(lnk);
815: /* create and assemble symbolic parallel matrix Cmpi */
816: /*----------------------------------------------------*/
817: MatCreate(comm,&Cmpi);
818: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
819: MatSetBlockSizesFromMats(Cmpi,A,P);
820: MatSetType(Cmpi,MATMPIAIJ);
821: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
822: MatPreallocateFinalize(dnz,onz);
824: /* malloc apa for assembly Cmpi */
825: PetscCalloc1(apnz_max,&apa);
827: ptap->apa = apa;
828: for (i=0; i<am; i++) {
829: row = i + rstart;
830: apnz = api[i+1] - api[i];
831: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
832: apj += apnz;
833: }
834: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
835: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
837: ptap->destroy = Cmpi->ops->destroy;
838: ptap->duplicate = Cmpi->ops->duplicate;
839: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
840: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
841: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
843: /* attach the supporting struct to Cmpi for reuse */
844: c = (Mat_MPIAIJ*)Cmpi->data;
845: c->ptap = ptap;
847: *C = Cmpi;
849: /* set MatInfo */
850: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
851: if (afill < 1.0) afill = 1.0;
852: Cmpi->info.mallocs = nspacedouble;
853: Cmpi->info.fill_ratio_given = fill;
854: Cmpi->info.fill_ratio_needed = afill;
856: #if defined(PETSC_USE_INFO)
857: if (api[am]) {
858: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
859: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
860: } else {
861: PetscInfo(Cmpi,"Empty matrix product\n");
862: }
863: #endif
864: return(0);
865: }
867: /*-------------------------------------------------------------------------*/
870: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
871: {
873: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
874: PetscInt alg=0; /* set default algorithm */
877: if (scall == MAT_INITIAL_MATRIX) {
878: PetscObjectOptionsBegin((PetscObject)A);
879: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);
880: PetscOptionsEnd();
882: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
883: switch (alg) {
884: case 1:
885: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
886: break;
887: case 2:
888: {
889: Mat Pt;
890: Mat_PtAPMPI *ptap;
891: Mat_MPIAIJ *c;
892: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
893: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
894: c = (Mat_MPIAIJ*)(*C)->data;
895: ptap = c->ptap;
896: ptap->Pt = Pt;
897: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
898: return(0);
899: }
900: break;
901: default:
902: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
903: break;
904: }
905: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
906: }
907: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
908: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
909: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
910: return(0);
911: }
913: /* This routine only works when scall=MAT_REUSE_MATRIX! */
916: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
917: {
919: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
920: Mat_PtAPMPI *ptap= c->ptap;
921: Mat Pt=ptap->Pt;
924: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
925: MatMatMultNumeric(Pt,A,C);
926: return(0);
927: }
929: /* Non-scalable version, use dense axpy */
932: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
933: {
934: PetscErrorCode ierr;
935: Mat_Merge_SeqsToMPI *merge;
936: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
937: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
938: Mat_PtAPMPI *ptap;
939: PetscInt *adj,*aJ;
940: PetscInt i,j,k,anz,pnz,row,*cj;
941: MatScalar *ada,*aval,*ca,valtmp;
942: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
943: MPI_Comm comm;
944: PetscMPIInt size,rank,taga,*len_s;
945: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
946: PetscInt **buf_ri,**buf_rj;
947: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
948: MPI_Request *s_waits,*r_waits;
949: MPI_Status *status;
950: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
951: PetscInt *ai,*aj,*coi,*coj;
952: PetscInt *poJ,*pdJ;
953: Mat A_loc;
954: Mat_SeqAIJ *a_loc;
957: PetscObjectGetComm((PetscObject)C,&comm);
958: MPI_Comm_size(comm,&size);
959: MPI_Comm_rank(comm,&rank);
961: ptap = c->ptap;
962: merge = ptap->merge;
964: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
965: /*--------------------------------------------------------------*/
966: /* get data from symbolic products */
967: coi = merge->coi; coj = merge->coj;
968: PetscCalloc1(coi[pon]+1,&coa);
970: bi = merge->bi; bj = merge->bj;
971: owners = merge->rowmap->range;
972: PetscCalloc1(bi[cm]+1,&ba);
974: /* get A_loc by taking all local rows of A */
975: A_loc = ptap->A_loc;
976: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
977: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
978: ai = a_loc->i;
979: aj = a_loc->j;
981: PetscCalloc1(A->cmap->N,&aval); /* non-scalable!!! */
983: for (i=0; i<am; i++) {
984: /* 2-a) put A[i,:] to dense array aval */
985: anz = ai[i+1] - ai[i];
986: adj = aj + ai[i];
987: ada = a_loc->a + ai[i];
988: for (j=0; j<anz; j++) {
989: aval[adj[j]] = ada[j];
990: }
992: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
993: /*--------------------------------------------------------------*/
994: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
995: pnz = po->i[i+1] - po->i[i];
996: poJ = po->j + po->i[i];
997: pA = po->a + po->i[i];
998: for (j=0; j<pnz; j++) {
999: row = poJ[j];
1000: cnz = coi[row+1] - coi[row];
1001: cj = coj + coi[row];
1002: ca = coa + coi[row];
1003: /* perform dense axpy */
1004: valtmp = pA[j];
1005: for (k=0; k<cnz; k++) {
1006: ca[k] += valtmp*aval[cj[k]];
1007: }
1008: PetscLogFlops(2.0*cnz);
1009: }
1011: /* put the value into Cd (diagonal part) */
1012: pnz = pd->i[i+1] - pd->i[i];
1013: pdJ = pd->j + pd->i[i];
1014: pA = pd->a + pd->i[i];
1015: for (j=0; j<pnz; j++) {
1016: row = pdJ[j];
1017: cnz = bi[row+1] - bi[row];
1018: cj = bj + bi[row];
1019: ca = ba + bi[row];
1020: /* perform dense axpy */
1021: valtmp = pA[j];
1022: for (k=0; k<cnz; k++) {
1023: ca[k] += valtmp*aval[cj[k]];
1024: }
1025: PetscLogFlops(2.0*cnz);
1026: }
1028: /* zero the current row of Pt*A */
1029: aJ = aj + ai[i];
1030: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1031: }
1033: /* 3) send and recv matrix values coa */
1034: /*------------------------------------*/
1035: buf_ri = merge->buf_ri;
1036: buf_rj = merge->buf_rj;
1037: len_s = merge->len_s;
1038: PetscCommGetNewTag(comm,&taga);
1039: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1041: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1042: for (proc=0,k=0; proc<size; proc++) {
1043: if (!len_s[proc]) continue;
1044: i = merge->owners_co[proc];
1045: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1046: k++;
1047: }
1048: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1049: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1051: PetscFree2(s_waits,status);
1052: PetscFree(r_waits);
1053: PetscFree(coa);
1055: /* 4) insert local Cseq and received values into Cmpi */
1056: /*----------------------------------------------------*/
1057: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1058: for (k=0; k<merge->nrecv; k++) {
1059: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1060: nrows = *(buf_ri_k[k]);
1061: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1062: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1063: }
1065: for (i=0; i<cm; i++) {
1066: row = owners[rank] + i; /* global row index of C_seq */
1067: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1068: ba_i = ba + bi[i];
1069: bnz = bi[i+1] - bi[i];
1070: /* add received vals into ba */
1071: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1072: /* i-th row */
1073: if (i == *nextrow[k]) {
1074: cnz = *(nextci[k]+1) - *nextci[k];
1075: cj = buf_rj[k] + *(nextci[k]);
1076: ca = abuf_r[k] + *(nextci[k]);
1077: nextcj = 0;
1078: for (j=0; nextcj<cnz; j++) {
1079: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1080: ba_i[j] += ca[nextcj++];
1081: }
1082: }
1083: nextrow[k]++; nextci[k]++;
1084: PetscLogFlops(2.0*cnz);
1085: }
1086: }
1087: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1088: }
1089: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1090: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1092: PetscFree(ba);
1093: PetscFree(abuf_r[0]);
1094: PetscFree(abuf_r);
1095: PetscFree3(buf_ri_k,nextrow,nextci);
1096: PetscFree(aval);
1097: return(0);
1098: }
1100: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1101: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1104: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1105: {
1106: PetscErrorCode ierr;
1107: Mat Cmpi,A_loc,POt,PDt;
1108: Mat_PtAPMPI *ptap;
1109: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1110: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1111: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1112: PetscInt nnz;
1113: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1114: PetscInt am=A->rmap->n,pn=P->cmap->n;
1115: PetscBT lnkbt;
1116: MPI_Comm comm;
1117: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1118: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1119: PetscInt len,proc,*dnz,*onz,*owners;
1120: PetscInt nzi,*bi,*bj;
1121: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1122: MPI_Request *swaits,*rwaits;
1123: MPI_Status *sstatus,rstatus;
1124: Mat_Merge_SeqsToMPI *merge;
1125: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1126: PetscReal afill =1.0,afill_tmp;
1127: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1128: PetscScalar *vals;
1129: Mat_SeqAIJ *a_loc, *pdt,*pot;
1132: PetscObjectGetComm((PetscObject)A,&comm);
1133: /* check if matrix local sizes are compatible */
1134: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1136: MPI_Comm_size(comm,&size);
1137: MPI_Comm_rank(comm,&rank);
1139: /* create struct Mat_PtAPMPI and attached it to C later */
1140: PetscNew(&ptap);
1142: /* get A_loc by taking all local rows of A */
1143: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1145: ptap->A_loc = A_loc;
1147: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1148: ai = a_loc->i;
1149: aj = a_loc->j;
1151: /* determine symbolic Co=(p->B)^T*A - send to others */
1152: /*----------------------------------------------------*/
1153: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1154: pdt = (Mat_SeqAIJ*)PDt->data;
1155: pdti = pdt->i; pdtj = pdt->j;
1157: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1158: pot = (Mat_SeqAIJ*)POt->data;
1159: poti = pot->i; potj = pot->j;
1161: /* then, compute symbolic Co = (p->B)^T*A */
1162: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1163: PetscMalloc1(pon+1,&coi);
1164: coi[0] = 0;
1166: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1167: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1168: PetscFreeSpaceGet(nnz,&free_space);
1169: current_space = free_space;
1171: /* create and initialize a linked list */
1172: PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);
1174: for (i=0; i<pon; i++) {
1175: pnz = poti[i+1] - poti[i];
1176: ptJ = potj + poti[i];
1177: for (j=0; j<pnz; j++) {
1178: row = ptJ[j]; /* row of A_loc == col of Pot */
1179: anz = ai[row+1] - ai[row];
1180: Jptr = aj + ai[row];
1181: /* add non-zero cols of AP into the sorted linked list lnk */
1182: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1183: }
1184: nnz = lnk[0];
1186: /* If free space is not available, double the total space in the list */
1187: if (current_space->local_remaining<nnz) {
1188: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1189: nspacedouble++;
1190: }
1192: /* Copy data into free space, and zero out denserows */
1193: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1195: current_space->array += nnz;
1196: current_space->local_used += nnz;
1197: current_space->local_remaining -= nnz;
1199: coi[i+1] = coi[i] + nnz;
1200: }
1202: PetscMalloc1(coi[pon]+1,&coj);
1203: PetscFreeSpaceContiguous(&free_space,coj);
1205: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1206: if (afill_tmp > afill) afill = afill_tmp;
1208: /* send j-array (coj) of Co to other processors */
1209: /*----------------------------------------------*/
1210: /* determine row ownership */
1211: PetscNew(&merge);
1212: PetscLayoutCreate(comm,&merge->rowmap);
1214: merge->rowmap->n = pn;
1215: merge->rowmap->bs = 1;
1217: PetscLayoutSetUp(merge->rowmap);
1218: owners = merge->rowmap->range;
1220: /* determine the number of messages to send, their lengths */
1221: PetscCalloc1(size,&len_si);
1222: PetscMalloc1(size,&merge->len_s);
1224: len_s = merge->len_s;
1225: merge->nsend = 0;
1227: PetscMalloc1(size+2,&owners_co);
1228: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1230: proc = 0;
1231: for (i=0; i<pon; i++) {
1232: while (prmap[i] >= owners[proc+1]) proc++;
1233: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1234: len_s[proc] += coi[i+1] - coi[i];
1235: }
1237: len = 0; /* max length of buf_si[] */
1238: owners_co[0] = 0;
1239: for (proc=0; proc<size; proc++) {
1240: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1241: if (len_si[proc]) {
1242: merge->nsend++;
1243: len_si[proc] = 2*(len_si[proc] + 1);
1244: len += len_si[proc];
1245: }
1246: }
1248: /* determine the number and length of messages to receive for coi and coj */
1249: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1250: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1252: /* post the Irecv and Isend of coj */
1253: PetscCommGetNewTag(comm,&tagj);
1254: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1255: PetscMalloc1(merge->nsend+1,&swaits);
1256: for (proc=0, k=0; proc<size; proc++) {
1257: if (!len_s[proc]) continue;
1258: i = owners_co[proc];
1259: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1260: k++;
1261: }
1263: /* receives and sends of coj are complete */
1264: PetscMalloc1(size,&sstatus);
1265: for (i=0; i<merge->nrecv; i++) {
1266: PetscMPIInt icompleted;
1267: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1268: }
1269: PetscFree(rwaits);
1270: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1272: /* send and recv coi */
1273: /*-------------------*/
1274: PetscCommGetNewTag(comm,&tagi);
1275: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1276: PetscMalloc1(len+1,&buf_s);
1277: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1278: for (proc=0,k=0; proc<size; proc++) {
1279: if (!len_s[proc]) continue;
1280: /* form outgoing message for i-structure:
1281: buf_si[0]: nrows to be sent
1282: [1:nrows]: row index (global)
1283: [nrows+1:2*nrows+1]: i-structure index
1284: */
1285: /*-------------------------------------------*/
1286: nrows = len_si[proc]/2 - 1;
1287: buf_si_i = buf_si + nrows+1;
1288: buf_si[0] = nrows;
1289: buf_si_i[0] = 0;
1290: nrows = 0;
1291: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1292: nzi = coi[i+1] - coi[i];
1293: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1294: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1295: nrows++;
1296: }
1297: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1298: k++;
1299: buf_si += len_si[proc];
1300: }
1301: i = merge->nrecv;
1302: while (i--) {
1303: PetscMPIInt icompleted;
1304: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1305: }
1306: PetscFree(rwaits);
1307: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1308: PetscFree(len_si);
1309: PetscFree(len_ri);
1310: PetscFree(swaits);
1311: PetscFree(sstatus);
1312: PetscFree(buf_s);
1314: /* compute the local portion of C (mpi mat) */
1315: /*------------------------------------------*/
1316: /* allocate bi array and free space for accumulating nonzero column info */
1317: PetscMalloc1(pn+1,&bi);
1318: bi[0] = 0;
1320: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1321: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1322: PetscFreeSpaceGet(nnz,&free_space);
1323: current_space = free_space;
1325: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1326: for (k=0; k<merge->nrecv; k++) {
1327: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1328: nrows = *buf_ri_k[k];
1329: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1330: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1331: }
1333: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1334: rmax = 0;
1335: for (i=0; i<pn; i++) {
1336: /* add pdt[i,:]*AP into lnk */
1337: pnz = pdti[i+1] - pdti[i];
1338: ptJ = pdtj + pdti[i];
1339: for (j=0; j<pnz; j++) {
1340: row = ptJ[j]; /* row of AP == col of Pt */
1341: anz = ai[row+1] - ai[row];
1342: Jptr = aj + ai[row];
1343: /* add non-zero cols of AP into the sorted linked list lnk */
1344: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1345: }
1347: /* add received col data into lnk */
1348: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1349: if (i == *nextrow[k]) { /* i-th row */
1350: nzi = *(nextci[k]+1) - *nextci[k];
1351: Jptr = buf_rj[k] + *nextci[k];
1352: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1353: nextrow[k]++; nextci[k]++;
1354: }
1355: }
1356: nnz = lnk[0];
1358: /* if free space is not available, make more free space */
1359: if (current_space->local_remaining<nnz) {
1360: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1361: nspacedouble++;
1362: }
1363: /* copy data into free space, then initialize lnk */
1364: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1365: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1367: current_space->array += nnz;
1368: current_space->local_used += nnz;
1369: current_space->local_remaining -= nnz;
1371: bi[i+1] = bi[i] + nnz;
1372: if (nnz > rmax) rmax = nnz;
1373: }
1374: PetscFree3(buf_ri_k,nextrow,nextci);
1376: PetscMalloc1(bi[pn]+1,&bj);
1377: PetscFreeSpaceContiguous(&free_space,bj);
1379: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1380: if (afill_tmp > afill) afill = afill_tmp;
1381: PetscLLCondensedDestroy(lnk,lnkbt);
1382: MatDestroy(&POt);
1383: MatDestroy(&PDt);
1385: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1386: /*----------------------------------------------------------------------------------*/
1387: PetscCalloc1(rmax+1,&vals);
1389: MatCreate(comm,&Cmpi);
1390: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1391: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1392: MatSetType(Cmpi,MATMPIAIJ);
1393: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1394: MatPreallocateFinalize(dnz,onz);
1395: MatSetBlockSize(Cmpi,1);
1396: for (i=0; i<pn; i++) {
1397: row = i + rstart;
1398: nnz = bi[i+1] - bi[i];
1399: Jptr = bj + bi[i];
1400: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1401: }
1402: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1403: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1404: PetscFree(vals);
1406: merge->bi = bi;
1407: merge->bj = bj;
1408: merge->coi = coi;
1409: merge->coj = coj;
1410: merge->buf_ri = buf_ri;
1411: merge->buf_rj = buf_rj;
1412: merge->owners_co = owners_co;
1414: /* attach the supporting struct to Cmpi for reuse */
1415: c = (Mat_MPIAIJ*)Cmpi->data;
1416: c->ptap = ptap;
1417: ptap->api = NULL;
1418: ptap->apj = NULL;
1419: ptap->merge = merge;
1420: ptap->destroy = Cmpi->ops->destroy;
1421: ptap->duplicate = Cmpi->ops->duplicate;
1423: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1424: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1425: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1427: *C = Cmpi;
1428: #if defined(PETSC_USE_INFO)
1429: if (bi[pn] != 0) {
1430: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1431: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1432: } else {
1433: PetscInfo(Cmpi,"Empty matrix product\n");
1434: }
1435: #endif
1436: return(0);
1437: }
1441: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1442: {
1443: PetscErrorCode ierr;
1444: Mat_Merge_SeqsToMPI *merge;
1445: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1446: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1447: Mat_PtAPMPI *ptap;
1448: PetscInt *adj;
1449: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1450: MatScalar *ada,*ca,valtmp;
1451: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1452: MPI_Comm comm;
1453: PetscMPIInt size,rank,taga,*len_s;
1454: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1455: PetscInt **buf_ri,**buf_rj;
1456: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1457: MPI_Request *s_waits,*r_waits;
1458: MPI_Status *status;
1459: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1460: PetscInt *ai,*aj,*coi,*coj;
1461: PetscInt *poJ,*pdJ;
1462: Mat A_loc;
1463: Mat_SeqAIJ *a_loc;
1466: PetscObjectGetComm((PetscObject)C,&comm);
1467: MPI_Comm_size(comm,&size);
1468: MPI_Comm_rank(comm,&rank);
1470: ptap = c->ptap;
1471: merge = ptap->merge;
1473: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1474: /*------------------------------------------*/
1475: /* get data from symbolic products */
1476: coi = merge->coi; coj = merge->coj;
1477: PetscCalloc1(coi[pon]+1,&coa);
1478: bi = merge->bi; bj = merge->bj;
1479: owners = merge->rowmap->range;
1480: PetscCalloc1(bi[cm]+1,&ba);
1482: /* get A_loc by taking all local rows of A */
1483: A_loc = ptap->A_loc;
1484: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1485: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1486: ai = a_loc->i;
1487: aj = a_loc->j;
1489: for (i=0; i<am; i++) {
1490: anz = ai[i+1] - ai[i];
1491: adj = aj + ai[i];
1492: ada = a_loc->a + ai[i];
1494: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1495: /*-------------------------------------------------------------*/
1496: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1497: pnz = po->i[i+1] - po->i[i];
1498: poJ = po->j + po->i[i];
1499: pA = po->a + po->i[i];
1500: for (j=0; j<pnz; j++) {
1501: row = poJ[j];
1502: cj = coj + coi[row];
1503: ca = coa + coi[row];
1504: /* perform sparse axpy */
1505: nexta = 0;
1506: valtmp = pA[j];
1507: for (k=0; nexta<anz; k++) {
1508: if (cj[k] == adj[nexta]) {
1509: ca[k] += valtmp*ada[nexta];
1510: nexta++;
1511: }
1512: }
1513: PetscLogFlops(2.0*anz);
1514: }
1516: /* put the value into Cd (diagonal part) */
1517: pnz = pd->i[i+1] - pd->i[i];
1518: pdJ = pd->j + pd->i[i];
1519: pA = pd->a + pd->i[i];
1520: for (j=0; j<pnz; j++) {
1521: row = pdJ[j];
1522: cj = bj + bi[row];
1523: ca = ba + bi[row];
1524: /* perform sparse axpy */
1525: nexta = 0;
1526: valtmp = pA[j];
1527: for (k=0; nexta<anz; k++) {
1528: if (cj[k] == adj[nexta]) {
1529: ca[k] += valtmp*ada[nexta];
1530: nexta++;
1531: }
1532: }
1533: PetscLogFlops(2.0*anz);
1534: }
1535: }
1537: /* 3) send and recv matrix values coa */
1538: /*------------------------------------*/
1539: buf_ri = merge->buf_ri;
1540: buf_rj = merge->buf_rj;
1541: len_s = merge->len_s;
1542: PetscCommGetNewTag(comm,&taga);
1543: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1545: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1546: for (proc=0,k=0; proc<size; proc++) {
1547: if (!len_s[proc]) continue;
1548: i = merge->owners_co[proc];
1549: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1550: k++;
1551: }
1552: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1553: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1555: PetscFree2(s_waits,status);
1556: PetscFree(r_waits);
1557: PetscFree(coa);
1559: /* 4) insert local Cseq and received values into Cmpi */
1560: /*----------------------------------------------------*/
1561: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1562: for (k=0; k<merge->nrecv; k++) {
1563: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1564: nrows = *(buf_ri_k[k]);
1565: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1566: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1567: }
1569: for (i=0; i<cm; i++) {
1570: row = owners[rank] + i; /* global row index of C_seq */
1571: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1572: ba_i = ba + bi[i];
1573: bnz = bi[i+1] - bi[i];
1574: /* add received vals into ba */
1575: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1576: /* i-th row */
1577: if (i == *nextrow[k]) {
1578: cnz = *(nextci[k]+1) - *nextci[k];
1579: cj = buf_rj[k] + *(nextci[k]);
1580: ca = abuf_r[k] + *(nextci[k]);
1581: nextcj = 0;
1582: for (j=0; nextcj<cnz; j++) {
1583: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1584: ba_i[j] += ca[nextcj++];
1585: }
1586: }
1587: nextrow[k]++; nextci[k]++;
1588: PetscLogFlops(2.0*cnz);
1589: }
1590: }
1591: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1592: }
1593: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1594: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1596: PetscFree(ba);
1597: PetscFree(abuf_r[0]);
1598: PetscFree(abuf_r);
1599: PetscFree3(buf_ri_k,nextrow,nextci);
1600: return(0);
1601: }
1603: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1604: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1605: differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1608: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1609: {
1610: PetscErrorCode ierr;
1611: Mat Cmpi,A_loc,POt,PDt;
1612: Mat_PtAPMPI *ptap;
1613: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1614: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1615: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1616: PetscInt nnz;
1617: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1618: PetscInt am =A->rmap->n,pn=P->cmap->n;
1619: MPI_Comm comm;
1620: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1621: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1622: PetscInt len,proc,*dnz,*onz,*owners;
1623: PetscInt nzi,*bi,*bj;
1624: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1625: MPI_Request *swaits,*rwaits;
1626: MPI_Status *sstatus,rstatus;
1627: Mat_Merge_SeqsToMPI *merge;
1628: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1629: PetscReal afill =1.0,afill_tmp;
1630: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1631: PetscScalar *vals;
1632: Mat_SeqAIJ *a_loc,*pdt,*pot;
1633: PetscTable ta;
1636: PetscObjectGetComm((PetscObject)A,&comm);
1637: /* check if matrix local sizes are compatible */
1638: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1640: MPI_Comm_size(comm,&size);
1641: MPI_Comm_rank(comm,&rank);
1643: /* create struct Mat_PtAPMPI and attached it to C later */
1644: PetscNew(&ptap);
1646: /* get A_loc by taking all local rows of A */
1647: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1649: ptap->A_loc = A_loc;
1650: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1651: ai = a_loc->i;
1652: aj = a_loc->j;
1654: /* determine symbolic Co=(p->B)^T*A - send to others */
1655: /*----------------------------------------------------*/
1656: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1657: pdt = (Mat_SeqAIJ*)PDt->data;
1658: pdti = pdt->i; pdtj = pdt->j;
1660: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1661: pot = (Mat_SeqAIJ*)POt->data;
1662: poti = pot->i; potj = pot->j;
1664: /* then, compute symbolic Co = (p->B)^T*A */
1665: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1666: >= (num of nonzero rows of C_seq) - pn */
1667: PetscMalloc1(pon+1,&coi);
1668: coi[0] = 0;
1670: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1671: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1672: PetscFreeSpaceGet(nnz,&free_space);
1673: current_space = free_space;
1675: /* create and initialize a linked list */
1676: PetscTableCreate(aN,aN,&ta);
1677: MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1678: PetscTableGetCount(ta,&Armax);
1679: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1681: for (i=0; i<pon; i++) {
1682: pnz = poti[i+1] - poti[i];
1683: ptJ = potj + poti[i];
1684: for (j=0; j<pnz; j++) {
1685: row = ptJ[j]; /* row of A_loc == col of Pot */
1686: anz = ai[row+1] - ai[row];
1687: Jptr = aj + ai[row];
1688: /* add non-zero cols of AP into the sorted linked list lnk */
1689: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1690: }
1691: nnz = lnk[0];
1693: /* If free space is not available, double the total space in the list */
1694: if (current_space->local_remaining<nnz) {
1695: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1696: nspacedouble++;
1697: }
1699: /* Copy data into free space, and zero out denserows */
1700: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1702: current_space->array += nnz;
1703: current_space->local_used += nnz;
1704: current_space->local_remaining -= nnz;
1706: coi[i+1] = coi[i] + nnz;
1707: }
1709: PetscMalloc1(coi[pon]+1,&coj);
1710: PetscFreeSpaceContiguous(&free_space,coj);
1711: PetscLLCondensedDestroy_Scalable(lnk); /* must destroy to get a new one for C */
1713: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1714: if (afill_tmp > afill) afill = afill_tmp;
1716: /* send j-array (coj) of Co to other processors */
1717: /*----------------------------------------------*/
1718: /* determine row ownership */
1719: PetscNew(&merge);
1720: PetscLayoutCreate(comm,&merge->rowmap);
1722: merge->rowmap->n = pn;
1723: merge->rowmap->bs = 1;
1725: PetscLayoutSetUp(merge->rowmap);
1726: owners = merge->rowmap->range;
1728: /* determine the number of messages to send, their lengths */
1729: PetscCalloc1(size,&len_si);
1730: PetscMalloc1(size,&merge->len_s);
1732: len_s = merge->len_s;
1733: merge->nsend = 0;
1735: PetscMalloc1(size+2,&owners_co);
1736: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1738: proc = 0;
1739: for (i=0; i<pon; i++) {
1740: while (prmap[i] >= owners[proc+1]) proc++;
1741: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1742: len_s[proc] += coi[i+1] - coi[i];
1743: }
1745: len = 0; /* max length of buf_si[] */
1746: owners_co[0] = 0;
1747: for (proc=0; proc<size; proc++) {
1748: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1749: if (len_si[proc]) {
1750: merge->nsend++;
1751: len_si[proc] = 2*(len_si[proc] + 1);
1752: len += len_si[proc];
1753: }
1754: }
1756: /* determine the number and length of messages to receive for coi and coj */
1757: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1758: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1760: /* post the Irecv and Isend of coj */
1761: PetscCommGetNewTag(comm,&tagj);
1762: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1763: PetscMalloc1(merge->nsend+1,&swaits);
1764: for (proc=0, k=0; proc<size; proc++) {
1765: if (!len_s[proc]) continue;
1766: i = owners_co[proc];
1767: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1768: k++;
1769: }
1771: /* receives and sends of coj are complete */
1772: PetscMalloc1(size,&sstatus);
1773: for (i=0; i<merge->nrecv; i++) {
1774: PetscMPIInt icompleted;
1775: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1776: }
1777: PetscFree(rwaits);
1778: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1780: /* add received column indices into table to update Armax */
1781: for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1782: Jptr = buf_rj[k];
1783: for (j=0; j<merge->len_r[k]; j++) {
1784: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1785: }
1786: }
1787: PetscTableGetCount(ta,&Armax);
1789: /* send and recv coi */
1790: /*-------------------*/
1791: PetscCommGetNewTag(comm,&tagi);
1792: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1793: PetscMalloc1(len+1,&buf_s);
1794: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1795: for (proc=0,k=0; proc<size; proc++) {
1796: if (!len_s[proc]) continue;
1797: /* form outgoing message for i-structure:
1798: buf_si[0]: nrows to be sent
1799: [1:nrows]: row index (global)
1800: [nrows+1:2*nrows+1]: i-structure index
1801: */
1802: /*-------------------------------------------*/
1803: nrows = len_si[proc]/2 - 1;
1804: buf_si_i = buf_si + nrows+1;
1805: buf_si[0] = nrows;
1806: buf_si_i[0] = 0;
1807: nrows = 0;
1808: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1809: nzi = coi[i+1] - coi[i];
1810: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1811: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1812: nrows++;
1813: }
1814: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1815: k++;
1816: buf_si += len_si[proc];
1817: }
1818: i = merge->nrecv;
1819: while (i--) {
1820: PetscMPIInt icompleted;
1821: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1822: }
1823: PetscFree(rwaits);
1824: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1825: PetscFree(len_si);
1826: PetscFree(len_ri);
1827: PetscFree(swaits);
1828: PetscFree(sstatus);
1829: PetscFree(buf_s);
1831: /* compute the local portion of C (mpi mat) */
1832: /*------------------------------------------*/
1833: /* allocate bi array and free space for accumulating nonzero column info */
1834: PetscMalloc1(pn+1,&bi);
1835: bi[0] = 0;
1837: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1838: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1839: PetscFreeSpaceGet(nnz,&free_space);
1840: current_space = free_space;
1842: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1843: for (k=0; k<merge->nrecv; k++) {
1844: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1845: nrows = *buf_ri_k[k];
1846: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1847: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1848: }
1850: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1851: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1852: rmax = 0;
1853: for (i=0; i<pn; i++) {
1854: /* add pdt[i,:]*AP into lnk */
1855: pnz = pdti[i+1] - pdti[i];
1856: ptJ = pdtj + pdti[i];
1857: for (j=0; j<pnz; j++) {
1858: row = ptJ[j]; /* row of AP == col of Pt */
1859: anz = ai[row+1] - ai[row];
1860: Jptr = aj + ai[row];
1861: /* add non-zero cols of AP into the sorted linked list lnk */
1862: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1863: }
1865: /* add received col data into lnk */
1866: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1867: if (i == *nextrow[k]) { /* i-th row */
1868: nzi = *(nextci[k]+1) - *nextci[k];
1869: Jptr = buf_rj[k] + *nextci[k];
1870: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1871: nextrow[k]++; nextci[k]++;
1872: }
1873: }
1874: nnz = lnk[0];
1876: /* if free space is not available, make more free space */
1877: if (current_space->local_remaining<nnz) {
1878: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1879: nspacedouble++;
1880: }
1881: /* copy data into free space, then initialize lnk */
1882: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1883: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1885: current_space->array += nnz;
1886: current_space->local_used += nnz;
1887: current_space->local_remaining -= nnz;
1889: bi[i+1] = bi[i] + nnz;
1890: if (nnz > rmax) rmax = nnz;
1891: }
1892: PetscFree3(buf_ri_k,nextrow,nextci);
1894: PetscMalloc1(bi[pn]+1,&bj);
1895: PetscFreeSpaceContiguous(&free_space,bj);
1896: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1897: if (afill_tmp > afill) afill = afill_tmp;
1898: PetscLLCondensedDestroy_Scalable(lnk);
1899: PetscTableDestroy(&ta);
1901: MatDestroy(&POt);
1902: MatDestroy(&PDt);
1904: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1905: /*----------------------------------------------------------------------------------*/
1906: PetscCalloc1(rmax+1,&vals);
1908: MatCreate(comm,&Cmpi);
1909: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1910: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1911: MatSetType(Cmpi,MATMPIAIJ);
1912: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1913: MatPreallocateFinalize(dnz,onz);
1914: MatSetBlockSize(Cmpi,1);
1915: for (i=0; i<pn; i++) {
1916: row = i + rstart;
1917: nnz = bi[i+1] - bi[i];
1918: Jptr = bj + bi[i];
1919: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1920: }
1921: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1922: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1923: PetscFree(vals);
1925: merge->bi = bi;
1926: merge->bj = bj;
1927: merge->coi = coi;
1928: merge->coj = coj;
1929: merge->buf_ri = buf_ri;
1930: merge->buf_rj = buf_rj;
1931: merge->owners_co = owners_co;
1933: /* attach the supporting struct to Cmpi for reuse */
1934: c = (Mat_MPIAIJ*)Cmpi->data;
1936: c->ptap = ptap;
1937: ptap->api = NULL;
1938: ptap->apj = NULL;
1939: ptap->merge = merge;
1940: ptap->apa = NULL;
1941: ptap->destroy = Cmpi->ops->destroy;
1942: ptap->duplicate = Cmpi->ops->duplicate;
1944: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1945: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1946: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1948: *C = Cmpi;
1949: #if defined(PETSC_USE_INFO)
1950: if (bi[pn] != 0) {
1951: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1952: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1953: } else {
1954: PetscInfo(Cmpi,"Empty matrix product\n");
1955: }
1956: #endif
1957: return(0);
1958: }