Actual source code: mpiptap.c
petsc-3.12.5 2020-03-29
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
17: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
18: {
19: PetscErrorCode ierr;
20: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
21: Mat_APMPI *ptap=a->ap;
22: PetscBool iascii;
23: PetscViewerFormat format;
26: if (!ptap) {
27: /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
28: A->ops->view = MatView_MPIAIJ;
29: (A->ops->view)(A,viewer);
30: return(0);
31: }
33: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
34: if (iascii) {
35: PetscViewerGetFormat(viewer,&format);
36: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
37: if (ptap->algType == 0) {
38: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
39: } else if (ptap->algType == 1) {
40: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
41: } else if (ptap->algType == 2) {
42: PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
43: } else if (ptap->algType == 3) {
44: PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
45: }
46: }
47: }
48: (ptap->view)(A,viewer);
49: return(0);
50: }
52: PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
53: {
54: PetscErrorCode ierr;
55: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
56: Mat_APMPI *ptap=a->ap;
57: Mat_Merge_SeqsToMPI *merge;
60: if (!ptap) return(0);
62: PetscFree2(ptap->startsj_s,ptap->startsj_r);
63: PetscFree(ptap->bufa);
64: MatDestroy(&ptap->P_loc);
65: MatDestroy(&ptap->P_oth);
66: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
67: MatDestroy(&ptap->Rd);
68: MatDestroy(&ptap->Ro);
69: if (ptap->AP_loc) { /* used by alg_rap */
70: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
71: PetscFree(ap->i);
72: PetscFree2(ap->j,ap->a);
73: MatDestroy(&ptap->AP_loc);
74: } else { /* used by alg_ptap */
75: PetscFree(ptap->api);
76: PetscFree(ptap->apj);
77: }
78: MatDestroy(&ptap->C_loc);
79: MatDestroy(&ptap->C_oth);
80: if (ptap->apa) {PetscFree(ptap->apa);}
82: MatDestroy(&ptap->Pt);
84: merge=ptap->merge;
85: if (merge) { /* used by alg_ptap */
86: PetscFree(merge->id_r);
87: PetscFree(merge->len_s);
88: PetscFree(merge->len_r);
89: PetscFree(merge->bi);
90: PetscFree(merge->bj);
91: PetscFree(merge->buf_ri[0]);
92: PetscFree(merge->buf_ri);
93: PetscFree(merge->buf_rj[0]);
94: PetscFree(merge->buf_rj);
95: PetscFree(merge->coi);
96: PetscFree(merge->coj);
97: PetscFree(merge->owners_co);
98: PetscLayoutDestroy(&merge->rowmap);
99: PetscFree(ptap->merge);
100: }
101: ISLocalToGlobalMappingDestroy(&ptap->ltog);
103: PetscSFDestroy(&ptap->sf);
104: PetscFree(ptap->c_othi);
105: PetscFree(ptap->c_rmti);
106: return(0);
107: }
109: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
110: {
112: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
113: Mat_APMPI *ptap=a->ap;
116: (*A->ops->freeintermediatedatastructures)(A);
117: (*ptap->destroy)(A); /* MatDestroy_MPIAIJ(A) */
118: PetscFree(ptap);
119: return(0);
120: }
122: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
123: {
125: PetscBool flg;
126: MPI_Comm comm;
127: #if !defined(PETSC_HAVE_HYPRE)
128: const char *algTypes[4] = {"scalable","nonscalable","allatonce","allatonce_merged"};
129: PetscInt nalg=4;
130: #else
131: const char *algTypes[5] = {"scalable","nonscalable","allatonce","allatonce_merged","hypre"};
132: PetscInt nalg=5;
133: #endif
134: PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */
137: /* check if matrix local sizes are compatible */
138: PetscObjectGetComm((PetscObject)A,&comm);
139: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
140: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
142: if (scall == MAT_INITIAL_MATRIX) {
143: /* pick an algorithm */
144: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
145: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
146: PetscOptionsEnd();
148: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
149: MatInfo Ainfo,Pinfo;
150: PetscInt nz_local;
151: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
153: MatGetInfo(A,MAT_LOCAL,&Ainfo);
154: MatGetInfo(P,MAT_LOCAL,&Pinfo);
155: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
157: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
158: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
160: if (alg_scalable) {
161: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
162: }
163: }
165: switch (alg) {
166: case 1:
167: /* do R=P^T locally, then C=R*A*P -- nonscalable */
168: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
169: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
170: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
171: break;
172: case 2:
173: /* compute C=P^T*A*P allatonce */
174: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
175: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
176: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
177: break;
178: case 3:
179: /* compute C=P^T*A*P allatonce */
180: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
181: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
182: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
183: break;
184: #if defined(PETSC_HAVE_HYPRE)
185: case 4:
186: /* Use boomerAMGBuildCoarseOperator */
187: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
188: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
189: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
190: break;
191: #endif
192: default:
193: /* do R=P^T locally, then C=R*A*P */
194: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
195: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
196: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
197: break;
198: }
200: if (alg == 0 || alg == 1 || alg == 2 || alg == 3) {
201: Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data;
202: Mat_APMPI *ap = c->ap;
203: PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");
204: ap->freestruct = PETSC_FALSE;
205: PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
206: PetscOptionsEnd();
207: }
208: }
210: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
211: (*(*C)->ops->ptapnumeric)(A,P,*C);
212: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
213: return(0);
214: }
216: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
217: {
218: PetscErrorCode ierr;
219: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
220: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
221: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
222: Mat_APMPI *ptap = c->ap;
223: Mat AP_loc,C_loc,C_oth;
224: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
225: PetscScalar *apa;
226: const PetscInt *cols;
227: const PetscScalar *vals;
230: if (!ptap->AP_loc) {
231: MPI_Comm comm;
232: PetscObjectGetComm((PetscObject)C,&comm);
233: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
234: }
236: MatZeroEntries(C);
238: /* 1) get R = Pd^T,Ro = Po^T */
239: if (ptap->reuse == MAT_REUSE_MATRIX) {
240: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
241: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
242: }
244: /* 2) get AP_loc */
245: AP_loc = ptap->AP_loc;
246: ap = (Mat_SeqAIJ*)AP_loc->data;
248: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
249: /*-----------------------------------------------------*/
250: if (ptap->reuse == MAT_REUSE_MATRIX) {
251: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
252: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
253: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
254: }
256: /* 2-2) compute numeric A_loc*P - dominating part */
257: /* ---------------------------------------------- */
258: /* get data from symbolic products */
259: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
260: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
262: api = ap->i;
263: apj = ap->j;
264: ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
265: for (i=0; i<am; i++) {
266: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
267: apnz = api[i+1] - api[i];
268: apa = ap->a + api[i];
269: PetscArrayzero(apa,apnz);
270: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
271: }
272: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);
273: if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
275: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
276: /* Always use scalable version since we are in the MPI scalable version */
277: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
278: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
280: C_loc = ptap->C_loc;
281: C_oth = ptap->C_oth;
283: /* add C_loc and Co to to C */
284: MatGetOwnershipRange(C,&rstart,&rend);
286: /* C_loc -> C */
287: cm = C_loc->rmap->N;
288: c_seq = (Mat_SeqAIJ*)C_loc->data;
289: cols = c_seq->j;
290: vals = c_seq->a;
291: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);
293: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
294: /* when there are no off-processor parts. */
295: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
296: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
297: /* a table, and other, more complex stuff has to be done. */
298: if (C->assembled) {
299: C->was_assembled = PETSC_TRUE;
300: C->assembled = PETSC_FALSE;
301: }
302: if (C->was_assembled) {
303: for (i=0; i<cm; i++) {
304: ncols = c_seq->i[i+1] - c_seq->i[i];
305: row = rstart + i;
306: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
307: cols += ncols; vals += ncols;
308: }
309: } else {
310: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
311: }
312: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);
313: if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
315: /* Co -> C, off-processor part */
316: cm = C_oth->rmap->N;
317: c_seq = (Mat_SeqAIJ*)C_oth->data;
318: cols = c_seq->j;
319: vals = c_seq->a;
320: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
321: for (i=0; i<cm; i++) {
322: ncols = c_seq->i[i+1] - c_seq->i[i];
323: row = p->garray[i];
324: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
325: cols += ncols; vals += ncols;
326: }
327: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
328: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
330: ptap->reuse = MAT_REUSE_MATRIX;
332: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
333: if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
335: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
336: if (ptap->freestruct) {
337: MatFreeIntermediateDataStructures(C);
338: }
339: return(0);
340: }
342: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
343: {
344: PetscErrorCode ierr;
345: Mat_APMPI *ptap;
346: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
347: MPI_Comm comm;
348: PetscMPIInt size,rank;
349: Mat Cmpi,P_loc,P_oth;
350: PetscFreeSpaceList free_space=NULL,current_space=NULL;
351: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
352: PetscInt *lnk,i,k,pnz,row,nsend;
353: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
354: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
355: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
356: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
357: MPI_Request *swaits,*rwaits;
358: MPI_Status *sstatus,rstatus;
359: PetscLayout rowmap;
360: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
361: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
362: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
363: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
364: PetscScalar *apv;
365: PetscTable ta;
366: MatType mtype;
367: const char *prefix;
368: #if defined(PETSC_USE_INFO)
369: PetscReal apfill;
370: #endif
373: PetscObjectGetComm((PetscObject)A,&comm);
374: MPI_Comm_size(comm,&size);
375: MPI_Comm_rank(comm,&rank);
377: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
379: /* create symbolic parallel matrix Cmpi */
380: MatCreate(comm,&Cmpi);
381: MatGetType(A,&mtype);
382: MatSetType(Cmpi,mtype);
384: /* create struct Mat_APMPI and attached it to C later */
385: PetscNew(&ptap);
386: ptap->reuse = MAT_INITIAL_MATRIX;
387: ptap->algType = 0;
389: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
390: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
391: /* get P_loc by taking all local rows of P */
392: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
394: ptap->P_loc = P_loc;
395: ptap->P_oth = P_oth;
397: /* (0) compute Rd = Pd^T, Ro = Po^T */
398: /* --------------------------------- */
399: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
400: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
402: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
403: /* ----------------------------------------------------------------- */
404: p_loc = (Mat_SeqAIJ*)P_loc->data;
405: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
407: /* create and initialize a linked list */
408: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
409: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
410: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
411: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
413: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
415: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
416: if (ao) {
417: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
418: } else {
419: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
420: }
421: current_space = free_space;
422: nspacedouble = 0;
424: PetscMalloc1(am+1,&api);
425: api[0] = 0;
426: for (i=0; i<am; i++) {
427: /* diagonal portion: Ad[i,:]*P */
428: ai = ad->i; pi = p_loc->i;
429: nzi = ai[i+1] - ai[i];
430: aj = ad->j + ai[i];
431: for (j=0; j<nzi; j++) {
432: row = aj[j];
433: pnz = pi[row+1] - pi[row];
434: Jptr = p_loc->j + pi[row];
435: /* add non-zero cols of P into the sorted linked list lnk */
436: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
437: }
438: /* off-diagonal portion: Ao[i,:]*P */
439: if (ao) {
440: ai = ao->i; pi = p_oth->i;
441: nzi = ai[i+1] - ai[i];
442: aj = ao->j + ai[i];
443: for (j=0; j<nzi; j++) {
444: row = aj[j];
445: pnz = pi[row+1] - pi[row];
446: Jptr = p_oth->j + pi[row];
447: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
448: }
449: }
450: apnz = lnk[0];
451: api[i+1] = api[i] + apnz;
453: /* if free space is not available, double the total space in the list */
454: if (current_space->local_remaining<apnz) {
455: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
456: nspacedouble++;
457: }
459: /* Copy data into free space, then initialize lnk */
460: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
462: current_space->array += apnz;
463: current_space->local_used += apnz;
464: current_space->local_remaining -= apnz;
465: }
466: /* Allocate space for apj and apv, initialize apj, and */
467: /* destroy list of free space and other temporary array(s) */
468: PetscCalloc2(api[am],&apj,api[am],&apv);
469: PetscFreeSpaceContiguous(&free_space,apj);
470: PetscLLCondensedDestroy_Scalable(lnk);
472: /* Create AP_loc for reuse */
473: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
474: MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);
476: #if defined(PETSC_USE_INFO)
477: if (ao) {
478: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
479: } else {
480: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
481: }
482: ptap->AP_loc->info.mallocs = nspacedouble;
483: ptap->AP_loc->info.fill_ratio_given = fill;
484: ptap->AP_loc->info.fill_ratio_needed = apfill;
486: if (api[am]) {
487: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
488: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
489: } else {
490: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
491: }
492: #endif
494: /* (2-1) compute symbolic Co = Ro*AP_loc */
495: /* ------------------------------------ */
496: MatGetOptionsPrefix(A,&prefix);
497: MatSetOptionsPrefix(ptap->Ro,prefix);
498: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
499: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
501: /* (3) send coj of C_oth to other processors */
502: /* ------------------------------------------ */
503: /* determine row ownership */
504: PetscLayoutCreate(comm,&rowmap);
505: rowmap->n = pn;
506: rowmap->bs = 1;
507: PetscLayoutSetUp(rowmap);
508: owners = rowmap->range;
510: /* determine the number of messages to send, their lengths */
511: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
512: PetscArrayzero(len_s,size);
513: PetscArrayzero(len_si,size);
515: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
516: coi = c_oth->i; coj = c_oth->j;
517: con = ptap->C_oth->rmap->n;
518: proc = 0;
519: ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
520: for (i=0; i<con; i++) {
521: while (prmap[i] >= owners[proc+1]) proc++;
522: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
523: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
524: }
526: len = 0; /* max length of buf_si[], see (4) */
527: owners_co[0] = 0;
528: nsend = 0;
529: for (proc=0; proc<size; proc++) {
530: owners_co[proc+1] = owners_co[proc] + len_si[proc];
531: if (len_s[proc]) {
532: nsend++;
533: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
534: len += len_si[proc];
535: }
536: }
538: /* determine the number and length of messages to receive for coi and coj */
539: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
540: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
542: /* post the Irecv and Isend of coj */
543: PetscCommGetNewTag(comm,&tagj);
544: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
545: PetscMalloc1(nsend+1,&swaits);
546: for (proc=0, k=0; proc<size; proc++) {
547: if (!len_s[proc]) continue;
548: i = owners_co[proc];
549: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
550: k++;
551: }
553: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
554: /* ---------------------------------------- */
555: MatSetOptionsPrefix(ptap->Rd,prefix);
556: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
557: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
558: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
559: ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);
561: /* receives coj are complete */
562: for (i=0; i<nrecv; i++) {
563: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
564: }
565: PetscFree(rwaits);
566: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
568: /* add received column indices into ta to update Crmax */
569: for (k=0; k<nrecv; k++) {/* k-th received message */
570: Jptr = buf_rj[k];
571: for (j=0; j<len_r[k]; j++) {
572: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
573: }
574: }
575: PetscTableGetCount(ta,&Crmax);
576: PetscTableDestroy(&ta);
578: /* (4) send and recv coi */
579: /*-----------------------*/
580: PetscCommGetNewTag(comm,&tagi);
581: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
582: PetscMalloc1(len+1,&buf_s);
583: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
584: for (proc=0,k=0; proc<size; proc++) {
585: if (!len_s[proc]) continue;
586: /* form outgoing message for i-structure:
587: buf_si[0]: nrows to be sent
588: [1:nrows]: row index (global)
589: [nrows+1:2*nrows+1]: i-structure index
590: */
591: /*-------------------------------------------*/
592: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
593: buf_si_i = buf_si + nrows+1;
594: buf_si[0] = nrows;
595: buf_si_i[0] = 0;
596: nrows = 0;
597: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
598: nzi = coi[i+1] - coi[i];
599: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
600: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
601: nrows++;
602: }
603: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
604: k++;
605: buf_si += len_si[proc];
606: }
607: for (i=0; i<nrecv; i++) {
608: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
609: }
610: PetscFree(rwaits);
611: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
613: PetscFree4(len_s,len_si,sstatus,owners_co);
614: PetscFree(len_ri);
615: PetscFree(swaits);
616: PetscFree(buf_s);
618: /* (5) compute the local portion of Cmpi */
619: /* ------------------------------------------ */
620: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
621: PetscFreeSpaceGet(Crmax,&free_space);
622: current_space = free_space;
624: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
625: for (k=0; k<nrecv; k++) {
626: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
627: nrows = *buf_ri_k[k];
628: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
629: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
630: }
632: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
633: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
634: for (i=0; i<pn; i++) {
635: /* add C_loc into Cmpi */
636: nzi = c_loc->i[i+1] - c_loc->i[i];
637: Jptr = c_loc->j + c_loc->i[i];
638: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
640: /* add received col data into lnk */
641: for (k=0; k<nrecv; k++) { /* k-th received message */
642: if (i == *nextrow[k]) { /* i-th row */
643: nzi = *(nextci[k]+1) - *nextci[k];
644: Jptr = buf_rj[k] + *nextci[k];
645: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
646: nextrow[k]++; nextci[k]++;
647: }
648: }
649: nzi = lnk[0];
651: /* copy data into free space, then initialize lnk */
652: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
653: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
654: }
655: PetscFree3(buf_ri_k,nextrow,nextci);
656: PetscLLCondensedDestroy_Scalable(lnk);
657: PetscFreeSpaceDestroy(free_space);
659: /* local sizes and preallocation */
660: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
661: if (P->cmap->bs > 0) {
662: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
663: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
664: }
665: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
666: MatPreallocateFinalize(dnz,onz);
668: /* members in merge */
669: PetscFree(id_r);
670: PetscFree(len_r);
671: PetscFree(buf_ri[0]);
672: PetscFree(buf_ri);
673: PetscFree(buf_rj[0]);
674: PetscFree(buf_rj);
675: PetscLayoutDestroy(&rowmap);
677: /* attach the supporting struct to Cmpi for reuse */
678: c = (Mat_MPIAIJ*)Cmpi->data;
679: c->ap = ptap;
680: ptap->duplicate = Cmpi->ops->duplicate;
681: ptap->destroy = Cmpi->ops->destroy;
682: ptap->view = Cmpi->ops->view;
684: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
685: Cmpi->assembled = PETSC_FALSE;
686: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
687: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
688: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
689: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
690: *C = Cmpi;
692: nout = 0;
693: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
694: if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
695: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);
696: if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
698: return(0);
699: }
701: PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
702: {
703: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
704: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
705: PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
706: PetscInt pcstart,pcend,column,offset;
707: PetscErrorCode ierr;
710: pcstart = P->cmap->rstart;
711: pcstart *= dof;
712: pcend = P->cmap->rend;
713: pcend *= dof;
714: /* diagonal portion: Ad[i,:]*P */
715: ai = ad->i;
716: nzi = ai[i+1] - ai[i];
717: aj = ad->j + ai[i];
718: for (j=0; j<nzi; j++) {
719: row = aj[j];
720: offset = row%dof;
721: row /= dof;
722: nzpi = pd->i[row+1] - pd->i[row];
723: pj = pd->j + pd->i[row];
724: for (k=0; k<nzpi; k++) {
725: PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
726: }
727: }
728: /* off diag P */
729: for (j=0; j<nzi; j++) {
730: row = aj[j];
731: offset = row%dof;
732: row /= dof;
733: nzpi = po->i[row+1] - po->i[row];
734: pj = po->j + po->i[row];
735: for (k=0; k<nzpi; k++) {
736: PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
737: }
738: }
740: /* off diagonal part: Ao[i, :]*P_oth */
741: if (ao) {
742: ai = ao->i;
743: pi = p_oth->i;
744: nzi = ai[i+1] - ai[i];
745: aj = ao->j + ai[i];
746: for (j=0; j<nzi; j++) {
747: row = aj[j];
748: offset = a->garray[row]%dof;
749: row = map[row];
750: pnz = pi[row+1] - pi[row];
751: p_othcols = p_oth->j + pi[row];
752: for (col=0; col<pnz; col++) {
753: column = p_othcols[col] * dof + offset;
754: if (column>=pcstart && column<pcend) {
755: PetscHSetIAdd(dht,column);
756: } else {
757: PetscHSetIAdd(oht,column);
758: }
759: }
760: }
761: } /* end if (ao) */
762: return(0);
763: }
765: PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
766: {
767: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
768: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
769: PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
770: PetscScalar ra,*aa,*pa;
771: PetscErrorCode ierr;
774: pcstart = P->cmap->rstart;
775: pcstart *= dof;
777: /* diagonal portion: Ad[i,:]*P */
778: ai = ad->i;
779: nzi = ai[i+1] - ai[i];
780: aj = ad->j + ai[i];
781: aa = ad->a + ai[i];
782: for (j=0; j<nzi; j++) {
783: ra = aa[j];
784: row = aj[j];
785: offset = row%dof;
786: row /= dof;
787: nzpi = pd->i[row+1] - pd->i[row];
788: pj = pd->j + pd->i[row];
789: pa = pd->a + pd->i[row];
790: for (k=0; k<nzpi; k++) {
791: PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
792: }
793: PetscLogFlops(2.0*nzpi);
794: }
795: for (j=0; j<nzi; j++) {
796: ra = aa[j];
797: row = aj[j];
798: offset = row%dof;
799: row /= dof;
800: nzpi = po->i[row+1] - po->i[row];
801: pj = po->j + po->i[row];
802: pa = po->a + po->i[row];
803: for (k=0; k<nzpi; k++) {
804: PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
805: }
806: PetscLogFlops(2.0*nzpi);
807: }
809: /* off diagonal part: Ao[i, :]*P_oth */
810: if (ao) {
811: ai = ao->i;
812: pi = p_oth->i;
813: nzi = ai[i+1] - ai[i];
814: aj = ao->j + ai[i];
815: aa = ao->a + ai[i];
816: for (j=0; j<nzi; j++) {
817: row = aj[j];
818: offset = a->garray[row]%dof;
819: row = map[row];
820: ra = aa[j];
821: pnz = pi[row+1] - pi[row];
822: p_othcols = p_oth->j + pi[row];
823: pa = p_oth->a + pi[row];
824: for (col=0; col<pnz; col++) {
825: PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
826: }
827: PetscLogFlops(2.0*pnz);
828: }
829: } /* end if (ao) */
831: return(0);
832: }
834: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
836: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
837: {
838: PetscErrorCode ierr;
839: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
840: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
841: Mat_APMPI *ptap = c->ap;
842: PetscHMapIV hmap;
843: PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
844: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
845: PetscInt offset,ii,pocol;
846: const PetscInt *mappingindices;
847: IS map;
848: MPI_Comm comm;
851: PetscObjectGetComm((PetscObject)A,&comm);
852: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
854: MatZeroEntries(C);
856: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
857: /*-----------------------------------------------------*/
858: if (ptap->reuse == MAT_REUSE_MATRIX) {
859: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
860: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
861: }
862: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
864: MatGetLocalSize(p->B,NULL,&pon);
865: pon *= dof;
866: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
867: MatGetLocalSize(A,&am,NULL);
868: cmaxr = 0;
869: for (i=0; i<pon; i++) {
870: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
871: }
872: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
873: PetscHMapIVCreate(&hmap);
874: PetscHMapIVResize(hmap,cmaxr);
875: ISGetIndices(map,&mappingindices);
876: for (i=0; i<am && pon; i++) {
877: PetscHMapIVClear(hmap);
878: offset = i%dof;
879: ii = i/dof;
880: nzi = po->i[ii+1] - po->i[ii];
881: if (!nzi) continue;
882: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
883: voff = 0;
884: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
885: if (!voff) continue;
887: /* Form C(ii, :) */
888: poj = po->j + po->i[ii];
889: poa = po->a + po->i[ii];
890: for (j=0; j<nzi; j++) {
891: pocol = poj[j]*dof+offset;
892: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
893: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
894: for (jj=0; jj<voff; jj++) {
895: apvaluestmp[jj] = apvalues[jj]*poa[j];
896: /*If the row is empty */
897: if (!c_rmtc[pocol]) {
898: c_rmtjj[jj] = apindices[jj];
899: c_rmtaa[jj] = apvaluestmp[jj];
900: c_rmtc[pocol]++;
901: } else {
902: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
903: if (loc>=0){ /* hit */
904: c_rmtaa[loc] += apvaluestmp[jj];
905: PetscLogFlops(1.0);
906: } else { /* new element */
907: loc = -(loc+1);
908: /* Move data backward */
909: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
910: c_rmtjj[kk] = c_rmtjj[kk-1];
911: c_rmtaa[kk] = c_rmtaa[kk-1];
912: }/* End kk */
913: c_rmtjj[loc] = apindices[jj];
914: c_rmtaa[loc] = apvaluestmp[jj];
915: c_rmtc[pocol]++;
916: }
917: }
918: PetscLogFlops(voff);
919: } /* End jj */
920: } /* End j */
921: } /* End i */
923: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
924: PetscHMapIVDestroy(&hmap);
926: MatGetLocalSize(P,NULL,&pn);
927: pn *= dof;
928: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
930: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
931: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
932: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
933: pcstart = pcstart*dof;
934: pcend = pcend*dof;
935: cd = (Mat_SeqAIJ*)(c->A)->data;
936: co = (Mat_SeqAIJ*)(c->B)->data;
938: cmaxr = 0;
939: for (i=0; i<pn; i++) {
940: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
941: }
942: PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
943: PetscHMapIVCreate(&hmap);
944: PetscHMapIVResize(hmap,cmaxr);
945: for (i=0; i<am && pn; i++) {
946: PetscHMapIVClear(hmap);
947: offset = i%dof;
948: ii = i/dof;
949: nzi = pd->i[ii+1] - pd->i[ii];
950: if (!nzi) continue;
951: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
952: voff = 0;
953: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
954: if (!voff) continue;
955: /* Form C(ii, :) */
956: pdj = pd->j + pd->i[ii];
957: pda = pd->a + pd->i[ii];
958: for (j=0; j<nzi; j++) {
959: row = pcstart + pdj[j] * dof + offset;
960: for (jj=0; jj<voff; jj++) {
961: apvaluestmp[jj] = apvalues[jj]*pda[j];
962: }
963: PetscLogFlops(voff);
964: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
965: }
966: }
967: ISRestoreIndices(map,&mappingindices);
968: MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
969: PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
970: PetscHMapIVDestroy(&hmap);
971: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
972: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
973: PetscFree2(c_rmtj,c_rmta);
975: /* Add contributions from remote */
976: for (i = 0; i < pn; i++) {
977: row = i + pcstart;
978: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
979: }
980: PetscFree2(c_othj,c_otha);
982: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
983: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
985: ptap->reuse = MAT_REUSE_MATRIX;
987: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
988: if (ptap->freestruct) {
989: MatFreeIntermediateDataStructures(C);
990: }
991: return(0);
992: }
994: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
995: {
996: PetscErrorCode ierr;
1000: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
1001: return(0);
1002: }
1004: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
1005: {
1006: PetscErrorCode ierr;
1007: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1008: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
1009: Mat_APMPI *ptap = c->ap;
1010: PetscHMapIV hmap;
1011: PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
1012: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
1013: PetscInt offset,ii,pocol;
1014: const PetscInt *mappingindices;
1015: IS map;
1016: MPI_Comm comm;
1019: PetscObjectGetComm((PetscObject)A,&comm);
1020: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1022: MatZeroEntries(C);
1024: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1025: /*-----------------------------------------------------*/
1026: if (ptap->reuse == MAT_REUSE_MATRIX) {
1027: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1028: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
1029: }
1030: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1031: MatGetLocalSize(p->B,NULL,&pon);
1032: pon *= dof;
1033: MatGetLocalSize(P,NULL,&pn);
1034: pn *= dof;
1036: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
1037: MatGetLocalSize(A,&am,NULL);
1038: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1039: pcstart *= dof;
1040: pcend *= dof;
1041: cmaxr = 0;
1042: for (i=0; i<pon; i++) {
1043: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
1044: }
1045: cd = (Mat_SeqAIJ*)(c->A)->data;
1046: co = (Mat_SeqAIJ*)(c->B)->data;
1047: for (i=0; i<pn; i++) {
1048: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
1049: }
1050: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
1051: PetscHMapIVCreate(&hmap);
1052: PetscHMapIVResize(hmap,cmaxr);
1053: ISGetIndices(map,&mappingindices);
1054: for (i=0; i<am && (pon || pn); i++) {
1055: PetscHMapIVClear(hmap);
1056: offset = i%dof;
1057: ii = i/dof;
1058: nzi = po->i[ii+1] - po->i[ii];
1059: dnzi = pd->i[ii+1] - pd->i[ii];
1060: if (!nzi && !dnzi) continue;
1061: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
1062: voff = 0;
1063: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
1064: if (!voff) continue;
1066: /* Form remote C(ii, :) */
1067: poj = po->j + po->i[ii];
1068: poa = po->a + po->i[ii];
1069: for (j=0; j<nzi; j++) {
1070: pocol = poj[j]*dof+offset;
1071: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
1072: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
1073: for (jj=0; jj<voff; jj++) {
1074: apvaluestmp[jj] = apvalues[jj]*poa[j];
1075: /*If the row is empty */
1076: if (!c_rmtc[pocol]) {
1077: c_rmtjj[jj] = apindices[jj];
1078: c_rmtaa[jj] = apvaluestmp[jj];
1079: c_rmtc[pocol]++;
1080: } else {
1081: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
1082: if (loc>=0){ /* hit */
1083: c_rmtaa[loc] += apvaluestmp[jj];
1084: PetscLogFlops(1.0);
1085: } else { /* new element */
1086: loc = -(loc+1);
1087: /* Move data backward */
1088: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
1089: c_rmtjj[kk] = c_rmtjj[kk-1];
1090: c_rmtaa[kk] = c_rmtaa[kk-1];
1091: }/* End kk */
1092: c_rmtjj[loc] = apindices[jj];
1093: c_rmtaa[loc] = apvaluestmp[jj];
1094: c_rmtc[pocol]++;
1095: }
1096: }
1097: } /* End jj */
1098: PetscLogFlops(voff);
1099: } /* End j */
1101: /* Form local C(ii, :) */
1102: pdj = pd->j + pd->i[ii];
1103: pda = pd->a + pd->i[ii];
1104: for (j=0; j<dnzi; j++) {
1105: row = pcstart + pdj[j] * dof + offset;
1106: for (jj=0; jj<voff; jj++) {
1107: apvaluestmp[jj] = apvalues[jj]*pda[j];
1108: }/* End kk */
1109: PetscLogFlops(voff);
1110: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
1111: }/* End j */
1112: } /* End i */
1114: ISRestoreIndices(map,&mappingindices);
1115: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
1116: PetscHMapIVDestroy(&hmap);
1117: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
1119: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1120: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1121: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1122: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1123: PetscFree2(c_rmtj,c_rmta);
1125: /* Add contributions from remote */
1126: for (i = 0; i < pn; i++) {
1127: row = i + pcstart;
1128: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
1129: }
1130: PetscFree2(c_othj,c_otha);
1132: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1133: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1135: ptap->reuse = MAT_REUSE_MATRIX;
1137: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1138: if (ptap->freestruct) {
1139: MatFreeIntermediateDataStructures(C);
1140: }
1141: return(0);
1142: }
1144: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1145: {
1146: PetscErrorCode ierr;
1150: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1151: return(0);
1152: }
1154: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C)
1155: {
1156: Mat_APMPI *ptap;
1157: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1158: MPI_Comm comm;
1159: Mat Cmpi;
1160: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1161: MatType mtype;
1162: PetscSF sf;
1163: PetscSFNode *iremote;
1164: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1165: const PetscInt *rootdegrees;
1166: PetscHSetI ht,oht,*hta,*hto;
1167: PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1168: PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1169: PetscInt nalg=2,alg=0,offset,ii;
1170: const PetscInt *mappingindices;
1171: PetscBool flg;
1172: const char *algTypes[2] = {"overlapping","merged"};
1173: IS map;
1174: PetscErrorCode ierr;
1177: PetscObjectGetComm((PetscObject)A,&comm);
1179: /* Create symbolic parallel matrix Cmpi */
1180: MatGetLocalSize(P,NULL,&pn);
1181: pn *= dof;
1182: MatCreate(comm,&Cmpi);
1183: MatGetType(A,&mtype);
1184: MatSetType(Cmpi,mtype);
1185: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1187: PetscNew(&ptap);
1188: ptap->reuse = MAT_INITIAL_MATRIX;
1189: ptap->algType = 2;
1191: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1192: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1193: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1194: /* This equals to the number of offdiag columns in P */
1195: MatGetLocalSize(p->B,NULL,&pon);
1196: pon *= dof;
1197: /* offsets */
1198: PetscMalloc1(pon+1,&ptap->c_rmti);
1199: /* The number of columns we will send to remote ranks */
1200: PetscMalloc1(pon,&c_rmtc);
1201: PetscMalloc1(pon,&hta);
1202: for (i=0; i<pon; i++) {
1203: PetscHSetICreate(&hta[i]);
1204: }
1205: MatGetLocalSize(A,&am,NULL);
1206: MatGetOwnershipRange(A,&arstart,&arend);
1207: /* Create hash table to merge all columns for C(i, :) */
1208: PetscHSetICreate(&ht);
1210: ISGetIndices(map,&mappingindices);
1211: ptap->c_rmti[0] = 0;
1212: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1213: for (i=0; i<am && pon; i++) {
1214: /* Form one row of AP */
1215: PetscHSetIClear(ht);
1216: offset = i%dof;
1217: ii = i/dof;
1218: /* If the off diag is empty, we should not do any calculation */
1219: nzi = po->i[ii+1] - po->i[ii];
1220: if (!nzi) continue;
1222: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1223: PetscHSetIGetSize(ht,&htsize);
1224: /* If AP is empty, just continue */
1225: if (!htsize) continue;
1226: /* Form C(ii, :) */
1227: poj = po->j + po->i[ii];
1228: for (j=0; j<nzi; j++) {
1229: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1230: }
1231: }
1233: for (i=0; i<pon; i++) {
1234: PetscHSetIGetSize(hta[i],&htsize);
1235: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1236: c_rmtc[i] = htsize;
1237: }
1239: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1241: for (i=0; i<pon; i++) {
1242: off = 0;
1243: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1244: PetscHSetIDestroy(&hta[i]);
1245: }
1246: PetscFree(hta);
1248: PetscMalloc1(pon,&iremote);
1249: for (i=0; i<pon; i++) {
1250: owner = 0; lidx = 0;
1251: offset = i%dof;
1252: ii = i/dof;
1253: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1254: iremote[i].index = lidx*dof + offset;
1255: iremote[i].rank = owner;
1256: }
1258: PetscSFCreate(comm,&sf);
1259: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1260: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1261: PetscSFSetRankOrder(sf,PETSC_TRUE);
1262: PetscSFSetFromOptions(sf);
1263: PetscSFSetUp(sf);
1264: /* How many neighbors have contributions to my rows? */
1265: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1266: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1267: rootspacesize = 0;
1268: for (i = 0; i < pn; i++) {
1269: rootspacesize += rootdegrees[i];
1270: }
1271: PetscMalloc1(rootspacesize,&rootspace);
1272: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1273: /* Get information from leaves
1274: * Number of columns other people contribute to my rows
1275: * */
1276: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1277: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1278: PetscFree(c_rmtc);
1279: PetscCalloc1(pn+1,&ptap->c_othi);
1280: /* The number of columns is received for each row */
1281: ptap->c_othi[0] = 0;
1282: rootspacesize = 0;
1283: rootspaceoffsets[0] = 0;
1284: for (i = 0; i < pn; i++) {
1285: rcvncols = 0;
1286: for (j = 0; j<rootdegrees[i]; j++) {
1287: rcvncols += rootspace[rootspacesize];
1288: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1289: rootspacesize++;
1290: }
1291: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1292: }
1293: PetscFree(rootspace);
1295: PetscMalloc1(pon,&c_rmtoffsets);
1296: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1297: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1298: PetscSFDestroy(&sf);
1299: PetscFree(rootspaceoffsets);
1301: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1302: nleaves = 0;
1303: for (i = 0; i<pon; i++) {
1304: owner = 0;
1305: ii = i/dof;
1306: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1307: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1308: for (j=0; j<sendncols; j++) {
1309: iremote[nleaves].rank = owner;
1310: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1311: }
1312: }
1313: PetscFree(c_rmtoffsets);
1314: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1316: PetscSFCreate(comm,&ptap->sf);
1317: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1318: PetscSFSetFromOptions(ptap->sf);
1319: /* One to one map */
1320: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1322: PetscMalloc2(pn,&dnz,pn,&onz);
1323: PetscHSetICreate(&oht);
1324: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1325: pcstart *= dof;
1326: pcend *= dof;
1327: PetscMalloc2(pn,&hta,pn,&hto);
1328: for (i=0; i<pn; i++) {
1329: PetscHSetICreate(&hta[i]);
1330: PetscHSetICreate(&hto[i]);
1331: }
1332: /* Work on local part */
1333: /* 4) Pass 1: Estimate memory for C_loc */
1334: for (i=0; i<am && pn; i++) {
1335: PetscHSetIClear(ht);
1336: PetscHSetIClear(oht);
1337: offset = i%dof;
1338: ii = i/dof;
1339: nzi = pd->i[ii+1] - pd->i[ii];
1340: if (!nzi) continue;
1342: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1343: PetscHSetIGetSize(ht,&htsize);
1344: PetscHSetIGetSize(oht,&htosize);
1345: if (!(htsize+htosize)) continue;
1346: /* Form C(ii, :) */
1347: pdj = pd->j + pd->i[ii];
1348: for (j=0; j<nzi; j++) {
1349: PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1350: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1351: }
1352: }
1354: ISRestoreIndices(map,&mappingindices);
1356: PetscHSetIDestroy(&ht);
1357: PetscHSetIDestroy(&oht);
1359: /* Get remote data */
1360: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1361: PetscFree(c_rmtj);
1363: for (i = 0; i < pn; i++) {
1364: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1365: rdj = c_othj + ptap->c_othi[i];
1366: for (j = 0; j < nzi; j++) {
1367: col = rdj[j];
1368: /* diag part */
1369: if (col>=pcstart && col<pcend) {
1370: PetscHSetIAdd(hta[i],col);
1371: } else { /* off diag */
1372: PetscHSetIAdd(hto[i],col);
1373: }
1374: }
1375: PetscHSetIGetSize(hta[i],&htsize);
1376: dnz[i] = htsize;
1377: PetscHSetIDestroy(&hta[i]);
1378: PetscHSetIGetSize(hto[i],&htsize);
1379: onz[i] = htsize;
1380: PetscHSetIDestroy(&hto[i]);
1381: }
1383: PetscFree2(hta,hto);
1384: PetscFree(c_othj);
1386: /* local sizes and preallocation */
1387: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1388: MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1389: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1390: MatSetUp(Cmpi);
1391: PetscFree2(dnz,onz);
1393: /* attach the supporting struct to Cmpi for reuse */
1394: c = (Mat_MPIAIJ*)Cmpi->data;
1395: c->ap = ptap;
1396: ptap->duplicate = Cmpi->ops->duplicate;
1397: ptap->destroy = Cmpi->ops->destroy;
1398: ptap->view = Cmpi->ops->view;
1400: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1401: Cmpi->assembled = PETSC_FALSE;
1402: /* pick an algorithm */
1403: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1404: alg = 0;
1405: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1406: PetscOptionsEnd();
1407: switch (alg) {
1408: case 0:
1409: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1410: break;
1411: case 1:
1412: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1413: break;
1414: default:
1415: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1416: }
1417: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1418: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1419: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1420: *C = Cmpi;
1421: return(0);
1422: }
1424: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat *C)
1425: {
1426: PetscErrorCode ierr;
1430: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1431: return(0);
1432: }
1434: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat *C)
1435: {
1436: Mat_APMPI *ptap;
1437: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1438: MPI_Comm comm;
1439: Mat Cmpi;
1440: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1441: MatType mtype;
1442: PetscSF sf;
1443: PetscSFNode *iremote;
1444: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1445: const PetscInt *rootdegrees;
1446: PetscHSetI ht,oht,*hta,*hto,*htd;
1447: PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1448: PetscInt owner,lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1449: PetscInt nalg=2,alg=0,offset,ii;
1450: PetscBool flg;
1451: const char *algTypes[2] = {"merged","overlapping"};
1452: const PetscInt *mappingindices;
1453: IS map;
1454: PetscErrorCode ierr;
1457: PetscObjectGetComm((PetscObject)A,&comm);
1459: /* Create symbolic parallel matrix Cmpi */
1460: MatGetLocalSize(P,NULL,&pn);
1461: pn *= dof;
1462: MatCreate(comm,&Cmpi);
1463: MatGetType(A,&mtype);
1464: MatSetType(Cmpi,mtype);
1465: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1467: PetscNew(&ptap);
1468: ptap->reuse = MAT_INITIAL_MATRIX;
1469: ptap->algType = 3;
1471: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1472: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1473: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1475: /* This equals to the number of offdiag columns in P */
1476: MatGetLocalSize(p->B,NULL,&pon);
1477: pon *= dof;
1478: /* offsets */
1479: PetscMalloc1(pon+1,&ptap->c_rmti);
1480: /* The number of columns we will send to remote ranks */
1481: PetscMalloc1(pon,&c_rmtc);
1482: PetscMalloc1(pon,&hta);
1483: for (i=0; i<pon; i++) {
1484: PetscHSetICreate(&hta[i]);
1485: }
1486: MatGetLocalSize(A,&am,NULL);
1487: MatGetOwnershipRange(A,&arstart,&arend);
1488: /* Create hash table to merge all columns for C(i, :) */
1489: PetscHSetICreate(&ht);
1490: PetscHSetICreate(&oht);
1491: PetscMalloc2(pn,&htd,pn,&hto);
1492: for (i=0; i<pn; i++) {
1493: PetscHSetICreate(&htd[i]);
1494: PetscHSetICreate(&hto[i]);
1495: }
1497: ISGetIndices(map,&mappingindices);
1498: ptap->c_rmti[0] = 0;
1499: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1500: for (i=0; i<am && (pon || pn); i++) {
1501: /* Form one row of AP */
1502: PetscHSetIClear(ht);
1503: PetscHSetIClear(oht);
1504: offset = i%dof;
1505: ii = i/dof;
1506: /* If the off diag is empty, we should not do any calculation */
1507: nzi = po->i[ii+1] - po->i[ii];
1508: dnzi = pd->i[ii+1] - pd->i[ii];
1509: if (!nzi && !dnzi) continue;
1511: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1512: PetscHSetIGetSize(ht,&htsize);
1513: PetscHSetIGetSize(oht,&htosize);
1514: /* If AP is empty, just continue */
1515: if (!(htsize+htosize)) continue;
1517: /* Form remote C(ii, :) */
1518: poj = po->j + po->i[ii];
1519: for (j=0; j<nzi; j++) {
1520: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1521: PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1522: }
1524: /* Form local C(ii, :) */
1525: pdj = pd->j + pd->i[ii];
1526: for (j=0; j<dnzi; j++) {
1527: PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1528: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1529: }
1530: }
1532: ISRestoreIndices(map,&mappingindices);
1534: PetscHSetIDestroy(&ht);
1535: PetscHSetIDestroy(&oht);
1537: for (i=0; i<pon; i++) {
1538: PetscHSetIGetSize(hta[i],&htsize);
1539: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1540: c_rmtc[i] = htsize;
1541: }
1543: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1545: for (i=0; i<pon; i++) {
1546: off = 0;
1547: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1548: PetscHSetIDestroy(&hta[i]);
1549: }
1550: PetscFree(hta);
1552: PetscMalloc1(pon,&iremote);
1553: for (i=0; i<pon; i++) {
1554: owner = 0; lidx = 0;
1555: offset = i%dof;
1556: ii = i/dof;
1557: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1558: iremote[i].index = lidx*dof+offset;
1559: iremote[i].rank = owner;
1560: }
1562: PetscSFCreate(comm,&sf);
1563: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1564: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1565: PetscSFSetRankOrder(sf,PETSC_TRUE);
1566: PetscSFSetFromOptions(sf);
1567: PetscSFSetUp(sf);
1568: /* How many neighbors have contributions to my rows? */
1569: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1570: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1571: rootspacesize = 0;
1572: for (i = 0; i < pn; i++) {
1573: rootspacesize += rootdegrees[i];
1574: }
1575: PetscMalloc1(rootspacesize,&rootspace);
1576: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1577: /* Get information from leaves
1578: * Number of columns other people contribute to my rows
1579: * */
1580: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1581: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1582: PetscFree(c_rmtc);
1583: PetscMalloc1(pn+1,&ptap->c_othi);
1584: /* The number of columns is received for each row */
1585: ptap->c_othi[0] = 0;
1586: rootspacesize = 0;
1587: rootspaceoffsets[0] = 0;
1588: for (i = 0; i < pn; i++) {
1589: rcvncols = 0;
1590: for (j = 0; j<rootdegrees[i]; j++) {
1591: rcvncols += rootspace[rootspacesize];
1592: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1593: rootspacesize++;
1594: }
1595: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1596: }
1597: PetscFree(rootspace);
1599: PetscMalloc1(pon,&c_rmtoffsets);
1600: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1601: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1602: PetscSFDestroy(&sf);
1603: PetscFree(rootspaceoffsets);
1605: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1606: nleaves = 0;
1607: for (i = 0; i<pon; i++) {
1608: owner = 0;
1609: ii = i/dof;
1610: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1611: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1612: for (j=0; j<sendncols; j++) {
1613: iremote[nleaves].rank = owner;
1614: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1615: }
1616: }
1617: PetscFree(c_rmtoffsets);
1618: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1620: PetscSFCreate(comm,&ptap->sf);
1621: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1622: PetscSFSetFromOptions(ptap->sf);
1623: /* One to one map */
1624: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1625: /* Get remote data */
1626: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1627: PetscFree(c_rmtj);
1628: PetscMalloc2(pn,&dnz,pn,&onz);
1629: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1630: pcstart *= dof;
1631: pcend *= dof;
1632: for (i = 0; i < pn; i++) {
1633: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1634: rdj = c_othj + ptap->c_othi[i];
1635: for (j = 0; j < nzi; j++) {
1636: col = rdj[j];
1637: /* diag part */
1638: if (col>=pcstart && col<pcend) {
1639: PetscHSetIAdd(htd[i],col);
1640: } else { /* off diag */
1641: PetscHSetIAdd(hto[i],col);
1642: }
1643: }
1644: PetscHSetIGetSize(htd[i],&htsize);
1645: dnz[i] = htsize;
1646: PetscHSetIDestroy(&htd[i]);
1647: PetscHSetIGetSize(hto[i],&htsize);
1648: onz[i] = htsize;
1649: PetscHSetIDestroy(&hto[i]);
1650: }
1652: PetscFree2(htd,hto);
1653: PetscFree(c_othj);
1655: /* local sizes and preallocation */
1656: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1657: MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1658: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1659: PetscFree2(dnz,onz);
1661: /* attach the supporting struct to Cmpi for reuse */
1662: c = (Mat_MPIAIJ*)Cmpi->data;
1663: c->ap = ptap;
1664: ptap->duplicate = Cmpi->ops->duplicate;
1665: ptap->destroy = Cmpi->ops->destroy;
1666: ptap->view = Cmpi->ops->view;
1668: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1669: Cmpi->assembled = PETSC_FALSE;
1670: /* pick an algorithm */
1671: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1672: alg = 0;
1673: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1674: PetscOptionsEnd();
1675: switch (alg) {
1676: case 0:
1677: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1678: break;
1679: case 1:
1680: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1681: break;
1682: default:
1683: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1684: }
1685: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1686: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1687: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1688: *C = Cmpi;
1689: return(0);
1690: }
1692: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat *C)
1693: {
1694: PetscErrorCode ierr;
1698: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1699: return(0);
1700: }
1702: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1703: {
1704: PetscErrorCode ierr;
1705: Mat_APMPI *ptap;
1706: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1707: MPI_Comm comm;
1708: PetscMPIInt size,rank;
1709: Mat Cmpi;
1710: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1711: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1712: PetscInt *lnk,i,k,pnz,row,nsend;
1713: PetscBT lnkbt;
1714: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1715: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1716: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1717: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1718: MPI_Request *swaits,*rwaits;
1719: MPI_Status *sstatus,rstatus;
1720: PetscLayout rowmap;
1721: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1722: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
1723: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1724: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1725: PetscScalar *apv;
1726: PetscTable ta;
1727: MatType mtype;
1728: const char *prefix;
1729: #if defined(PETSC_USE_INFO)
1730: PetscReal apfill;
1731: #endif
1734: PetscObjectGetComm((PetscObject)A,&comm);
1735: MPI_Comm_size(comm,&size);
1736: MPI_Comm_rank(comm,&rank);
1738: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1740: /* create symbolic parallel matrix Cmpi */
1741: MatCreate(comm,&Cmpi);
1742: MatGetType(A,&mtype);
1743: MatSetType(Cmpi,mtype);
1745: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1746: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1748: /* create struct Mat_APMPI and attached it to C later */
1749: PetscNew(&ptap);
1750: ptap->reuse = MAT_INITIAL_MATRIX;
1751: ptap->algType = 1;
1753: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1754: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1755: /* get P_loc by taking all local rows of P */
1756: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1758: /* (0) compute Rd = Pd^T, Ro = Po^T */
1759: /* --------------------------------- */
1760: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1761: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
1763: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1764: /* ----------------------------------------------------------------- */
1765: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1766: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1768: /* create and initialize a linked list */
1769: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1770: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1771: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1772: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1773: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1775: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1777: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1778: if (ao) {
1779: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1780: } else {
1781: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1782: }
1783: current_space = free_space;
1784: nspacedouble = 0;
1786: PetscMalloc1(am+1,&api);
1787: api[0] = 0;
1788: for (i=0; i<am; i++) {
1789: /* diagonal portion: Ad[i,:]*P */
1790: ai = ad->i; pi = p_loc->i;
1791: nzi = ai[i+1] - ai[i];
1792: aj = ad->j + ai[i];
1793: for (j=0; j<nzi; j++) {
1794: row = aj[j];
1795: pnz = pi[row+1] - pi[row];
1796: Jptr = p_loc->j + pi[row];
1797: /* add non-zero cols of P into the sorted linked list lnk */
1798: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1799: }
1800: /* off-diagonal portion: Ao[i,:]*P */
1801: if (ao) {
1802: ai = ao->i; pi = p_oth->i;
1803: nzi = ai[i+1] - ai[i];
1804: aj = ao->j + ai[i];
1805: for (j=0; j<nzi; j++) {
1806: row = aj[j];
1807: pnz = pi[row+1] - pi[row];
1808: Jptr = p_oth->j + pi[row];
1809: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1810: }
1811: }
1812: apnz = lnk[0];
1813: api[i+1] = api[i] + apnz;
1814: if (ap_rmax < apnz) ap_rmax = apnz;
1816: /* if free space is not available, double the total space in the list */
1817: if (current_space->local_remaining<apnz) {
1818: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1819: nspacedouble++;
1820: }
1822: /* Copy data into free space, then initialize lnk */
1823: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1825: current_space->array += apnz;
1826: current_space->local_used += apnz;
1827: current_space->local_remaining -= apnz;
1828: }
1829: /* Allocate space for apj and apv, initialize apj, and */
1830: /* destroy list of free space and other temporary array(s) */
1831: PetscMalloc2(api[am],&apj,api[am],&apv);
1832: PetscFreeSpaceContiguous(&free_space,apj);
1833: PetscLLDestroy(lnk,lnkbt);
1835: /* Create AP_loc for reuse */
1836: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
1838: #if defined(PETSC_USE_INFO)
1839: if (ao) {
1840: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1841: } else {
1842: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1843: }
1844: ptap->AP_loc->info.mallocs = nspacedouble;
1845: ptap->AP_loc->info.fill_ratio_given = fill;
1846: ptap->AP_loc->info.fill_ratio_needed = apfill;
1848: if (api[am]) {
1849: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1850: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1851: } else {
1852: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1853: }
1854: #endif
1856: /* (2-1) compute symbolic Co = Ro*AP_loc */
1857: /* ------------------------------------ */
1858: MatGetOptionsPrefix(A,&prefix);
1859: MatSetOptionsPrefix(ptap->Ro,prefix);
1860: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1861: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
1863: /* (3) send coj of C_oth to other processors */
1864: /* ------------------------------------------ */
1865: /* determine row ownership */
1866: PetscLayoutCreate(comm,&rowmap);
1867: rowmap->n = pn;
1868: rowmap->bs = 1;
1869: PetscLayoutSetUp(rowmap);
1870: owners = rowmap->range;
1872: /* determine the number of messages to send, their lengths */
1873: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1874: PetscArrayzero(len_s,size);
1875: PetscArrayzero(len_si,size);
1877: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1878: coi = c_oth->i; coj = c_oth->j;
1879: con = ptap->C_oth->rmap->n;
1880: proc = 0;
1881: for (i=0; i<con; i++) {
1882: while (prmap[i] >= owners[proc+1]) proc++;
1883: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1884: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1885: }
1887: len = 0; /* max length of buf_si[], see (4) */
1888: owners_co[0] = 0;
1889: nsend = 0;
1890: for (proc=0; proc<size; proc++) {
1891: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1892: if (len_s[proc]) {
1893: nsend++;
1894: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1895: len += len_si[proc];
1896: }
1897: }
1899: /* determine the number and length of messages to receive for coi and coj */
1900: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1901: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
1903: /* post the Irecv and Isend of coj */
1904: PetscCommGetNewTag(comm,&tagj);
1905: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1906: PetscMalloc1(nsend+1,&swaits);
1907: for (proc=0, k=0; proc<size; proc++) {
1908: if (!len_s[proc]) continue;
1909: i = owners_co[proc];
1910: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1911: k++;
1912: }
1914: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1915: /* ---------------------------------------- */
1916: MatSetOptionsPrefix(ptap->Rd,prefix);
1917: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1918: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
1919: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1921: /* receives coj are complete */
1922: for (i=0; i<nrecv; i++) {
1923: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1924: }
1925: PetscFree(rwaits);
1926: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1928: /* add received column indices into ta to update Crmax */
1929: for (k=0; k<nrecv; k++) {/* k-th received message */
1930: Jptr = buf_rj[k];
1931: for (j=0; j<len_r[k]; j++) {
1932: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1933: }
1934: }
1935: PetscTableGetCount(ta,&Crmax);
1936: PetscTableDestroy(&ta);
1938: /* (4) send and recv coi */
1939: /*-----------------------*/
1940: PetscCommGetNewTag(comm,&tagi);
1941: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1942: PetscMalloc1(len+1,&buf_s);
1943: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1944: for (proc=0,k=0; proc<size; proc++) {
1945: if (!len_s[proc]) continue;
1946: /* form outgoing message for i-structure:
1947: buf_si[0]: nrows to be sent
1948: [1:nrows]: row index (global)
1949: [nrows+1:2*nrows+1]: i-structure index
1950: */
1951: /*-------------------------------------------*/
1952: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1953: buf_si_i = buf_si + nrows+1;
1954: buf_si[0] = nrows;
1955: buf_si_i[0] = 0;
1956: nrows = 0;
1957: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1958: nzi = coi[i+1] - coi[i];
1959: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1960: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1961: nrows++;
1962: }
1963: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1964: k++;
1965: buf_si += len_si[proc];
1966: }
1967: for (i=0; i<nrecv; i++) {
1968: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1969: }
1970: PetscFree(rwaits);
1971: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1973: PetscFree4(len_s,len_si,sstatus,owners_co);
1974: PetscFree(len_ri);
1975: PetscFree(swaits);
1976: PetscFree(buf_s);
1978: /* (5) compute the local portion of Cmpi */
1979: /* ------------------------------------------ */
1980: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1981: PetscFreeSpaceGet(Crmax,&free_space);
1982: current_space = free_space;
1984: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1985: for (k=0; k<nrecv; k++) {
1986: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1987: nrows = *buf_ri_k[k];
1988: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1989: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1990: }
1992: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1993: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1994: for (i=0; i<pn; i++) {
1995: /* add C_loc into Cmpi */
1996: nzi = c_loc->i[i+1] - c_loc->i[i];
1997: Jptr = c_loc->j + c_loc->i[i];
1998: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
2000: /* add received col data into lnk */
2001: for (k=0; k<nrecv; k++) { /* k-th received message */
2002: if (i == *nextrow[k]) { /* i-th row */
2003: nzi = *(nextci[k]+1) - *nextci[k];
2004: Jptr = buf_rj[k] + *nextci[k];
2005: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
2006: nextrow[k]++; nextci[k]++;
2007: }
2008: }
2009: nzi = lnk[0];
2011: /* copy data into free space, then initialize lnk */
2012: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
2013: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
2014: }
2015: PetscFree3(buf_ri_k,nextrow,nextci);
2016: PetscLLDestroy(lnk,lnkbt);
2017: PetscFreeSpaceDestroy(free_space);
2019: /* local sizes and preallocation */
2020: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
2021: if (P->cmap->bs > 0) {
2022: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
2023: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
2024: }
2025: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
2026: MatPreallocateFinalize(dnz,onz);
2028: /* members in merge */
2029: PetscFree(id_r);
2030: PetscFree(len_r);
2031: PetscFree(buf_ri[0]);
2032: PetscFree(buf_ri);
2033: PetscFree(buf_rj[0]);
2034: PetscFree(buf_rj);
2035: PetscLayoutDestroy(&rowmap);
2037: /* attach the supporting struct to Cmpi for reuse */
2038: c = (Mat_MPIAIJ*)Cmpi->data;
2039: c->ap = ptap;
2040: ptap->duplicate = Cmpi->ops->duplicate;
2041: ptap->destroy = Cmpi->ops->destroy;
2042: ptap->view = Cmpi->ops->view;
2043: PetscCalloc1(pN,&ptap->apa);
2045: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
2046: Cmpi->assembled = PETSC_FALSE;
2047: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
2048: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
2049: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
2050: *C = Cmpi;
2051: return(0);
2052: }
2054: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
2055: {
2056: PetscErrorCode ierr;
2057: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
2058: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2059: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
2060: Mat_APMPI *ptap = c->ap;
2061: Mat AP_loc,C_loc,C_oth;
2062: PetscInt i,rstart,rend,cm,ncols,row;
2063: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
2064: PetscScalar *apa;
2065: const PetscInt *cols;
2066: const PetscScalar *vals;
2069: if (!ptap->AP_loc) {
2070: MPI_Comm comm;
2071: PetscObjectGetComm((PetscObject)C,&comm);
2072: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
2073: }
2075: MatZeroEntries(C);
2076: /* 1) get R = Pd^T,Ro = Po^T */
2077: if (ptap->reuse == MAT_REUSE_MATRIX) {
2078: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
2079: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
2080: }
2082: /* 2) get AP_loc */
2083: AP_loc = ptap->AP_loc;
2084: ap = (Mat_SeqAIJ*)AP_loc->data;
2086: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
2087: /*-----------------------------------------------------*/
2088: if (ptap->reuse == MAT_REUSE_MATRIX) {
2089: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2090: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
2091: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
2092: }
2094: /* 2-2) compute numeric A_loc*P - dominating part */
2095: /* ---------------------------------------------- */
2096: /* get data from symbolic products */
2097: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2098: if (ptap->P_oth) {
2099: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2100: }
2101: apa = ptap->apa;
2102: api = ap->i;
2103: apj = ap->j;
2104: for (i=0; i<am; i++) {
2105: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2106: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2107: apnz = api[i+1] - api[i];
2108: for (j=0; j<apnz; j++) {
2109: col = apj[j+api[i]];
2110: ap->a[j+ap->i[i]] = apa[col];
2111: apa[col] = 0.0;
2112: }
2113: }
2115: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2116: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
2117: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
2118: C_loc = ptap->C_loc;
2119: C_oth = ptap->C_oth;
2121: /* add C_loc and Co to to C */
2122: MatGetOwnershipRange(C,&rstart,&rend);
2124: /* C_loc -> C */
2125: cm = C_loc->rmap->N;
2126: c_seq = (Mat_SeqAIJ*)C_loc->data;
2127: cols = c_seq->j;
2128: vals = c_seq->a;
2131: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2132: /* when there are no off-processor parts. */
2133: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2134: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2135: /* a table, and other, more complex stuff has to be done. */
2136: if (C->assembled) {
2137: C->was_assembled = PETSC_TRUE;
2138: C->assembled = PETSC_FALSE;
2139: }
2140: if (C->was_assembled) {
2141: for (i=0; i<cm; i++) {
2142: ncols = c_seq->i[i+1] - c_seq->i[i];
2143: row = rstart + i;
2144: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
2145: cols += ncols; vals += ncols;
2146: }
2147: } else {
2148: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
2149: }
2151: /* Co -> C, off-processor part */
2152: cm = C_oth->rmap->N;
2153: c_seq = (Mat_SeqAIJ*)C_oth->data;
2154: cols = c_seq->j;
2155: vals = c_seq->a;
2156: for (i=0; i<cm; i++) {
2157: ncols = c_seq->i[i+1] - c_seq->i[i];
2158: row = p->garray[i];
2159: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2160: cols += ncols; vals += ncols;
2161: }
2163: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2164: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2166: ptap->reuse = MAT_REUSE_MATRIX;
2168: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2169: if (ptap->freestruct) {
2170: MatFreeIntermediateDataStructures(C);
2171: }
2172: return(0);
2173: }