Actual source code: mpiptap.c

petsc-3.13.6 2020-09-29
Report Typos and Errors

  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7:  #include <../src/mat/impls/aij/seq/aij.h>
  8:  #include <../src/mat/utils/freespace.h>
  9:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10:  #include <petscbt.h>
 11:  #include <petsctime.h>
 12:  #include <petsc/private/hashmapiv.h>
 13:  #include <petsc/private/hashseti.h>
 14:  #include <petscsf.h>


 17: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 18: {
 19:   PetscErrorCode    ierr;
 20:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
 21:   Mat_APMPI         *ptap=a->ap;
 22:   PetscBool         iascii;
 23:   PetscViewerFormat format;

 26:   if (!ptap) {
 27:     /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
 28:     A->ops->view = MatView_MPIAIJ;
 29:     (A->ops->view)(A,viewer);
 30:     return(0);
 31:   }

 33:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 34:   if (iascii) {
 35:     PetscViewerGetFormat(viewer,&format);
 36:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 37:       if (ptap->algType == 0) {
 38:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 39:       } else if (ptap->algType == 1) {
 40:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 41:       } else if (ptap->algType == 2) {
 42:         PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
 43:       } else if (ptap->algType == 3) {
 44:         PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
 45:       }
 46:     }
 47:   }
 48:   (ptap->view)(A,viewer);
 49:   return(0);
 50: }

 52: PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
 53: {
 54:   PetscErrorCode      ierr;
 55:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data;
 56:   Mat_APMPI           *ptap=a->ap;
 57:   Mat_Merge_SeqsToMPI *merge;

 60:   if (!ptap) return(0);

 62:   PetscFree2(ptap->startsj_s,ptap->startsj_r);
 63:   PetscFree(ptap->bufa);
 64:   MatDestroy(&ptap->P_loc);
 65:   MatDestroy(&ptap->P_oth);
 66:   MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 67:   MatDestroy(&ptap->Rd);
 68:   MatDestroy(&ptap->Ro);
 69:   if (ptap->AP_loc) { /* used by alg_rap */
 70:     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 71:     PetscFree(ap->i);
 72:     PetscFree2(ap->j,ap->a);
 73:     MatDestroy(&ptap->AP_loc);
 74:   } else { /* used by alg_ptap */
 75:     PetscFree(ptap->api);
 76:     PetscFree(ptap->apj);
 77:   }
 78:   MatDestroy(&ptap->C_loc);
 79:   MatDestroy(&ptap->C_oth);
 80:   if (ptap->apa) {PetscFree(ptap->apa);}

 82:   MatDestroy(&ptap->Pt);

 84:   merge=ptap->merge;
 85:   if (merge) { /* used by alg_ptap */
 86:     PetscFree(merge->id_r);
 87:     PetscFree(merge->len_s);
 88:     PetscFree(merge->len_r);
 89:     PetscFree(merge->bi);
 90:     PetscFree(merge->bj);
 91:     PetscFree(merge->buf_ri[0]);
 92:     PetscFree(merge->buf_ri);
 93:     PetscFree(merge->buf_rj[0]);
 94:     PetscFree(merge->buf_rj);
 95:     PetscFree(merge->coi);
 96:     PetscFree(merge->coj);
 97:     PetscFree(merge->owners_co);
 98:     PetscLayoutDestroy(&merge->rowmap);
 99:     PetscFree(ptap->merge);
100:   }
101:   ISLocalToGlobalMappingDestroy(&ptap->ltog);

103:   PetscSFDestroy(&ptap->sf);
104:   PetscFree(ptap->c_othi);
105:   PetscFree(ptap->c_rmti);
106:   return(0);
107: }

109: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
110: {
112:   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
113:   Mat_APMPI      *ptap=a->ap;

116:   (*A->ops->freeintermediatedatastructures)(A);
117:   (*ptap->destroy)(A); /* MatDestroy_MPIAIJ(A) */
118:   PetscFree(ptap);
119:   return(0);
120: }

122: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
123: {
124:   PetscErrorCode    ierr;
125:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
126:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
127:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
128:   Mat_APMPI         *ptap = c->ap;
129:   Mat               AP_loc,C_loc,C_oth;
130:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
131:   PetscScalar       *apa;
132:   const PetscInt    *cols;
133:   const PetscScalar *vals;

136:   if (!ptap->AP_loc) {
137:     MPI_Comm comm;
138:     PetscObjectGetComm((PetscObject)C,&comm);
139:     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
140:   }

142:   MatZeroEntries(C);

144:   /* 1) get R = Pd^T,Ro = Po^T */
145:   if (ptap->reuse == MAT_REUSE_MATRIX) {
146:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
147:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
148:   }

150:   /* 2) get AP_loc */
151:   AP_loc = ptap->AP_loc;
152:   ap = (Mat_SeqAIJ*)AP_loc->data;

154:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
155:   /*-----------------------------------------------------*/
156:   if (ptap->reuse == MAT_REUSE_MATRIX) {
157:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
158:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
159:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
160:   }

162:   /* 2-2) compute numeric A_loc*P - dominating part */
163:   /* ---------------------------------------------- */
164:   /* get data from symbolic products */
165:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
166:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;

168:   api   = ap->i;
169:   apj   = ap->j;
170:   ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
171:   for (i=0; i<am; i++) {
172:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
173:     apnz = api[i+1] - api[i];
174:     apa = ap->a + api[i];
175:     PetscArrayzero(apa,apnz);
176:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
177:   }
178:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);
179:   if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);

181:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
182:   /* Always use scalable version since we are in the MPI scalable version */
183:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
184:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

186:   C_loc = ptap->C_loc;
187:   C_oth = ptap->C_oth;

189:   /* add C_loc and Co to to C */
190:   MatGetOwnershipRange(C,&rstart,&rend);

192:   /* C_loc -> C */
193:   cm    = C_loc->rmap->N;
194:   c_seq = (Mat_SeqAIJ*)C_loc->data;
195:   cols = c_seq->j;
196:   vals = c_seq->a;
197:   ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);

199:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
200:   /* when there are no off-processor parts.  */
201:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
202:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
203:   /* a table, and other, more complex stuff has to be done. */
204:   if (C->assembled) {
205:     C->was_assembled = PETSC_TRUE;
206:     C->assembled     = PETSC_FALSE;
207:   }
208:   if (C->was_assembled) {
209:     for (i=0; i<cm; i++) {
210:       ncols = c_seq->i[i+1] - c_seq->i[i];
211:       row = rstart + i;
212:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
213:       cols += ncols; vals += ncols;
214:     }
215:   } else {
216:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
217:   }
218:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);
219:   if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);

221:   /* Co -> C, off-processor part */
222:   cm = C_oth->rmap->N;
223:   c_seq = (Mat_SeqAIJ*)C_oth->data;
224:   cols = c_seq->j;
225:   vals = c_seq->a;
226:   ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
227:   for (i=0; i<cm; i++) {
228:     ncols = c_seq->i[i+1] - c_seq->i[i];
229:     row = p->garray[i];
230:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
231:     cols += ncols; vals += ncols;
232:   }
233:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
234:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

236:   ptap->reuse = MAT_REUSE_MATRIX;

238:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
239:   if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);

241:   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
242:   if (ptap->freestruct) {
243:     MatFreeIntermediateDataStructures(C);
244:   }
245:   return(0);
246: }

248: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
249: {
250:   PetscErrorCode      ierr;
251:   Mat_APMPI           *ptap;
252:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
253:   MPI_Comm            comm;
254:   PetscMPIInt         size,rank;
255:   Mat                 P_loc,P_oth;
256:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
257:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
258:   PetscInt            *lnk,i,k,pnz,row,nsend;
259:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
260:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
261:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
262:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
263:   MPI_Request         *swaits,*rwaits;
264:   MPI_Status          *sstatus,rstatus;
265:   PetscLayout         rowmap;
266:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
267:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
268:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
269:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
270:   PetscScalar         *apv;
271:   PetscTable          ta;
272:   MatType             mtype;
273:   const char          *prefix;
274: #if defined(PETSC_USE_INFO)
275:   PetscReal           apfill;
276: #endif

279:   PetscObjectGetComm((PetscObject)A,&comm);
280:   MPI_Comm_size(comm,&size);
281:   MPI_Comm_rank(comm,&rank);

283:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

285:   /* create symbolic parallel matrix Cmpi */
286:   MatGetType(A,&mtype);
287:   MatSetType(Cmpi,mtype);

289:   /* create struct Mat_APMPI and attached it to C later */
290:   PetscNew(&ptap);
291:   ptap->reuse = MAT_INITIAL_MATRIX;
292:   ptap->algType = 0;

294:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
295:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
296:   /* get P_loc by taking all local rows of P */
297:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

299:   ptap->P_loc = P_loc;
300:   ptap->P_oth = P_oth;

302:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
303:   /* --------------------------------- */
304:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
305:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

307:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
308:   /* ----------------------------------------------------------------- */
309:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
310:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

312:   /* create and initialize a linked list */
313:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
314:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
315:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
316:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

318:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

320:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
321:   if (ao) {
322:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
323:   } else {
324:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
325:   }
326:   current_space = free_space;
327:   nspacedouble  = 0;

329:   PetscMalloc1(am+1,&api);
330:   api[0] = 0;
331:   for (i=0; i<am; i++) {
332:     /* diagonal portion: Ad[i,:]*P */
333:     ai = ad->i; pi = p_loc->i;
334:     nzi = ai[i+1] - ai[i];
335:     aj  = ad->j + ai[i];
336:     for (j=0; j<nzi; j++) {
337:       row  = aj[j];
338:       pnz  = pi[row+1] - pi[row];
339:       Jptr = p_loc->j + pi[row];
340:       /* add non-zero cols of P into the sorted linked list lnk */
341:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
342:     }
343:     /* off-diagonal portion: Ao[i,:]*P */
344:     if (ao) {
345:       ai = ao->i; pi = p_oth->i;
346:       nzi = ai[i+1] - ai[i];
347:       aj  = ao->j + ai[i];
348:       for (j=0; j<nzi; j++) {
349:         row  = aj[j];
350:         pnz  = pi[row+1] - pi[row];
351:         Jptr = p_oth->j + pi[row];
352:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
353:       }
354:     }
355:     apnz     = lnk[0];
356:     api[i+1] = api[i] + apnz;

358:     /* if free space is not available, double the total space in the list */
359:     if (current_space->local_remaining<apnz) {
360:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
361:       nspacedouble++;
362:     }

364:     /* Copy data into free space, then initialize lnk */
365:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

367:     current_space->array           += apnz;
368:     current_space->local_used      += apnz;
369:     current_space->local_remaining -= apnz;
370:   }
371:   /* Allocate space for apj and apv, initialize apj, and */
372:   /* destroy list of free space and other temporary array(s) */
373:   PetscCalloc2(api[am],&apj,api[am],&apv);
374:   PetscFreeSpaceContiguous(&free_space,apj);
375:   PetscLLCondensedDestroy_Scalable(lnk);

377:   /* Create AP_loc for reuse */
378:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
379:   MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);

381: #if defined(PETSC_USE_INFO)
382:   if (ao) {
383:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
384:   } else {
385:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
386:   }
387:   ptap->AP_loc->info.mallocs           = nspacedouble;
388:   ptap->AP_loc->info.fill_ratio_given  = fill;
389:   ptap->AP_loc->info.fill_ratio_needed = apfill;

391:   if (api[am]) {
392:     PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
393:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
394:   } else {
395:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
396:   }
397: #endif

399:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
400:   /* -------------------------------------- */
401:   MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
402:   MatGetOptionsPrefix(A,&prefix);
403:   MatSetOptionsPrefix(ptap->C_oth,prefix);
404:   MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");

406:   MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
407:   MatProductSetAlgorithm(ptap->C_oth,"sorted");
408:   MatProductSetFill(ptap->C_oth,fill);
409:   MatProductSetFromOptions(ptap->C_oth);
410:   MatProductSymbolic(ptap->C_oth);

412:   /* (3) send coj of C_oth to other processors  */
413:   /* ------------------------------------------ */
414:   /* determine row ownership */
415:   PetscLayoutCreate(comm,&rowmap);
416:   rowmap->n  = pn;
417:   rowmap->bs = 1;
418:   PetscLayoutSetUp(rowmap);
419:   owners = rowmap->range;

421:   /* determine the number of messages to send, their lengths */
422:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
423:   PetscArrayzero(len_s,size);
424:   PetscArrayzero(len_si,size);

426:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
427:   coi   = c_oth->i; coj = c_oth->j;
428:   con   = ptap->C_oth->rmap->n;
429:   proc  = 0;
430:   ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
431:   for (i=0; i<con; i++) {
432:     while (prmap[i] >= owners[proc+1]) proc++;
433:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
434:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
435:   }

437:   len          = 0; /* max length of buf_si[], see (4) */
438:   owners_co[0] = 0;
439:   nsend        = 0;
440:   for (proc=0; proc<size; proc++) {
441:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
442:     if (len_s[proc]) {
443:       nsend++;
444:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
445:       len         += len_si[proc];
446:     }
447:   }

449:   /* determine the number and length of messages to receive for coi and coj  */
450:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
451:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

453:   /* post the Irecv and Isend of coj */
454:   PetscCommGetNewTag(comm,&tagj);
455:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
456:   PetscMalloc1(nsend+1,&swaits);
457:   for (proc=0, k=0; proc<size; proc++) {
458:     if (!len_s[proc]) continue;
459:     i    = owners_co[proc];
460:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
461:     k++;
462:   }

464:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
465:   /* ---------------------------------------- */
466:   MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
467:   MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
468:   MatProductSetAlgorithm(ptap->C_loc,"default");
469:   MatProductSetFill(ptap->C_loc,fill);

471:   MatSetOptionsPrefix(ptap->C_loc,prefix);
472:   MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");

474:   MatProductSetFromOptions(ptap->C_loc);
475:   MatProductSymbolic(ptap->C_loc);

477:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
478:   ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);

480:   /* receives coj are complete */
481:   for (i=0; i<nrecv; i++) {
482:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
483:   }
484:   PetscFree(rwaits);
485:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

487:   /* add received column indices into ta to update Crmax */
488:   for (k=0; k<nrecv; k++) {/* k-th received message */
489:     Jptr = buf_rj[k];
490:     for (j=0; j<len_r[k]; j++) {
491:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
492:     }
493:   }
494:   PetscTableGetCount(ta,&Crmax);
495:   PetscTableDestroy(&ta);

497:   /* (4) send and recv coi */
498:   /*-----------------------*/
499:   PetscCommGetNewTag(comm,&tagi);
500:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
501:   PetscMalloc1(len+1,&buf_s);
502:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
503:   for (proc=0,k=0; proc<size; proc++) {
504:     if (!len_s[proc]) continue;
505:     /* form outgoing message for i-structure:
506:          buf_si[0]:                 nrows to be sent
507:                [1:nrows]:           row index (global)
508:                [nrows+1:2*nrows+1]: i-structure index
509:     */
510:     /*-------------------------------------------*/
511:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
512:     buf_si_i    = buf_si + nrows+1;
513:     buf_si[0]   = nrows;
514:     buf_si_i[0] = 0;
515:     nrows       = 0;
516:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
517:       nzi = coi[i+1] - coi[i];
518:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
519:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
520:       nrows++;
521:     }
522:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
523:     k++;
524:     buf_si += len_si[proc];
525:   }
526:   for (i=0; i<nrecv; i++) {
527:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
528:   }
529:   PetscFree(rwaits);
530:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

532:   PetscFree4(len_s,len_si,sstatus,owners_co);
533:   PetscFree(len_ri);
534:   PetscFree(swaits);
535:   PetscFree(buf_s);

537:   /* (5) compute the local portion of Cmpi      */
538:   /* ------------------------------------------ */
539:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
540:   PetscFreeSpaceGet(Crmax,&free_space);
541:   current_space = free_space;

543:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
544:   for (k=0; k<nrecv; k++) {
545:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
546:     nrows       = *buf_ri_k[k];
547:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
548:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
549:   }

551:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
552:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
553:   for (i=0; i<pn; i++) {
554:     /* add C_loc into Cmpi */
555:     nzi  = c_loc->i[i+1] - c_loc->i[i];
556:     Jptr = c_loc->j + c_loc->i[i];
557:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

559:     /* add received col data into lnk */
560:     for (k=0; k<nrecv; k++) { /* k-th received message */
561:       if (i == *nextrow[k]) { /* i-th row */
562:         nzi  = *(nextci[k]+1) - *nextci[k];
563:         Jptr = buf_rj[k] + *nextci[k];
564:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
565:         nextrow[k]++; nextci[k]++;
566:       }
567:     }
568:     nzi = lnk[0];

570:     /* copy data into free space, then initialize lnk */
571:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
572:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
573:   }
574:   PetscFree3(buf_ri_k,nextrow,nextci);
575:   PetscLLCondensedDestroy_Scalable(lnk);
576:   PetscFreeSpaceDestroy(free_space);

578:   /* local sizes and preallocation */
579:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
580:   if (P->cmap->bs > 0) {
581:     PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
582:     PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
583:   }
584:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
585:   MatPreallocateFinalize(dnz,onz);

587:   /* members in merge */
588:   PetscFree(id_r);
589:   PetscFree(len_r);
590:   PetscFree(buf_ri[0]);
591:   PetscFree(buf_ri);
592:   PetscFree(buf_rj[0]);
593:   PetscFree(buf_rj);
594:   PetscLayoutDestroy(&rowmap);

596:   /* attach the supporting struct to Cmpi for reuse */
597:   c = (Mat_MPIAIJ*)Cmpi->data;
598:   c->ap           = ptap;
599:   ptap->duplicate = Cmpi->ops->duplicate;
600:   ptap->destroy   = Cmpi->ops->destroy;
601:   ptap->view      = Cmpi->ops->view;

603:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
604:   Cmpi->assembled        = PETSC_FALSE;
605:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
606:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
607:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
608:   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;

610:    nout = 0;
611:    ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
612:    if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
613:    ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);
614:    if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);

616:   return(0);
617: }

619: PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
620: {
621:   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
622:   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
623:   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
624:   PetscInt             pcstart,pcend,column,offset;
625:   PetscErrorCode       ierr;

628:   pcstart = P->cmap->rstart;
629:   pcstart *= dof;
630:   pcend   = P->cmap->rend;
631:   pcend   *= dof;
632:   /* diagonal portion: Ad[i,:]*P */
633:   ai = ad->i;
634:   nzi = ai[i+1] - ai[i];
635:   aj  = ad->j + ai[i];
636:   for (j=0; j<nzi; j++) {
637:     row  = aj[j];
638:     offset = row%dof;
639:     row   /= dof;
640:     nzpi = pd->i[row+1] - pd->i[row];
641:     pj  = pd->j + pd->i[row];
642:     for (k=0; k<nzpi; k++) {
643:       PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
644:     }
645:   }
646:   /* off diag P */
647:   for (j=0; j<nzi; j++) {
648:     row  = aj[j];
649:     offset = row%dof;
650:     row   /= dof;
651:     nzpi = po->i[row+1] - po->i[row];
652:     pj  = po->j + po->i[row];
653:     for (k=0; k<nzpi; k++) {
654:       PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
655:     }
656:   }

658:   /* off diagonal part: Ao[i, :]*P_oth */
659:   if (ao) {
660:     ai = ao->i;
661:     pi = p_oth->i;
662:     nzi = ai[i+1] - ai[i];
663:     aj  = ao->j + ai[i];
664:     for (j=0; j<nzi; j++) {
665:       row  = aj[j];
666:       offset = a->garray[row]%dof;
667:       row  = map[row];
668:       pnz  = pi[row+1] - pi[row];
669:       p_othcols = p_oth->j + pi[row];
670:       for (col=0; col<pnz; col++) {
671:         column = p_othcols[col] * dof + offset;
672:         if (column>=pcstart && column<pcend) {
673:           PetscHSetIAdd(dht,column);
674:         } else {
675:           PetscHSetIAdd(oht,column);
676:         }
677:       }
678:     }
679:   } /* end if (ao) */
680:   return(0);
681: }

683: PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
684: {
685:   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
686:   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
687:   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
688:   PetscScalar          ra,*aa,*pa;
689:   PetscErrorCode       ierr;

692:   pcstart = P->cmap->rstart;
693:   pcstart *= dof;

695:   /* diagonal portion: Ad[i,:]*P */
696:   ai  = ad->i;
697:   nzi = ai[i+1] - ai[i];
698:   aj  = ad->j + ai[i];
699:   aa  = ad->a + ai[i];
700:   for (j=0; j<nzi; j++) {
701:     ra   = aa[j];
702:     row  = aj[j];
703:     offset = row%dof;
704:     row    /= dof;
705:     nzpi = pd->i[row+1] - pd->i[row];
706:     pj = pd->j + pd->i[row];
707:     pa = pd->a + pd->i[row];
708:     for (k=0; k<nzpi; k++) {
709:       PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
710:     }
711:     PetscLogFlops(2.0*nzpi);
712:   }
713:   for (j=0; j<nzi; j++) {
714:     ra   = aa[j];
715:     row  = aj[j];
716:     offset = row%dof;
717:     row   /= dof;
718:     nzpi = po->i[row+1] - po->i[row];
719:     pj = po->j + po->i[row];
720:     pa = po->a + po->i[row];
721:     for (k=0; k<nzpi; k++) {
722:       PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
723:     }
724:     PetscLogFlops(2.0*nzpi);
725:   }

727:   /* off diagonal part: Ao[i, :]*P_oth */
728:   if (ao) {
729:     ai = ao->i;
730:     pi = p_oth->i;
731:     nzi = ai[i+1] - ai[i];
732:     aj  = ao->j + ai[i];
733:     aa  = ao->a + ai[i];
734:     for (j=0; j<nzi; j++) {
735:       row  = aj[j];
736:       offset = a->garray[row]%dof;
737:       row    = map[row];
738:       ra   = aa[j];
739:       pnz  = pi[row+1] - pi[row];
740:       p_othcols = p_oth->j + pi[row];
741:       pa   = p_oth->a + pi[row];
742:       for (col=0; col<pnz; col++) {
743:         PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
744:       }
745:       PetscLogFlops(2.0*pnz);
746:     }
747:   } /* end if (ao) */

749:   return(0);
750: }

752: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);

754: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
755: {
756:   PetscErrorCode    ierr;
757:   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
758:   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
759:   Mat_APMPI         *ptap = c->ap;
760:   PetscHMapIV       hmap;
761:   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
762:   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
763:   PetscInt          offset,ii,pocol;
764:   const PetscInt    *mappingindices;
765:   IS                map;
766:   MPI_Comm          comm;

769:   PetscObjectGetComm((PetscObject)A,&comm);
770:   if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");

772:   MatZeroEntries(C);

774:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
775:   /*-----------------------------------------------------*/
776:   if (ptap->reuse == MAT_REUSE_MATRIX) {
777:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
778:      MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
779:   }
780:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);

782:   MatGetLocalSize(p->B,NULL,&pon);
783:   pon *= dof;
784:   PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
785:   MatGetLocalSize(A,&am,NULL);
786:   cmaxr = 0;
787:   for (i=0; i<pon; i++) {
788:     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
789:   }
790:   PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
791:   PetscHMapIVCreate(&hmap);
792:   PetscHMapIVResize(hmap,cmaxr);
793:   ISGetIndices(map,&mappingindices);
794:   for (i=0; i<am && pon; i++) {
795:     PetscHMapIVClear(hmap);
796:     offset = i%dof;
797:     ii     = i/dof;
798:     nzi = po->i[ii+1] - po->i[ii];
799:     if (!nzi) continue;
800:     MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
801:     voff = 0;
802:     PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
803:     if (!voff) continue;

805:     /* Form C(ii, :) */
806:     poj = po->j + po->i[ii];
807:     poa = po->a + po->i[ii];
808:     for (j=0; j<nzi; j++) {
809:       pocol = poj[j]*dof+offset;
810:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
811:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
812:       for (jj=0; jj<voff; jj++) {
813:         apvaluestmp[jj] = apvalues[jj]*poa[j];
814:         /*If the row is empty */
815:         if (!c_rmtc[pocol]) {
816:           c_rmtjj[jj] = apindices[jj];
817:           c_rmtaa[jj] = apvaluestmp[jj];
818:           c_rmtc[pocol]++;
819:         } else {
820:           PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
821:           if (loc>=0){ /* hit */
822:             c_rmtaa[loc] += apvaluestmp[jj];
823:             PetscLogFlops(1.0);
824:           } else { /* new element */
825:             loc = -(loc+1);
826:             /* Move data backward */
827:             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
828:               c_rmtjj[kk] = c_rmtjj[kk-1];
829:               c_rmtaa[kk] = c_rmtaa[kk-1];
830:             }/* End kk */
831:             c_rmtjj[loc] = apindices[jj];
832:             c_rmtaa[loc] = apvaluestmp[jj];
833:             c_rmtc[pocol]++;
834:           }
835:         }
836:         PetscLogFlops(voff);
837:       } /* End jj */
838:     } /* End j */
839:   } /* End i */

841:   PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
842:   PetscHMapIVDestroy(&hmap);

844:   MatGetLocalSize(P,NULL,&pn);
845:   pn *= dof;
846:   PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);

848:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
849:   PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
850:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
851:   pcstart = pcstart*dof;
852:   pcend   = pcend*dof;
853:   cd = (Mat_SeqAIJ*)(c->A)->data;
854:   co = (Mat_SeqAIJ*)(c->B)->data;

856:   cmaxr = 0;
857:   for (i=0; i<pn; i++) {
858:     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
859:   }
860:   PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
861:   PetscHMapIVCreate(&hmap);
862:   PetscHMapIVResize(hmap,cmaxr);
863:   for (i=0; i<am && pn; i++) {
864:     PetscHMapIVClear(hmap);
865:     offset = i%dof;
866:     ii     = i/dof;
867:     nzi = pd->i[ii+1] - pd->i[ii];
868:     if (!nzi) continue;
869:     MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
870:     voff = 0;
871:     PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
872:     if (!voff) continue;
873:     /* Form C(ii, :) */
874:     pdj = pd->j + pd->i[ii];
875:     pda = pd->a + pd->i[ii];
876:     for (j=0; j<nzi; j++) {
877:       row = pcstart + pdj[j] * dof + offset;
878:       for (jj=0; jj<voff; jj++) {
879:         apvaluestmp[jj] = apvalues[jj]*pda[j];
880:       }
881:       PetscLogFlops(voff);
882:       MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
883:     }
884:   }
885:   ISRestoreIndices(map,&mappingindices);
886:   MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
887:   PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
888:   PetscHMapIVDestroy(&hmap);
889:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
890:   PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
891:   PetscFree2(c_rmtj,c_rmta);

893:   /* Add contributions from remote */
894:   for (i = 0; i < pn; i++) {
895:     row = i + pcstart;
896:     MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
897:   }
898:   PetscFree2(c_othj,c_otha);

900:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
901:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

903:   ptap->reuse = MAT_REUSE_MATRIX;

905:   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
906:   if (ptap->freestruct) {
907:     MatFreeIntermediateDataStructures(C);
908:   }
909:   return(0);
910: }

912: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
913: {
914:   PetscErrorCode      ierr;


918:   MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
919:   return(0);
920: }

922: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
923: {
924:   PetscErrorCode    ierr;
925:   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
926:   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
927:   Mat_APMPI         *ptap = c->ap;
928:   PetscHMapIV       hmap;
929:   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
930:   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
931:   PetscInt          offset,ii,pocol;
932:   const PetscInt    *mappingindices;
933:   IS                map;
934:   MPI_Comm          comm;

937:   PetscObjectGetComm((PetscObject)A,&comm);
938:   if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");

940:   MatZeroEntries(C);

942:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
943:   /*-----------------------------------------------------*/
944:   if (ptap->reuse == MAT_REUSE_MATRIX) {
945:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
946:      MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
947:   }
948:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
949:   MatGetLocalSize(p->B,NULL,&pon);
950:   pon *= dof;
951:   MatGetLocalSize(P,NULL,&pn);
952:   pn  *= dof;

954:   PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
955:   MatGetLocalSize(A,&am,NULL);
956:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
957:   pcstart *= dof;
958:   pcend   *= dof;
959:   cmaxr = 0;
960:   for (i=0; i<pon; i++) {
961:     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
962:   }
963:   cd = (Mat_SeqAIJ*)(c->A)->data;
964:   co = (Mat_SeqAIJ*)(c->B)->data;
965:   for (i=0; i<pn; i++) {
966:     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
967:   }
968:   PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
969:   PetscHMapIVCreate(&hmap);
970:   PetscHMapIVResize(hmap,cmaxr);
971:   ISGetIndices(map,&mappingindices);
972:   for (i=0; i<am && (pon || pn); i++) {
973:     PetscHMapIVClear(hmap);
974:     offset = i%dof;
975:     ii     = i/dof;
976:     nzi  = po->i[ii+1] - po->i[ii];
977:     dnzi = pd->i[ii+1] - pd->i[ii];
978:     if (!nzi && !dnzi) continue;
979:     MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
980:     voff = 0;
981:     PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
982:     if (!voff) continue;

984:     /* Form remote C(ii, :) */
985:     poj = po->j + po->i[ii];
986:     poa = po->a + po->i[ii];
987:     for (j=0; j<nzi; j++) {
988:       pocol = poj[j]*dof+offset;
989:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
990:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
991:       for (jj=0; jj<voff; jj++) {
992:         apvaluestmp[jj] = apvalues[jj]*poa[j];
993:         /*If the row is empty */
994:         if (!c_rmtc[pocol]) {
995:           c_rmtjj[jj] = apindices[jj];
996:           c_rmtaa[jj] = apvaluestmp[jj];
997:           c_rmtc[pocol]++;
998:         } else {
999:           PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
1000:           if (loc>=0){ /* hit */
1001:             c_rmtaa[loc] += apvaluestmp[jj];
1002:             PetscLogFlops(1.0);
1003:           } else { /* new element */
1004:             loc = -(loc+1);
1005:             /* Move data backward */
1006:             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
1007:               c_rmtjj[kk] = c_rmtjj[kk-1];
1008:               c_rmtaa[kk] = c_rmtaa[kk-1];
1009:             }/* End kk */
1010:             c_rmtjj[loc] = apindices[jj];
1011:             c_rmtaa[loc] = apvaluestmp[jj];
1012:             c_rmtc[pocol]++;
1013:           }
1014:         }
1015:       } /* End jj */
1016:       PetscLogFlops(voff);
1017:     } /* End j */

1019:     /* Form local C(ii, :) */
1020:     pdj = pd->j + pd->i[ii];
1021:     pda = pd->a + pd->i[ii];
1022:     for (j=0; j<dnzi; j++) {
1023:       row = pcstart + pdj[j] * dof + offset;
1024:       for (jj=0; jj<voff; jj++) {
1025:         apvaluestmp[jj] = apvalues[jj]*pda[j];
1026:       }/* End kk */
1027:       PetscLogFlops(voff);
1028:       MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
1029:     }/* End j */
1030:   } /* End i */

1032:   ISRestoreIndices(map,&mappingindices);
1033:   PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
1034:   PetscHMapIVDestroy(&hmap);
1035:   PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);

1037:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1038:   PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1039:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1040:   PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1041:   PetscFree2(c_rmtj,c_rmta);

1043:   /* Add contributions from remote */
1044:   for (i = 0; i < pn; i++) {
1045:     row = i + pcstart;
1046:     MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
1047:   }
1048:   PetscFree2(c_othj,c_otha);

1050:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1051:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1053:   ptap->reuse = MAT_REUSE_MATRIX;

1055:   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1056:   if (ptap->freestruct) {
1057:     MatFreeIntermediateDataStructures(C);
1058:   }
1059:   return(0);
1060: }

1062: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1063: {
1064:   PetscErrorCode      ierr;


1068:   MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1069:   return(0);
1070: }

1072: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1073: {
1074:   Mat_APMPI           *ptap;
1075:   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
1076:   MPI_Comm            comm;
1077:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1078:   MatType             mtype;
1079:   PetscSF             sf;
1080:   PetscSFNode         *iremote;
1081:   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1082:   const PetscInt      *rootdegrees;
1083:   PetscHSetI          ht,oht,*hta,*hto;
1084:   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1085:   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1086:   PetscInt            nalg=2,alg=0,offset,ii;
1087:   PetscMPIInt         owner;
1088:   const PetscInt      *mappingindices;
1089:   PetscBool           flg;
1090:   const char          *algTypes[2] = {"overlapping","merged"};
1091:   IS                  map;
1092:   PetscErrorCode      ierr;

1095:   PetscObjectGetComm((PetscObject)A,&comm);

1097:   /* Create symbolic parallel matrix Cmpi */
1098:   MatGetLocalSize(P,NULL,&pn);
1099:   pn *= dof;
1100:   MatGetType(A,&mtype);
1101:   MatSetType(Cmpi,mtype);
1102:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);

1104:   PetscNew(&ptap);
1105:   ptap->reuse = MAT_INITIAL_MATRIX;
1106:   ptap->algType = 2;

1108:   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1109:   MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1110:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1111:   /* This equals to the number of offdiag columns in P */
1112:   MatGetLocalSize(p->B,NULL,&pon);
1113:   pon *= dof;
1114:   /* offsets */
1115:   PetscMalloc1(pon+1,&ptap->c_rmti);
1116:   /* The number of columns we will send to remote ranks */
1117:   PetscMalloc1(pon,&c_rmtc);
1118:   PetscMalloc1(pon,&hta);
1119:   for (i=0; i<pon; i++) {
1120:     PetscHSetICreate(&hta[i]);
1121:   }
1122:   MatGetLocalSize(A,&am,NULL);
1123:   MatGetOwnershipRange(A,&arstart,&arend);
1124:   /* Create hash table to merge all columns for C(i, :) */
1125:   PetscHSetICreate(&ht);

1127:   ISGetIndices(map,&mappingindices);
1128:   ptap->c_rmti[0] = 0;
1129:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1130:   for (i=0; i<am && pon; i++) {
1131:     /* Form one row of AP */
1132:     PetscHSetIClear(ht);
1133:     offset = i%dof;
1134:     ii     = i/dof;
1135:     /* If the off diag is empty, we should not do any calculation */
1136:     nzi = po->i[ii+1] - po->i[ii];
1137:     if (!nzi) continue;

1139:     MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1140:     PetscHSetIGetSize(ht,&htsize);
1141:     /* If AP is empty, just continue */
1142:     if (!htsize) continue;
1143:     /* Form C(ii, :) */
1144:     poj = po->j + po->i[ii];
1145:     for (j=0; j<nzi; j++) {
1146:       PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1147:     }
1148:   }

1150:   for (i=0; i<pon; i++) {
1151:     PetscHSetIGetSize(hta[i],&htsize);
1152:     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1153:     c_rmtc[i] = htsize;
1154:   }

1156:   PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);

1158:   for (i=0; i<pon; i++) {
1159:     off = 0;
1160:     PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1161:     PetscHSetIDestroy(&hta[i]);
1162:   }
1163:   PetscFree(hta);

1165:   PetscMalloc1(pon,&iremote);
1166:   for (i=0; i<pon; i++) {
1167:     owner = 0; lidx = 0;
1168:     offset = i%dof;
1169:     ii     = i/dof;
1170:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1171:     iremote[i].index = lidx*dof + offset;
1172:     iremote[i].rank  = owner;
1173:   }

1175:   PetscSFCreate(comm,&sf);
1176:   PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1177:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1178:   PetscSFSetRankOrder(sf,PETSC_TRUE);
1179:   PetscSFSetFromOptions(sf);
1180:   PetscSFSetUp(sf);
1181:   /* How many neighbors have contributions to my rows? */
1182:   PetscSFComputeDegreeBegin(sf,&rootdegrees);
1183:   PetscSFComputeDegreeEnd(sf,&rootdegrees);
1184:   rootspacesize = 0;
1185:   for (i = 0; i < pn; i++) {
1186:     rootspacesize += rootdegrees[i];
1187:   }
1188:   PetscMalloc1(rootspacesize,&rootspace);
1189:   PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1190:   /* Get information from leaves
1191:    * Number of columns other people contribute to my rows
1192:    * */
1193:   PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1194:   PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1195:   PetscFree(c_rmtc);
1196:   PetscCalloc1(pn+1,&ptap->c_othi);
1197:   /* The number of columns is received for each row */
1198:   ptap->c_othi[0] = 0;
1199:   rootspacesize = 0;
1200:   rootspaceoffsets[0] = 0;
1201:   for (i = 0; i < pn; i++) {
1202:     rcvncols = 0;
1203:     for (j = 0; j<rootdegrees[i]; j++) {
1204:       rcvncols += rootspace[rootspacesize];
1205:       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1206:       rootspacesize++;
1207:     }
1208:     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1209:   }
1210:   PetscFree(rootspace);

1212:   PetscMalloc1(pon,&c_rmtoffsets);
1213:   PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1214:   PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1215:   PetscSFDestroy(&sf);
1216:   PetscFree(rootspaceoffsets);

1218:   PetscCalloc1(ptap->c_rmti[pon],&iremote);
1219:   nleaves = 0;
1220:   for (i = 0; i<pon; i++) {
1221:     owner = 0;
1222:     ii = i/dof;
1223:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1224:     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1225:     for (j=0; j<sendncols; j++) {
1226:       iremote[nleaves].rank = owner;
1227:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1228:     }
1229:   }
1230:   PetscFree(c_rmtoffsets);
1231:   PetscCalloc1(ptap->c_othi[pn],&c_othj);

1233:   PetscSFCreate(comm,&ptap->sf);
1234:   PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1235:   PetscSFSetFromOptions(ptap->sf);
1236:   /* One to one map */
1237:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);

1239:   PetscMalloc2(pn,&dnz,pn,&onz);
1240:   PetscHSetICreate(&oht);
1241:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1242:   pcstart *= dof;
1243:   pcend   *= dof;
1244:   PetscMalloc2(pn,&hta,pn,&hto);
1245:   for (i=0; i<pn; i++) {
1246:     PetscHSetICreate(&hta[i]);
1247:     PetscHSetICreate(&hto[i]);
1248:   }
1249:   /* Work on local part */
1250:   /* 4) Pass 1: Estimate memory for C_loc */
1251:   for (i=0; i<am && pn; i++) {
1252:     PetscHSetIClear(ht);
1253:     PetscHSetIClear(oht);
1254:     offset = i%dof;
1255:     ii     = i/dof;
1256:     nzi = pd->i[ii+1] - pd->i[ii];
1257:     if (!nzi) continue;

1259:     MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1260:     PetscHSetIGetSize(ht,&htsize);
1261:     PetscHSetIGetSize(oht,&htosize);
1262:     if (!(htsize+htosize)) continue;
1263:     /* Form C(ii, :) */
1264:     pdj = pd->j + pd->i[ii];
1265:     for (j=0; j<nzi; j++) {
1266:       PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1267:       PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1268:     }
1269:   }

1271:   ISRestoreIndices(map,&mappingindices);

1273:   PetscHSetIDestroy(&ht);
1274:   PetscHSetIDestroy(&oht);

1276:   /* Get remote data */
1277:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1278:   PetscFree(c_rmtj);

1280:   for (i = 0; i < pn; i++) {
1281:     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1282:     rdj = c_othj + ptap->c_othi[i];
1283:     for (j = 0; j < nzi; j++) {
1284:       col =  rdj[j];
1285:       /* diag part */
1286:       if (col>=pcstart && col<pcend) {
1287:         PetscHSetIAdd(hta[i],col);
1288:       } else { /* off diag */
1289:         PetscHSetIAdd(hto[i],col);
1290:       }
1291:     }
1292:     PetscHSetIGetSize(hta[i],&htsize);
1293:     dnz[i] = htsize;
1294:     PetscHSetIDestroy(&hta[i]);
1295:     PetscHSetIGetSize(hto[i],&htsize);
1296:     onz[i] = htsize;
1297:     PetscHSetIDestroy(&hto[i]);
1298:   }

1300:   PetscFree2(hta,hto);
1301:   PetscFree(c_othj);

1303:   /* local sizes and preallocation */
1304:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1305:   MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1306:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1307:   MatSetUp(Cmpi);
1308:   PetscFree2(dnz,onz);

1310:   /* attach the supporting struct to Cmpi for reuse */
1311:   c = (Mat_MPIAIJ*)Cmpi->data;
1312:   c->ap           = ptap;
1313:   ptap->duplicate = Cmpi->ops->duplicate;
1314:   ptap->destroy   = Cmpi->ops->destroy;
1315:   ptap->view      = Cmpi->ops->view;

1317:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1318:   Cmpi->assembled = PETSC_FALSE;
1319:   /* pick an algorithm */
1320:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1321:   alg = 0;
1322:   PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1323:   PetscOptionsEnd();
1324:   switch (alg) {
1325:     case 0:
1326:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1327:       break;
1328:     case 1:
1329:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1330:       break;
1331:     default:
1332:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1333:   }
1334:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1335:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1336:   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1337:   return(0);
1338: }

1340: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1341: {
1342:   PetscErrorCode      ierr;


1346:   MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1347:   return(0);
1348: }

1350: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1351: {
1352:   Mat_APMPI           *ptap;
1353:   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data,*c;
1354:   MPI_Comm            comm;
1355:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1356:   MatType             mtype;
1357:   PetscSF             sf;
1358:   PetscSFNode         *iremote;
1359:   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1360:   const PetscInt      *rootdegrees;
1361:   PetscHSetI          ht,oht,*hta,*hto,*htd;
1362:   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1363:   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1364:   PetscInt            nalg=2,alg=0,offset,ii;
1365:   PetscMPIInt         owner;
1366:   PetscBool           flg;
1367:   const char          *algTypes[2] = {"merged","overlapping"};
1368:   const PetscInt      *mappingindices;
1369:   IS                  map;
1370:   PetscErrorCode      ierr;

1373:   PetscObjectGetComm((PetscObject)A,&comm);

1375:   /* Create symbolic parallel matrix Cmpi */
1376:   MatGetLocalSize(P,NULL,&pn);
1377:   pn *= dof;
1378:   MatGetType(A,&mtype);
1379:   MatSetType(Cmpi,mtype);
1380:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);

1382:   PetscNew(&ptap);
1383:   ptap->reuse = MAT_INITIAL_MATRIX;
1384:   ptap->algType = 3;

1386:   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1387:   MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1388:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);

1390:   /* This equals to the number of offdiag columns in P */
1391:   MatGetLocalSize(p->B,NULL,&pon);
1392:   pon *= dof;
1393:   /* offsets */
1394:   PetscMalloc1(pon+1,&ptap->c_rmti);
1395:   /* The number of columns we will send to remote ranks */
1396:   PetscMalloc1(pon,&c_rmtc);
1397:   PetscMalloc1(pon,&hta);
1398:   for (i=0; i<pon; i++) {
1399:     PetscHSetICreate(&hta[i]);
1400:   }
1401:   MatGetLocalSize(A,&am,NULL);
1402:   MatGetOwnershipRange(A,&arstart,&arend);
1403:   /* Create hash table to merge all columns for C(i, :) */
1404:   PetscHSetICreate(&ht);
1405:   PetscHSetICreate(&oht);
1406:   PetscMalloc2(pn,&htd,pn,&hto);
1407:   for (i=0; i<pn; i++) {
1408:     PetscHSetICreate(&htd[i]);
1409:     PetscHSetICreate(&hto[i]);
1410:   }

1412:   ISGetIndices(map,&mappingindices);
1413:   ptap->c_rmti[0] = 0;
1414:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1415:   for (i=0; i<am && (pon || pn); i++) {
1416:     /* Form one row of AP */
1417:     PetscHSetIClear(ht);
1418:     PetscHSetIClear(oht);
1419:     offset = i%dof;
1420:     ii     = i/dof;
1421:     /* If the off diag is empty, we should not do any calculation */
1422:     nzi = po->i[ii+1] - po->i[ii];
1423:     dnzi = pd->i[ii+1] - pd->i[ii];
1424:     if (!nzi && !dnzi) continue;

1426:     MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1427:     PetscHSetIGetSize(ht,&htsize);
1428:     PetscHSetIGetSize(oht,&htosize);
1429:     /* If AP is empty, just continue */
1430:     if (!(htsize+htosize)) continue;

1432:     /* Form remote C(ii, :) */
1433:     poj = po->j + po->i[ii];
1434:     for (j=0; j<nzi; j++) {
1435:       PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1436:       PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1437:     }

1439:     /* Form local C(ii, :) */
1440:     pdj = pd->j + pd->i[ii];
1441:     for (j=0; j<dnzi; j++) {
1442:       PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1443:       PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1444:     }
1445:   }

1447:   ISRestoreIndices(map,&mappingindices);

1449:   PetscHSetIDestroy(&ht);
1450:   PetscHSetIDestroy(&oht);

1452:   for (i=0; i<pon; i++) {
1453:     PetscHSetIGetSize(hta[i],&htsize);
1454:     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1455:     c_rmtc[i] = htsize;
1456:   }

1458:   PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);

1460:   for (i=0; i<pon; i++) {
1461:     off = 0;
1462:     PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1463:     PetscHSetIDestroy(&hta[i]);
1464:   }
1465:   PetscFree(hta);

1467:   PetscMalloc1(pon,&iremote);
1468:   for (i=0; i<pon; i++) {
1469:     owner = 0; lidx = 0;
1470:     offset = i%dof;
1471:     ii     = i/dof;
1472:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1473:     iremote[i].index = lidx*dof+offset;
1474:     iremote[i].rank  = owner;
1475:   }

1477:   PetscSFCreate(comm,&sf);
1478:   PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1479:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1480:   PetscSFSetRankOrder(sf,PETSC_TRUE);
1481:   PetscSFSetFromOptions(sf);
1482:   PetscSFSetUp(sf);
1483:   /* How many neighbors have contributions to my rows? */
1484:   PetscSFComputeDegreeBegin(sf,&rootdegrees);
1485:   PetscSFComputeDegreeEnd(sf,&rootdegrees);
1486:   rootspacesize = 0;
1487:   for (i = 0; i < pn; i++) {
1488:     rootspacesize += rootdegrees[i];
1489:   }
1490:   PetscMalloc1(rootspacesize,&rootspace);
1491:   PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1492:   /* Get information from leaves
1493:    * Number of columns other people contribute to my rows
1494:    * */
1495:   PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1496:   PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1497:   PetscFree(c_rmtc);
1498:   PetscMalloc1(pn+1,&ptap->c_othi);
1499:   /* The number of columns is received for each row */
1500:   ptap->c_othi[0]     = 0;
1501:   rootspacesize       = 0;
1502:   rootspaceoffsets[0] = 0;
1503:   for (i = 0; i < pn; i++) {
1504:     rcvncols = 0;
1505:     for (j = 0; j<rootdegrees[i]; j++) {
1506:       rcvncols += rootspace[rootspacesize];
1507:       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1508:       rootspacesize++;
1509:     }
1510:     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1511:   }
1512:   PetscFree(rootspace);

1514:   PetscMalloc1(pon,&c_rmtoffsets);
1515:   PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1516:   PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1517:   PetscSFDestroy(&sf);
1518:   PetscFree(rootspaceoffsets);

1520:   PetscCalloc1(ptap->c_rmti[pon],&iremote);
1521:   nleaves = 0;
1522:   for (i = 0; i<pon; i++) {
1523:     owner = 0;
1524:     ii    = i/dof;
1525:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1526:     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1527:     for (j=0; j<sendncols; j++) {
1528:       iremote[nleaves].rank    = owner;
1529:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1530:     }
1531:   }
1532:   PetscFree(c_rmtoffsets);
1533:   PetscCalloc1(ptap->c_othi[pn],&c_othj);

1535:   PetscSFCreate(comm,&ptap->sf);
1536:   PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1537:   PetscSFSetFromOptions(ptap->sf);
1538:   /* One to one map */
1539:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1540:   /* Get remote data */
1541:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1542:   PetscFree(c_rmtj);
1543:   PetscMalloc2(pn,&dnz,pn,&onz);
1544:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1545:   pcstart *= dof;
1546:   pcend   *= dof;
1547:   for (i = 0; i < pn; i++) {
1548:     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1549:     rdj = c_othj + ptap->c_othi[i];
1550:     for (j = 0; j < nzi; j++) {
1551:       col =  rdj[j];
1552:       /* diag part */
1553:       if (col>=pcstart && col<pcend) {
1554:         PetscHSetIAdd(htd[i],col);
1555:       } else { /* off diag */
1556:         PetscHSetIAdd(hto[i],col);
1557:       }
1558:     }
1559:     PetscHSetIGetSize(htd[i],&htsize);
1560:     dnz[i] = htsize;
1561:     PetscHSetIDestroy(&htd[i]);
1562:     PetscHSetIGetSize(hto[i],&htsize);
1563:     onz[i] = htsize;
1564:     PetscHSetIDestroy(&hto[i]);
1565:   }

1567:   PetscFree2(htd,hto);
1568:   PetscFree(c_othj);

1570:   /* local sizes and preallocation */
1571:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1572:   MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1573:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1574:   PetscFree2(dnz,onz);

1576:   /* attach the supporting struct to Cmpi for reuse */
1577:   c = (Mat_MPIAIJ*)Cmpi->data;
1578:   c->ap           = ptap;
1579:   ptap->duplicate = Cmpi->ops->duplicate;
1580:   ptap->destroy   = Cmpi->ops->destroy;
1581:   ptap->view      = Cmpi->ops->view;

1583:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1584:   Cmpi->assembled        = PETSC_FALSE;
1585:   /* pick an algorithm */
1586:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1587:   alg = 0;
1588:   PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1589:   PetscOptionsEnd();
1590:   switch (alg) {
1591:     case 0:
1592:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1593:       break;
1594:     case 1:
1595:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1596:       break;
1597:     default:
1598:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1599:   }
1600:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1601:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1602:   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1603:   return(0);
1604: }

1606: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1607: {
1608:   PetscErrorCode      ierr;


1612:   MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1613:   return(0);
1614: }

1616: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1617: {
1618:   PetscErrorCode      ierr;
1619:   Mat_APMPI           *ptap;
1620:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1621:   MPI_Comm            comm;
1622:   PetscMPIInt         size,rank;
1623:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1624:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1625:   PetscInt            *lnk,i,k,pnz,row,nsend;
1626:   PetscBT             lnkbt;
1627:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1628:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1629:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1630:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1631:   MPI_Request         *swaits,*rwaits;
1632:   MPI_Status          *sstatus,rstatus;
1633:   PetscLayout         rowmap;
1634:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1635:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1636:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1637:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1638:   PetscScalar         *apv;
1639:   PetscTable          ta;
1640:   MatType             mtype;
1641:   const char          *prefix;
1642: #if defined(PETSC_USE_INFO)
1643:   PetscReal           apfill;
1644: #endif

1647:   PetscObjectGetComm((PetscObject)A,&comm);
1648:   MPI_Comm_size(comm,&size);
1649:   MPI_Comm_rank(comm,&rank);

1651:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

1653:   /* create symbolic parallel matrix Cmpi */
1654:   MatGetType(A,&mtype);
1655:   MatSetType(Cmpi,mtype);

1657:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1658:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

1660:   /* create struct Mat_APMPI and attached it to C later */
1661:   PetscNew(&ptap);
1662:   ptap->reuse = MAT_INITIAL_MATRIX;
1663:   ptap->algType = 1;

1665:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1666:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1667:   /* get P_loc by taking all local rows of P */
1668:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

1670:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1671:   /* --------------------------------- */
1672:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1673:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

1675:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1676:   /* ----------------------------------------------------------------- */
1677:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1678:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

1680:   /* create and initialize a linked list */
1681:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1682:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1683:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1684:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1685:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

1687:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

1689:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1690:   if (ao) {
1691:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1692:   } else {
1693:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1694:   }
1695:   current_space = free_space;
1696:   nspacedouble  = 0;

1698:   PetscMalloc1(am+1,&api);
1699:   api[0] = 0;
1700:   for (i=0; i<am; i++) {
1701:     /* diagonal portion: Ad[i,:]*P */
1702:     ai = ad->i; pi = p_loc->i;
1703:     nzi = ai[i+1] - ai[i];
1704:     aj  = ad->j + ai[i];
1705:     for (j=0; j<nzi; j++) {
1706:       row  = aj[j];
1707:       pnz  = pi[row+1] - pi[row];
1708:       Jptr = p_loc->j + pi[row];
1709:       /* add non-zero cols of P into the sorted linked list lnk */
1710:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1711:     }
1712:     /* off-diagonal portion: Ao[i,:]*P */
1713:     if (ao) {
1714:       ai = ao->i; pi = p_oth->i;
1715:       nzi = ai[i+1] - ai[i];
1716:       aj  = ao->j + ai[i];
1717:       for (j=0; j<nzi; j++) {
1718:         row  = aj[j];
1719:         pnz  = pi[row+1] - pi[row];
1720:         Jptr = p_oth->j + pi[row];
1721:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1722:       }
1723:     }
1724:     apnz     = lnk[0];
1725:     api[i+1] = api[i] + apnz;
1726:     if (ap_rmax < apnz) ap_rmax = apnz;

1728:     /* if free space is not available, double the total space in the list */
1729:     if (current_space->local_remaining<apnz) {
1730:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
1731:       nspacedouble++;
1732:     }

1734:     /* Copy data into free space, then initialize lnk */
1735:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

1737:     current_space->array           += apnz;
1738:     current_space->local_used      += apnz;
1739:     current_space->local_remaining -= apnz;
1740:   }
1741:   /* Allocate space for apj and apv, initialize apj, and */
1742:   /* destroy list of free space and other temporary array(s) */
1743:   PetscMalloc2(api[am],&apj,api[am],&apv);
1744:   PetscFreeSpaceContiguous(&free_space,apj);
1745:   PetscLLDestroy(lnk,lnkbt);

1747:   /* Create AP_loc for reuse */
1748:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

1750: #if defined(PETSC_USE_INFO)
1751:   if (ao) {
1752:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1753:   } else {
1754:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1755:   }
1756:   ptap->AP_loc->info.mallocs           = nspacedouble;
1757:   ptap->AP_loc->info.fill_ratio_given  = fill;
1758:   ptap->AP_loc->info.fill_ratio_needed = apfill;

1760:   if (api[am]) {
1761:     PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1762:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1763:   } else {
1764:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1765:   }
1766: #endif

1768:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1769:   /* ------------------------------------ */
1770:   MatGetOptionsPrefix(A,&prefix);
1771:   MatSetOptionsPrefix(ptap->Ro,prefix);
1772:   MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");

1774:   MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
1775:   MatGetOptionsPrefix(Cmpi,&prefix);
1776:   MatSetOptionsPrefix(ptap->C_oth,prefix);
1777:   MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");

1779:   MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
1780:   MatProductSetAlgorithm(ptap->C_oth,"default");
1781:   MatProductSetFill(ptap->C_oth,fill);
1782:   MatProductSetFromOptions(ptap->C_oth);
1783:   MatProductSymbolic(ptap->C_oth);

1785:   /* (3) send coj of C_oth to other processors  */
1786:   /* ------------------------------------------ */
1787:   /* determine row ownership */
1788:   PetscLayoutCreate(comm,&rowmap);
1789:   rowmap->n  = pn;
1790:   rowmap->bs = 1;
1791:   PetscLayoutSetUp(rowmap);
1792:   owners = rowmap->range;

1794:   /* determine the number of messages to send, their lengths */
1795:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1796:   PetscArrayzero(len_s,size);
1797:   PetscArrayzero(len_si,size);

1799:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1800:   coi   = c_oth->i; coj = c_oth->j;
1801:   con   = ptap->C_oth->rmap->n;
1802:   proc  = 0;
1803:   for (i=0; i<con; i++) {
1804:     while (prmap[i] >= owners[proc+1]) proc++;
1805:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1806:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1807:   }

1809:   len          = 0; /* max length of buf_si[], see (4) */
1810:   owners_co[0] = 0;
1811:   nsend        = 0;
1812:   for (proc=0; proc<size; proc++) {
1813:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1814:     if (len_s[proc]) {
1815:       nsend++;
1816:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1817:       len         += len_si[proc];
1818:     }
1819:   }

1821:   /* determine the number and length of messages to receive for coi and coj  */
1822:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1823:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

1825:   /* post the Irecv and Isend of coj */
1826:   PetscCommGetNewTag(comm,&tagj);
1827:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1828:   PetscMalloc1(nsend+1,&swaits);
1829:   for (proc=0, k=0; proc<size; proc++) {
1830:     if (!len_s[proc]) continue;
1831:     i    = owners_co[proc];
1832:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1833:     k++;
1834:   }

1836:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1837:   /* ---------------------------------------- */
1838:   MatSetOptionsPrefix(ptap->Rd,prefix);
1839:   MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");

1841:   MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
1842:   MatGetOptionsPrefix(Cmpi,&prefix);
1843:   MatSetOptionsPrefix(ptap->C_loc,prefix);
1844:   MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");

1846:   MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
1847:   MatProductSetAlgorithm(ptap->C_loc,"default");
1848:   MatProductSetFill(ptap->C_loc,fill);
1849:   MatProductSetFromOptions(ptap->C_loc);
1850:   MatProductSymbolic(ptap->C_loc);

1852:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

1854:   /* receives coj are complete */
1855:   for (i=0; i<nrecv; i++) {
1856:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1857:   }
1858:   PetscFree(rwaits);
1859:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

1861:   /* add received column indices into ta to update Crmax */
1862:   for (k=0; k<nrecv; k++) {/* k-th received message */
1863:     Jptr = buf_rj[k];
1864:     for (j=0; j<len_r[k]; j++) {
1865:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1866:     }
1867:   }
1868:   PetscTableGetCount(ta,&Crmax);
1869:   PetscTableDestroy(&ta);

1871:   /* (4) send and recv coi */
1872:   /*-----------------------*/
1873:   PetscCommGetNewTag(comm,&tagi);
1874:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1875:   PetscMalloc1(len+1,&buf_s);
1876:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1877:   for (proc=0,k=0; proc<size; proc++) {
1878:     if (!len_s[proc]) continue;
1879:     /* form outgoing message for i-structure:
1880:          buf_si[0]:                 nrows to be sent
1881:                [1:nrows]:           row index (global)
1882:                [nrows+1:2*nrows+1]: i-structure index
1883:     */
1884:     /*-------------------------------------------*/
1885:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1886:     buf_si_i    = buf_si + nrows+1;
1887:     buf_si[0]   = nrows;
1888:     buf_si_i[0] = 0;
1889:     nrows       = 0;
1890:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1891:       nzi = coi[i+1] - coi[i];
1892:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1893:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1894:       nrows++;
1895:     }
1896:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1897:     k++;
1898:     buf_si += len_si[proc];
1899:   }
1900:   for (i=0; i<nrecv; i++) {
1901:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1902:   }
1903:   PetscFree(rwaits);
1904:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

1906:   PetscFree4(len_s,len_si,sstatus,owners_co);
1907:   PetscFree(len_ri);
1908:   PetscFree(swaits);
1909:   PetscFree(buf_s);

1911:   /* (5) compute the local portion of Cmpi      */
1912:   /* ------------------------------------------ */
1913:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1914:   PetscFreeSpaceGet(Crmax,&free_space);
1915:   current_space = free_space;

1917:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1918:   for (k=0; k<nrecv; k++) {
1919:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1920:     nrows       = *buf_ri_k[k];
1921:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1922:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
1923:   }

1925:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1926:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1927:   for (i=0; i<pn; i++) {
1928:     /* add C_loc into Cmpi */
1929:     nzi  = c_loc->i[i+1] - c_loc->i[i];
1930:     Jptr = c_loc->j + c_loc->i[i];
1931:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

1933:     /* add received col data into lnk */
1934:     for (k=0; k<nrecv; k++) { /* k-th received message */
1935:       if (i == *nextrow[k]) { /* i-th row */
1936:         nzi  = *(nextci[k]+1) - *nextci[k];
1937:         Jptr = buf_rj[k] + *nextci[k];
1938:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1939:         nextrow[k]++; nextci[k]++;
1940:       }
1941:     }
1942:     nzi = lnk[0];

1944:     /* copy data into free space, then initialize lnk */
1945:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
1946:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
1947:   }
1948:   PetscFree3(buf_ri_k,nextrow,nextci);
1949:   PetscLLDestroy(lnk,lnkbt);
1950:   PetscFreeSpaceDestroy(free_space);

1952:   /* local sizes and preallocation */
1953:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1954:   if (P->cmap->bs > 0) {
1955:     PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
1956:     PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
1957:   }
1958:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1959:   MatPreallocateFinalize(dnz,onz);

1961:   /* members in merge */
1962:   PetscFree(id_r);
1963:   PetscFree(len_r);
1964:   PetscFree(buf_ri[0]);
1965:   PetscFree(buf_ri);
1966:   PetscFree(buf_rj[0]);
1967:   PetscFree(buf_rj);
1968:   PetscLayoutDestroy(&rowmap);

1970:   /* attach the supporting struct to Cmpi for reuse */
1971:   c = (Mat_MPIAIJ*)Cmpi->data;
1972:   c->ap           = ptap;
1973:   ptap->duplicate = Cmpi->ops->duplicate;
1974:   ptap->destroy   = Cmpi->ops->destroy;
1975:   ptap->view      = Cmpi->ops->view;
1976:   PetscCalloc1(pN,&ptap->apa);

1978:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1979:   Cmpi->assembled        = PETSC_FALSE;
1980:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
1981:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
1982:   Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1983:   return(0);
1984: }

1986: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1987: {
1988:   PetscErrorCode    ierr;
1989:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1990:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1991:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1992:   Mat_APMPI         *ptap = c->ap;
1993:   Mat               AP_loc,C_loc,C_oth;
1994:   PetscInt          i,rstart,rend,cm,ncols,row;
1995:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1996:   PetscScalar       *apa;
1997:   const PetscInt    *cols;
1998:   const PetscScalar *vals;

2001:   if (!ptap->AP_loc) {
2002:     MPI_Comm comm;
2003:     PetscObjectGetComm((PetscObject)C,&comm);
2004:     SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
2005:   }

2007:   MatZeroEntries(C);
2008:   /* 1) get R = Pd^T,Ro = Po^T */
2009:   if (ptap->reuse == MAT_REUSE_MATRIX) {
2010:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
2011:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
2012:   }

2014:   /* 2) get AP_loc */
2015:   AP_loc = ptap->AP_loc;
2016:   ap = (Mat_SeqAIJ*)AP_loc->data;

2018:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
2019:   /*-----------------------------------------------------*/
2020:   if (ptap->reuse == MAT_REUSE_MATRIX) {
2021:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2022:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
2023:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
2024:   }

2026:   /* 2-2) compute numeric A_loc*P - dominating part */
2027:   /* ---------------------------------------------- */
2028:   /* get data from symbolic products */
2029:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2030:   if (ptap->P_oth) {
2031:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2032:   }
2033:   apa   = ptap->apa;
2034:   api   = ap->i;
2035:   apj   = ap->j;
2036:   for (i=0; i<am; i++) {
2037:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2038:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2039:     apnz = api[i+1] - api[i];
2040:     for (j=0; j<apnz; j++) {
2041:       col = apj[j+api[i]];
2042:       ap->a[j+ap->i[i]] = apa[col];
2043:       apa[col] = 0.0;
2044:     }
2045:   }
2046:   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
2047:   PetscObjectStateIncrease((PetscObject)AP_loc);

2049:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2050:   ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
2051:   ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
2052:   C_loc = ptap->C_loc;
2053:   C_oth = ptap->C_oth;

2055:   /* add C_loc and Co to to C */
2056:   MatGetOwnershipRange(C,&rstart,&rend);

2058:   /* C_loc -> C */
2059:   cm    = C_loc->rmap->N;
2060:   c_seq = (Mat_SeqAIJ*)C_loc->data;
2061:   cols = c_seq->j;
2062:   vals = c_seq->a;


2065:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2066:   /* when there are no off-processor parts.  */
2067:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2068:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2069:   /* a table, and other, more complex stuff has to be done. */
2070:   if (C->assembled) {
2071:     C->was_assembled = PETSC_TRUE;
2072:     C->assembled     = PETSC_FALSE;
2073:   }
2074:   if (C->was_assembled) {
2075:     for (i=0; i<cm; i++) {
2076:       ncols = c_seq->i[i+1] - c_seq->i[i];
2077:       row = rstart + i;
2078:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
2079:       cols += ncols; vals += ncols;
2080:     }
2081:   } else {
2082:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
2083:   }

2085:   /* Co -> C, off-processor part */
2086:   cm = C_oth->rmap->N;
2087:   c_seq = (Mat_SeqAIJ*)C_oth->data;
2088:   cols = c_seq->j;
2089:   vals = c_seq->a;
2090:   for (i=0; i<cm; i++) {
2091:     ncols = c_seq->i[i+1] - c_seq->i[i];
2092:     row = p->garray[i];
2093:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2094:     cols += ncols; vals += ncols;
2095:   }

2097:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2098:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

2100:   ptap->reuse = MAT_REUSE_MATRIX;

2102:   /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2103:   if (ptap->freestruct) {
2104:     MatFreeIntermediateDataStructures(C);
2105:   }
2106:   return(0);
2107: }

2109: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2110: {
2111:   PetscErrorCode      ierr;
2112:   Mat_Product         *product = C->product;
2113:   Mat                 A=product->A,P=product->B;
2114:   MatProductAlgorithm alg=product->alg;
2115:   PetscReal           fill=product->fill;
2116:   PetscBool           flg;

2119:   /* scalable: do R=P^T locally, then C=R*A*P */
2120:   PetscStrcmp(alg,"scalable",&flg);
2121:   if (flg) {
2122:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);
2123:     C->ops->productnumeric = MatProductNumeric_PtAP;
2124:     goto next;
2125:   }

2127:   /* nonscalable: do R=P^T locally, then C=R*A*P */
2128:   PetscStrcmp(alg,"nonscalable",&flg);
2129:   if (flg) {
2130:     MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
2131:     goto next;
2132:   }

2134:   /* allatonce */
2135:   PetscStrcmp(alg,"allatonce",&flg);
2136:   if (flg) {
2137:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
2138:     goto next;
2139:   }

2141:   /* allatonce_merged */
2142:   PetscStrcmp(alg,"allatonce_merged",&flg);
2143:   if (flg) {
2144:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
2145:     goto next;
2146:   }

2148:   /* hypre */
2149: #if defined(PETSC_HAVE_HYPRE)
2150:   PetscStrcmp(alg,"hypre",&flg);
2151:   if (flg) {
2152:     MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
2153:     C->ops->productnumeric = MatProductNumeric_PtAP;
2154:     return(0);
2155:   }
2156: #endif
2157:   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");

2159: next:
2160:   C->ops->productnumeric = MatProductNumeric_PtAP;
2161:   {
2162:     Mat_MPIAIJ *c  = (Mat_MPIAIJ*)C->data;
2163:     Mat_APMPI  *ap = c->ap;
2164:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatFreeIntermediateDataStructures","Mat");
2165:     ap->freestruct = PETSC_FALSE;
2166:     PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
2167:     PetscOptionsEnd();
2168:   }
2169:   return(0);
2170: }