Actual source code: mpiptap.c

petsc-3.9.4 2018-09-11
Report Typos and Errors

  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7:  #include <../src/mat/impls/aij/seq/aij.h>
  8:  #include <../src/mat/utils/freespace.h>
  9:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10:  #include <petscbt.h>
 11:  #include <petsctime.h>

 13: /* #define PTAP_PROFILE */

 15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 16: {
 17:   PetscErrorCode    ierr;
 18:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
 19:   Mat_PtAPMPI       *ptap=a->ptap;
 20:   PetscBool         iascii;
 21:   PetscViewerFormat format;

 24:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 32:       }
 33:     }
 34:   }
 35:   (ptap->view)(A,viewer);
 36:   return(0);
 37: }

 39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
 40: {
 42:   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
 43:   Mat_PtAPMPI    *ptap=a->ptap;

 46:   if (ptap) {
 47:     Mat_Merge_SeqsToMPI *merge=ptap->merge;
 48:     PetscFree2(ptap->startsj_s,ptap->startsj_r);
 49:     PetscFree(ptap->bufa);
 50:     MatDestroy(&ptap->P_loc);
 51:     MatDestroy(&ptap->P_oth);
 52:     MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 53:     MatDestroy(&ptap->Rd);
 54:     MatDestroy(&ptap->Ro);
 55:     if (ptap->AP_loc) { /* used by alg_rap */
 56:       Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 57:       PetscFree(ap->i);
 58:       PetscFree2(ap->j,ap->a);
 59:       MatDestroy(&ptap->AP_loc);
 60:     } else { /* used by alg_ptap */
 61:       PetscFree(ptap->api);
 62:       PetscFree(ptap->apj);
 63:     }
 64:     MatDestroy(&ptap->C_loc);
 65:     MatDestroy(&ptap->C_oth);
 66:     if (ptap->apa) {PetscFree(ptap->apa);}

 68:     if (merge) { /* used by alg_ptap */
 69:       PetscFree(merge->id_r);
 70:       PetscFree(merge->len_s);
 71:       PetscFree(merge->len_r);
 72:       PetscFree(merge->bi);
 73:       PetscFree(merge->bj);
 74:       PetscFree(merge->buf_ri[0]);
 75:       PetscFree(merge->buf_ri);
 76:       PetscFree(merge->buf_rj[0]);
 77:       PetscFree(merge->buf_rj);
 78:       PetscFree(merge->coi);
 79:       PetscFree(merge->coj);
 80:       PetscFree(merge->owners_co);
 81:       PetscLayoutDestroy(&merge->rowmap);
 82:       PetscFree(ptap->merge);
 83:     }

 85:     ptap->destroy(A);
 86:     PetscFree(ptap);
 87:   }
 88:   return(0);
 89: }

 91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
 92: {
 94:   Mat_MPIAIJ     *a     = (Mat_MPIAIJ*)A->data;
 95:   Mat_PtAPMPI    *ptap  = a->ptap;

 98:   (*ptap->duplicate)(A,op,M);
 99:   (*M)->ops->destroy   = ptap->destroy;
100:   (*M)->ops->duplicate = ptap->duplicate;
101:   (*M)->ops->view      = ptap->view;
102:   return(0);
103: }

105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108:   PetscBool      flg;
109:   MPI_Comm       comm;
110: #if !defined(PETSC_HAVE_HYPRE)
111:   const char          *algTypes[2] = {"scalable","nonscalable"};
112:   PetscInt            nalg=2;
113: #else
114:   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
115:   PetscInt            nalg=3;
116: #endif
117:   PetscInt            pN=P->cmap->N,alg=1; /* set default algorithm */

120:   /* check if matrix local sizes are compatible */
121:   PetscObjectGetComm((PetscObject)A,&comm);
122:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
123:   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);

125:   if (scall == MAT_INITIAL_MATRIX) {
126:     /* pick an algorithm */
127:     PetscObjectOptionsBegin((PetscObject)A);
128:     PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
129:     PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
130:     PetscOptionsEnd();

132:     if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
133:       MatInfo     Ainfo,Pinfo;
134:       PetscInt    nz_local;
135:       PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;

137:       MatGetInfo(A,MAT_LOCAL,&Ainfo);
138:       MatGetInfo(P,MAT_LOCAL,&Pinfo);
139:       nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);

141:       if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
142:       MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);

144:       if (alg_scalable) {
145:         alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
146:       }
147:     }

149:     switch (alg) {
150:     case 1:
151:       /* do R=P^T locally, then C=R*A*P */
152:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
153:       MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
154:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
155:       break;
156: #if defined(PETSC_HAVE_HYPRE)
157:     case 2:
158:       /* Use boomerAMGBuildCoarseOperator */
159:       MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
160:       return(0);
161:       break;
162: #endif
163:     default:
164:       /* do R=P^T locally, then C=R*A*P */
165:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
166:       MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
167:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
168:       break;
169:     }
170:   }
171:   PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
172:   (*(*C)->ops->ptapnumeric)(A,P,*C);
173:   PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
174:   return(0);
175: }

177: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
178: {
179:   PetscErrorCode    ierr;
180:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
181:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
182:   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
183:   Mat_PtAPMPI       *ptap = c->ptap;
184:   Mat               AP_loc,C_loc,C_oth;
185:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
186:   PetscScalar       *apa;
187:   const PetscInt    *cols;
188:   const PetscScalar *vals;

191:   MatZeroEntries(C);

193:   /* 1) get R = Pd^T,Ro = Po^T */
194:   if (ptap->reuse == MAT_REUSE_MATRIX) {
195:     MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
196:     MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
197:   }

199:   /* 2) get AP_loc */
200:   AP_loc = ptap->AP_loc;
201:   ap = (Mat_SeqAIJ*)AP_loc->data;

203:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
204:   /*-----------------------------------------------------*/
205:   if (ptap->reuse == MAT_REUSE_MATRIX) {
206:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
207:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
208:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
209:   }

211:   /* 2-2) compute numeric A_loc*P - dominating part */
212:   /* ---------------------------------------------- */
213:   /* get data from symbolic products */
214:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
215:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
216:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");

218:   api   = ap->i;
219:   apj   = ap->j;
220:   for (i=0; i<am; i++) {
221:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
222:     apnz = api[i+1] - api[i];
223:     apa = ap->a + api[i];
224:     PetscMemzero(apa,sizeof(PetscScalar)*apnz);
225:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
226:     PetscLogFlops(2.0*apnz);
227:   }

229:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
230:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
231:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

233:   C_loc = ptap->C_loc;
234:   C_oth = ptap->C_oth;

236:   /* add C_loc and Co to to C */
237:   MatGetOwnershipRange(C,&rstart,&rend);

239:   /* C_loc -> C */
240:   cm    = C_loc->rmap->N;
241:   c_seq = (Mat_SeqAIJ*)C_loc->data;
242:   cols = c_seq->j;
243:   vals = c_seq->a;
244:   for (i=0; i<cm; i++) {
245:     ncols = c_seq->i[i+1] - c_seq->i[i];
246:     row = rstart + i;
247:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
248:     cols += ncols; vals += ncols;
249:   }

251:   /* Co -> C, off-processor part */
252:   cm = C_oth->rmap->N;
253:   c_seq = (Mat_SeqAIJ*)C_oth->data;
254:   cols = c_seq->j;
255:   vals = c_seq->a;
256:   for (i=0; i<cm; i++) {
257:     ncols = c_seq->i[i+1] - c_seq->i[i];
258:     row = p->garray[i];
259:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
260:     cols += ncols; vals += ncols;
261:   }
262:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
263:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

265:   ptap->reuse = MAT_REUSE_MATRIX;
266:   return(0);
267: }

269: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
270: {
271:   PetscErrorCode      ierr;
272:   Mat_PtAPMPI         *ptap;
273:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
274:   MPI_Comm            comm;
275:   PetscMPIInt         size,rank;
276:   Mat                 Cmpi,P_loc,P_oth;
277:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
278:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
279:   PetscInt            *lnk,i,k,pnz,row,nsend;
280:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
281:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
282:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
283:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
284:   MPI_Request         *swaits,*rwaits;
285:   MPI_Status          *sstatus,rstatus;
286:   PetscLayout         rowmap;
287:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
288:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
289:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
290:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
291:   PetscScalar         *apv;
292:   PetscTable          ta;
293: #if defined(PETSC_USE_INFO)
294:   PetscReal           apfill;
295: #endif

298:   PetscObjectGetComm((PetscObject)A,&comm);
299:   MPI_Comm_size(comm,&size);
300:   MPI_Comm_rank(comm,&rank);

302:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

304:   /* create symbolic parallel matrix Cmpi */
305:   MatCreate(comm,&Cmpi);
306:   MatSetType(Cmpi,MATMPIAIJ);

308:   /* create struct Mat_PtAPMPI and attached it to C later */
309:   PetscNew(&ptap);
310:   ptap->reuse = MAT_INITIAL_MATRIX;
311:   ptap->algType = 0;

313:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
314:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
315:   /* get P_loc by taking all local rows of P */
316:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

318:   ptap->P_loc = P_loc;
319:   ptap->P_oth = P_oth;

321:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
322:   /* --------------------------------- */
323:   MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
324:   MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

326:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
327:   /* ----------------------------------------------------------------- */
328:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
329:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

331:   /* create and initialize a linked list */
332:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
333:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
334:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
335:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

337:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

339:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
340:   if (ao) {
341:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
342:   } else {
343:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
344:   }
345:   current_space = free_space;
346:   nspacedouble  = 0;

348:   PetscMalloc1(am+1,&api);
349:   api[0] = 0;
350:   for (i=0; i<am; i++) {
351:     /* diagonal portion: Ad[i,:]*P */
352:     ai = ad->i; pi = p_loc->i;
353:     nzi = ai[i+1] - ai[i];
354:     aj  = ad->j + ai[i];
355:     for (j=0; j<nzi; j++) {
356:       row  = aj[j];
357:       pnz  = pi[row+1] - pi[row];
358:       Jptr = p_loc->j + pi[row];
359:       /* add non-zero cols of P into the sorted linked list lnk */
360:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
361:     }
362:     /* off-diagonal portion: Ao[i,:]*P */
363:     if (ao) {
364:       ai = ao->i; pi = p_oth->i;
365:       nzi = ai[i+1] - ai[i];
366:       aj  = ao->j + ai[i];
367:       for (j=0; j<nzi; j++) {
368:         row  = aj[j];
369:         pnz  = pi[row+1] - pi[row];
370:         Jptr = p_oth->j + pi[row];
371:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
372:       }
373:     }
374:     apnz     = lnk[0];
375:     api[i+1] = api[i] + apnz;

377:     /* if free space is not available, double the total space in the list */
378:     if (current_space->local_remaining<apnz) {
379:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
380:       nspacedouble++;
381:     }

383:     /* Copy data into free space, then initialize lnk */
384:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

386:     current_space->array           += apnz;
387:     current_space->local_used      += apnz;
388:     current_space->local_remaining -= apnz;
389:   }
390:   /* Allocate space for apj and apv, initialize apj, and */
391:   /* destroy list of free space and other temporary array(s) */
392:   PetscMalloc2(api[am],&apj,api[am],&apv);
393:   PetscFreeSpaceContiguous(&free_space,apj);
394:   PetscLLCondensedDestroy_Scalable(lnk);

396:   /* Create AP_loc for reuse */
397:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

399: #if defined(PETSC_USE_INFO)
400:   if (ao) {
401:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
402:   } else {
403:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
404:   }
405:   ptap->AP_loc->info.mallocs           = nspacedouble;
406:   ptap->AP_loc->info.fill_ratio_given  = fill;
407:   ptap->AP_loc->info.fill_ratio_needed = apfill;

409:   if (api[am]) {
410:     PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
411:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
412:   } else {
413:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
414:   }
415: #endif

417:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
418:   /* ------------------------------------ */
419:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

421:   /* (3) send coj of C_oth to other processors  */
422:   /* ------------------------------------------ */
423:   /* determine row ownership */
424:   PetscLayoutCreate(comm,&rowmap);
425:   rowmap->n  = pn;
426:   rowmap->bs = 1;
427:   PetscLayoutSetUp(rowmap);
428:   owners = rowmap->range;

430:   /* determine the number of messages to send, their lengths */
431:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
432:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
433:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

435:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
436:   coi   = c_oth->i; coj = c_oth->j;
437:   con   = ptap->C_oth->rmap->n;
438:   proc  = 0;
439:   for (i=0; i<con; i++) {
440:     while (prmap[i] >= owners[proc+1]) proc++;
441:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
442:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
443:   }

445:   len          = 0; /* max length of buf_si[], see (4) */
446:   owners_co[0] = 0;
447:   nsend        = 0;
448:   for (proc=0; proc<size; proc++) {
449:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
450:     if (len_s[proc]) {
451:       nsend++;
452:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
453:       len         += len_si[proc];
454:     }
455:   }

457:   /* determine the number and length of messages to receive for coi and coj  */
458:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
459:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

461:   /* post the Irecv and Isend of coj */
462:   PetscCommGetNewTag(comm,&tagj);
463:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
464:   PetscMalloc1(nsend+1,&swaits);
465:   for (proc=0, k=0; proc<size; proc++) {
466:     if (!len_s[proc]) continue;
467:     i    = owners_co[proc];
468:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
469:     k++;
470:   }

472:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
473:   /* ---------------------------------------- */
474:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
475:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

477:   /* receives coj are complete */
478:   for (i=0; i<nrecv; i++) {
479:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
480:   }
481:   PetscFree(rwaits);
482:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

484:   /* add received column indices into ta to update Crmax */
485:   for (k=0; k<nrecv; k++) {/* k-th received message */
486:     Jptr = buf_rj[k];
487:     for (j=0; j<len_r[k]; j++) {
488:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
489:     }
490:   }
491:   PetscTableGetCount(ta,&Crmax);
492:   PetscTableDestroy(&ta);

494:   /* (4) send and recv coi */
495:   /*-----------------------*/
496:   PetscCommGetNewTag(comm,&tagi);
497:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
498:   PetscMalloc1(len+1,&buf_s);
499:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
500:   for (proc=0,k=0; proc<size; proc++) {
501:     if (!len_s[proc]) continue;
502:     /* form outgoing message for i-structure:
503:          buf_si[0]:                 nrows to be sent
504:                [1:nrows]:           row index (global)
505:                [nrows+1:2*nrows+1]: i-structure index
506:     */
507:     /*-------------------------------------------*/
508:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
509:     buf_si_i    = buf_si + nrows+1;
510:     buf_si[0]   = nrows;
511:     buf_si_i[0] = 0;
512:     nrows       = 0;
513:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
514:       nzi = coi[i+1] - coi[i];
515:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
516:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
517:       nrows++;
518:     }
519:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
520:     k++;
521:     buf_si += len_si[proc];
522:   }
523:   for (i=0; i<nrecv; i++) {
524:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
525:   }
526:   PetscFree(rwaits);
527:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

529:   PetscFree4(len_s,len_si,sstatus,owners_co);
530:   PetscFree(len_ri);
531:   PetscFree(swaits);
532:   PetscFree(buf_s);

534:   /* (5) compute the local portion of Cmpi      */
535:   /* ------------------------------------------ */
536:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
537:   PetscFreeSpaceGet(Crmax,&free_space);
538:   current_space = free_space;

540:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
541:   for (k=0; k<nrecv; k++) {
542:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
543:     nrows       = *buf_ri_k[k];
544:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
545:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
546:   }

548:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
549:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
550:   for (i=0; i<pn; i++) {
551:     /* add C_loc into Cmpi */
552:     nzi  = c_loc->i[i+1] - c_loc->i[i];
553:     Jptr = c_loc->j + c_loc->i[i];
554:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

556:     /* add received col data into lnk */
557:     for (k=0; k<nrecv; k++) { /* k-th received message */
558:       if (i == *nextrow[k]) { /* i-th row */
559:         nzi  = *(nextci[k]+1) - *nextci[k];
560:         Jptr = buf_rj[k] + *nextci[k];
561:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
562:         nextrow[k]++; nextci[k]++;
563:       }
564:     }
565:     nzi = lnk[0];

567:     /* copy data into free space, then initialize lnk */
568:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
569:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
570:   }
571:   PetscFree3(buf_ri_k,nextrow,nextci);
572:   PetscLLCondensedDestroy_Scalable(lnk);
573:   PetscFreeSpaceDestroy(free_space);

575:   /* local sizes and preallocation */
576:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
577:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
578:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
579:   MatPreallocateFinalize(dnz,onz);

581:   /* members in merge */
582:   PetscFree(id_r);
583:   PetscFree(len_r);
584:   PetscFree(buf_ri[0]);
585:   PetscFree(buf_ri);
586:   PetscFree(buf_rj[0]);
587:   PetscFree(buf_rj);
588:   PetscLayoutDestroy(&rowmap);

590:   /* attach the supporting struct to Cmpi for reuse */
591:   c = (Mat_MPIAIJ*)Cmpi->data;
592:   c->ptap         = ptap;
593:   ptap->duplicate = Cmpi->ops->duplicate;
594:   ptap->destroy   = Cmpi->ops->destroy;
595:   ptap->view      = Cmpi->ops->view;

597:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
598:   Cmpi->assembled        = PETSC_FALSE;
599:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
600:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
601:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
602:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
603:   *C                     = Cmpi;
604:   return(0);
605: }

607: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
608: {
609:   PetscErrorCode      ierr;
610:   Mat_PtAPMPI         *ptap;
611:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
612:   MPI_Comm            comm;
613:   PetscMPIInt         size,rank;
614:   Mat                 Cmpi;
615:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
616:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
617:   PetscInt            *lnk,i,k,pnz,row,nsend;
618:   PetscBT             lnkbt;
619:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
620:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
621:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
622:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
623:   MPI_Request         *swaits,*rwaits;
624:   MPI_Status          *sstatus,rstatus;
625:   PetscLayout         rowmap;
626:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
627:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
628:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
629:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
630:   PetscScalar         *apv;
631:   PetscTable          ta;
632: #if defined(PETSC_USE_INFO)
633:   PetscReal           apfill;
634: #endif

637:   PetscObjectGetComm((PetscObject)A,&comm);
638:   MPI_Comm_size(comm,&size);
639:   MPI_Comm_rank(comm,&rank);

641:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

643:   /* create symbolic parallel matrix Cmpi */
644:   MatCreate(comm,&Cmpi);
645:   MatSetType(Cmpi,MATMPIAIJ);

647:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
648:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

650:   /* create struct Mat_PtAPMPI and attached it to C later */
651:   PetscNew(&ptap);
652:   ptap->reuse = MAT_INITIAL_MATRIX;
653:   ptap->algType = 1;

655:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
656:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
657:   /* get P_loc by taking all local rows of P */
658:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

660:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
661:   /* --------------------------------- */
662:   MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
663:   MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

665:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
666:   /* ----------------------------------------------------------------- */
667:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
668:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

670:   /* create and initialize a linked list */
671:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
672:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
673:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
674:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
675:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

677:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

679:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
680:   if (ao) {
681:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
682:   } else {
683:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
684:   }
685:   current_space = free_space;
686:   nspacedouble  = 0;

688:   PetscMalloc1(am+1,&api);
689:   api[0] = 0;
690:   for (i=0; i<am; i++) {
691:     /* diagonal portion: Ad[i,:]*P */
692:     ai = ad->i; pi = p_loc->i;
693:     nzi = ai[i+1] - ai[i];
694:     aj  = ad->j + ai[i];
695:     for (j=0; j<nzi; j++) {
696:       row  = aj[j];
697:       pnz  = pi[row+1] - pi[row];
698:       Jptr = p_loc->j + pi[row];
699:       /* add non-zero cols of P into the sorted linked list lnk */
700:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
701:     }
702:     /* off-diagonal portion: Ao[i,:]*P */
703:     if (ao) {
704:       ai = ao->i; pi = p_oth->i;
705:       nzi = ai[i+1] - ai[i];
706:       aj  = ao->j + ai[i];
707:       for (j=0; j<nzi; j++) {
708:         row  = aj[j];
709:         pnz  = pi[row+1] - pi[row];
710:         Jptr = p_oth->j + pi[row];
711:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
712:       }
713:     }
714:     apnz     = lnk[0];
715:     api[i+1] = api[i] + apnz;
716:     if (ap_rmax < apnz) ap_rmax = apnz;

718:     /* if free space is not available, double the total space in the list */
719:     if (current_space->local_remaining<apnz) {
720:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
721:       nspacedouble++;
722:     }

724:     /* Copy data into free space, then initialize lnk */
725:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

727:     current_space->array           += apnz;
728:     current_space->local_used      += apnz;
729:     current_space->local_remaining -= apnz;
730:   }
731:   /* Allocate space for apj and apv, initialize apj, and */
732:   /* destroy list of free space and other temporary array(s) */
733:   PetscMalloc2(api[am],&apj,api[am],&apv);
734:   PetscFreeSpaceContiguous(&free_space,apj);
735:   PetscLLDestroy(lnk,lnkbt);

737:   /* Create AP_loc for reuse */
738:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

740: #if defined(PETSC_USE_INFO)
741:   if (ao) {
742:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
743:   } else {
744:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
745:   }
746:   ptap->AP_loc->info.mallocs           = nspacedouble;
747:   ptap->AP_loc->info.fill_ratio_given  = fill;
748:   ptap->AP_loc->info.fill_ratio_needed = apfill;

750:   if (api[am]) {
751:     PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
752:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
753:   } else {
754:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
755:   }
756: #endif

758:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
759:   /* ------------------------------------ */
760:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

762:   /* (3) send coj of C_oth to other processors  */
763:   /* ------------------------------------------ */
764:   /* determine row ownership */
765:   PetscLayoutCreate(comm,&rowmap);
766:   rowmap->n  = pn;
767:   rowmap->bs = 1;
768:   PetscLayoutSetUp(rowmap);
769:   owners = rowmap->range;

771:   /* determine the number of messages to send, their lengths */
772:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
773:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
774:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

776:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
777:   coi   = c_oth->i; coj = c_oth->j;
778:   con   = ptap->C_oth->rmap->n;
779:   proc  = 0;
780:   for (i=0; i<con; i++) {
781:     while (prmap[i] >= owners[proc+1]) proc++;
782:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
783:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
784:   }

786:   len          = 0; /* max length of buf_si[], see (4) */
787:   owners_co[0] = 0;
788:   nsend        = 0;
789:   for (proc=0; proc<size; proc++) {
790:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
791:     if (len_s[proc]) {
792:       nsend++;
793:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
794:       len         += len_si[proc];
795:     }
796:   }

798:   /* determine the number and length of messages to receive for coi and coj  */
799:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
800:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

802:   /* post the Irecv and Isend of coj */
803:   PetscCommGetNewTag(comm,&tagj);
804:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
805:   PetscMalloc1(nsend+1,&swaits);
806:   for (proc=0, k=0; proc<size; proc++) {
807:     if (!len_s[proc]) continue;
808:     i    = owners_co[proc];
809:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
810:     k++;
811:   }

813:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
814:   /* ---------------------------------------- */
815:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
816:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

818:   /* receives coj are complete */
819:   for (i=0; i<nrecv; i++) {
820:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
821:   }
822:   PetscFree(rwaits);
823:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

825:   /* add received column indices into ta to update Crmax */
826:   for (k=0; k<nrecv; k++) {/* k-th received message */
827:     Jptr = buf_rj[k];
828:     for (j=0; j<len_r[k]; j++) {
829:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
830:     }
831:   }
832:   PetscTableGetCount(ta,&Crmax);
833:   PetscTableDestroy(&ta);

835:   /* (4) send and recv coi */
836:   /*-----------------------*/
837:   PetscCommGetNewTag(comm,&tagi);
838:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
839:   PetscMalloc1(len+1,&buf_s);
840:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
841:   for (proc=0,k=0; proc<size; proc++) {
842:     if (!len_s[proc]) continue;
843:     /* form outgoing message for i-structure:
844:          buf_si[0]:                 nrows to be sent
845:                [1:nrows]:           row index (global)
846:                [nrows+1:2*nrows+1]: i-structure index
847:     */
848:     /*-------------------------------------------*/
849:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
850:     buf_si_i    = buf_si + nrows+1;
851:     buf_si[0]   = nrows;
852:     buf_si_i[0] = 0;
853:     nrows       = 0;
854:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
855:       nzi = coi[i+1] - coi[i];
856:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
857:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
858:       nrows++;
859:     }
860:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
861:     k++;
862:     buf_si += len_si[proc];
863:   }
864:   for (i=0; i<nrecv; i++) {
865:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
866:   }
867:   PetscFree(rwaits);
868:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

870:   PetscFree4(len_s,len_si,sstatus,owners_co);
871:   PetscFree(len_ri);
872:   PetscFree(swaits);
873:   PetscFree(buf_s);

875:   /* (5) compute the local portion of Cmpi      */
876:   /* ------------------------------------------ */
877:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
878:   PetscFreeSpaceGet(Crmax,&free_space);
879:   current_space = free_space;

881:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
882:   for (k=0; k<nrecv; k++) {
883:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
884:     nrows       = *buf_ri_k[k];
885:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
886:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
887:   }

889:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
890:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
891:   for (i=0; i<pn; i++) {
892:     /* add C_loc into Cmpi */
893:     nzi  = c_loc->i[i+1] - c_loc->i[i];
894:     Jptr = c_loc->j + c_loc->i[i];
895:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

897:     /* add received col data into lnk */
898:     for (k=0; k<nrecv; k++) { /* k-th received message */
899:       if (i == *nextrow[k]) { /* i-th row */
900:         nzi  = *(nextci[k]+1) - *nextci[k];
901:         Jptr = buf_rj[k] + *nextci[k];
902:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
903:         nextrow[k]++; nextci[k]++;
904:       }
905:     }
906:     nzi = lnk[0];

908:     /* copy data into free space, then initialize lnk */
909:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
910:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
911:   }
912:   PetscFree3(buf_ri_k,nextrow,nextci);
913:   PetscLLDestroy(lnk,lnkbt);
914:   PetscFreeSpaceDestroy(free_space);

916:   /* local sizes and preallocation */
917:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
918:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
919:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
920:   MatPreallocateFinalize(dnz,onz);

922:   /* members in merge */
923:   PetscFree(id_r);
924:   PetscFree(len_r);
925:   PetscFree(buf_ri[0]);
926:   PetscFree(buf_ri);
927:   PetscFree(buf_rj[0]);
928:   PetscFree(buf_rj);
929:   PetscLayoutDestroy(&rowmap);

931:   /* attach the supporting struct to Cmpi for reuse */
932:   c = (Mat_MPIAIJ*)Cmpi->data;
933:   c->ptap         = ptap;
934:   ptap->duplicate = Cmpi->ops->duplicate;
935:   ptap->destroy   = Cmpi->ops->destroy;
936:   ptap->view      = Cmpi->ops->view;
937:   PetscCalloc1(pN,&ptap->apa);

939:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
940:   Cmpi->assembled        = PETSC_FALSE;
941:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
942:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
943:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
944:   *C                     = Cmpi;
945:   return(0);
946: }

948: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
949: {
950:   PetscErrorCode    ierr;
951:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
952:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
953:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
954:   Mat_PtAPMPI       *ptap = c->ptap;
955:   Mat               AP_loc,C_loc,C_oth;
956:   PetscInt          i,rstart,rend,cm,ncols,row;
957:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
958:   PetscScalar       *apa;
959:   const PetscInt    *cols;
960:   const PetscScalar *vals;

963:   MatZeroEntries(C);

965:   /* 1) get R = Pd^T,Ro = Po^T */
966:   if (ptap->reuse == MAT_REUSE_MATRIX) {
967:     MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
968:     MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
969:   }

971:   /* 2) get AP_loc */
972:   AP_loc = ptap->AP_loc;
973:   ap = (Mat_SeqAIJ*)AP_loc->data;

975:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
976:   /*-----------------------------------------------------*/
977:   if (ptap->reuse == MAT_REUSE_MATRIX) {
978:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
979:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
980:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
981:   }

983:   /* 2-2) compute numeric A_loc*P - dominating part */
984:   /* ---------------------------------------------- */
985:   /* get data from symbolic products */
986:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
987:   if (ptap->P_oth) {
988:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
989:   }
990:   apa   = ptap->apa;
991:   api   = ap->i;
992:   apj   = ap->j;
993:   for (i=0; i<am; i++) {
994:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
995:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
996:     apnz = api[i+1] - api[i];
997:     for (j=0; j<apnz; j++) {
998:       col = apj[j+api[i]];
999:       ap->a[j+ap->i[i]] = apa[col];
1000:       apa[col] = 0.0;
1001:     }
1002:     PetscLogFlops(2.0*apnz);
1003:   }

1005:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1006:   ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1007:   ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1008:   C_loc = ptap->C_loc;
1009:   C_oth = ptap->C_oth;

1011:   /* add C_loc and Co to to C */
1012:   MatGetOwnershipRange(C,&rstart,&rend);

1014:   /* C_loc -> C */
1015:   cm    = C_loc->rmap->N;
1016:   c_seq = (Mat_SeqAIJ*)C_loc->data;
1017:   cols = c_seq->j;
1018:   vals = c_seq->a;
1019:   for (i=0; i<cm; i++) {
1020:     ncols = c_seq->i[i+1] - c_seq->i[i];
1021:     row = rstart + i;
1022:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1023:     cols += ncols; vals += ncols;
1024:   }

1026:   /* Co -> C, off-processor part */
1027:   cm = C_oth->rmap->N;
1028:   c_seq = (Mat_SeqAIJ*)C_oth->data;
1029:   cols = c_seq->j;
1030:   vals = c_seq->a;
1031:   for (i=0; i<cm; i++) {
1032:     ncols = c_seq->i[i+1] - c_seq->i[i];
1033:     row = p->garray[i];
1034:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1035:     cols += ncols; vals += ncols;
1036:   }
1037:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1038:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1040:   ptap->reuse = MAT_REUSE_MATRIX;
1041:   return(0);
1042: }