Actual source code: mpiptap.c
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include src/mat/impls/aij/seq/aij.h
8: #include src/mat/utils/freespace.h
9: #include src/mat/impls/aij/mpi/mpiaij.h
10: #include petscbt.h
14: PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
15: {
16: PetscErrorCode ierr;
17: Mat_MatMatMultMPI *mult;
18: PetscObjectContainer container;
21: if (scall == MAT_INITIAL_MATRIX){
22: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
23: /* destroy the container 'Mat_MatMatMultMPI' in case that P is attached to it */
24: PetscObjectContainerDestroy_Mat_MatMatMultMPI((PetscObject)P);
25: /* create the container 'Mat_MatMatMultMPI' and attach it to P */
26: PetscNew(Mat_MatMatMultMPI,&mult);
27: PetscMemzero(mult,sizeof(Mat_MatMatMultMPI));
28: mult->B_loc=PETSC_NULL; mult->B_oth=PETSC_NULL;
29: mult->abi=PETSC_NULL; mult->abj=PETSC_NULL;
30: mult->abnz_max = 0; /* symbolic A*P is not done yet */
32: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
33: MatGetBrowsOfAoCols(A,P,scall,&mult->startsj,&mult->bufa,&mult->B_oth);
34:
35: /* get P_loc by taking all local rows of P */
36: MatGetLocalMat(P,scall,&mult->B_loc);
38: /* attach the container 'Mat_MatMatMultMPI' to P */
39: P->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
40: PetscObjectContainerCreate(PETSC_COMM_SELF,&container);
41: PetscObjectContainerSetPointer(container,mult);
42: PetscObjectCompose((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject)container);
43:
44: /* now, compute symbolic local P^T*A*P */
45: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);/* numeric product is computed as well */
46: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
47: } else if (scall == MAT_REUSE_MATRIX){
48: PetscObjectQuery((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject *)&container);
49: if (container) {
50: PetscObjectContainerGetPointer(container,(void **)&mult);
51: } else {
52: SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
53: }
54: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
55: MatGetBrowsOfAoCols(A,P,scall,&mult->startsj,&mult->bufa,&mult->B_oth);
56: /* get P_loc by taking all local rows of P */
57: MatGetLocalMat(P,scall,&mult->B_loc);
58: } else {
59: SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
60: }
61: /* now, compute numeric local P^T*A*P */
62: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
63: MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);
64: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
65:
66: return(0);
67: }
71: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
72: {
73: PetscErrorCode ierr;
74: Mat P_loc,P_oth,B_mpi;
75: Mat_MatMatMultMPI *ap;
76: PetscObjectContainer container;
77: FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
78: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
79: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
80: Mat_SeqAIJ *p_loc,*p_oth;
81: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ;
82: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz;
83: PetscInt nlnk,*lnk,*owners_co,*coi,*coj,i,k,pnz,row;
84: PetscInt am=A->m,pN=P->N,pn=P->n;
85: PetscBT lnkbt;
86: MPI_Comm comm=A->comm;
87: PetscMPIInt size,rank,tag,*len_si,*len_s,*len_ri;
88: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
89: PetscInt len,proc,*dnz,*onz,*owners;
90: PetscInt nzi,*bi,*bj;
91: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
92: MPI_Request *swaits,*rwaits;
93: MPI_Status *sstatus,rstatus;
94: Mat_Merge_SeqsToMPI *merge;
95: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,pon;
96: PetscMPIInt j;
99: PetscObjectQuery((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject *)&container);
100: if (container) {
101: PetscObjectContainerGetPointer(container,(void **)&ap);
102: } else {
103: SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
104: }
105: MPI_Comm_size(comm,&size);
106: MPI_Comm_rank(comm,&rank);
108: /* get data from P_loc and P_oth */
109: P_loc=ap->B_loc; P_oth=ap->B_oth;
110: p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
111: pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
113: /* first, compute symbolic AP = A_loc*P */
114: /*--------------------------------------*/
115: PetscMalloc((am+2)*sizeof(PetscInt),&api);
116: ap->abi = api;
117: api[0] = 0;
119: /* create and initialize a linked list */
120: nlnk = pN+1;
121: PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);
123: /* Initial FreeSpace size is fill*nnz(A) */
124: GetMoreSpace((PetscInt)(fill*(adi[am]+aoi[am])),&free_space);
125: current_space = free_space;
127: for (i=0;i<am;i++) {
128: apnz = 0;
129: /* diagonal portion of A */
130: nzi = adi[i+1] - adi[i];
131: for (j=0; j<nzi; j++){
132: row = *adj++;
133: pnz = pi_loc[row+1] - pi_loc[row];
134: Jptr = pj_loc + pi_loc[row];
135: /* add non-zero cols of P into the sorted linked list lnk */
136: PetscLLAdd(pnz,Jptr,pN,nlnk,lnk,lnkbt);
137: apnz += nlnk;
138: }
139: /* off-diagonal portion of A */
140: nzi = aoi[i+1] - aoi[i];
141: for (j=0; j<nzi; j++){
142: row = *aoj++;
143: pnz = pi_oth[row+1] - pi_oth[row];
144: Jptr = pj_oth + pi_oth[row];
145: PetscLLAdd(pnz,Jptr,pN,nlnk,lnk,lnkbt);
146: apnz += nlnk;
147: }
149: api[i+1] = api[i] + apnz;
150: if (ap->abnz_max < apnz) ap->abnz_max = apnz;
152: /* if free space is not available, double the total space in the list */
153: if (current_space->local_remaining<apnz) {
154: GetMoreSpace(current_space->total_array_size,¤t_space);
155: }
157: /* Copy data into free space, then initialize lnk */
158: PetscLLClean(pN,pN,apnz,lnk,current_space->array,lnkbt);
159: current_space->array += apnz;
160: current_space->local_used += apnz;
161: current_space->local_remaining -= apnz;
162: }
163: /* Allocate space for apj, initialize apj, and */
164: /* destroy list of free space and other temporary array(s) */
165: PetscMalloc((api[am]+1)*sizeof(PetscInt),&ap->abj);
166: apj = ap->abj;
167: MakeSpaceContiguous(&free_space,ap->abj);
169: /* determine symbolic Co=(p->B)^T*AP - send to others */
170: /*----------------------------------------------------*/
171: MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);
173: /* then, compute symbolic Co = (p->B)^T*AP */
174: pon = (p->B)->n; /* total num of rows to be sent to other processors
175: >= (num of nonzero rows of C_seq) - pn */
176: PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
177: coi[0] = 0;
179: /* set initial free space to be 3*pon*max( nnz(AP) per row) */
180: nnz = 3*pon*ap->abnz_max + 1;
181: GetMoreSpace(nnz,&free_space);
182: current_space = free_space;
184: for (i=0; i<pon; i++) {
185: nnz = 0;
186: pnz = poti[i+1] - poti[i];
187: j = pnz;
188: ptJ = potj + poti[i+1];
189: while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
190: j--; ptJ--;
191: row = *ptJ; /* row of AP == col of Pot */
192: apnz = api[row+1] - api[row];
193: Jptr = apj + api[row];
194: /* add non-zero cols of AP into the sorted linked list lnk */
195: PetscLLAdd(apnz,Jptr,pN,nlnk,lnk,lnkbt);
196: nnz += nlnk;
197: }
199: /* If free space is not available, double the total space in the list */
200: if (current_space->local_remaining<nnz) {
201: GetMoreSpace(current_space->total_array_size,¤t_space);
202: }
204: /* Copy data into free space, and zero out denserows */
205: PetscLLClean(pN,pN,nnz,lnk,current_space->array,lnkbt);
206: current_space->array += nnz;
207: current_space->local_used += nnz;
208: current_space->local_remaining -= nnz;
209: coi[i+1] = coi[i] + nnz;
210: }
211: PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
212: MakeSpaceContiguous(&free_space,coj);
213: MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);
215: /* send j-array (coj) of Co to other processors */
216: /*----------------------------------------------*/
217: /* determine row ownership */
218: PetscNew(Mat_Merge_SeqsToMPI,&merge);
219: PetscMemzero(merge,sizeof(Mat_Merge_SeqsToMPI));
220: PetscMapCreate(comm,&merge->rowmap);
221: PetscMapSetLocalSize(merge->rowmap,pn);
222: PetscMapSetType(merge->rowmap,MAP_MPI);
223: PetscMapGetGlobalRange(merge->rowmap,&owners);
225: /* determine the number of messages to send, their lengths */
226: PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
227: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
228: PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);
229: len_s = merge->len_s;
230: merge->nsend = 0;
231:
232: PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
233: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
235: proc = 0;
236: for (i=0; i<pon; i++){
237: while (prmap[i] >= owners[proc+1]) proc++;
238: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
239: len_s[proc] += coi[i+1] - coi[i];
240: }
242: len = 0; /* max length of buf_si[] */
243: owners_co[0] = 0;
244: for (proc=0; proc<size; proc++){
245: owners_co[proc+1] = owners_co[proc] + len_si[proc];
246: if (len_si[proc]){
247: merge->nsend++;
248: len_si[proc] = 2*(len_si[proc] + 1);
249: len += len_si[proc];
250: }
251: }
253: /* determine the number and length of messages to receive for coi and coj */
254: PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);
255: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
257: /* post the Irecv and Isend of coj */
258: PetscObjectGetNewTag((PetscObject)merge->rowmap,&tag);
259: PetscPostIrecvInt(comm,tag,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
261: PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
263: for (proc=0, k=0; proc<size; proc++){
264: if (!len_s[proc]) continue;
265: i = owners_co[proc];
266: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tag,comm,swaits+k);
267: k++;
268: }
270: /* receives and sends of coj are complete */
271: PetscMalloc(size*sizeof(MPI_Status),&sstatus);
272: i = merge->nrecv;
273: while (i--) {
274: MPI_Waitany(merge->nrecv,rwaits,&j,&rstatus);
275: }
276: PetscFree(rwaits);
277: if (merge->nsend){
278: MPI_Waitall(merge->nsend,swaits,sstatus);
279: }
280:
281: /* send and recv coi */
282: /*-------------------*/
283: PetscPostIrecvInt(comm,tag,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
284:
285: PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
286: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
287: for (proc=0,k=0; proc<size; proc++){
288: if (!len_s[proc]) continue;
289: /* form outgoing message for i-structure:
290: buf_si[0]: nrows to be sent
291: [1:nrows]: row index (global)
292: [nrows+1:2*nrows+1]: i-structure index
293: */
294: /*-------------------------------------------*/
295: nrows = len_si[proc]/2 - 1;
296: buf_si_i = buf_si + nrows+1;
297: buf_si[0] = nrows;
298: buf_si_i[0] = 0;
299: nrows = 0;
300: for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
301: nzi = coi[i+1] - coi[i];
302: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
303: buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
304: nrows++;
305: }
306: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tag,comm,swaits+k);
307: k++;
308: buf_si += len_si[proc];
309: }
310: i = merge->nrecv;
311: while (i--) {
312: MPI_Waitany(merge->nrecv,rwaits,&j,&rstatus);
313: }
314: PetscFree(rwaits);
315: if (merge->nsend){
316: MPI_Waitall(merge->nsend,swaits,sstatus);
317: }
319: PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);
320: for (i=0; i<merge->nrecv; i++){
321: PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);
322: }
324: PetscFree(len_si);
325: PetscFree(len_ri);
326: PetscFree(swaits);
327: PetscFree(sstatus);
328: PetscFree(buf_s);
330: /* compute the local portion of C (mpi mat) */
331: /*------------------------------------------*/
332: MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
334: /* allocate bi array and free space for accumulating nonzero column info */
335: PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
336: bi[0] = 0;
337:
338: /* set initial free space to be 3*pn*max( nnz(AP) per row) */
339: nnz = 3*pn*ap->abnz_max + 1;
340: GetMoreSpace(nnz,&free_space);
341: current_space = free_space;
343: PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);
344: nextrow = buf_ri_k + merge->nrecv;
345: nextci = nextrow + merge->nrecv;
346: for (k=0; k<merge->nrecv; k++){
347: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
348: nrows = *buf_ri_k[k];
349: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
350: nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */
351: }
352: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
353: for (i=0; i<pn; i++) {
354: /* add pdt[i,:]*AP into lnk */
355: nnz = 0;
356: pnz = pdti[i+1] - pdti[i];
357: j = pnz;
358: ptJ = pdtj + pdti[i+1];
359: while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
360: j--; ptJ--;
361: row = *ptJ; /* row of AP == col of Pt */
362: apnz = api[row+1] - api[row];
363: Jptr = apj + api[row];
364: /* add non-zero cols of AP into the sorted linked list lnk */
365: PetscLLAdd(apnz,Jptr,pN,nlnk,lnk,lnkbt);
366: nnz += nlnk;
367: }
368: /* add received col data into lnk */
369: for (k=0; k<merge->nrecv; k++){ /* k-th received message */
370: if (i == *nextrow[k]) { /* i-th row */
371: nzi = *(nextci[k]+1) - *nextci[k];
372: Jptr = buf_rj[k] + *nextci[k];
373: PetscLLAdd(nzi,Jptr,pN,nlnk,lnk,lnkbt);
374: nnz += nlnk;
375: nextrow[k]++; nextci[k]++;
376: }
377: }
379: /* if free space is not available, make more free space */
380: if (current_space->local_remaining<nnz) {
381: GetMoreSpace(current_space->total_array_size,¤t_space);
382: }
383: /* copy data into free space, then initialize lnk */
384: PetscLLClean(pN,pN,nnz,lnk,current_space->array,lnkbt);
385: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
386: current_space->array += nnz;
387: current_space->local_used += nnz;
388: current_space->local_remaining -= nnz;
389: bi[i+1] = bi[i] + nnz;
390: }
391: MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);
392: PetscFree(buf_ri_k);
394: PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
395: MakeSpaceContiguous(&free_space,bj);
396: PetscLLDestroy(lnk,lnkbt);
398: /* create symbolic parallel matrix B_mpi */
399: /*---------------------------------------*/
400: MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);
401: MatSetType(B_mpi,MATMPIAIJ);
402: MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);
403: MatPreallocateFinalize(dnz,onz);
405: /* B_mpi is not ready for use - assembly will be done by MatPtAPNumeric() */
406: B_mpi->assembled = PETSC_FALSE;
407: B_mpi->ops->destroy = MatDestroy_MPIAIJ_SeqsToMPI;
408: merge->bi = bi;
409: merge->bj = bj;
410: merge->coi = coi;
411: merge->coj = coj;
412: merge->buf_ri = buf_ri;
413: merge->buf_rj = buf_rj;
414: merge->owners_co = owners_co;
416: /* attach the supporting struct to B_mpi for reuse */
417: PetscObjectContainerCreate(PETSC_COMM_SELF,&container);
418: PetscObjectContainerSetPointer(container,merge);
419: PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);
420: *C = B_mpi;
421:
422: return(0);
423: }
427: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
428: {
429: PetscErrorCode ierr;
430: Mat_Merge_SeqsToMPI *merge;
431: Mat_MatMatMultMPI *ap;
432: PetscObjectContainer cont_merge,cont_ptap;
433: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
434: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
435: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
436: Mat_SeqAIJ *p_loc,*p_oth;
437: PetscInt *adi=ad->i,*aoi=ao->i,*adj=ad->j,*aoj=ao->j,*apJ,nextp,flops=0;
438: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
439: PetscInt i,j,k,anz,pnz,apnz,nextap,row,*cj;
440: MatScalar *ada=ad->a,*aoa=ao->a,*apa,*pa,*ca,*pa_loc,*pa_oth;
441: PetscInt am=A->m,cm=C->m,pon=(p->B)->n;
442: MPI_Comm comm=C->comm;
443: PetscMPIInt size,rank,taga,*len_s;
444: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
445: PetscInt **buf_ri,**buf_rj;
446: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
447: MPI_Request *s_waits,*r_waits;
448: MPI_Status *status;
449: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
450: PetscInt *api,*apj,*coi,*coj;
451: PetscInt *poJ=po->j,*pdJ=pd->j,pcstart=p->cstart,pcend=p->cend;
454: MPI_Comm_size(comm,&size);
455: MPI_Comm_rank(comm,&rank);
457: PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);
458: if (cont_merge) {
459: PetscObjectContainerGetPointer(cont_merge,(void **)&merge);
460: } else {
461: SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
462: }
463: PetscObjectQuery((PetscObject)P,"Mat_MatMatMultMPI",(PetscObject *)&cont_ptap);
464: if (cont_ptap) {
465: PetscObjectContainerGetPointer(cont_ptap,(void **)&ap);
466: } else {
467: SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
468: }
469:
470: /* get data from symbolic products */
471: p_loc = (Mat_SeqAIJ*)(ap->B_loc)->data;
472: p_oth = (Mat_SeqAIJ*)(ap->B_oth)->data;
473: pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a,pA=pa_loc;
474: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
475:
476: coi = merge->coi; coj = merge->coj;
477: PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
478: PetscMemzero(coa,coi[pon]*sizeof(MatScalar));
480: bi = merge->bi; bj = merge->bj;
481: PetscMapGetGlobalRange(merge->rowmap,&owners);
482: PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
483: PetscMemzero(ba,bi[cm]*sizeof(MatScalar));
485: /* get data from symbolic A*P */
486: PetscMalloc((ap->abnz_max+1)*sizeof(MatScalar),&apa);
487: PetscMemzero(apa,ap->abnz_max*sizeof(MatScalar));
489: /* compute numeric C_seq=P_loc^T*A_loc*P */
490: api = ap->abi; apj = ap->abj;
491: for (i=0;i<am;i++) {
492: /* form i-th sparse row of A*P */
493: apnz = api[i+1] - api[i];
494: apJ = apj + api[i];
495: /* diagonal portion of A */
496: anz = adi[i+1] - adi[i];
497: for (j=0;j<anz;j++) {
498: row = *adj++;
499: pnz = pi_loc[row+1] - pi_loc[row];
500: pj = pj_loc + pi_loc[row];
501: pa = pa_loc + pi_loc[row];
502: nextp = 0;
503: for (k=0; nextp<pnz; k++) {
504: if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
505: apa[k] += (*ada)*pa[nextp++];
506: }
507: }
508: flops += 2*pnz;
509: ada++;
510: }
511: /* off-diagonal portion of A */
512: anz = aoi[i+1] - aoi[i];
513: for (j=0; j<anz; j++) {
514: row = *aoj++;
515: pnz = pi_oth[row+1] - pi_oth[row];
516: pj = pj_oth + pi_oth[row];
517: pa = pa_oth + pi_oth[row];
518: nextp = 0;
519: for (k=0; nextp<pnz; k++) {
520: if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
521: apa[k] += (*aoa)*pa[nextp++];
522: }
523: }
524: flops += 2*pnz;
525: aoa++;
526: }
528: /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
529: pnz = pi_loc[i+1] - pi_loc[i];
530: for (j=0; j<pnz; j++) {
531: nextap = 0;
532: row = *pJ++; /* global index */
533: if (row < pcstart || row >=pcend) { /* put the value into Co */
534: cj = coj + coi[*poJ];
535: ca = coa + coi[*poJ++];
536: } else { /* put the value into Cd */
537: cj = bj + bi[*pdJ];
538: ca = ba + bi[*pdJ++];
539: }
540: for (k=0; nextap<apnz; k++) {
541: if (cj[k]==apJ[nextap]) ca[k] += (*pA)*apa[nextap++];
542: }
543: flops += 2*apnz;
544: pA++;
545: }
547: /* zero the current row info for A*P */
548: PetscMemzero(apa,apnz*sizeof(MatScalar));
549: }
550: PetscFree(apa);
551:
552: /* send and recv matrix values */
553: /*-----------------------------*/
554: buf_ri = merge->buf_ri;
555: buf_rj = merge->buf_rj;
556: len_s = merge->len_s;
557: PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);
558: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
560: PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);
561: for (proc=0,k=0; proc<size; proc++){
562: if (!len_s[proc]) continue;
563: i = merge->owners_co[proc];
564: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
565: k++;
566: }
567: PetscMalloc(size*sizeof(MPI_Status),&status);
568: MPI_Waitall(merge->nrecv,r_waits,status);
569: MPI_Waitall(merge->nsend,s_waits,status);
570: PetscFree(status);
572: PetscFree(s_waits);
573: PetscFree(r_waits);
574: PetscFree(coa);
576: /* insert local and received values into C */
577: /*-----------------------------------------*/
578: PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);
579: nextrow = buf_ri_k + merge->nrecv;
580: nextci = nextrow + merge->nrecv;
582: for (k=0; k<merge->nrecv; k++){
583: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
584: nrows = *(buf_ri_k[k]);
585: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
586: nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */
587: }
589: for (i=0; i<cm; i++) {
590: row = owners[rank] + i; /* global row index of C_seq */
591: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
592: ba_i = ba + bi[i];
593: bnz = bi[i+1] - bi[i];
594: /* add received vals into ba */
595: for (k=0; k<merge->nrecv; k++){ /* k-th received message */
596: /* i-th row */
597: if (i == *nextrow[k]) {
598: cnz = *(nextci[k]+1) - *nextci[k];
599: cj = buf_rj[k] + *(nextci[k]);
600: ca = abuf_r[k] + *(nextci[k]);
601: nextcj = 0;
602: for (j=0; nextcj<cnz; j++){
603: if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
604: ba_i[j] += ca[nextcj++];
605: }
606: }
607: nextrow[k]++; nextci[k]++;
608: }
609: }
610: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
611: flops += 2*cnz;
612: }
613: MatSetOption(C,MAT_COLUMNS_SORTED); /* -cause delay? */
614: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
615: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
617: PetscFree(ba);
618: PetscFree(abuf_r);
619: PetscFree(buf_ri_k);
620: PetscLogFlops(flops);
621:
622: return(0);
623: }