Actual source code: mpiov.c

petsc-3.7.3 2016-08-01
Report Typos and Errors
  2: /*
  3:    Routines to compute overlapping regions of a parallel MPI matrix
  4:   and to find submatrices that were shared across processors.
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>
  7: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  8: #include <petscbt.h>
  9: #include <petscsf.h>

 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
 13: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
 14: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 15: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
 20: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);


 25: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 26: {
 28:   PetscInt       i;

 31:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 32:   for (i=0; i<ov; ++i) {
 33:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 34:   }
 35:   return(0);
 36: }

 40: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
 41: {
 43:   PetscInt       i;

 46:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 47:   for (i=0; i<ov; ++i) {
 48:     MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
 49:   }
 50:   return(0);
 51: }


 56: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
 57: {
 58:   PetscErrorCode   ierr;
 59:   MPI_Comm         comm;
 60:   PetscInt        *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
 61:   PetscInt         *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
 62:   PetscInt         nrecvrows,*sbsizes = 0,*sbdata = 0;
 63:   const PetscInt  *indices_i,**indices;
 64:   PetscLayout      rmap;
 65:   PetscMPIInt      rank,size,*toranks,*fromranks,nto,nfrom;
 66:   PetscSF          sf;
 67:   PetscSFNode     *remote;

 70:   PetscObjectGetComm((PetscObject)mat,&comm);
 71:   MPI_Comm_rank(comm,&rank);
 72:   MPI_Comm_size(comm,&size);
 73:   /* get row map to determine where rows should be going */
 74:   MatGetLayouts(mat,&rmap,PETSC_NULL);
 75:   /* retrieve IS data and put all together so that we
 76:    * can optimize communication
 77:    *  */
 78:   PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
 79:   for (i=0,tlength=0; i<nidx; i++){
 80:     ISGetLocalSize(is[i],&length[i]);
 81:     tlength += length[i];
 82:     ISGetIndices(is[i],&indices[i]);
 83:   }
 84:   /* find these rows on remote processors */
 85:   PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
 86:   PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
 87:   nrrows = 0;
 88:   for (i=0; i<nidx; i++){
 89:     length_i     = length[i];
 90:     indices_i    = indices[i];
 91:     for (j=0; j<length_i; j++){
 92:       owner = -1;
 93:       PetscLayoutFindOwner(rmap,indices_i[j],&owner);
 94:       /* remote processors */
 95:       if (owner != rank){
 96:         tosizes_temp[owner]++; /* number of rows to owner */
 97:         rrow_ranks[nrrows]  = owner; /* processor */
 98:         rrow_isids[nrrows]   = i; /* is id */
 99:         remoterows[nrrows++] = indices_i[j]; /* row */
100:       }
101:     }
102:     ISRestoreIndices(is[i],&indices[i]);
103:   }
104:   PetscFree2(indices,length);
105:   /* test if we need to exchange messages
106:    * generally speaking, we do not need to exchange
107:    * data when overlap is 1
108:    * */
109:   MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
110:   /* we do not have any messages
111:    * It usually corresponds to overlap 1
112:    * */
113:   if (!reducednrrows){
114:     PetscFree3(toranks,tosizes,tosizes_temp);
115:     PetscFree3(remoterows,rrow_ranks,rrow_isids);
116:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
117:     return(0);
118:   }
119:   nto = 0;
120:   /* send sizes and ranks for building a two-sided communcation */
121:   for (i=0; i<size; i++){
122:    if (tosizes_temp[i]){
123:      tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
124:      tosizes_temp[i] = nto; /* a map from processor to index */
125:      toranks[nto++]  = i; /* processor */
126:    }
127:   }
128:   PetscCalloc1(nto+1,&toffsets);
129:   for (i=0; i<nto; i++){
130:     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
131:     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
132:   }
133:   /* send information to other processors */
134:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
135:   nrecvrows = 0;
136:   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137:   PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);
138:   nrecvrows = 0;
139:   for (i=0; i<nfrom; i++){
140:     for (j=0; j<fromsizes[2*i]; j++){
141:       remote[nrecvrows].rank    = fromranks[i];
142:       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143:     }
144:   }
145:   PetscSFCreate(comm,&sf);
146:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
147:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
148:   PetscSFSetType(sf,PETSCSFBASIC);
149:   PetscSFSetFromOptions(sf);
150:   /* message pair <no of is, row>  */
151:   PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
152:   for (i=0; i<nrrows; i++){
153:     owner = rrow_ranks[i]; /* processor */
154:     j     = tosizes_temp[owner]; /* index */
155:     todata[toffsets[j]++] = rrow_isids[i];
156:     todata[toffsets[j]++] = remoterows[i];
157:   }
158:   PetscFree3(toranks,tosizes,tosizes_temp);
159:   PetscFree3(remoterows,rrow_ranks,rrow_isids);
160:   PetscFree(toffsets);
161:   PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);
162:   PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);
163:   PetscSFDestroy(&sf);
164:   /* send rows belonging to the remote so that then we could get the overlapping data back */
165:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
166:   PetscFree2(todata,fromdata);
167:   PetscFree(fromsizes);
168:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
169:   PetscFree(fromranks);
170:   nrecvrows = 0;
171:   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
172:   PetscCalloc1(nrecvrows,&todata);
173:   PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);
174:   nrecvrows = 0;
175:   for (i=0; i<nto; i++){
176:     for (j=0; j<tosizes[2*i]; j++){
177:       remote[nrecvrows].rank    = toranks[i];
178:       remote[nrecvrows++].index = tosizes[2*i+1]+j;
179:     }
180:   }
181:   PetscSFCreate(comm,&sf);
182:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
183:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
184:   PetscSFSetType(sf,PETSCSFBASIC);
185:   PetscSFSetFromOptions(sf);
186:   /* overlap communication and computation */
187:   PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);
188:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
189:   PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);
190:   PetscSFDestroy(&sf);
191:   PetscFree2(sbdata,sbsizes);
192:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
193:   PetscFree(toranks);
194:   PetscFree(tosizes);
195:   PetscFree(todata);
196:   return(0);
197: }

201: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
202: {
203:   PetscInt         *isz,isz_i,i,j,is_id, data_size;
204:   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
205:   const PetscInt   *indices_i_temp;
206:   PetscErrorCode    ierr;

209:   max_lsize = 0;
210:   PetscMalloc(nidx*sizeof(PetscInt),&isz);
211:   for (i=0; i<nidx; i++){
212:     ISGetLocalSize(is[i],&lsize);
213:     max_lsize = lsize>max_lsize ? lsize:max_lsize;
214:     isz[i]    = lsize;
215:   }
216:   PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);
217:   for (i=0; i<nidx; i++){
218:     ISGetIndices(is[i],&indices_i_temp);
219:     PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);
220:     ISRestoreIndices(is[i],&indices_i_temp);
221:     ISDestroy(&is[i]);
222:   }
223:   /* retrieve information to get row id and its overlap */
224:   for (i=0; i<nrecvs; ){
225:     is_id      = recvdata[i++];
226:     data_size  = recvdata[i++];
227:     indices_i  = indices_temp+(max_lsize+nrecvs)*is_id;
228:     isz_i      = isz[is_id];
229:     for (j=0; j< data_size; j++){
230:       col = recvdata[i++];
231:       indices_i[isz_i++] = col;
232:     }
233:     isz[is_id] = isz_i;
234:   }
235:   /* remove duplicate entities */
236:   for (i=0; i<nidx; i++){
237:     indices_i  = indices_temp+(max_lsize+nrecvs)*i;
238:     isz_i      = isz[i];
239:     PetscSortRemoveDupsInt(&isz_i,indices_i);
240:     ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
241:   }
242:   PetscFree(isz);
243:   PetscFree(indices_temp);
244:   return(0);
245: }

249: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
250: {
251:   PetscLayout       rmap,cmap;
252:   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
253:   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
254:   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
255:   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
256:   Mat               amat,bmat;
257:   PetscMPIInt       rank;
258:   PetscBool         done;
259:   MPI_Comm          comm;
260:   PetscErrorCode    ierr;

263:   PetscObjectGetComm((PetscObject)mat,&comm);
264:   MPI_Comm_rank(comm,&rank);
265:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
266:   /* Even if the mat is symmetric, we still assume it is not symmetric */
267:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
268:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
269:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
270:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271:   /* total number of nonzero values is used to estimate the memory usage in the next step */
272:   tnz  = ai[an]+bi[bn];
273:   MatGetLayouts(mat,&rmap,&cmap);
274:   PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);
275:   PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);
276:   /* to find the longest message */
277:   max_fszs = 0;
278:   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
279:   /* better way to estimate number of nonzero in the mat??? */
280:   PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
281:   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
282:   rows_pos  = 0;
283:   totalrows = 0;
284:   for (i=0; i<nfrom; i++){
285:     PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
286:     /* group data together */
287:     for (j=0; j<fromsizes[2*i]; j+=2){
288:       is_id                       = fromrows[rows_pos++];/* no of is */
289:       rows_i                      = rows_data[is_id];
290:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
291:     }
292:     /* estimate a space to avoid multiple allocations  */
293:     for (j=0; j<nidx; j++){
294:       indvc_ij = 0;
295:       rows_i   = rows_data[j];
296:       for (l=0; l<rows_pos_i[j]; l++){
297:         row    = rows_i[l]-rstart;
298:         start  = ai[row];
299:         end    = ai[row+1];
300:         for (k=start; k<end; k++){ /* Amat */
301:           col = aj[k] + cstart;
302:           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
303:         }
304:         start = bi[row];
305:         end   = bi[row+1];
306:         for (k=start; k<end; k++) { /* Bmat */
307:           col = gcols[bj[k]];
308:           indices_tmp[indvc_ij++] = col;
309:         }
310:       }
311:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
312:       indv_counts[i*nidx+j] = indvc_ij;
313:       totalrows            += indvc_ij;
314:     }
315:   }
316:   /* message triple <no of is, number of rows, rows> */
317:   PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
318:   totalrows = 0;
319:   rows_pos  = 0;
320:   /* use this code again */
321:   for (i=0;i<nfrom;i++){
322:     PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
323:     for (j=0; j<fromsizes[2*i]; j+=2){
324:       is_id                       = fromrows[rows_pos++];
325:       rows_i                      = rows_data[is_id];
326:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
327:     }
328:     /* add data  */
329:     for (j=0; j<nidx; j++){
330:       if (!indv_counts[i*nidx+j]) continue;
331:       indvc_ij = 0;
332:       sbdata[totalrows++] = j;
333:       sbdata[totalrows++] = indv_counts[i*nidx+j];
334:       sbsizes[2*i]       += 2;
335:       rows_i              = rows_data[j];
336:       for (l=0; l<rows_pos_i[j]; l++){
337:         row   = rows_i[l]-rstart;
338:         start = ai[row];
339:         end   = ai[row+1];
340:         for (k=start; k<end; k++){ /* Amat */
341:           col = aj[k] + cstart;
342:           indices_tmp[indvc_ij++] = col;
343:         }
344:         start = bi[row];
345:         end   = bi[row+1];
346:         for (k=start; k<end; k++) { /* Bmat */
347:           col = gcols[bj[k]];
348:           indices_tmp[indvc_ij++] = col;
349:         }
350:       }
351:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
352:       sbsizes[2*i]  += indvc_ij;
353:       PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);
354:       totalrows += indvc_ij;
355:     }
356:   }
357:   PetscCalloc1(nfrom+1,&offsets);
358:   for (i=0; i<nfrom; i++){
359:     offsets[i+1]   = offsets[i] + sbsizes[2*i];
360:     sbsizes[2*i+1] = offsets[i];
361:   }
362:   PetscFree(offsets);
363:   if (sbrowsizes) *sbrowsizes = sbsizes;
364:   if (sbrows) *sbrows = sbdata;
365:   PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
366:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
367:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
368:   return(0);
369: }

373: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
374: {
375:   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
376:   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
377:   PetscInt          lsize,lsize_tmp,owner;
378:   PetscMPIInt       rank;
379:   Mat                   amat,bmat;
380:   PetscBool         done;
381:   PetscLayout       cmap,rmap;
382:   MPI_Comm          comm;
383:   PetscErrorCode    ierr;

386:   PetscObjectGetComm((PetscObject)mat,&comm);
387:   MPI_Comm_rank(comm,&rank);
388:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
389:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
390:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
391:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
392:   if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
393:   /* is it a safe way to compute number of nonzero values ? */
394:   tnz  = ai[an]+bi[bn];
395:   MatGetLayouts(mat,&rmap,&cmap);
396:   PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);
397:   PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);
398:   /* it is a better way to estimate memory than the old implementation
399:    * where global size of matrix is used
400:    * */
401:   PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);
402:   for (i=0; i<nidx; i++) {
403:     ISGetLocalSize(is[i],&lsize);
404:     ISGetIndices(is[i],&indices);
405:     lsize_tmp = 0;
406:     for (j=0; j<lsize; j++) {
407:       owner = -1;
408:       row   = indices[j];
409:       PetscLayoutFindOwner(rmap,row,&owner);
410:       if (owner != rank) continue;
411:       /* local number */
412:       row  -= rstart;
413:       start = ai[row];
414:       end   = ai[row+1];
415:       for (k=start; k<end; k++) { /* Amat */
416:         col = aj[k] + cstart;
417:         indices_temp[lsize_tmp++] = col;
418:       }
419:       start = bi[row];
420:       end   = bi[row+1];
421:       for (k=start; k<end; k++) { /* Bmat */
422:         col = gcols[bj[k]];
423:         indices_temp[lsize_tmp++] = col;
424:       }
425:     }
426:    ISRestoreIndices(is[i],&indices);
427:    ISDestroy(&is[i]);
428:    PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
429:    ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
430:   }
431:   PetscFree(indices_temp);
432:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
433:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
434:   return(0);
435: }


438: /*
439:   Sample message format:
440:   If a processor A wants processor B to process some elements corresponding
441:   to index sets is[1],is[5]
442:   mesg [0] = 2   (no of index sets in the mesg)
443:   -----------
444:   mesg [1] = 1 => is[1]
445:   mesg [2] = sizeof(is[1]);
446:   -----------
447:   mesg [3] = 5  => is[5]
448:   mesg [4] = sizeof(is[5]);
449:   -----------
450:   mesg [5]
451:   mesg [n]  datas[1]
452:   -----------
453:   mesg[n+1]
454:   mesg[m]  data(is[5])
455:   -----------

457:   Notes:
458:   nrqs - no of requests sent (or to be sent out)
459:   nrqr - no of requests recieved (which have to be or which have been processed
460: */
463: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
464: {
465:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
466:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
467:   const PetscInt **idx,*idx_i;
468:   PetscInt       *n,**data,len;
470:   PetscMPIInt    size,rank,tag1,tag2;
471:   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
472:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
473:   PetscBT        *table;
474:   MPI_Comm       comm;
475:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476:   MPI_Status     *s_status,*recv_status;
477:   char           *t_p;

480:   PetscObjectGetComm((PetscObject)C,&comm);
481:   size = c->size;
482:   rank = c->rank;
483:   M    = C->rmap->N;

485:   PetscObjectGetNewTag((PetscObject)C,&tag1);
486:   PetscObjectGetNewTag((PetscObject)C,&tag2);

488:   PetscMalloc2(imax,&idx,imax,&n);

490:   for (i=0; i<imax; i++) {
491:     ISGetIndices(is[i],&idx[i]);
492:     ISGetLocalSize(is[i],&n[i]);
493:   }

495:   /* evaluate communication - mesg to who,length of mesg, and buffer space
496:      required. Based on this, buffers are allocated, and data copied into them  */
497:   PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
498:   PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
499:   PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
500:   PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
501:   for (i=0; i<imax; i++) {
502:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
503:     idx_i = idx[i];
504:     len   = n[i];
505:     for (j=0; j<len; j++) {
506:       row = idx_i[j];
507:       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
508:       PetscLayoutFindOwner(C->rmap,row,&proc);
509:       w4[proc]++;
510:     }
511:     for (j=0; j<size; j++) {
512:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
513:     }
514:   }

516:   nrqs     = 0;              /* no of outgoing messages */
517:   msz      = 0;              /* total mesg length (for all proc */
518:   w1[rank] = 0;              /* no mesg sent to intself */
519:   w3[rank] = 0;
520:   for (i=0; i<size; i++) {
521:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
522:   }
523:   /* pa - is list of processors to communicate with */
524:   PetscMalloc1(nrqs+1,&pa);
525:   for (i=0,j=0; i<size; i++) {
526:     if (w1[i]) {pa[j] = i; j++;}
527:   }

529:   /* Each message would have a header = 1 + 2*(no of IS) + data */
530:   for (i=0; i<nrqs; i++) {
531:     j      = pa[i];
532:     w1[j] += w2[j] + 2*w3[j];
533:     msz   += w1[j];
534:   }

536:   /* Determine the number of messages to expect, their lengths, from from-ids */
537:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
538:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

540:   /* Now post the Irecvs corresponding to these messages */
541:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

543:   /* Allocate Memory for outgoing messages */
544:   PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
545:   PetscMemzero(outdat,size*sizeof(PetscInt*));
546:   PetscMemzero(ptr,size*sizeof(PetscInt*));

548:   {
549:     PetscInt *iptr = tmp,ict  = 0;
550:     for (i=0; i<nrqs; i++) {
551:       j         = pa[i];
552:       iptr     +=  ict;
553:       outdat[j] = iptr;
554:       ict       = w1[j];
555:     }
556:   }

558:   /* Form the outgoing messages */
559:   /* plug in the headers */
560:   for (i=0; i<nrqs; i++) {
561:     j            = pa[i];
562:     outdat[j][0] = 0;
563:     PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
564:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
565:   }

567:   /* Memory for doing local proc's work */
568:   {
569:     PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);

571:     for (i=0; i<imax; i++) {
572:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
573:       data[i]  = d_p + M*i;
574:     }
575:   }

577:   /* Parse the IS and update local tables and the outgoing buf with the data */
578:   {
579:     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
580:     PetscBT  table_i;

582:     for (i=0; i<imax; i++) {
583:       PetscMemzero(ctr,size*sizeof(PetscInt));
584:       n_i     = n[i];
585:       table_i = table[i];
586:       idx_i   = idx[i];
587:       data_i  = data[i];
588:       isz_i   = isz[i];
589:       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
590:         row  = idx_i[j];
591:         PetscLayoutFindOwner(C->rmap,row,&proc);
592:         if (proc != rank) { /* copy to the outgoing buffer */
593:           ctr[proc]++;
594:           *ptr[proc] = row;
595:           ptr[proc]++;
596:         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
597:       }
598:       /* Update the headers for the current IS */
599:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
600:         if ((ctr_j = ctr[j])) {
601:           outdat_j        = outdat[j];
602:           k               = ++outdat_j[0];
603:           outdat_j[2*k]   = ctr_j;
604:           outdat_j[2*k-1] = i;
605:         }
606:       }
607:       isz[i] = isz_i;
608:     }
609:   }

611:   /*  Now  post the sends */
612:   PetscMalloc1(nrqs+1,&s_waits1);
613:   for (i=0; i<nrqs; ++i) {
614:     j    = pa[i];
615:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
616:   }

618:   /* No longer need the original indices */
619:   for (i=0; i<imax; ++i) {
620:     ISRestoreIndices(is[i],idx+i);
621:   }
622:   PetscFree2(idx,n);

624:   for (i=0; i<imax; ++i) {
625:     ISDestroy(&is[i]);
626:   }

628:   /* Do Local work */
629:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);

631:   /* Receive messages */
632:   PetscMalloc1(nrqr+1,&recv_status);
633:   if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}

635:   PetscMalloc1(nrqs+1,&s_status);
636:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}

638:   /* Phase 1 sends are complete - deallocate buffers */
639:   PetscFree4(outdat,ptr,tmp,ctr);
640:   PetscFree4(w1,w2,w3,w4);

642:   PetscMalloc1(nrqr+1,&xdata);
643:   PetscMalloc1(nrqr+1,&isz1);
644:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
645:   PetscFree(rbuf[0]);
646:   PetscFree(rbuf);


649:   /* Send the data back */
650:   /* Do a global reduction to know the buffer space req for incoming messages */
651:   {
652:     PetscMPIInt *rw1;

654:     PetscCalloc1(size,&rw1);

656:     for (i=0; i<nrqr; ++i) {
657:       proc = recv_status[i].MPI_SOURCE;

659:       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
660:       rw1[proc] = isz1[i];
661:     }
662:     PetscFree(onodes1);
663:     PetscFree(olengths1);

665:     /* Determine the number of messages to expect, their lengths, from from-ids */
666:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
667:     PetscFree(rw1);
668:   }
669:   /* Now post the Irecvs corresponding to these messages */
670:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

672:   /* Now  post the sends */
673:   PetscMalloc1(nrqr+1,&s_waits2);
674:   for (i=0; i<nrqr; ++i) {
675:     j    = recv_status[i].MPI_SOURCE;
676:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
677:   }

679:   /* receive work done on other processors */
680:   {
681:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
682:     PetscMPIInt idex;
683:     PetscBT     table_i;
684:     MPI_Status  *status2;

686:     PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
687:     for (i=0; i<nrqs; ++i) {
688:       MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
689:       /* Process the message */
690:       rbuf2_i = rbuf2[idex];
691:       ct1     = 2*rbuf2_i[0]+1;
692:       jmax    = rbuf2[idex][0];
693:       for (j=1; j<=jmax; j++) {
694:         max     = rbuf2_i[2*j];
695:         is_no   = rbuf2_i[2*j-1];
696:         isz_i   = isz[is_no];
697:         data_i  = data[is_no];
698:         table_i = table[is_no];
699:         for (k=0; k<max; k++,ct1++) {
700:           row = rbuf2_i[ct1];
701:           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
702:         }
703:         isz[is_no] = isz_i;
704:       }
705:     }

707:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
708:     PetscFree(status2);
709:   }

711:   for (i=0; i<imax; ++i) {
712:     ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
713:   }

715:   PetscFree(onodes2);
716:   PetscFree(olengths2);

718:   PetscFree(pa);
719:   PetscFree(rbuf2[0]);
720:   PetscFree(rbuf2);
721:   PetscFree(s_waits1);
722:   PetscFree(r_waits1);
723:   PetscFree(s_waits2);
724:   PetscFree(r_waits2);
725:   PetscFree5(table,data,isz,d_p,t_p);
726:   PetscFree(s_status);
727:   PetscFree(recv_status);
728:   PetscFree(xdata[0]);
729:   PetscFree(xdata);
730:   PetscFree(isz1);
731:   return(0);
732: }

736: /*
737:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
738:        the work on the local processor.

740:      Inputs:
741:       C      - MAT_MPIAIJ;
742:       imax - total no of index sets processed at a time;
743:       table  - an array of char - size = m bits.

745:      Output:
746:       isz    - array containing the count of the solution elements corresponding
747:                to each index set;
748:       data   - pointer to the solutions
749: */
750: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
751: {
752:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
753:   Mat        A  = c->A,B = c->B;
754:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
755:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
756:   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
757:   PetscBT    table_i;

760:   rstart = C->rmap->rstart;
761:   cstart = C->cmap->rstart;
762:   ai     = a->i;
763:   aj     = a->j;
764:   bi     = b->i;
765:   bj     = b->j;
766:   garray = c->garray;


769:   for (i=0; i<imax; i++) {
770:     data_i  = data[i];
771:     table_i = table[i];
772:     isz_i   = isz[i];
773:     for (j=0,max=isz[i]; j<max; j++) {
774:       row   = data_i[j] - rstart;
775:       start = ai[row];
776:       end   = ai[row+1];
777:       for (k=start; k<end; k++) { /* Amat */
778:         val = aj[k] + cstart;
779:         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
780:       }
781:       start = bi[row];
782:       end   = bi[row+1];
783:       for (k=start; k<end; k++) { /* Bmat */
784:         val = garray[bj[k]];
785:         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
786:       }
787:     }
788:     isz[i] = isz_i;
789:   }
790:   return(0);
791: }

795: /*
796:       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
797:          and return the output

799:          Input:
800:            C    - the matrix
801:            nrqr - no of messages being processed.
802:            rbuf - an array of pointers to the recieved requests

804:          Output:
805:            xdata - array of messages to be sent back
806:            isz1  - size of each message

808:   For better efficiency perhaps we should malloc separately each xdata[i],
809: then if a remalloc is required we need only copy the data for that one row
810: rather then all previous rows as it is now where a single large chunck of
811: memory is used.

813: */
814: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
815: {
816:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
817:   Mat            A  = c->A,B = c->B;
818:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
820:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
821:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
822:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
823:   PetscInt       *rbuf_i,kmax,rbuf_0;
824:   PetscBT        xtable;

827:   m      = C->rmap->N;
828:   rstart = C->rmap->rstart;
829:   cstart = C->cmap->rstart;
830:   ai     = a->i;
831:   aj     = a->j;
832:   bi     = b->i;
833:   bj     = b->j;
834:   garray = c->garray;


837:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
838:     rbuf_i =  rbuf[i];
839:     rbuf_0 =  rbuf_i[0];
840:     ct    += rbuf_0;
841:     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
842:   }

844:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
845:   else max1 = 1;
846:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
847:   PetscMalloc1(mem_estimate,&xdata[0]);
848:   ++no_malloc;
849:   PetscBTCreate(m,&xtable);
850:   PetscMemzero(isz1,nrqr*sizeof(PetscInt));

852:   ct3 = 0;
853:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
854:     rbuf_i =  rbuf[i];
855:     rbuf_0 =  rbuf_i[0];
856:     ct1    =  2*rbuf_0+1;
857:     ct2    =  ct1;
858:     ct3   += ct1;
859:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
860:       PetscBTMemzero(m,xtable);
861:       oct2 = ct2;
862:       kmax = rbuf_i[2*j];
863:       for (k=0; k<kmax; k++,ct1++) {
864:         row = rbuf_i[ct1];
865:         if (!PetscBTLookupSet(xtable,row)) {
866:           if (!(ct3 < mem_estimate)) {
867:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
868:             PetscMalloc1(new_estimate,&tmp);
869:             PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
870:             PetscFree(xdata[0]);
871:             xdata[0]     = tmp;
872:             mem_estimate = new_estimate; ++no_malloc;
873:             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
874:           }
875:           xdata[i][ct2++] = row;
876:           ct3++;
877:         }
878:       }
879:       for (k=oct2,max2=ct2; k<max2; k++) {
880:         row   = xdata[i][k] - rstart;
881:         start = ai[row];
882:         end   = ai[row+1];
883:         for (l=start; l<end; l++) {
884:           val = aj[l] + cstart;
885:           if (!PetscBTLookupSet(xtable,val)) {
886:             if (!(ct3 < mem_estimate)) {
887:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
888:               PetscMalloc1(new_estimate,&tmp);
889:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
890:               PetscFree(xdata[0]);
891:               xdata[0]     = tmp;
892:               mem_estimate = new_estimate; ++no_malloc;
893:               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
894:             }
895:             xdata[i][ct2++] = val;
896:             ct3++;
897:           }
898:         }
899:         start = bi[row];
900:         end   = bi[row+1];
901:         for (l=start; l<end; l++) {
902:           val = garray[bj[l]];
903:           if (!PetscBTLookupSet(xtable,val)) {
904:             if (!(ct3 < mem_estimate)) {
905:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
906:               PetscMalloc1(new_estimate,&tmp);
907:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
908:               PetscFree(xdata[0]);
909:               xdata[0]     = tmp;
910:               mem_estimate = new_estimate; ++no_malloc;
911:               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
912:             }
913:             xdata[i][ct2++] = val;
914:             ct3++;
915:           }
916:         }
917:       }
918:       /* Update the header*/
919:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
920:       xdata[i][2*j-1] = rbuf_i[2*j-1];
921:     }
922:     xdata[i][0] = rbuf_0;
923:     xdata[i+1]  = xdata[i] + ct2;
924:     isz1[i]     = ct2; /* size of each message */
925:   }
926:   PetscBTDestroy(&xtable);
927:   PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
928:   return(0);
929: }
930: /* -------------------------------------------------------------------------*/
931: extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
932: extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
933: /*
934:     Every processor gets the entire matrix
935: */
938: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
939: {
940:   Mat            B;
941:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
942:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
944:   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
945:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
946:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
947:   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;

950:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
951:   MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);

953:   if (scall == MAT_INITIAL_MATRIX) {
954:     /* ----------------------------------------------------------------
955:          Tell every processor the number of nonzeros per row
956:     */
957:     PetscMalloc1(A->rmap->N,&lens);
958:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
959:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
960:     }
961:     PetscMalloc2(size,&recvcounts,size,&displs);
962:     for (i=0; i<size; i++) {
963:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
964:       displs[i]     = A->rmap->range[i];
965:     }
966: #if defined(PETSC_HAVE_MPI_IN_PLACE)
967:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
968: #else
969:     sendcount = A->rmap->rend - A->rmap->rstart;
970:     MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
971: #endif
972:     /* ---------------------------------------------------------------
973:          Create the sequential matrix of the same type as the local block diagonal
974:     */
975:     MatCreate(PETSC_COMM_SELF,&B);
976:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
977:     MatSetBlockSizesFromMats(B,A,A);
978:     MatSetType(B,((PetscObject)a->A)->type_name);
979:     MatSeqAIJSetPreallocation(B,0,lens);
980:     PetscMalloc1(1,Bin);
981:     **Bin = B;
982:     b     = (Mat_SeqAIJ*)B->data;

984:     /*--------------------------------------------------------------------
985:        Copy my part of matrix column indices over
986:     */
987:     sendcount  = ad->nz + bd->nz;
988:     jsendbuf   = b->j + b->i[rstarts[rank]];
989:     a_jsendbuf = ad->j;
990:     b_jsendbuf = bd->j;
991:     n          = A->rmap->rend - A->rmap->rstart;
992:     cnt        = 0;
993:     for (i=0; i<n; i++) {

995:       /* put in lower diagonal portion */
996:       m = bd->i[i+1] - bd->i[i];
997:       while (m > 0) {
998:         /* is it above diagonal (in bd (compressed) numbering) */
999:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1000:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1001:         m--;
1002:       }

1004:       /* put in diagonal portion */
1005:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1006:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1007:       }

1009:       /* put in upper diagonal portion */
1010:       while (m-- > 0) {
1011:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1012:       }
1013:     }
1014:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1016:     /*--------------------------------------------------------------------
1017:        Gather all column indices to all processors
1018:     */
1019:     for (i=0; i<size; i++) {
1020:       recvcounts[i] = 0;
1021:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1022:         recvcounts[i] += lens[j];
1023:       }
1024:     }
1025:     displs[0] = 0;
1026:     for (i=1; i<size; i++) {
1027:       displs[i] = displs[i-1] + recvcounts[i-1];
1028:     }
1029: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1030:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1031: #else
1032:     MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1033: #endif
1034:     /*--------------------------------------------------------------------
1035:         Assemble the matrix into useable form (note numerical values not yet set)
1036:     */
1037:     /* set the b->ilen (length of each row) values */
1038:     PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
1039:     /* set the b->i indices */
1040:     b->i[0] = 0;
1041:     for (i=1; i<=A->rmap->N; i++) {
1042:       b->i[i] = b->i[i-1] + lens[i-1];
1043:     }
1044:     PetscFree(lens);
1045:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1046:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

1048:   } else {
1049:     B = **Bin;
1050:     b = (Mat_SeqAIJ*)B->data;
1051:   }

1053:   /*--------------------------------------------------------------------
1054:        Copy my part of matrix numerical values into the values location
1055:   */
1056:   if (flag == MAT_GET_VALUES) {
1057:     sendcount = ad->nz + bd->nz;
1058:     sendbuf   = b->a + b->i[rstarts[rank]];
1059:     a_sendbuf = ad->a;
1060:     b_sendbuf = bd->a;
1061:     b_sendj   = bd->j;
1062:     n         = A->rmap->rend - A->rmap->rstart;
1063:     cnt       = 0;
1064:     for (i=0; i<n; i++) {

1066:       /* put in lower diagonal portion */
1067:       m = bd->i[i+1] - bd->i[i];
1068:       while (m > 0) {
1069:         /* is it above diagonal (in bd (compressed) numbering) */
1070:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1071:         sendbuf[cnt++] = *b_sendbuf++;
1072:         m--;
1073:         b_sendj++;
1074:       }

1076:       /* put in diagonal portion */
1077:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1078:         sendbuf[cnt++] = *a_sendbuf++;
1079:       }

1081:       /* put in upper diagonal portion */
1082:       while (m-- > 0) {
1083:         sendbuf[cnt++] = *b_sendbuf++;
1084:         b_sendj++;
1085:       }
1086:     }
1087:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

1089:     /* -----------------------------------------------------------------
1090:        Gather all numerical values to all processors
1091:     */
1092:     if (!recvcounts) {
1093:       PetscMalloc2(size,&recvcounts,size,&displs);
1094:     }
1095:     for (i=0; i<size; i++) {
1096:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1097:     }
1098:     displs[0] = 0;
1099:     for (i=1; i<size; i++) {
1100:       displs[i] = displs[i-1] + recvcounts[i-1];
1101:     }
1102:     recvbuf = b->a;
1103: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1104:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1105: #else
1106:     MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1107: #endif
1108:   }  /* endof (flag == MAT_GET_VALUES) */
1109:   PetscFree2(recvcounts,displs);

1111:   if (A->symmetric) {
1112:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1113:   } else if (A->hermitian) {
1114:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1115:   } else if (A->structurally_symmetric) {
1116:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1117:   }
1118:   return(0);
1119: }



1125: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1126: {
1128:   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1129:   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;


1133:   /*
1134:        Check for special case: each processor gets entire matrix
1135:   */
1136:   if (ismax == 1 && C->rmap->N == C->cmap->N) {
1137:     ISIdentity(*isrow,&rowflag);
1138:     ISIdentity(*iscol,&colflag);
1139:     ISGetLocalSize(*isrow,&nrow);
1140:     ISGetLocalSize(*iscol,&ncol);
1141:     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1142:       wantallmatrix = PETSC_TRUE;

1144:       PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
1145:     }
1146:   }
1147:   MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));
1148:   if (twantallmatrix) {
1149:     MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
1150:     return(0);
1151:   }

1153:   /* Allocate memory to hold all the submatrices */
1154:   if (scall != MAT_REUSE_MATRIX) {
1155:     PetscMalloc1(ismax+1,submat);
1156:   }

1158:   /* Check for special case: each processor gets entire matrix columns */
1159:   PetscMalloc1(ismax+1,&allcolumns);
1160:   for (i=0; i<ismax; i++) {
1161:     ISIdentity(iscol[i],&colflag);
1162:     ISGetLocalSize(iscol[i],&ncol);
1163:     if (colflag && ncol == C->cmap->N) {
1164:       allcolumns[i] = PETSC_TRUE;
1165:     } else {
1166:       allcolumns[i] = PETSC_FALSE;
1167:     }
1168:   }

1170:   /* Determine the number of stages through which submatrices are done */
1171:   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));

1173:   /*
1174:      Each stage will extract nmax submatrices.
1175:      nmax is determined by the matrix column dimension.
1176:      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1177:   */
1178:   if (!nmax) nmax = 1;
1179:   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);

1181:   /* Make sure every processor loops through the nstages */
1182:   MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));

1184:   for (i=0,pos=0; i<nstages; i++) {
1185:     if (pos+nmax <= ismax) max_no = nmax;
1186:     else if (pos == ismax) max_no = 0;
1187:     else                   max_no = ismax-pos;
1188:     MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);
1189:     pos += max_no;
1190:   }

1192:   PetscFree(allcolumns);
1193:   return(0);
1194: }

1196: /* -------------------------------------------------------------------------*/
1199: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1200: {
1201:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1202:   Mat            A  = c->A;
1203:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1204:   const PetscInt **icol,**irow;
1205:   PetscInt       *nrow,*ncol,start;
1207:   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1208:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1209:   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1210:   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1211:   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1212: #if defined(PETSC_USE_CTABLE)
1213:   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1214: #else
1215:   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1216: #endif
1217:   const PetscInt *irow_i;
1218:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1219:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1220:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
1221:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1222:   MPI_Status     *r_status3,*r_status4,*s_status4;
1223:   MPI_Comm       comm;
1224:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1225:   PetscMPIInt    *onodes1,*olengths1;
1226:   PetscMPIInt    idex,idex2,end;

1229:   PetscObjectGetComm((PetscObject)C,&comm);
1230:   tag0 = ((PetscObject)C)->tag;
1231:   size = c->size;
1232:   rank = c->rank;

1234:   /* Get some new tags to keep the communication clean */
1235:   PetscObjectGetNewTag((PetscObject)C,&tag1);
1236:   PetscObjectGetNewTag((PetscObject)C,&tag2);
1237:   PetscObjectGetNewTag((PetscObject)C,&tag3);

1239:   PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);

1241:   for (i=0; i<ismax; i++) {
1242:     ISGetIndices(isrow[i],&irow[i]);
1243:     ISGetLocalSize(isrow[i],&nrow[i]);
1244:     if (allcolumns[i]) {
1245:       icol[i] = NULL;
1246:       ncol[i] = C->cmap->N;
1247:     } else {
1248:       ISGetIndices(iscol[i],&icol[i]);
1249:       ISGetLocalSize(iscol[i],&ncol[i]);
1250:     }
1251:   }

1253:   /* evaluate communication - mesg to who, length of mesg, and buffer space
1254:      required. Based on this, buffers are allocated, and data copied into them*/
1255:   PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);   /* mesg size */
1256:   PetscMemzero(w1,size*sizeof(PetscMPIInt));   /* initialize work vector*/
1257:   PetscMemzero(w2,size*sizeof(PetscMPIInt));   /* initialize work vector*/
1258:   PetscMemzero(w3,size*sizeof(PetscMPIInt));   /* initialize work vector*/
1259:   for (i=0; i<ismax; i++) {
1260:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
1261:     jmax   = nrow[i];
1262:     irow_i = irow[i];
1263:     for (j=0; j<jmax; j++) {
1264:       l   = 0;
1265:       row = irow_i[j];
1266:       while (row >= C->rmap->range[l+1]) l++;
1267:       proc = l;
1268:       w4[proc]++;
1269:     }
1270:     for (j=0; j<size; j++) {
1271:       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
1272:     }
1273:   }

1275:   nrqs     = 0;              /* no of outgoing messages */
1276:   msz      = 0;              /* total mesg length (for all procs) */
1277:   w1[rank] = 0;              /* no mesg sent to self */
1278:   w3[rank] = 0;
1279:   for (i=0; i<size; i++) {
1280:     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1281:   }
1282:   PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1283:   for (i=0,j=0; i<size; i++) {
1284:     if (w1[i]) { pa[j] = i; j++; }
1285:   }

1287:   /* Each message would have a header = 1 + 2*(no of IS) + data */
1288:   for (i=0; i<nrqs; i++) {
1289:     j      = pa[i];
1290:     w1[j] += w2[j] + 2* w3[j];
1291:     msz   += w1[j];
1292:   }
1293:   PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

1295:   /* Determine the number of messages to expect, their lengths, from from-ids */
1296:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1297:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

1299:   /* Now post the Irecvs corresponding to these messages */
1300:   PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

1302:   PetscFree(onodes1);
1303:   PetscFree(olengths1);

1305:   /* Allocate Memory for outgoing messages */
1306:   PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1307:   PetscMemzero(sbuf1,size*sizeof(PetscInt*));
1308:   PetscMemzero(ptr,size*sizeof(PetscInt*));

1310:   {
1311:     PetscInt *iptr = tmp,ict = 0;
1312:     for (i=0; i<nrqs; i++) {
1313:       j        = pa[i];
1314:       iptr    += ict;
1315:       sbuf1[j] = iptr;
1316:       ict      = w1[j];
1317:     }
1318:   }

1320:   /* Form the outgoing messages */
1321:   /* Initialize the header space */
1322:   for (i=0; i<nrqs; i++) {
1323:     j           = pa[i];
1324:     sbuf1[j][0] = 0;
1325:     PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
1326:     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
1327:   }

1329:   /* Parse the isrow and copy data into outbuf */
1330:   for (i=0; i<ismax; i++) {
1331:     PetscMemzero(ctr,size*sizeof(PetscInt));
1332:     irow_i = irow[i];
1333:     jmax   = nrow[i];
1334:     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
1335:       l   = 0;
1336:       row = irow_i[j];
1337:       while (row >= C->rmap->range[l+1]) l++;
1338:       proc = l;
1339:       if (proc != rank) { /* copy to the outgoing buf*/
1340:         ctr[proc]++;
1341:         *ptr[proc] = row;
1342:         ptr[proc]++;
1343:       }
1344:     }
1345:     /* Update the headers for the current IS */
1346:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1347:       if ((ctr_j = ctr[j])) {
1348:         sbuf1_j        = sbuf1[j];
1349:         k              = ++sbuf1_j[0];
1350:         sbuf1_j[2*k]   = ctr_j;
1351:         sbuf1_j[2*k-1] = i;
1352:       }
1353:     }
1354:   }

1356:   /*  Now  post the sends */
1357:   PetscMalloc1(nrqs+1,&s_waits1);
1358:   for (i=0; i<nrqs; ++i) {
1359:     j    = pa[i];
1360:     MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
1361:   }

1363:   /* Post Receives to capture the buffer size */
1364:   PetscMalloc1(nrqs+1,&r_waits2);
1365:   PetscMalloc1(nrqs+1,&rbuf2);
1366:   rbuf2[0] = tmp + msz;
1367:   for (i=1; i<nrqs; ++i) {
1368:     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1369:   }
1370:   for (i=0; i<nrqs; ++i) {
1371:     j    = pa[i];
1372:     MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
1373:   }

1375:   /* Send to other procs the buf size they should allocate */


1378:   /* Receive messages*/
1379:   PetscMalloc1(nrqr+1,&s_waits2);
1380:   PetscMalloc1(nrqr+1,&r_status1);
1381:   PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);
1382:   {
1383:     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1384:     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1385:     PetscInt   *sbuf2_i;

1387:     for (i=0; i<nrqr; ++i) {
1388:       MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);

1390:       req_size[idex] = 0;
1391:       rbuf1_i        = rbuf1[idex];
1392:       start          = 2*rbuf1_i[0] + 1;
1393:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
1394:       PetscMalloc1(end+1,&sbuf2[idex]);
1395:       sbuf2_i        = sbuf2[idex];
1396:       for (j=start; j<end; j++) {
1397:         id              = rbuf1_i[j] - rstart;
1398:         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1399:         sbuf2_i[j]      = ncols;
1400:         req_size[idex] += ncols;
1401:       }
1402:       req_source[idex] = r_status1[i].MPI_SOURCE;
1403:       /* form the header */
1404:       sbuf2_i[0] = req_size[idex];
1405:       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

1407:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1408:     }
1409:   }
1410:   PetscFree(r_status1);
1411:   PetscFree(r_waits1);

1413:   /*  recv buffer sizes */
1414:   /* Receive messages*/

1416:   PetscMalloc1(nrqs+1,&rbuf3);
1417:   PetscMalloc1(nrqs+1,&rbuf4);
1418:   PetscMalloc1(nrqs+1,&r_waits3);
1419:   PetscMalloc1(nrqs+1,&r_waits4);
1420:   PetscMalloc1(nrqs+1,&r_status2);

1422:   for (i=0; i<nrqs; ++i) {
1423:     MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1424:     PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);
1425:     PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);
1426:     MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1427:     MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1428:   }
1429:   PetscFree(r_status2);
1430:   PetscFree(r_waits2);

1432:   /* Wait on sends1 and sends2 */
1433:   PetscMalloc1(nrqs+1,&s_status1);
1434:   PetscMalloc1(nrqr+1,&s_status2);

1436:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1437:   if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1438:   PetscFree(s_status1);
1439:   PetscFree(s_status2);
1440:   PetscFree(s_waits1);
1441:   PetscFree(s_waits2);

1443:   /* Now allocate buffers for a->j, and send them off */
1444:   PetscMalloc1(nrqr+1,&sbuf_aj);
1445:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1446:   PetscMalloc1(j+1,&sbuf_aj[0]);
1447:   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

1449:   PetscMalloc1(nrqr+1,&s_waits3);
1450:   {
1451:     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1452:     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1453:     PetscInt cend = C->cmap->rend;
1454:     PetscInt *a_j = a->j,*b_j = b->j,ctmp;

1456:     for (i=0; i<nrqr; i++) {
1457:       rbuf1_i   = rbuf1[i];
1458:       sbuf_aj_i = sbuf_aj[i];
1459:       ct1       = 2*rbuf1_i[0] + 1;
1460:       ct2       = 0;
1461:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1462:         kmax = rbuf1[i][2*j];
1463:         for (k=0; k<kmax; k++,ct1++) {
1464:           row    = rbuf1_i[ct1] - rstart;
1465:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1466:           ncols  = nzA + nzB;
1467:           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];

1469:           /* load the column indices for this row into cols*/
1470:           cols = sbuf_aj_i + ct2;

1472:           lwrite = 0;
1473:           for (l=0; l<nzB; l++) {
1474:             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1475:           }
1476:           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1477:           for (l=0; l<nzB; l++) {
1478:             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1479:           }

1481:           ct2 += ncols;
1482:         }
1483:       }
1484:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1485:     }
1486:   }
1487:   PetscMalloc1(nrqs+1,&r_status3);
1488:   PetscMalloc1(nrqr+1,&s_status3);

1490:   /* Allocate buffers for a->a, and send them off */
1491:   PetscMalloc1(nrqr+1,&sbuf_aa);
1492:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1493:   PetscMalloc1(j+1,&sbuf_aa[0]);
1494:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

1496:   PetscMalloc1(nrqr+1,&s_waits4);
1497:   {
1498:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1499:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1500:     PetscInt    cend   = C->cmap->rend;
1501:     PetscInt    *b_j   = b->j;
1502:     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;

1504:     for (i=0; i<nrqr; i++) {
1505:       rbuf1_i   = rbuf1[i];
1506:       sbuf_aa_i = sbuf_aa[i];
1507:       ct1       = 2*rbuf1_i[0]+1;
1508:       ct2       = 0;
1509:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1510:         kmax = rbuf1_i[2*j];
1511:         for (k=0; k<kmax; k++,ct1++) {
1512:           row    = rbuf1_i[ct1] - rstart;
1513:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1514:           ncols  = nzA + nzB;
1515:           cworkB = b_j + b_i[row];
1516:           vworkA = a_a + a_i[row];
1517:           vworkB = b_a + b_i[row];

1519:           /* load the column values for this row into vals*/
1520:           vals = sbuf_aa_i+ct2;

1522:           lwrite = 0;
1523:           for (l=0; l<nzB; l++) {
1524:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1525:           }
1526:           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1527:           for (l=0; l<nzB; l++) {
1528:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1529:           }

1531:           ct2 += ncols;
1532:         }
1533:       }
1534:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1535:     }
1536:   }
1537:   PetscFree(rbuf1[0]);
1538:   PetscFree(rbuf1);
1539:   PetscMalloc1(nrqs+1,&r_status4);
1540:   PetscMalloc1(nrqr+1,&s_status4);

1542:   /* Form the matrix */
1543:   /* create col map: global col of C -> local col of submatrices */
1544:   {
1545:     const PetscInt *icol_i;
1546: #if defined(PETSC_USE_CTABLE)
1547:     PetscMalloc1(1+ismax,&cmap);
1548:     for (i=0; i<ismax; i++) {
1549:       if (!allcolumns[i]) {
1550:         PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);

1552:         jmax   = ncol[i];
1553:         icol_i = icol[i];
1554:         cmap_i = cmap[i];
1555:         for (j=0; j<jmax; j++) {
1556:           PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
1557:         }
1558:       } else {
1559:         cmap[i] = NULL;
1560:       }
1561:     }
1562: #else
1563:     PetscMalloc1(ismax,&cmap);
1564:     for (i=0; i<ismax; i++) {
1565:       if (!allcolumns[i]) {
1566:         PetscMalloc1(C->cmap->N,&cmap[i]);
1567:         PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));
1568:         jmax   = ncol[i];
1569:         icol_i = icol[i];
1570:         cmap_i = cmap[i];
1571:         for (j=0; j<jmax; j++) {
1572:           cmap_i[icol_i[j]] = j+1;
1573:         }
1574:       } else {
1575:         cmap[i] = NULL;
1576:       }
1577:     }
1578: #endif
1579:   }

1581:   /* Create lens which is required for MatCreate... */
1582:   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1583:   PetscMalloc1(ismax,&lens);
1584:   if (ismax) {
1585:     PetscMalloc1(j,&lens[0]);
1586:     PetscMemzero(lens[0],j*sizeof(PetscInt));
1587:   }
1588:   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];

1590:   /* Update lens from local data */
1591:   for (i=0; i<ismax; i++) {
1592:     jmax = nrow[i];
1593:     if (!allcolumns[i]) cmap_i = cmap[i];
1594:     irow_i = irow[i];
1595:     lens_i = lens[i];
1596:     for (j=0; j<jmax; j++) {
1597:       l   = 0;
1598:       row = irow_i[j];
1599:       while (row >= C->rmap->range[l+1]) l++;
1600:       proc = l;
1601:       if (proc == rank) {
1602:         MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1603:         if (!allcolumns[i]) {
1604:           for (k=0; k<ncols; k++) {
1605: #if defined(PETSC_USE_CTABLE)
1606:             PetscTableFind(cmap_i,cols[k]+1,&tcol);
1607: #else
1608:             tcol = cmap_i[cols[k]];
1609: #endif
1610:             if (tcol) lens_i[j]++;
1611:           }
1612:         } else { /* allcolumns */
1613:           lens_i[j] = ncols;
1614:         }
1615:         MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1616:       }
1617:     }
1618:   }

1620:   /* Create row map: global row of C -> local row of submatrices */
1621: #if defined(PETSC_USE_CTABLE)
1622:   PetscMalloc1(1+ismax,&rmap);
1623:   for (i=0; i<ismax; i++) {
1624:     PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
1625:     irow_i = irow[i];
1626:     jmax   = nrow[i];
1627:     for (j=0; j<jmax; j++) {
1628:       PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1629:     }
1630:   }
1631: #else
1632:   PetscMalloc1(ismax,&rmap);
1633:   if (ismax) {
1634:     PetscMalloc1(ismax*C->rmap->N,&rmap[0]);
1635:     PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1636:   }
1637:   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1638:   for (i=0; i<ismax; i++) {
1639:     rmap_i = rmap[i];
1640:     irow_i = irow[i];
1641:     jmax   = nrow[i];
1642:     for (j=0; j<jmax; j++) {
1643:       rmap_i[irow_i[j]] = j;
1644:     }
1645:   }
1646: #endif

1648:   /* Update lens from offproc data */
1649:   {
1650:     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

1652:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1653:       MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1654:       idex    = pa[idex2];
1655:       sbuf1_i = sbuf1[idex];
1656:       jmax    = sbuf1_i[0];
1657:       ct1     = 2*jmax+1;
1658:       ct2     = 0;
1659:       rbuf2_i = rbuf2[idex2];
1660:       rbuf3_i = rbuf3[idex2];
1661:       for (j=1; j<=jmax; j++) {
1662:         is_no  = sbuf1_i[2*j-1];
1663:         max1   = sbuf1_i[2*j];
1664:         lens_i = lens[is_no];
1665:         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1666:         rmap_i = rmap[is_no];
1667:         for (k=0; k<max1; k++,ct1++) {
1668: #if defined(PETSC_USE_CTABLE)
1669:           PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1670:           row--;
1671:           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1672: #else
1673:           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1674: #endif
1675:           max2 = rbuf2_i[ct1];
1676:           for (l=0; l<max2; l++,ct2++) {
1677:             if (!allcolumns[is_no]) {
1678: #if defined(PETSC_USE_CTABLE)
1679:               PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1680: #else
1681:               tcol = cmap_i[rbuf3_i[ct2]];
1682: #endif
1683:               if (tcol) lens_i[row]++;
1684:             } else { /* allcolumns */
1685:               lens_i[row]++; /* lens_i[row] += max2 ? */
1686:             }
1687:           }
1688:         }
1689:       }
1690:     }
1691:   }
1692:   PetscFree(r_status3);
1693:   PetscFree(r_waits3);
1694:   if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1695:   PetscFree(s_status3);
1696:   PetscFree(s_waits3);

1698:   /* Create the submatrices */
1699:   if (scall == MAT_REUSE_MATRIX) {
1700:     PetscBool flag;

1702:     /*
1703:         Assumes new rows are same length as the old rows,hence bug!
1704:     */
1705:     for (i=0; i<ismax; i++) {
1706:       mat = (Mat_SeqAIJ*)(submats[i]->data);
1707:       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1708:       PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1709:       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1710:       /* Initial matrix as if empty */
1711:       PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));

1713:       submats[i]->factortype = C->factortype;
1714:     }
1715:   } else {
1716:     for (i=0; i<ismax; i++) {
1717:       PetscInt rbs,cbs;
1718:       ISGetBlockSize(isrow[i],&rbs);
1719:       ISGetBlockSize(iscol[i],&cbs);

1721:       MatCreate(PETSC_COMM_SELF,submats+i);
1722:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);

1724:       MatSetBlockSizes(submats[i],rbs,cbs);
1725:       MatSetType(submats[i],((PetscObject)A)->type_name);
1726:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1727:     }
1728:   }

1730:   /* Assemble the matrices */
1731:   /* First assemble the local rows */
1732:   {
1733:     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1734:     PetscScalar *imat_a;

1736:     for (i=0; i<ismax; i++) {
1737:       mat       = (Mat_SeqAIJ*)submats[i]->data;
1738:       imat_ilen = mat->ilen;
1739:       imat_j    = mat->j;
1740:       imat_i    = mat->i;
1741:       imat_a    = mat->a;

1743:       if (!allcolumns[i]) cmap_i = cmap[i];
1744:       rmap_i = rmap[i];
1745:       irow_i = irow[i];
1746:       jmax   = nrow[i];
1747:       for (j=0; j<jmax; j++) {
1748:         l   = 0;
1749:         row = irow_i[j];
1750:         while (row >= C->rmap->range[l+1]) l++;
1751:         proc = l;
1752:         if (proc == rank) {
1753:           old_row = row;
1754: #if defined(PETSC_USE_CTABLE)
1755:           PetscTableFind(rmap_i,row+1,&row);
1756:           row--;
1757: #else
1758:           row = rmap_i[row];
1759: #endif
1760:           ilen_row = imat_ilen[row];
1761:           MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1762:           mat_i    = imat_i[row];
1763:           mat_a    = imat_a + mat_i;
1764:           mat_j    = imat_j + mat_i;
1765:           if (!allcolumns[i]) {
1766:             for (k=0; k<ncols; k++) {
1767: #if defined(PETSC_USE_CTABLE)
1768:               PetscTableFind(cmap_i,cols[k]+1,&tcol);
1769: #else
1770:               tcol = cmap_i[cols[k]];
1771: #endif
1772:               if (tcol) {
1773:                 *mat_j++ = tcol - 1;
1774:                 *mat_a++ = vals[k];
1775:                 ilen_row++;
1776:               }
1777:             }
1778:           } else { /* allcolumns */
1779:             for (k=0; k<ncols; k++) {
1780:               *mat_j++ = cols[k];  /* global col index! */
1781:               *mat_a++ = vals[k];
1782:               ilen_row++;
1783:             }
1784:           }
1785:           MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);

1787:           imat_ilen[row] = ilen_row;
1788:         }
1789:       }
1790:     }
1791:   }

1793:   /*   Now assemble the off proc rows*/
1794:   {
1795:     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1796:     PetscInt    *imat_j,*imat_i;
1797:     PetscScalar *imat_a,*rbuf4_i;

1799:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1800:       MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1801:       idex    = pa[idex2];
1802:       sbuf1_i = sbuf1[idex];
1803:       jmax    = sbuf1_i[0];
1804:       ct1     = 2*jmax + 1;
1805:       ct2     = 0;
1806:       rbuf2_i = rbuf2[idex2];
1807:       rbuf3_i = rbuf3[idex2];
1808:       rbuf4_i = rbuf4[idex2];
1809:       for (j=1; j<=jmax; j++) {
1810:         is_no     = sbuf1_i[2*j-1];
1811:         rmap_i    = rmap[is_no];
1812:         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1813:         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1814:         imat_ilen = mat->ilen;
1815:         imat_j    = mat->j;
1816:         imat_i    = mat->i;
1817:         imat_a    = mat->a;
1818:         max1      = sbuf1_i[2*j];
1819:         for (k=0; k<max1; k++,ct1++) {
1820:           row = sbuf1_i[ct1];
1821: #if defined(PETSC_USE_CTABLE)
1822:           PetscTableFind(rmap_i,row+1,&row);
1823:           row--;
1824: #else
1825:           row = rmap_i[row];
1826: #endif
1827:           ilen  = imat_ilen[row];
1828:           mat_i = imat_i[row];
1829:           mat_a = imat_a + mat_i;
1830:           mat_j = imat_j + mat_i;
1831:           max2  = rbuf2_i[ct1];
1832:           if (!allcolumns[is_no]) {
1833:             for (l=0; l<max2; l++,ct2++) {

1835: #if defined(PETSC_USE_CTABLE)
1836:               PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1837: #else
1838:               tcol = cmap_i[rbuf3_i[ct2]];
1839: #endif
1840:               if (tcol) {
1841:                 *mat_j++ = tcol - 1;
1842:                 *mat_a++ = rbuf4_i[ct2];
1843:                 ilen++;
1844:               }
1845:             }
1846:           } else { /* allcolumns */
1847:             for (l=0; l<max2; l++,ct2++) {
1848:               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1849:               *mat_a++ = rbuf4_i[ct2];
1850:               ilen++;
1851:             }
1852:           }
1853:           imat_ilen[row] = ilen;
1854:         }
1855:       }
1856:     }
1857:   }

1859:   /* sort the rows */
1860:   {
1861:     PetscInt    *imat_ilen,*imat_j,*imat_i;
1862:     PetscScalar *imat_a;

1864:     for (i=0; i<ismax; i++) {
1865:       mat       = (Mat_SeqAIJ*)submats[i]->data;
1866:       imat_j    = mat->j;
1867:       imat_i    = mat->i;
1868:       imat_a    = mat->a;
1869:       imat_ilen = mat->ilen;

1871:       if (allcolumns[i]) continue;
1872:       jmax = nrow[i];
1873:       for (j=0; j<jmax; j++) {
1874:         PetscInt ilen;

1876:         mat_i = imat_i[j];
1877:         mat_a = imat_a + mat_i;
1878:         mat_j = imat_j + mat_i;
1879:         ilen  = imat_ilen[j];
1880:         PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);
1881:       }
1882:     }
1883:   }

1885:   PetscFree(r_status4);
1886:   PetscFree(r_waits4);
1887:   if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1888:   PetscFree(s_waits4);
1889:   PetscFree(s_status4);

1891:   /* Restore the indices */
1892:   for (i=0; i<ismax; i++) {
1893:     ISRestoreIndices(isrow[i],irow+i);
1894:     if (!allcolumns[i]) {
1895:       ISRestoreIndices(iscol[i],icol+i);
1896:     }
1897:   }

1899:   /* Destroy allocated memory */
1900:   PetscFree4(irow,icol,nrow,ncol);
1901:   PetscFree4(w1,w2,w3,w4);
1902:   PetscFree(pa);

1904:   PetscFree4(sbuf1,ptr,tmp,ctr);
1905:   PetscFree(rbuf2);
1906:   for (i=0; i<nrqr; ++i) {
1907:     PetscFree(sbuf2[i]);
1908:   }
1909:   for (i=0; i<nrqs; ++i) {
1910:     PetscFree(rbuf3[i]);
1911:     PetscFree(rbuf4[i]);
1912:   }

1914:   PetscFree3(sbuf2,req_size,req_source);
1915:   PetscFree(rbuf3);
1916:   PetscFree(rbuf4);
1917:   PetscFree(sbuf_aj[0]);
1918:   PetscFree(sbuf_aj);
1919:   PetscFree(sbuf_aa[0]);
1920:   PetscFree(sbuf_aa);

1922: #if defined(PETSC_USE_CTABLE)
1923:   for (i=0; i<ismax; i++) {PetscTableDestroy((PetscTable*)&rmap[i]);}
1924: #else
1925:   if (ismax) {PetscFree(rmap[0]);}
1926: #endif
1927:   PetscFree(rmap);

1929:   for (i=0; i<ismax; i++) {
1930:     if (!allcolumns[i]) {
1931: #if defined(PETSC_USE_CTABLE)
1932:       PetscTableDestroy((PetscTable*)&cmap[i]);
1933: #else
1934:       PetscFree(cmap[i]);
1935: #endif
1936:     }
1937:   }
1938:   PetscFree(cmap);
1939:   if (ismax) {PetscFree(lens[0]);}
1940:   PetscFree(lens);

1942:   for (i=0; i<ismax; i++) {
1943:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1944:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1945:   }
1946:   return(0);
1947: }

1949: /*
1950:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1951:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1952:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1953:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1954:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1955:  state, and needs to be "assembled" later by compressing B's column space.

1957:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1958:  Following this call, C->A & C->B have been created, even if empty.
1959:  */
1962: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1963: {
1964:   /* If making this function public, change the error returned in this function away from _PLIB. */
1966:   Mat_MPIAIJ     *aij;
1967:   Mat_SeqAIJ     *Baij;
1968:   PetscBool      seqaij,Bdisassembled;
1969:   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1970:   PetscScalar    v;
1971:   const PetscInt *rowindices,*colindices;

1974:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1975:   if (A) {
1976:     PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
1977:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1978:     if (rowemb) {
1979:       ISGetLocalSize(rowemb,&m);
1980:       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1981:     } else {
1982:       if (C->rmap->n != A->rmap->n) {
1983:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1984:       }
1985:     }
1986:     if (dcolemb) {
1987:       ISGetLocalSize(dcolemb,&n);
1988:       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1989:     } else {
1990:       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1991:     }
1992:   }
1993:   if (B) {
1994:     PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
1995:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1996:     if (rowemb) {
1997:       ISGetLocalSize(rowemb,&m);
1998:       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
1999:     } else {
2000:       if (C->rmap->n != B->rmap->n) {
2001:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2002:       }
2003:     }
2004:     if (ocolemb) {
2005:       ISGetLocalSize(ocolemb,&n);
2006:       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2007:     } else {
2008:       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2009:     }
2010:   }

2012:   aij    = (Mat_MPIAIJ*)(C->data);
2013:   if (!aij->A) {
2014:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2015:     MatCreate(PETSC_COMM_SELF,&aij->A);
2016:     MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2017:     MatSetBlockSizesFromMats(aij->A,C,C);
2018:     MatSetType(aij->A,MATSEQAIJ);
2019:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2020:   }
2021:   if (A) {
2022:     MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2023:   } else {
2024:     MatSetUp(aij->A);
2025:   }
2026:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2027:     /*
2028:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2029:       need to "disassemble" B -- convert it to using C's global indices.
2030:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2032:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2033:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2035:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2036:       At least avoid calling MatSetValues() and the implied searches?
2037:     */

2039:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2040: #if defined(PETSC_USE_CTABLE)
2041:       PetscTableDestroy(&aij->colmap);
2042: #else
2043:       PetscFree(aij->colmap);
2044:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2045:       if (aij->B) {
2046:         PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2047:       }
2048: #endif
2049:       ngcol = 0;
2050:       if (aij->lvec) {
2051:         VecGetSize(aij->lvec,&ngcol);
2052:       }
2053:       if (aij->garray) {
2054:         PetscFree(aij->garray);
2055:         PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2056:       }
2057:       VecDestroy(&aij->lvec);
2058:       VecScatterDestroy(&aij->Mvctx);
2059:     }
2060:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2061:       MatDestroy(&aij->B);
2062:     }
2063:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2064:       MatZeroEntries(aij->B);
2065:     }
2066:   }
2067:   Bdisassembled = PETSC_FALSE;
2068:   if (!aij->B) {
2069:     MatCreate(PETSC_COMM_SELF,&aij->B);
2070:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2071:     MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2072:     MatSetBlockSizesFromMats(aij->B,B,B);
2073:     MatSetType(aij->B,MATSEQAIJ);
2074:     Bdisassembled = PETSC_TRUE;
2075:   }
2076:   if (B) {
2077:     Baij = (Mat_SeqAIJ*)(B->data);
2078:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2079:       PetscMalloc1(B->rmap->n,&nz);
2080:       for (i=0; i<B->rmap->n; i++) {
2081:         nz[i] = Baij->i[i+1] - Baij->i[i];
2082:       }
2083:       MatSeqAIJSetPreallocation(aij->B,0,nz);
2084:       PetscFree(nz);
2085:     }

2087:     PetscLayoutGetRange(C->rmap,&rstart,&rend);
2088:     shift = rend-rstart;
2089:     count = 0;
2090:     rowindices = NULL;
2091:     colindices = NULL;
2092:     if (rowemb) {
2093:       ISGetIndices(rowemb,&rowindices);
2094:     }
2095:     if (ocolemb) {
2096:       ISGetIndices(ocolemb,&colindices);
2097:     }
2098:     for (i=0; i<B->rmap->n; i++) {
2099:       PetscInt row;
2100:       row = i;
2101:       if (rowindices) row = rowindices[i];
2102:       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2103:         col  = Baij->j[count];
2104:         if (colindices) col = colindices[col];
2105:         if (Bdisassembled && col>=rstart) col += shift;
2106:         v    = Baij->a[count];
2107:         MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
2108:         ++count;
2109:       }
2110:     }
2111:     /* No assembly for aij->B is necessary. */
2112:     /* FIXME: set aij->B's nonzerostate correctly. */
2113:   } else {
2114:     MatSetUp(aij->B);
2115:   }
2116:   C->preallocated  = PETSC_TRUE;
2117:   C->was_assembled = PETSC_FALSE;
2118:   C->assembled     = PETSC_FALSE;
2119:    /*
2120:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2121:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2122:    */
2123:   return(0);
2124: }

2128: /*
2129:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
2130:  */
2131: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
2132: {
2133:   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);

2138:   /* FIXME: make sure C is assembled */
2139:   *A = aij->A;
2140:   *B = aij->B;
2141:   /* Note that we don't incref *A and *B, so be careful! */
2142:   return(0);
2143: }

2145: /*
2146:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2147:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2148: */
2151: PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2152:                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
2153:                                                  PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
2154:                                                  PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
2155:                                                  PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
2156: {
2158:   PetscMPIInt    isize,flag;
2159:   PetscInt       i,ii,cismax,ispar,ciscol_localsize;
2160:   Mat            *A,*B;
2161:   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;

2164:   if (!ismax) return(0);

2166:   for (i = 0, cismax = 0; i < ismax; ++i) {
2167:     PetscMPIInt isize;
2168:     MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
2169:     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2170:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
2171:     if (isize > 1) ++cismax;
2172:   }
2173:   /*
2174:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2175:      ispar counts the number of parallel ISs across C's comm.
2176:   */
2177:   MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2178:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2179:     (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
2180:     return(0);
2181:   }

2183:   /* if (ispar) */
2184:   /*
2185:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2186:     These are used to extract the off-diag portion of the resulting parallel matrix.
2187:     The row IS for the off-diag portion is the same as for the diag portion,
2188:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2189:   */
2190:   PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
2191:   PetscMalloc1(cismax,&ciscol_p);
2192:   for (i = 0, ii = 0; i < ismax; ++i) {
2193:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2194:     if (isize > 1) {
2195:       /*
2196:          TODO: This is the part that's ***NOT SCALABLE***.
2197:          To fix this we need to extract just the indices of C's nonzero columns
2198:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
2199:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
2200:          to be done without serializing on the IS list, so, most likely, it is best
2201:          done by rewriting MatGetSubMatrices_MPIAIJ() directly.
2202:       */
2203:       ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
2204:       /* Now we have to
2205:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
2206:              were sorted on each rank, concatenated they might no longer be sorted;
2207:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
2208:              indices in the nondecreasing order to the original index positions.
2209:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
2210:       */
2211:       ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
2212:       ISSort(ciscol[ii]);
2213:       ++ii;
2214:     }
2215:   }
2216:   PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
2217:   for (i = 0, ii = 0; i < ismax; ++i) {
2218:     PetscInt       j,issize;
2219:     const PetscInt *indices;

2221:     /*
2222:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
2223:      */
2224:     ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
2225:     ISSort(isrow[i]);
2226:     ISGetLocalSize(isrow[i],&issize);
2227:     ISGetIndices(isrow[i],&indices);
2228:     for (j = 1; j < issize; ++j) {
2229:       if (indices[j] == indices[j-1]) {
2230:         SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2231:       }
2232:     }
2233:     ISRestoreIndices(isrow[i],&indices);


2236:     ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
2237:     ISSort(iscol[i]);
2238:     ISGetLocalSize(iscol[i],&issize);
2239:     ISGetIndices(iscol[i],&indices);
2240:     for (j = 1; j < issize; ++j) {
2241:       if (indices[j-1] == indices[j]) {
2242:         SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2243:       }
2244:     }
2245:     ISRestoreIndices(iscol[i],&indices);
2246:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2247:     if (isize > 1) {
2248:       cisrow[ii] = isrow[i];
2249:       ++ii;
2250:     }
2251:   }
2252:   /*
2253:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2254:     array of sequential matrices underlying the resulting parallel matrices.
2255:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
2256:     contain duplicates.

2258:     There are as many diag matrices as there are original index sets. There are only as many parallel
2259:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

2261:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
2262:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
2263:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
2264:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
2265:       with A[i] and B[ii] extracted from the corresponding MPI submat.
2266:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
2267:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
2268:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
2269:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
2270:       values into A[i] and B[ii] sitting inside the corresponding submat.
2271:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
2272:       A[i], B[ii], AA[i] or BB[ii] matrices.
2273:   */
2274:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
2275:   if (scall == MAT_INITIAL_MATRIX) {
2276:     PetscMalloc1(ismax,submat);
2277:     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
2278:   } else {
2279:     PetscMalloc1(ismax,&A);
2280:     PetscMalloc1(cismax,&B);
2281:     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
2282:     for (i = 0, ii = 0; i < ismax; ++i) {
2283:       MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2284:       if (isize > 1) {
2285:         Mat AA,BB;
2286:         (*getlocalmats)((*submat)[i],&AA,&BB);
2287:         if (!isrow_p[i] && !iscol_p[i]) {
2288:           A[i] = AA;
2289:         } else {
2290:           /* TODO: extract A[i] composed on AA. */
2291:           MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);
2292:         }
2293:         if (!isrow_p[i] && !ciscol_p[ii]) {
2294:           B[ii] = BB;
2295:         } else {
2296:           /* TODO: extract B[ii] composed on BB. */
2297:           MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);
2298:         }
2299:         ++ii;
2300:       } else {
2301:         if (!isrow_p[i] && !iscol_p[i]) {
2302:           A[i] = (*submat)[i];
2303:         } else {
2304:           /* TODO: extract A[i] composed on (*submat)[i]. */
2305:           MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);
2306:         }
2307:       }
2308:     }
2309:   }
2310:   /* Now obtain the sequential A and B submatrices separately. */
2311:   (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);
2312:   /* I did not figure out a good way to fix it right now.
2313:    * Local column size of B[i] is different from the size of ciscol[i].
2314:    * B[i]'s size is finally determined by MatAssembly, while
2315:    * ciscol[i] is computed as the complement of iscol[i].
2316:    * It is better to keep only nonzero indices when computing
2317:    * the complement ciscol[i].
2318:    * */
2319:   if(scall==MAT_REUSE_MATRIX){
2320:         for(i=0; i<cismax; i++){
2321:           ISGetLocalSize(ciscol[i],&ciscol_localsize);
2322:           B[i]->cmap->n = ciscol_localsize;
2323:         }
2324:   }
2325:   (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);
2326:   /*
2327:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
2328:     matrices A & B have been extracted directly into the parallel matrices containing them, or
2329:     simply into the sequential matrix identical with the corresponding A (if isize == 1).
2330:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
2331:     to have the same sparsity pattern.
2332:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
2333:     must be constructed for C. This is done by setseqmat(s).
2334:   */
2335:   for (i = 0, ii = 0; i < ismax; ++i) {
2336:     /*
2337:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
2338:        That way we can avoid sorting and computing permutations when reusing.
2339:        To this end:
2340:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
2341:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
2342:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
2343:     */
2344:     MatStructure pattern;
2345:     if (scall == MAT_INITIAL_MATRIX) {
2346:       pattern = DIFFERENT_NONZERO_PATTERN;
2347:     } else {
2348:       pattern = SAME_NONZERO_PATTERN;
2349:     }
2350:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2351:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
2352:     if (isize > 1) {
2353:       if (scall == MAT_INITIAL_MATRIX) {
2354:         MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
2355:         MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
2356:         MatSetType((*submat)[i],MATMPIAIJ);
2357:         PetscLayoutSetUp((*submat)[i]->rmap);
2358:         PetscLayoutSetUp((*submat)[i]->cmap);
2359:       }
2360:       /*
2361:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
2362:       */
2363:       {
2364:         Mat AA,BB;
2365:         AA = NULL;
2366:         if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
2367:         BB = NULL;
2368:         if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
2369:         if (AA || BB) {
2370:           setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
2371:           MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
2372:           MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
2373:         }
2374:         if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
2375:           /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
2376:           MatDestroy(&AA);
2377:         }
2378:         if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
2379:           /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
2380:           MatDestroy(&BB);
2381:         }
2382:       }
2383:       ISDestroy(ciscol+ii);
2384:       ISDestroy(ciscol_p+ii);
2385:       ++ii;
2386:     } else { /* if (isize == 1) */
2387:       if (scall == MAT_INITIAL_MATRIX) {
2388:         if (isrow_p[i] || iscol_p[i]) {
2389:           MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
2390:         } else (*submat)[i] = A[i];
2391:       }
2392:       if (isrow_p[i] || iscol_p[i]) {
2393:         setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
2394:         /* Otherwise A is extracted straight into (*submats)[i]. */
2395:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
2396:         MatDestroy(A+i);
2397:       }
2398:     }
2399:     ISDestroy(&isrow_p[i]);
2400:     ISDestroy(&iscol_p[i]);
2401:   }
2402:   PetscFree2(cisrow,ciscol);
2403:   PetscFree2(isrow_p,iscol_p);
2404:   PetscFree(ciscol_p);
2405:   PetscFree(A);
2406:   PetscFree(B);
2407:   return(0);
2408: }



2414: PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2415: {

2419:   MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
2420:   return(0);
2421: }