Actual source code: mpiov.c

  1: /*
  2:    Routines to compute overlapping regions of a parallel MPI matrix
  3:   and to find submatrices that were shared across processors.
  4: */
  5: #include <../src/mat/impls/aij/seq/aij.h>
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <petscsf.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
 17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
 18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
 19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);

 21: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 22: {
 23:   PetscInt       i;

 26:   for (i=0; i<ov; ++i) {
 27:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 28:   }
 29:   return 0;
 30: }

 32: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
 33: {
 34:   PetscInt       i;

 37:   for (i=0; i<ov; ++i) {
 38:     MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
 39:   }
 40:   return 0;
 41: }

 43: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
 44: {
 45:   MPI_Comm       comm;
 46:   PetscInt       *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
 47:   PetscInt       *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
 48:   PetscInt       nrecvrows,*sbsizes = NULL,*sbdata = NULL;
 49:   const PetscInt *indices_i,**indices;
 50:   PetscLayout    rmap;
 51:   PetscMPIInt    rank,size,*toranks,*fromranks,nto,nfrom,owner;
 52:   PetscSF        sf;
 53:   PetscSFNode    *remote;

 55:   PetscObjectGetComm((PetscObject)mat,&comm);
 56:   MPI_Comm_rank(comm,&rank);
 57:   MPI_Comm_size(comm,&size);
 58:   /* get row map to determine where rows should be going */
 59:   MatGetLayouts(mat,&rmap,NULL);
 60:   /* retrieve IS data and put all together so that we
 61:    * can optimize communication
 62:    *  */
 63:   PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
 64:   for (i=0,tlength=0; i<nidx; i++) {
 65:     ISGetLocalSize(is[i],&length[i]);
 66:     tlength += length[i];
 67:     ISGetIndices(is[i],&indices[i]);
 68:   }
 69:   /* find these rows on remote processors */
 70:   PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
 71:   PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
 72:   nrrows = 0;
 73:   for (i=0; i<nidx; i++) {
 74:     length_i  = length[i];
 75:     indices_i = indices[i];
 76:     for (j=0; j<length_i; j++) {
 77:       owner = -1;
 78:       PetscLayoutFindOwner(rmap,indices_i[j],&owner);
 79:       /* remote processors */
 80:       if (owner != rank) {
 81:         tosizes_temp[owner]++; /* number of rows to owner */
 82:         rrow_ranks[nrrows]  = owner; /* processor */
 83:         rrow_isids[nrrows]   = i; /* is id */
 84:         remoterows[nrrows++] = indices_i[j]; /* row */
 85:       }
 86:     }
 87:     ISRestoreIndices(is[i],&indices[i]);
 88:   }
 89:   PetscFree2(*(PetscInt***)&indices,length);
 90:   /* test if we need to exchange messages
 91:    * generally speaking, we do not need to exchange
 92:    * data when overlap is 1
 93:    * */
 94:   MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
 95:   /* we do not have any messages
 96:    * It usually corresponds to overlap 1
 97:    * */
 98:   if (!reducednrrows) {
 99:     PetscFree3(toranks,tosizes,tosizes_temp);
100:     PetscFree3(remoterows,rrow_ranks,rrow_isids);
101:     MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
102:     return 0;
103:   }
104:   nto = 0;
105:   /* send sizes and ranks for building a two-sided communcation */
106:   for (i=0; i<size; i++) {
107:     if (tosizes_temp[i]) {
108:       tosizes[nto*2]  = tosizes_temp[i]*2; /* size */
109:       tosizes_temp[i] = nto; /* a map from processor to index */
110:       toranks[nto++]  = i; /* processor */
111:     }
112:   }
113:   PetscMalloc1(nto+1,&toffsets);
114:   toffsets[0] = 0;
115:   for (i=0; i<nto; i++) {
116:     toffsets[i+1]  = toffsets[i]+tosizes[2*i]; /* offsets */
117:     tosizes[2*i+1] = toffsets[i]; /* offsets to send */
118:   }
119:   /* send information to other processors */
120:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
121:   nrecvrows = 0;
122:   for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
123:   PetscMalloc1(nrecvrows,&remote);
124:   nrecvrows = 0;
125:   for (i=0; i<nfrom; i++) {
126:     for (j=0; j<fromsizes[2*i]; j++) {
127:       remote[nrecvrows].rank    = fromranks[i];
128:       remote[nrecvrows++].index = fromsizes[2*i+1]+j;
129:     }
130:   }
131:   PetscSFCreate(comm,&sf);
132:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
133:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
134:   PetscSFSetType(sf,PETSCSFBASIC);
135:   PetscSFSetFromOptions(sf);
136:   /* message pair <no of is, row>  */
137:   PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
138:   for (i=0; i<nrrows; i++) {
139:     owner = rrow_ranks[i]; /* processor */
140:     j     = tosizes_temp[owner]; /* index */
141:     todata[toffsets[j]++] = rrow_isids[i];
142:     todata[toffsets[j]++] = remoterows[i];
143:   }
144:   PetscFree3(toranks,tosizes,tosizes_temp);
145:   PetscFree3(remoterows,rrow_ranks,rrow_isids);
146:   PetscFree(toffsets);
147:   PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
148:   PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
149:   PetscSFDestroy(&sf);
150:   /* send rows belonging to the remote so that then we could get the overlapping data back */
151:   MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
152:   PetscFree2(todata,fromdata);
153:   PetscFree(fromsizes);
154:   PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
155:   PetscFree(fromranks);
156:   nrecvrows = 0;
157:   for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
158:   PetscCalloc1(nrecvrows,&todata);
159:   PetscMalloc1(nrecvrows,&remote);
160:   nrecvrows = 0;
161:   for (i=0; i<nto; i++) {
162:     for (j=0; j<tosizes[2*i]; j++) {
163:       remote[nrecvrows].rank    = toranks[i];
164:       remote[nrecvrows++].index = tosizes[2*i+1]+j;
165:     }
166:   }
167:   PetscSFCreate(comm,&sf);
168:   PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
169:   /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
170:   PetscSFSetType(sf,PETSCSFBASIC);
171:   PetscSFSetFromOptions(sf);
172:   /* overlap communication and computation */
173:   PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
174:   MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
175:   PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
176:   PetscSFDestroy(&sf);
177:   PetscFree2(sbdata,sbsizes);
178:   MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
179:   PetscFree(toranks);
180:   PetscFree(tosizes);
181:   PetscFree(todata);
182:   return 0;
183: }

185: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
186: {
187:   PetscInt         *isz,isz_i,i,j,is_id, data_size;
188:   PetscInt          col,lsize,max_lsize,*indices_temp, *indices_i;
189:   const PetscInt   *indices_i_temp;
190:   MPI_Comm         *iscomms;

192:   max_lsize = 0;
193:   PetscMalloc1(nidx,&isz);
194:   for (i=0; i<nidx; i++) {
195:     ISGetLocalSize(is[i],&lsize);
196:     max_lsize = lsize>max_lsize ? lsize:max_lsize;
197:     isz[i]    = lsize;
198:   }
199:   PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
200:   for (i=0; i<nidx; i++) {
201:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
202:     ISGetIndices(is[i],&indices_i_temp);
203:     PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);
204:     ISRestoreIndices(is[i],&indices_i_temp);
205:     ISDestroy(&is[i]);
206:   }
207:   /* retrieve information to get row id and its overlap */
208:   for (i=0; i<nrecvs;) {
209:     is_id     = recvdata[i++];
210:     data_size = recvdata[i++];
211:     indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
212:     isz_i     = isz[is_id];
213:     for (j=0; j< data_size; j++) {
214:       col = recvdata[i++];
215:       indices_i[isz_i++] = col;
216:     }
217:     isz[is_id] = isz_i;
218:   }
219:   /* remove duplicate entities */
220:   for (i=0; i<nidx; i++) {
221:     indices_i = indices_temp+(max_lsize+nrecvs)*i;
222:     isz_i     = isz[i];
223:     PetscSortRemoveDupsInt(&isz_i,indices_i);
224:     ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
225:     PetscCommDestroy(&iscomms[i]);
226:   }
227:   PetscFree(isz);
228:   PetscFree2(indices_temp,iscomms);
229:   return 0;
230: }

232: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
233: {
234:   PetscLayout       rmap,cmap;
235:   PetscInt          i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
236:   PetscInt          is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
237:   PetscInt         *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
238:   const PetscInt   *gcols,*ai,*aj,*bi,*bj;
239:   Mat               amat,bmat;
240:   PetscMPIInt       rank;
241:   PetscBool         done;
242:   MPI_Comm          comm;

244:   PetscObjectGetComm((PetscObject)mat,&comm);
245:   MPI_Comm_rank(comm,&rank);
246:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
247:   /* Even if the mat is symmetric, we still assume it is not symmetric */
248:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
250:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
252:   /* total number of nonzero values is used to estimate the memory usage in the next step */
253:   tnz  = ai[an]+bi[bn];
254:   MatGetLayouts(mat,&rmap,&cmap);
255:   PetscLayoutGetRange(rmap,&rstart,NULL);
256:   PetscLayoutGetRange(cmap,&cstart,NULL);
257:   /* to find the longest message */
258:   max_fszs = 0;
259:   for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
260:   /* better way to estimate number of nonzero in the mat??? */
261:   PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
262:   for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
263:   rows_pos  = 0;
264:   totalrows = 0;
265:   for (i=0; i<nfrom; i++) {
266:     PetscArrayzero(rows_pos_i,nidx);
267:     /* group data together */
268:     for (j=0; j<fromsizes[2*i]; j+=2) {
269:       is_id                       = fromrows[rows_pos++];/* no of is */
270:       rows_i                      = rows_data[is_id];
271:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
272:     }
273:     /* estimate a space to avoid multiple allocations  */
274:     for (j=0; j<nidx; j++) {
275:       indvc_ij = 0;
276:       rows_i   = rows_data[j];
277:       for (l=0; l<rows_pos_i[j]; l++) {
278:         row    = rows_i[l]-rstart;
279:         start  = ai[row];
280:         end    = ai[row+1];
281:         for (k=start; k<end; k++) { /* Amat */
282:           col = aj[k] + cstart;
283:           indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
284:         }
285:         start = bi[row];
286:         end   = bi[row+1];
287:         for (k=start; k<end; k++) { /* Bmat */
288:           col = gcols[bj[k]];
289:           indices_tmp[indvc_ij++] = col;
290:         }
291:       }
292:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
293:       indv_counts[i*nidx+j] = indvc_ij;
294:       totalrows            += indvc_ij;
295:     }
296:   }
297:   /* message triple <no of is, number of rows, rows> */
298:   PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
299:   totalrows = 0;
300:   rows_pos  = 0;
301:   /* use this code again */
302:   for (i=0;i<nfrom;i++) {
303:     PetscArrayzero(rows_pos_i,nidx);
304:     for (j=0; j<fromsizes[2*i]; j+=2) {
305:       is_id                       = fromrows[rows_pos++];
306:       rows_i                      = rows_data[is_id];
307:       rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
308:     }
309:     /* add data  */
310:     for (j=0; j<nidx; j++) {
311:       if (!indv_counts[i*nidx+j]) continue;
312:       indvc_ij = 0;
313:       sbdata[totalrows++] = j;
314:       sbdata[totalrows++] = indv_counts[i*nidx+j];
315:       sbsizes[2*i]       += 2;
316:       rows_i              = rows_data[j];
317:       for (l=0; l<rows_pos_i[j]; l++) {
318:         row   = rows_i[l]-rstart;
319:         start = ai[row];
320:         end   = ai[row+1];
321:         for (k=start; k<end; k++) { /* Amat */
322:           col = aj[k] + cstart;
323:           indices_tmp[indvc_ij++] = col;
324:         }
325:         start = bi[row];
326:         end   = bi[row+1];
327:         for (k=start; k<end; k++) { /* Bmat */
328:           col = gcols[bj[k]];
329:           indices_tmp[indvc_ij++] = col;
330:         }
331:       }
332:       PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
333:       sbsizes[2*i]  += indvc_ij;
334:       PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);
335:       totalrows += indvc_ij;
336:     }
337:   }
338:   PetscMalloc1(nfrom+1,&offsets);
339:   offsets[0] = 0;
340:   for (i=0; i<nfrom; i++) {
341:     offsets[i+1]   = offsets[i] + sbsizes[2*i];
342:     sbsizes[2*i+1] = offsets[i];
343:   }
344:   PetscFree(offsets);
345:   if (sbrowsizes) *sbrowsizes = sbsizes;
346:   if (sbrows) *sbrows = sbdata;
347:   PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
348:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
349:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
350:   return 0;
351: }

353: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
354: {
355:   const PetscInt   *gcols,*ai,*aj,*bi,*bj, *indices;
356:   PetscInt          tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
357:   PetscInt          lsize,lsize_tmp;
358:   PetscMPIInt       rank,owner;
359:   Mat               amat,bmat;
360:   PetscBool         done;
361:   PetscLayout       cmap,rmap;
362:   MPI_Comm          comm;

364:   PetscObjectGetComm((PetscObject)mat,&comm);
365:   MPI_Comm_rank(comm,&rank);
366:   MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
367:   MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
369:   MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
371:   /* is it a safe way to compute number of nonzero values ? */
372:   tnz  = ai[an]+bi[bn];
373:   MatGetLayouts(mat,&rmap,&cmap);
374:   PetscLayoutGetRange(rmap,&rstart,NULL);
375:   PetscLayoutGetRange(cmap,&cstart,NULL);
376:   /* it is a better way to estimate memory than the old implementation
377:    * where global size of matrix is used
378:    * */
379:   PetscMalloc1(tnz,&indices_temp);
380:   for (i=0; i<nidx; i++) {
381:     MPI_Comm iscomm;

383:     ISGetLocalSize(is[i],&lsize);
384:     ISGetIndices(is[i],&indices);
385:     lsize_tmp = 0;
386:     for (j=0; j<lsize; j++) {
387:       owner = -1;
388:       row   = indices[j];
389:       PetscLayoutFindOwner(rmap,row,&owner);
390:       if (owner != rank) continue;
391:       /* local number */
392:       row  -= rstart;
393:       start = ai[row];
394:       end   = ai[row+1];
395:       for (k=start; k<end; k++) { /* Amat */
396:         col = aj[k] + cstart;
397:         indices_temp[lsize_tmp++] = col;
398:       }
399:       start = bi[row];
400:       end   = bi[row+1];
401:       for (k=start; k<end; k++) { /* Bmat */
402:         col = gcols[bj[k]];
403:         indices_temp[lsize_tmp++] = col;
404:       }
405:     }
406:    ISRestoreIndices(is[i],&indices);
407:    PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
408:    ISDestroy(&is[i]);
409:    PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
410:    ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
411:    PetscCommDestroy(&iscomm);
412:   }
413:   PetscFree(indices_temp);
414:   MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
415:   MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
416:   return 0;
417: }

419: /*
420:   Sample message format:
421:   If a processor A wants processor B to process some elements corresponding
422:   to index sets is[1],is[5]
423:   mesg [0] = 2   (no of index sets in the mesg)
424:   -----------
425:   mesg [1] = 1 => is[1]
426:   mesg [2] = sizeof(is[1]);
427:   -----------
428:   mesg [3] = 5  => is[5]
429:   mesg [4] = sizeof(is[5]);
430:   -----------
431:   mesg [5]
432:   mesg [n]  datas[1]
433:   -----------
434:   mesg[n+1]
435:   mesg[m]  data(is[5])
436:   -----------

438:   Notes:
439:   nrqs - no of requests sent (or to be sent out)
440:   nrqr - no of requests received (which have to be or which have been processed)
441: */
442: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
443: {
444:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
445:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
446:   const PetscInt **idx,*idx_i;
447:   PetscInt       *n,**data,len;
448: #if defined(PETSC_USE_CTABLE)
449:   PetscTable     *table_data,table_data_i;
450:   PetscInt       *tdata,tcount,tcount_max;
451: #else
452:   PetscInt       *data_i,*d_p;
453: #endif
454:   PetscMPIInt    size,rank,tag1,tag2,proc = 0;
455:   PetscInt       M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
456:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
457:   PetscBT        *table;
458:   MPI_Comm       comm;
459:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
460:   MPI_Status     *recv_status;
461:   MPI_Comm       *iscomms;
462:   char           *t_p;

464:   PetscObjectGetComm((PetscObject)C,&comm);
465:   size = c->size;
466:   rank = c->rank;
467:   M    = C->rmap->N;

469:   PetscObjectGetNewTag((PetscObject)C,&tag1);
470:   PetscObjectGetNewTag((PetscObject)C,&tag2);

472:   PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);

474:   for (i=0; i<imax; i++) {
475:     ISGetIndices(is[i],&idx[i]);
476:     ISGetLocalSize(is[i],&n[i]);
477:   }

479:   /* evaluate communication - mesg to who,length of mesg, and buffer space
480:      required. Based on this, buffers are allocated, and data copied into them  */
481:   PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
482:   for (i=0; i<imax; i++) {
483:     PetscArrayzero(w4,size); /* initialise work vector*/
484:     idx_i = idx[i];
485:     len   = n[i];
486:     for (j=0; j<len; j++) {
487:       row = idx_i[j];
489:       PetscLayoutFindOwner(C->rmap,row,&proc);
490:       w4[proc]++;
491:     }
492:     for (j=0; j<size; j++) {
493:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
494:     }
495:   }

497:   nrqs     = 0;              /* no of outgoing messages */
498:   msz      = 0;              /* total mesg length (for all proc */
499:   w1[rank] = 0;              /* no mesg sent to intself */
500:   w3[rank] = 0;
501:   for (i=0; i<size; i++) {
502:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
503:   }
504:   /* pa - is list of processors to communicate with */
505:   PetscMalloc1(nrqs,&pa);
506:   for (i=0,j=0; i<size; i++) {
507:     if (w1[i]) {pa[j] = i; j++;}
508:   }

510:   /* Each message would have a header = 1 + 2*(no of IS) + data */
511:   for (i=0; i<nrqs; i++) {
512:     j      = pa[i];
513:     w1[j] += w2[j] + 2*w3[j];
514:     msz   += w1[j];
515:   }

517:   /* Determine the number of messages to expect, their lengths, from from-ids */
518:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
519:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

521:   /* Now post the Irecvs corresponding to these messages */
522:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

524:   /* Allocate Memory for outgoing messages */
525:   PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
526:   PetscArrayzero(outdat,size);
527:   PetscArrayzero(ptr,size);

529:   {
530:     PetscInt *iptr = tmp,ict  = 0;
531:     for (i=0; i<nrqs; i++) {
532:       j         = pa[i];
533:       iptr     +=  ict;
534:       outdat[j] = iptr;
535:       ict       = w1[j];
536:     }
537:   }

539:   /* Form the outgoing messages */
540:   /* plug in the headers */
541:   for (i=0; i<nrqs; i++) {
542:     j            = pa[i];
543:     outdat[j][0] = 0;
544:     PetscArrayzero(outdat[j]+1,2*w3[j]);
545:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
546:   }

548:   /* Memory for doing local proc's work */
549:   {
550:     PetscInt M_BPB_imax = 0;
551: #if defined(PETSC_USE_CTABLE)
552:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
553:     PetscMalloc1(imax,&table_data);
554:     for (i=0; i<imax; i++) {
555:       PetscTableCreate(n[i],M,&table_data[i]);
556:     }
557:     PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
558:     for (i=0; i<imax; i++) {
559:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
560:     }
561: #else
562:     PetscInt Mimax = 0;
563:     PetscIntMultError(M,imax, &Mimax);
564:     PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
565:     PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
566:     for (i=0; i<imax; i++) {
567:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
568:       data[i]  = d_p + M*i;
569:     }
570: #endif
571:   }

573:   /* Parse the IS and update local tables and the outgoing buf with the data */
574:   {
575:     PetscInt n_i,isz_i,*outdat_j,ctr_j;
576:     PetscBT  table_i;

578:     for (i=0; i<imax; i++) {
579:       PetscArrayzero(ctr,size);
580:       n_i     = n[i];
581:       table_i = table[i];
582:       idx_i   = idx[i];
583: #if defined(PETSC_USE_CTABLE)
584:       table_data_i = table_data[i];
585: #else
586:       data_i  = data[i];
587: #endif
588:       isz_i   = isz[i];
589:       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
590:         row  = idx_i[j];
591:         PetscLayoutFindOwner(C->rmap,row,&proc);
592:         if (proc != rank) { /* copy to the outgoing buffer */
593:           ctr[proc]++;
594:           *ptr[proc] = row;
595:           ptr[proc]++;
596:         } else if (!PetscBTLookupSet(table_i,row)) {
597: #if defined(PETSC_USE_CTABLE)
598:           PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
599: #else
600:           data_i[isz_i] = row; /* Update the local table */
601: #endif
602:           isz_i++;
603:         }
604:       }
605:       /* Update the headers for the current IS */
606:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
607:         if ((ctr_j = ctr[j])) {
608:           outdat_j        = outdat[j];
609:           k               = ++outdat_j[0];
610:           outdat_j[2*k]   = ctr_j;
611:           outdat_j[2*k-1] = i;
612:         }
613:       }
614:       isz[i] = isz_i;
615:     }
616:   }

618:   /*  Now  post the sends */
619:   PetscMalloc1(nrqs,&s_waits1);
620:   for (i=0; i<nrqs; ++i) {
621:     j    = pa[i];
622:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
623:   }

625:   /* No longer need the original indices */
626:   PetscMalloc1(imax,&iscomms);
627:   for (i=0; i<imax; ++i) {
628:     ISRestoreIndices(is[i],idx+i);
629:     PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
630:   }
631:   PetscFree2(*(PetscInt***)&idx,n);

633:   for (i=0; i<imax; ++i) {
634:     ISDestroy(&is[i]);
635:   }

637:   /* Do Local work */
638: #if defined(PETSC_USE_CTABLE)
639:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
640: #else
641:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
642: #endif

644:   /* Receive messages */
645:   PetscMalloc1(nrqr,&recv_status);
646:   MPI_Waitall(nrqr,r_waits1,recv_status);
647:   MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);

649:   /* Phase 1 sends are complete - deallocate buffers */
650:   PetscFree4(outdat,ptr,tmp,ctr);
651:   PetscFree4(w1,w2,w3,w4);

653:   PetscMalloc1(nrqr,&xdata);
654:   PetscMalloc1(nrqr,&isz1);
655:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
656:   PetscFree(rbuf[0]);
657:   PetscFree(rbuf);

659:   /* Send the data back */
660:   /* Do a global reduction to know the buffer space req for incoming messages */
661:   {
662:     PetscMPIInt *rw1;

664:     PetscCalloc1(size,&rw1);

666:     for (i=0; i<nrqr; ++i) {
667:       proc = recv_status[i].MPI_SOURCE;

670:       rw1[proc] = isz1[i];
671:     }
672:     PetscFree(onodes1);
673:     PetscFree(olengths1);

675:     /* Determine the number of messages to expect, their lengths, from from-ids */
676:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
677:     PetscFree(rw1);
678:   }
679:   /* Now post the Irecvs corresponding to these messages */
680:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

682:   /* Now  post the sends */
683:   PetscMalloc1(nrqr,&s_waits2);
684:   for (i=0; i<nrqr; ++i) {
685:     j    = recv_status[i].MPI_SOURCE;
686:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
687:   }

689:   /* receive work done on other processors */
690:   {
691:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,jmax;
692:     PetscMPIInt idex;
693:     PetscBT     table_i;

695:     for (i=0; i<nrqs; ++i) {
696:       MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);
697:       /* Process the message */
698:       rbuf2_i = rbuf2[idex];
699:       ct1     = 2*rbuf2_i[0]+1;
700:       jmax    = rbuf2[idex][0];
701:       for (j=1; j<=jmax; j++) {
702:         max     = rbuf2_i[2*j];
703:         is_no   = rbuf2_i[2*j-1];
704:         isz_i   = isz[is_no];
705:         table_i = table[is_no];
706: #if defined(PETSC_USE_CTABLE)
707:         table_data_i = table_data[is_no];
708: #else
709:         data_i  = data[is_no];
710: #endif
711:         for (k=0; k<max; k++,ct1++) {
712:           row = rbuf2_i[ct1];
713:           if (!PetscBTLookupSet(table_i,row)) {
714: #if defined(PETSC_USE_CTABLE)
715:             PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
716: #else
717:             data_i[isz_i] = row;
718: #endif
719:             isz_i++;
720:           }
721:         }
722:         isz[is_no] = isz_i;
723:       }
724:     }

726:     MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
727:   }

729: #if defined(PETSC_USE_CTABLE)
730:   tcount_max = 0;
731:   for (i=0; i<imax; ++i) {
732:     table_data_i = table_data[i];
733:     PetscTableGetCount(table_data_i,&tcount);
734:     if (tcount_max < tcount) tcount_max = tcount;
735:   }
736:   PetscMalloc1(tcount_max+1,&tdata);
737: #endif

739:   for (i=0; i<imax; ++i) {
740: #if defined(PETSC_USE_CTABLE)
741:     PetscTablePosition tpos;
742:     table_data_i = table_data[i];

744:     PetscTableGetHeadPosition(table_data_i,&tpos);
745:     while (tpos) {
746:       PetscTableGetNext(table_data_i,&tpos,&k,&j);
747:       tdata[--j] = --k;
748:     }
749:     ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
750: #else
751:     ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
752: #endif
753:     PetscCommDestroy(&iscomms[i]);
754:   }

756:   PetscFree(iscomms);
757:   PetscFree(onodes2);
758:   PetscFree(olengths2);

760:   PetscFree(pa);
761:   PetscFree(rbuf2[0]);
762:   PetscFree(rbuf2);
763:   PetscFree(s_waits1);
764:   PetscFree(r_waits1);
765:   PetscFree(s_waits2);
766:   PetscFree(r_waits2);
767:   PetscFree(recv_status);
768:   if (xdata) {
769:     PetscFree(xdata[0]);
770:     PetscFree(xdata);
771:   }
772:   PetscFree(isz1);
773: #if defined(PETSC_USE_CTABLE)
774:   for (i=0; i<imax; i++) {
775:     PetscTableDestroy((PetscTable*)&table_data[i]);
776:   }
777:   PetscFree(table_data);
778:   PetscFree(tdata);
779:   PetscFree4(table,data,isz,t_p);
780: #else
781:   PetscFree5(table,data,isz,d_p,t_p);
782: #endif
783:   return 0;
784: }

786: /*
787:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
788:        the work on the local processor.

790:      Inputs:
791:       C      - MAT_MPIAIJ;
792:       imax - total no of index sets processed at a time;
793:       table  - an array of char - size = m bits.

795:      Output:
796:       isz    - array containing the count of the solution elements corresponding
797:                to each index set;
798:       data or table_data  - pointer to the solutions
799: */
800: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
801: {
802:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
803:   Mat        A  = c->A,B = c->B;
804:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
805:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
806:   PetscInt   *bi,*bj,*garray,i,j,k,row,isz_i;
807:   PetscBT    table_i;
808: #if defined(PETSC_USE_CTABLE)
809:   PetscTable         table_data_i;
810:   PetscTablePosition tpos;
811:   PetscInt           tcount,*tdata;
812: #else
813:   PetscInt           *data_i;
814: #endif

816:   rstart = C->rmap->rstart;
817:   cstart = C->cmap->rstart;
818:   ai     = a->i;
819:   aj     = a->j;
820:   bi     = b->i;
821:   bj     = b->j;
822:   garray = c->garray;

824:   for (i=0; i<imax; i++) {
825: #if defined(PETSC_USE_CTABLE)
826:     /* copy existing entries of table_data_i into tdata[] */
827:     table_data_i = table_data[i];
828:     PetscTableGetCount(table_data_i,&tcount);

831:     PetscMalloc1(tcount,&tdata);
832:     PetscTableGetHeadPosition(table_data_i,&tpos);
833:     while (tpos) {
834:       PetscTableGetNext(table_data_i,&tpos,&row,&j);
835:       tdata[--j] = --row;
837:     }
838: #else
839:     data_i  = data[i];
840: #endif
841:     table_i = table[i];
842:     isz_i   = isz[i];
843:     max     = isz[i];

845:     for (j=0; j<max; j++) {
846: #if defined(PETSC_USE_CTABLE)
847:       row   = tdata[j] - rstart;
848: #else
849:       row   = data_i[j] - rstart;
850: #endif
851:       start = ai[row];
852:       end   = ai[row+1];
853:       for (k=start; k<end; k++) { /* Amat */
854:         val = aj[k] + cstart;
855:         if (!PetscBTLookupSet(table_i,val)) {
856: #if defined(PETSC_USE_CTABLE)
857:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
858: #else
859:           data_i[isz_i] = val;
860: #endif
861:           isz_i++;
862:         }
863:       }
864:       start = bi[row];
865:       end   = bi[row+1];
866:       for (k=start; k<end; k++) { /* Bmat */
867:         val = garray[bj[k]];
868:         if (!PetscBTLookupSet(table_i,val)) {
869: #if defined(PETSC_USE_CTABLE)
870:           PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
871: #else
872:           data_i[isz_i] = val;
873: #endif
874:           isz_i++;
875:         }
876:       }
877:     }
878:     isz[i] = isz_i;

880: #if defined(PETSC_USE_CTABLE)
881:     PetscFree(tdata);
882: #endif
883:   }
884:   return 0;
885: }

887: /*
888:       MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
889:          and return the output

891:          Input:
892:            C    - the matrix
893:            nrqr - no of messages being processed.
894:            rbuf - an array of pointers to the received requests

896:          Output:
897:            xdata - array of messages to be sent back
898:            isz1  - size of each message

900:   For better efficiency perhaps we should malloc separately each xdata[i],
901: then if a remalloc is required we need only copy the data for that one row
902: rather then all previous rows as it is now where a single large chunk of
903: memory is used.

905: */
906: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
907: {
908:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
909:   Mat            A  = c->A,B = c->B;
910:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
911:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
912:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
913:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
914:   PetscInt       *rbuf_i,kmax,rbuf_0;
915:   PetscBT        xtable;

917:   m      = C->rmap->N;
918:   rstart = C->rmap->rstart;
919:   cstart = C->cmap->rstart;
920:   ai     = a->i;
921:   aj     = a->j;
922:   bi     = b->i;
923:   bj     = b->j;
924:   garray = c->garray;

926:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
927:     rbuf_i =  rbuf[i];
928:     rbuf_0 =  rbuf_i[0];
929:     ct    += rbuf_0;
930:     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
931:   }

933:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
934:   else max1 = 1;
935:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
936:   if (nrqr) {
937:     PetscMalloc1(mem_estimate,&xdata[0]);
938:     ++no_malloc;
939:   }
940:   PetscBTCreate(m,&xtable);
941:   PetscArrayzero(isz1,nrqr);

943:   ct3 = 0;
944:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
945:     rbuf_i =  rbuf[i];
946:     rbuf_0 =  rbuf_i[0];
947:     ct1    =  2*rbuf_0+1;
948:     ct2    =  ct1;
949:     ct3   += ct1;
950:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
951:       PetscBTMemzero(m,xtable);
952:       oct2 = ct2;
953:       kmax = rbuf_i[2*j];
954:       for (k=0; k<kmax; k++,ct1++) {
955:         row = rbuf_i[ct1];
956:         if (!PetscBTLookupSet(xtable,row)) {
957:           if (!(ct3 < mem_estimate)) {
958:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
959:             PetscMalloc1(new_estimate,&tmp);
960:             PetscArraycpy(tmp,xdata[0],mem_estimate);
961:             PetscFree(xdata[0]);
962:             xdata[0]     = tmp;
963:             mem_estimate = new_estimate; ++no_malloc;
964:             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
965:           }
966:           xdata[i][ct2++] = row;
967:           ct3++;
968:         }
969:       }
970:       for (k=oct2,max2=ct2; k<max2; k++) {
971:         row   = xdata[i][k] - rstart;
972:         start = ai[row];
973:         end   = ai[row+1];
974:         for (l=start; l<end; l++) {
975:           val = aj[l] + cstart;
976:           if (!PetscBTLookupSet(xtable,val)) {
977:             if (!(ct3 < mem_estimate)) {
978:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
979:               PetscMalloc1(new_estimate,&tmp);
980:               PetscArraycpy(tmp,xdata[0],mem_estimate);
981:               PetscFree(xdata[0]);
982:               xdata[0]     = tmp;
983:               mem_estimate = new_estimate; ++no_malloc;
984:               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
985:             }
986:             xdata[i][ct2++] = val;
987:             ct3++;
988:           }
989:         }
990:         start = bi[row];
991:         end   = bi[row+1];
992:         for (l=start; l<end; l++) {
993:           val = garray[bj[l]];
994:           if (!PetscBTLookupSet(xtable,val)) {
995:             if (!(ct3 < mem_estimate)) {
996:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997:               PetscMalloc1(new_estimate,&tmp);
998:               PetscArraycpy(tmp,xdata[0],mem_estimate);
999:               PetscFree(xdata[0]);
1000:               xdata[0]     = tmp;
1001:               mem_estimate = new_estimate; ++no_malloc;
1002:               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003:             }
1004:             xdata[i][ct2++] = val;
1005:             ct3++;
1006:           }
1007:         }
1008:       }
1009:       /* Update the header*/
1010:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1011:       xdata[i][2*j-1] = rbuf_i[2*j-1];
1012:     }
1013:     xdata[i][0] = rbuf_0;
1014:     if (i+1<nrqr) xdata[i+1]  = xdata[i] + ct2;
1015:     isz1[i]     = ct2; /* size of each message */
1016:   }
1017:   PetscBTDestroy(&xtable);
1018:   PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc);
1019:   return 0;
1020: }
1021: /* -------------------------------------------------------------------------*/
1022: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1023: /*
1024:     Every processor gets the entire matrix
1025: */
1026: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1027: {
1028:   Mat            B;
1029:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1030:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1031:   PetscMPIInt    size,rank,*recvcounts = NULL,*displs = NULL;
1032:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1033:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;

1035:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1036:   MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1037:   if (scall == MAT_INITIAL_MATRIX) {
1038:     /* ----------------------------------------------------------------
1039:          Tell every processor the number of nonzeros per row
1040:     */
1041:     PetscMalloc1(A->rmap->N,&lens);
1042:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1043:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1044:     }
1045:     PetscMalloc2(size,&recvcounts,size,&displs);
1046:     for (i=0; i<size; i++) {
1047:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1048:       displs[i]     = A->rmap->range[i];
1049:     }
1050:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1051:     /* ---------------------------------------------------------------
1052:          Create the sequential matrix of the same type as the local block diagonal
1053:     */
1054:     MatCreate(PETSC_COMM_SELF,&B);
1055:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1056:     MatSetBlockSizesFromMats(B,A,A);
1057:     MatSetType(B,((PetscObject)a->A)->type_name);
1058:     MatSeqAIJSetPreallocation(B,0,lens);
1059:     PetscCalloc1(2,Bin);
1060:     **Bin = B;
1061:     b     = (Mat_SeqAIJ*)B->data;

1063:     /*--------------------------------------------------------------------
1064:        Copy my part of matrix column indices over
1065:     */
1066:     sendcount  = ad->nz + bd->nz;
1067:     jsendbuf   = b->j + b->i[rstarts[rank]];
1068:     a_jsendbuf = ad->j;
1069:     b_jsendbuf = bd->j;
1070:     n          = A->rmap->rend - A->rmap->rstart;
1071:     cnt        = 0;
1072:     for (i=0; i<n; i++) {
1073:       /* put in lower diagonal portion */
1074:       m = bd->i[i+1] - bd->i[i];
1075:       while (m > 0) {
1076:         /* is it above diagonal (in bd (compressed) numbering) */
1077:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1078:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1079:         m--;
1080:       }

1082:       /* put in diagonal portion */
1083:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1084:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1085:       }

1087:       /* put in upper diagonal portion */
1088:       while (m-- > 0) {
1089:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
1090:       }
1091:     }

1094:     /*--------------------------------------------------------------------
1095:        Gather all column indices to all processors
1096:     */
1097:     for (i=0; i<size; i++) {
1098:       recvcounts[i] = 0;
1099:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1100:         recvcounts[i] += lens[j];
1101:       }
1102:     }
1103:     displs[0] = 0;
1104:     for (i=1; i<size; i++) {
1105:       displs[i] = displs[i-1] + recvcounts[i-1];
1106:     }
1107:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1108:     /*--------------------------------------------------------------------
1109:         Assemble the matrix into useable form (note numerical values not yet set)
1110:     */
1111:     /* set the b->ilen (length of each row) values */
1112:     PetscArraycpy(b->ilen,lens,A->rmap->N);
1113:     /* set the b->i indices */
1114:     b->i[0] = 0;
1115:     for (i=1; i<=A->rmap->N; i++) {
1116:       b->i[i] = b->i[i-1] + lens[i-1];
1117:     }
1118:     PetscFree(lens);
1119:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1120:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

1122:   } else {
1123:     B = **Bin;
1124:     b = (Mat_SeqAIJ*)B->data;
1125:   }

1127:   /*--------------------------------------------------------------------
1128:        Copy my part of matrix numerical values into the values location
1129:   */
1130:   if (flag == MAT_GET_VALUES) {
1131:     const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1132:     MatScalar         *sendbuf,*recvbuf;

1134:     MatSeqAIJGetArrayRead(a->A,&ada);
1135:     MatSeqAIJGetArrayRead(a->B,&bda);
1136:     sendcount = ad->nz + bd->nz;
1137:     sendbuf   = b->a + b->i[rstarts[rank]];
1138:     a_sendbuf = ada;
1139:     b_sendbuf = bda;
1140:     b_sendj   = bd->j;
1141:     n         = A->rmap->rend - A->rmap->rstart;
1142:     cnt       = 0;
1143:     for (i=0; i<n; i++) {
1144:       /* put in lower diagonal portion */
1145:       m = bd->i[i+1] - bd->i[i];
1146:       while (m > 0) {
1147:         /* is it above diagonal (in bd (compressed) numbering) */
1148:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
1149:         sendbuf[cnt++] = *b_sendbuf++;
1150:         m--;
1151:         b_sendj++;
1152:       }

1154:       /* put in diagonal portion */
1155:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1156:         sendbuf[cnt++] = *a_sendbuf++;
1157:       }

1159:       /* put in upper diagonal portion */
1160:       while (m-- > 0) {
1161:         sendbuf[cnt++] = *b_sendbuf++;
1162:         b_sendj++;
1163:       }
1164:     }

1167:     /* -----------------------------------------------------------------
1168:        Gather all numerical values to all processors
1169:     */
1170:     if (!recvcounts) {
1171:       PetscMalloc2(size,&recvcounts,size,&displs);
1172:     }
1173:     for (i=0; i<size; i++) {
1174:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1175:     }
1176:     displs[0] = 0;
1177:     for (i=1; i<size; i++) {
1178:       displs[i] = displs[i-1] + recvcounts[i-1];
1179:     }
1180:     recvbuf = b->a;
1181:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1182:     MatSeqAIJRestoreArrayRead(a->A,&ada);
1183:     MatSeqAIJRestoreArrayRead(a->B,&bda);
1184:   }  /* endof (flag == MAT_GET_VALUES) */
1185:   PetscFree2(recvcounts,displs);

1187:   if (A->symmetric) {
1188:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1189:   } else if (A->hermitian) {
1190:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1191:   } else if (A->structurally_symmetric) {
1192:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1193:   }
1194:   return 0;
1195: }

1197: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1198: {
1199:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
1200:   Mat            submat,A = c->A,B = c->B;
1201:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1202:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1203:   PetscInt       cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1204:   const PetscInt *icol,*irow;
1205:   PetscInt       nrow,ncol,start;
1206:   PetscMPIInt    rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1207:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1208:   PetscInt       nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1209:   PetscInt       **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1210:   PetscInt       *lens,rmax,ncols,*cols,Crow;
1211: #if defined(PETSC_USE_CTABLE)
1212:   PetscTable     cmap,rmap;
1213:   PetscInt       *cmap_loc,*rmap_loc;
1214: #else
1215:   PetscInt       *cmap,*rmap;
1216: #endif
1217:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1218:   PetscInt       *cworkB,lwrite,*subcols,*row2proc;
1219:   PetscScalar    *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1220:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1221:   MPI_Request    *r_waits4,*s_waits3 = NULL,*s_waits4;
1222:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1223:   MPI_Status     *r_status3 = NULL,*r_status4,*s_status4;
1224:   MPI_Comm       comm;
1225:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1226:   PetscMPIInt    *onodes1,*olengths1,idex,end;
1227:   Mat_SubSppt    *smatis1;
1228:   PetscBool      isrowsorted,iscolsorted;

1233:   MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
1234:   MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);
1235:   PetscObjectGetComm((PetscObject)C,&comm);
1236:   size = c->size;
1237:   rank = c->rank;

1239:   ISSorted(iscol[0],&iscolsorted);
1240:   ISSorted(isrow[0],&isrowsorted);
1241:   ISGetIndices(isrow[0],&irow);
1242:   ISGetLocalSize(isrow[0],&nrow);
1243:   if (allcolumns) {
1244:     icol = NULL;
1245:     ncol = C->cmap->N;
1246:   } else {
1247:     ISGetIndices(iscol[0],&icol);
1248:     ISGetLocalSize(iscol[0],&ncol);
1249:   }

1251:   if (scall == MAT_INITIAL_MATRIX) {
1252:     PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;

1254:     /* Get some new tags to keep the communication clean */
1255:     tag1 = ((PetscObject)C)->tag;
1256:     PetscObjectGetNewTag((PetscObject)C,&tag2);
1257:     PetscObjectGetNewTag((PetscObject)C,&tag3);

1259:     /* evaluate communication - mesg to who, length of mesg, and buffer space
1260:      required. Based on this, buffers are allocated, and data copied into them */
1261:     PetscCalloc2(size,&w1,size,&w2);
1262:     PetscMalloc1(nrow,&row2proc);

1264:     /* w1[proc] = num of rows owned by proc -- to be requested */
1265:     proc = 0;
1266:     nrqs = 0; /* num of outgoing messages */
1267:     for (j=0; j<nrow; j++) {
1268:       row  = irow[j];
1269:       if (!isrowsorted) proc = 0;
1270:       while (row >= C->rmap->range[proc+1]) proc++;
1271:       w1[proc]++;
1272:       row2proc[j] = proc; /* map row index to proc */

1274:       if (proc != rank && !w2[proc]) {
1275:         w2[proc] = 1; nrqs++;
1276:       }
1277:     }
1278:     w1[rank] = 0;  /* rows owned by self will not be requested */

1280:     PetscMalloc1(nrqs,&pa); /*(proc -array)*/
1281:     for (proc=0,j=0; proc<size; proc++) {
1282:       if (w1[proc]) { pa[j++] = proc;}
1283:     }

1285:     /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1286:     msz = 0;              /* total mesg length (for all procs) */
1287:     for (i=0; i<nrqs; i++) {
1288:       proc      = pa[i];
1289:       w1[proc] += 3;
1290:       msz      += w1[proc];
1291:     }
1292:     PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);

1294:     /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1295:     /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1296:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);

1298:     /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1299:        Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1300:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

1302:     /* Now post the Irecvs corresponding to these messages */
1303:     PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

1305:     PetscFree(onodes1);
1306:     PetscFree(olengths1);

1308:     /* Allocate Memory for outgoing messages */
1309:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1310:     PetscArrayzero(sbuf1,size);
1311:     PetscArrayzero(ptr,size);

1313:     /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1314:     iptr = tmp;
1315:     for (i=0; i<nrqs; i++) {
1316:       proc        = pa[i];
1317:       sbuf1[proc] = iptr;
1318:       iptr       += w1[proc];
1319:     }

1321:     /* Form the outgoing messages */
1322:     /* Initialize the header space */
1323:     for (i=0; i<nrqs; i++) {
1324:       proc      = pa[i];
1325:       PetscArrayzero(sbuf1[proc],3);
1326:       ptr[proc] = sbuf1[proc] + 3;
1327:     }

1329:     /* Parse the isrow and copy data into outbuf */
1330:     PetscArrayzero(ctr,size);
1331:     for (j=0; j<nrow; j++) {  /* parse the indices of each IS */
1332:       proc = row2proc[j];
1333:       if (proc != rank) { /* copy to the outgoing buf*/
1334:         *ptr[proc] = irow[j];
1335:         ctr[proc]++; ptr[proc]++;
1336:       }
1337:     }

1339:     /* Update the headers for the current IS */
1340:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
1341:       if ((ctr_j = ctr[j])) {
1342:         sbuf1_j        = sbuf1[j];
1343:         k              = ++sbuf1_j[0];
1344:         sbuf1_j[2*k]   = ctr_j;
1345:         sbuf1_j[2*k-1] = 0;
1346:       }
1347:     }

1349:     /* Now post the sends */
1350:     PetscMalloc1(nrqs,&s_waits1);
1351:     for (i=0; i<nrqs; ++i) {
1352:       proc = pa[i];
1353:       MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1354:     }

1356:     /* Post Receives to capture the buffer size */
1357:     PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);
1358:     PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);

1360:     if (nrqs) rbuf2[0] = tmp + msz;
1361:     for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];

1363:     for (i=0; i<nrqs; ++i) {
1364:       proc = pa[i];
1365:       MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1366:     }

1368:     PetscFree2(w1,w2);

1370:     /* Send to other procs the buf size they should allocate */
1371:     /* Receive messages*/
1372:     PetscMalloc1(nrqr,&r_status1);
1373:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);

1375:     MPI_Waitall(nrqr,r_waits1,r_status1);
1376:     for (i=0; i<nrqr; ++i) {
1377:       req_size[i] = 0;
1378:       rbuf1_i        = rbuf1[i];
1379:       start          = 2*rbuf1_i[0] + 1;
1380:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
1381:       PetscMalloc1(end,&sbuf2[i]);
1382:       sbuf2_i        = sbuf2[i];
1383:       for (j=start; j<end; j++) {
1384:         k            = rbuf1_i[j] - rstart;
1385:         ncols        = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1386:         sbuf2_i[j]   = ncols;
1387:         req_size[i] += ncols;
1388:       }
1389:       req_source1[i] = r_status1[i].MPI_SOURCE;

1391:       /* form the header */
1392:       sbuf2_i[0] = req_size[i];
1393:       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

1395:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1396:     }

1398:     PetscFree(r_status1);
1399:     PetscFree(r_waits1);

1401:     /* rbuf2 is received, Post recv column indices a->j */
1402:     MPI_Waitall(nrqs,r_waits2,r_status2);

1404:     PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);
1405:     for (i=0; i<nrqs; ++i) {
1406:       PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
1407:       req_source2[i] = r_status2[i].MPI_SOURCE;
1408:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1409:     }

1411:     /* Wait on sends1 and sends2 */
1412:     PetscMalloc1(nrqs,&s_status1);
1413:     MPI_Waitall(nrqs,s_waits1,s_status1);
1414:     PetscFree(s_waits1);
1415:     PetscFree(s_status1);

1417:     MPI_Waitall(nrqr,s_waits2,s_status2);
1418:     PetscFree4(r_status2,s_waits2,r_waits2,s_status2);

1420:     /* Now allocate sending buffers for a->j, and send them off */
1421:     PetscMalloc1(nrqr,&sbuf_aj);
1422:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1423:     if (nrqr) PetscMalloc1(j,&sbuf_aj[0]);
1424:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

1426:     for (i=0; i<nrqr; i++) { /* for each requested message */
1427:       rbuf1_i   = rbuf1[i];
1428:       sbuf_aj_i = sbuf_aj[i];
1429:       ct1       = 2*rbuf1_i[0] + 1;
1430:       ct2       = 0;

1433:       kmax = rbuf1[i][2];
1434:       for (k=0; k<kmax; k++,ct1++) { /* for each row */
1435:         row    = rbuf1_i[ct1] - rstart;
1436:         nzA    = ai[row+1] - ai[row];
1437:         nzB    = bi[row+1] - bi[row];
1438:         ncols  = nzA + nzB;
1439:         cworkA = aj + ai[row]; cworkB = bj + bi[row];

1441:         /* load the column indices for this row into cols*/
1442:         cols = sbuf_aj_i + ct2;

1444:         lwrite = 0;
1445:         for (l=0; l<nzB; l++) {
1446:           if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1447:         }
1448:         for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1449:         for (l=0; l<nzB; l++) {
1450:           if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1451:         }

1453:         ct2 += ncols;
1454:       }
1455:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1456:     }

1458:     /* create column map (cmap): global col of C -> local col of submat */
1459: #if defined(PETSC_USE_CTABLE)
1460:     if (!allcolumns) {
1461:       PetscTableCreate(ncol,C->cmap->N,&cmap);
1462:       PetscCalloc1(C->cmap->n,&cmap_loc);
1463:       for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1464:         if (icol[j] >= cstart && icol[j] <cend) {
1465:           cmap_loc[icol[j] - cstart] = j+1;
1466:         } else { /* use PetscTable for non-local col indices */
1467:           PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1468:         }
1469:       }
1470:     } else {
1471:       cmap     = NULL;
1472:       cmap_loc = NULL;
1473:     }
1474:     PetscCalloc1(C->rmap->n,&rmap_loc);
1475: #else
1476:     if (!allcolumns) {
1477:       PetscCalloc1(C->cmap->N,&cmap);
1478:       for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1479:     } else {
1480:       cmap = NULL;
1481:     }
1482: #endif

1484:     /* Create lens for MatSeqAIJSetPreallocation() */
1485:     PetscCalloc1(nrow,&lens);

1487:     /* Compute lens from local part of C */
1488:     for (j=0; j<nrow; j++) {
1489:       row  = irow[j];
1490:       proc = row2proc[j];
1491:       if (proc == rank) {
1492:         /* diagonal part A = c->A */
1493:         ncols = ai[row-rstart+1] - ai[row-rstart];
1494:         cols  = aj + ai[row-rstart];
1495:         if (!allcolumns) {
1496:           for (k=0; k<ncols; k++) {
1497: #if defined(PETSC_USE_CTABLE)
1498:             tcol = cmap_loc[cols[k]];
1499: #else
1500:             tcol = cmap[cols[k]+cstart];
1501: #endif
1502:             if (tcol) lens[j]++;
1503:           }
1504:         } else { /* allcolumns */
1505:           lens[j] = ncols;
1506:         }

1508:         /* off-diagonal part B = c->B */
1509:         ncols = bi[row-rstart+1] - bi[row-rstart];
1510:         cols  = bj + bi[row-rstart];
1511:         if (!allcolumns) {
1512:           for (k=0; k<ncols; k++) {
1513: #if defined(PETSC_USE_CTABLE)
1514:             PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1515: #else
1516:             tcol = cmap[bmap[cols[k]]];
1517: #endif
1518:             if (tcol) lens[j]++;
1519:           }
1520:         } else { /* allcolumns */
1521:           lens[j] += ncols;
1522:         }
1523:       }
1524:     }

1526:     /* Create row map (rmap): global row of C -> local row of submat */
1527: #if defined(PETSC_USE_CTABLE)
1528:     PetscTableCreate(nrow,C->rmap->N,&rmap);
1529:     for (j=0; j<nrow; j++) {
1530:       row  = irow[j];
1531:       proc = row2proc[j];
1532:       if (proc == rank) { /* a local row */
1533:         rmap_loc[row - rstart] = j;
1534:       } else {
1535:         PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1536:       }
1537:     }
1538: #else
1539:     PetscCalloc1(C->rmap->N,&rmap);
1540:     for (j=0; j<nrow; j++) {
1541:       rmap[irow[j]] = j;
1542:     }
1543: #endif

1545:     /* Update lens from offproc data */
1546:     /* recv a->j is done */
1547:     MPI_Waitall(nrqs,r_waits3,r_status3);
1548:     for (i=0; i<nrqs; i++) {
1549:       proc    = pa[i];
1550:       sbuf1_i = sbuf1[proc];
1552:       ct1     = 2 + 1;
1553:       ct2     = 0;
1554:       rbuf2_i = rbuf2[i]; /* received length of C->j */
1555:       rbuf3_i = rbuf3[i]; /* received C->j */

1558:       max1   = sbuf1_i[2];
1559:       for (k=0; k<max1; k++,ct1++) {
1560: #if defined(PETSC_USE_CTABLE)
1561:         PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1562:         row--;
1564: #else
1565:         row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1566: #endif
1567:         /* Now, store row index of submat in sbuf1_i[ct1] */
1568:         sbuf1_i[ct1] = row;

1570:         nnz = rbuf2_i[ct1];
1571:         if (!allcolumns) {
1572:           for (l=0; l<nnz; l++,ct2++) {
1573: #if defined(PETSC_USE_CTABLE)
1574:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1575:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1576:             } else {
1577:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1578:             }
1579: #else
1580:             tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1581: #endif
1582:             if (tcol) lens[row]++;
1583:           }
1584:         } else { /* allcolumns */
1585:           lens[row] += nnz;
1586:         }
1587:       }
1588:     }
1589:     MPI_Waitall(nrqr,s_waits3,s_status3);
1590:     PetscFree4(r_waits3,s_waits3,r_status3,s_status3);

1592:     /* Create the submatrices */
1593:     MatCreate(PETSC_COMM_SELF,&submat);
1594:     MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);

1596:     ISGetBlockSize(isrow[0],&i);
1597:     ISGetBlockSize(iscol[0],&j);
1598:     MatSetBlockSizes(submat,i,j);
1599:     MatSetType(submat,((PetscObject)A)->type_name);
1600:     MatSeqAIJSetPreallocation(submat,0,lens);

1602:     /* create struct Mat_SubSppt and attached it to submat */
1603:     PetscNew(&smatis1);
1604:     subc = (Mat_SeqAIJ*)submat->data;
1605:     subc->submatis1 = smatis1;

1607:     smatis1->id          = 0;
1608:     smatis1->nrqs        = nrqs;
1609:     smatis1->nrqr        = nrqr;
1610:     smatis1->rbuf1       = rbuf1;
1611:     smatis1->rbuf2       = rbuf2;
1612:     smatis1->rbuf3       = rbuf3;
1613:     smatis1->sbuf2       = sbuf2;
1614:     smatis1->req_source2 = req_source2;

1616:     smatis1->sbuf1       = sbuf1;
1617:     smatis1->ptr         = ptr;
1618:     smatis1->tmp         = tmp;
1619:     smatis1->ctr         = ctr;

1621:     smatis1->pa           = pa;
1622:     smatis1->req_size     = req_size;
1623:     smatis1->req_source1  = req_source1;

1625:     smatis1->allcolumns  = allcolumns;
1626:     smatis1->singleis    = PETSC_TRUE;
1627:     smatis1->row2proc    = row2proc;
1628:     smatis1->rmap        = rmap;
1629:     smatis1->cmap        = cmap;
1630: #if defined(PETSC_USE_CTABLE)
1631:     smatis1->rmap_loc    = rmap_loc;
1632:     smatis1->cmap_loc    = cmap_loc;
1633: #endif

1635:     smatis1->destroy     = submat->ops->destroy;
1636:     submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1637:     submat->factortype   = C->factortype;

1639:     /* compute rmax */
1640:     rmax = 0;
1641:     for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);

1643:   } else { /* scall == MAT_REUSE_MATRIX */
1644:     submat = submats[0];

1647:     subc    = (Mat_SeqAIJ*)submat->data;
1648:     rmax    = subc->rmax;
1649:     smatis1 = subc->submatis1;
1650:     nrqs        = smatis1->nrqs;
1651:     nrqr        = smatis1->nrqr;
1652:     rbuf1       = smatis1->rbuf1;
1653:     rbuf2       = smatis1->rbuf2;
1654:     rbuf3       = smatis1->rbuf3;
1655:     req_source2 = smatis1->req_source2;

1657:     sbuf1     = smatis1->sbuf1;
1658:     sbuf2     = smatis1->sbuf2;
1659:     ptr       = smatis1->ptr;
1660:     tmp       = smatis1->tmp;
1661:     ctr       = smatis1->ctr;

1663:     pa         = smatis1->pa;
1664:     req_size   = smatis1->req_size;
1665:     req_source1 = smatis1->req_source1;

1667:     allcolumns = smatis1->allcolumns;
1668:     row2proc   = smatis1->row2proc;
1669:     rmap       = smatis1->rmap;
1670:     cmap       = smatis1->cmap;
1671: #if defined(PETSC_USE_CTABLE)
1672:     rmap_loc   = smatis1->rmap_loc;
1673:     cmap_loc   = smatis1->cmap_loc;
1674: #endif
1675:   }

1677:   /* Post recv matrix values */
1678:   PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);
1679:   PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);
1680:   PetscObjectGetNewTag((PetscObject)C,&tag4);
1681:   for (i=0; i<nrqs; ++i) {
1682:     PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
1683:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1684:   }

1686:   /* Allocate sending buffers for a->a, and send them off */
1687:   PetscMalloc1(nrqr,&sbuf_aa);
1688:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1689:   if (nrqr) PetscMalloc1(j,&sbuf_aa[0]);
1690:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

1692:   for (i=0; i<nrqr; i++) {
1693:     rbuf1_i   = rbuf1[i];
1694:     sbuf_aa_i = sbuf_aa[i];
1695:     ct1       = 2*rbuf1_i[0]+1;
1696:     ct2       = 0;

1699:     kmax = rbuf1_i[2];
1700:     for (k=0; k<kmax; k++,ct1++) {
1701:       row = rbuf1_i[ct1] - rstart;
1702:       nzA = ai[row+1] - ai[row];
1703:       nzB = bi[row+1] - bi[row];
1704:       ncols  = nzA + nzB;
1705:       cworkB = bj + bi[row];
1706:       vworkA = a_a + ai[row];
1707:       vworkB = b_a + bi[row];

1709:       /* load the column values for this row into vals*/
1710:       vals = sbuf_aa_i + ct2;

1712:       lwrite = 0;
1713:       for (l=0; l<nzB; l++) {
1714:         if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1715:       }
1716:       for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1717:       for (l=0; l<nzB; l++) {
1718:         if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1719:       }

1721:       ct2 += ncols;
1722:     }
1723:     MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1724:   }

1726:   /* Assemble submat */
1727:   /* First assemble the local rows */
1728:   for (j=0; j<nrow; j++) {
1729:     row  = irow[j];
1730:     proc = row2proc[j];
1731:     if (proc == rank) {
1732:       Crow = row - rstart;  /* local row index of C */
1733: #if defined(PETSC_USE_CTABLE)
1734:       row = rmap_loc[Crow]; /* row index of submat */
1735: #else
1736:       row = rmap[row];
1737: #endif

1739:       if (allcolumns) {
1740:         /* diagonal part A = c->A */
1741:         ncols = ai[Crow+1] - ai[Crow];
1742:         cols  = aj + ai[Crow];
1743:         vals  = a_a + ai[Crow];
1744:         i     = 0;
1745:         for (k=0; k<ncols; k++) {
1746:           subcols[i]   = cols[k] + cstart;
1747:           subvals[i++] = vals[k];
1748:         }

1750:         /* off-diagonal part B = c->B */
1751:         ncols = bi[Crow+1] - bi[Crow];
1752:         cols  = bj + bi[Crow];
1753:         vals  = b_a + bi[Crow];
1754:         for (k=0; k<ncols; k++) {
1755:           subcols[i]   = bmap[cols[k]];
1756:           subvals[i++] = vals[k];
1757:         }

1759:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);

1761:       } else { /* !allcolumns */
1762: #if defined(PETSC_USE_CTABLE)
1763:         /* diagonal part A = c->A */
1764:         ncols = ai[Crow+1] - ai[Crow];
1765:         cols  = aj + ai[Crow];
1766:         vals  = a_a + ai[Crow];
1767:         i     = 0;
1768:         for (k=0; k<ncols; k++) {
1769:           tcol = cmap_loc[cols[k]];
1770:           if (tcol) {
1771:             subcols[i]   = --tcol;
1772:             subvals[i++] = vals[k];
1773:           }
1774:         }

1776:         /* off-diagonal part B = c->B */
1777:         ncols = bi[Crow+1] - bi[Crow];
1778:         cols  = bj + bi[Crow];
1779:         vals  = b_a + bi[Crow];
1780:         for (k=0; k<ncols; k++) {
1781:           PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1782:           if (tcol) {
1783:             subcols[i]   = --tcol;
1784:             subvals[i++] = vals[k];
1785:           }
1786:         }
1787: #else
1788:         /* diagonal part A = c->A */
1789:         ncols = ai[Crow+1] - ai[Crow];
1790:         cols  = aj + ai[Crow];
1791:         vals  = a_a + ai[Crow];
1792:         i     = 0;
1793:         for (k=0; k<ncols; k++) {
1794:           tcol = cmap[cols[k]+cstart];
1795:           if (tcol) {
1796:             subcols[i]   = --tcol;
1797:             subvals[i++] = vals[k];
1798:           }
1799:         }

1801:         /* off-diagonal part B = c->B */
1802:         ncols = bi[Crow+1] - bi[Crow];
1803:         cols  = bj + bi[Crow];
1804:         vals  = b_a + bi[Crow];
1805:         for (k=0; k<ncols; k++) {
1806:           tcol = cmap[bmap[cols[k]]];
1807:           if (tcol) {
1808:             subcols[i]   = --tcol;
1809:             subvals[i++] = vals[k];
1810:           }
1811:         }
1812: #endif
1813:         MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1814:       }
1815:     }
1816:   }

1818:   /* Now assemble the off-proc rows */
1819:   for (i=0; i<nrqs; i++) { /* for each requested message */
1820:     /* recv values from other processes */
1821:     MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1822:     proc    = pa[idex];
1823:     sbuf1_i = sbuf1[proc];
1825:     ct1     = 2 + 1;
1826:     ct2     = 0; /* count of received C->j */
1827:     ct3     = 0; /* count of received C->j that will be inserted into submat */
1828:     rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1829:     rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1830:     rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */

1833:     max1 = sbuf1_i[2];             /* num of rows */
1834:     for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1835:       row = sbuf1_i[ct1]; /* row index of submat */
1836:       if (!allcolumns) {
1837:         idex = 0;
1838:         if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1839:           nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1840:           for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1841: #if defined(PETSC_USE_CTABLE)
1842:             if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1843:               tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1844:             } else {
1845:               PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1846:             }
1847: #else
1848:             tcol = cmap[rbuf3_i[ct2]];
1849: #endif
1850:             if (tcol) {
1851:               subcols[idex]   = --tcol; /* may not be sorted */
1852:               subvals[idex++] = rbuf4_i[ct2];

1854:               /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1855:                For reuse, we replace received C->j with index that should be inserted to submat */
1856:               if (iscolsorted) rbuf3_i[ct3++] = ct2;
1857:             }
1858:           }
1859:           MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1860:         } else { /* scall == MAT_REUSE_MATRIX */
1861:           submat = submats[0];
1862:           subc   = (Mat_SeqAIJ*)submat->data;

1864:           nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1865:           for (l=0; l<nnz; l++) {
1866:             ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1867:             subvals[idex++] = rbuf4_i[ct2];
1868:           }

1870:           bj = subc->j + subc->i[row]; /* sorted column indices */
1871:           MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1872:         }
1873:       } else { /* allcolumns */
1874:         nnz  = rbuf2_i[ct1]; /* num of C entries in this row */
1875:         MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1876:         ct2 += nnz;
1877:       }
1878:     }
1879:   }

1881:   /* sending a->a are done */
1882:   MPI_Waitall(nrqr,s_waits4,s_status4);
1883:   PetscFree4(r_waits4,s_waits4,r_status4,s_status4);

1885:   MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1886:   MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1887:   submats[0] = submat;

1889:   /* Restore the indices */
1890:   ISRestoreIndices(isrow[0],&irow);
1891:   if (!allcolumns) {
1892:     ISRestoreIndices(iscol[0],&icol);
1893:   }

1895:   /* Destroy allocated memory */
1896:   for (i=0; i<nrqs; ++i) {
1897:     PetscFree(rbuf4[i]);
1898:   }
1899:   PetscFree3(rbuf4,subcols,subvals);
1900:   if (sbuf_aa) {
1901:     PetscFree(sbuf_aa[0]);
1902:     PetscFree(sbuf_aa);
1903:   }

1905:   if (scall == MAT_INITIAL_MATRIX) {
1906:     PetscFree(lens);
1907:     if (sbuf_aj) {
1908:       PetscFree(sbuf_aj[0]);
1909:       PetscFree(sbuf_aj);
1910:     }
1911:   }
1912:   MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
1913:   MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);
1914:   return 0;
1915: }

1917: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1918: {
1919:   PetscInt       ncol;
1920:   PetscBool      colflag,allcolumns=PETSC_FALSE;

1922:   /* Allocate memory to hold all the submatrices */
1923:   if (scall == MAT_INITIAL_MATRIX) {
1924:     PetscCalloc1(2,submat);
1925:   }

1927:   /* Check for special case: each processor gets entire matrix columns */
1928:   ISIdentity(iscol[0],&colflag);
1929:   ISGetLocalSize(iscol[0],&ncol);
1930:   if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;

1932:   MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1933:   return 0;
1934: }

1936: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1937: {
1938:   PetscInt       nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1939:   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE;
1940:   Mat_SeqAIJ     *subc;
1941:   Mat_SubSppt    *smat;

1943:   /* Check for special case: each processor has a single IS */
1944:   if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1945:     MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1946:     C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1947:     return 0;
1948:   }

1950:   /* Collect global wantallmatrix and nstages */
1951:   if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1952:   else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1953:   if (!nmax) nmax = 1;

1955:   if (scall == MAT_INITIAL_MATRIX) {
1956:     /* Collect global wantallmatrix and nstages */
1957:     if (ismax == 1 && C->rmap->N == C->cmap->N) {
1958:       ISIdentity(*isrow,&rowflag);
1959:       ISIdentity(*iscol,&colflag);
1960:       ISGetLocalSize(*isrow,&nrow);
1961:       ISGetLocalSize(*iscol,&ncol);
1962:       if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1963:         wantallmatrix = PETSC_TRUE;

1965:         PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
1966:       }
1967:     }

1969:     /* Determine the number of stages through which submatrices are done
1970:        Each stage will extract nmax submatrices.
1971:        nmax is determined by the matrix column dimension.
1972:        If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1973:     */
1974:     nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */

1976:     in[0] = -1*(PetscInt)wantallmatrix;
1977:     in[1] = nstages;
1978:     MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
1979:     wantallmatrix = (PetscBool)(-out[0]);
1980:     nstages       = out[1]; /* Make sure every processor loops through the global nstages */

1982:   } else { /* MAT_REUSE_MATRIX */
1983:     if (ismax) {
1984:       subc = (Mat_SeqAIJ*)(*submat)[0]->data;
1985:       smat = subc->submatis1;
1986:     } else { /* (*submat)[0] is a dummy matrix */
1987:       smat = (Mat_SubSppt*)(*submat)[0]->data;
1988:     }
1989:     if (!smat) {
1990:       /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1991:       wantallmatrix = PETSC_TRUE;
1992:     } else if (smat->singleis) {
1993:       MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1994:       return 0;
1995:     } else {
1996:       nstages = smat->nstages;
1997:     }
1998:   }

2000:   if (wantallmatrix) {
2001:     MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2002:     return 0;
2003:   }

2005:   /* Allocate memory to hold all the submatrices and dummy submatrices */
2006:   if (scall == MAT_INITIAL_MATRIX) {
2007:     PetscCalloc1(ismax+nstages,submat);
2008:   }

2010:   for (i=0,pos=0; i<nstages; i++) {
2011:     if (pos+nmax <= ismax) max_no = nmax;
2012:     else if (pos >= ismax) max_no = 0;
2013:     else                   max_no = ismax-pos;

2015:     MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2016:     if (!max_no) {
2017:       if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2018:         smat = (Mat_SubSppt*)(*submat)[pos]->data;
2019:         smat->nstages = nstages;
2020:       }
2021:       pos++; /* advance to next dummy matrix if any */
2022:     } else pos += max_no;
2023:   }

2025:   if (ismax && scall == MAT_INITIAL_MATRIX) {
2026:     /* save nstages for reuse */
2027:     subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2028:     smat = subc->submatis1;
2029:     smat->nstages = nstages;
2030:   }
2031:   return 0;
2032: }

2034: /* -------------------------------------------------------------------------*/
2035: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2036: {
2037:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
2038:   Mat            A  = c->A;
2039:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2040:   const PetscInt **icol,**irow;
2041:   PetscInt       *nrow,*ncol,start;
2042:   PetscMPIInt    rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2043:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2044:   PetscInt       nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2045:   PetscInt       **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2046:   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2047: #if defined(PETSC_USE_CTABLE)
2048:   PetscTable     *cmap,cmap_i=NULL,*rmap,rmap_i;
2049: #else
2050:   PetscInt       **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2051: #endif
2052:   const PetscInt *irow_i;
2053:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2054:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2055:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
2056:   MPI_Comm       comm;
2057:   PetscScalar    **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2058:   PetscMPIInt    *onodes1,*olengths1,end;
2059:   PetscInt       **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2060:   Mat_SubSppt    *smat_i;
2061:   PetscBool      *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2062:   PetscInt       *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;

2064:   PetscObjectGetComm((PetscObject)C,&comm);
2065:   size = c->size;
2066:   rank = c->rank;

2068:   PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2069:   PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);

2071:   for (i=0; i<ismax; i++) {
2072:     ISSorted(iscol[i],&issorted[i]);
2073:     if (!issorted[i]) iscsorted = issorted[i];

2075:     ISSorted(isrow[i],&issorted[i]);

2077:     ISGetIndices(isrow[i],&irow[i]);
2078:     ISGetLocalSize(isrow[i],&nrow[i]);

2080:     /* Check for special case: allcolumn */
2081:     ISIdentity(iscol[i],&colflag);
2082:     ISGetLocalSize(iscol[i],&ncol[i]);
2083:     if (colflag && ncol[i] == C->cmap->N) {
2084:       allcolumns[i] = PETSC_TRUE;
2085:       icol[i] = NULL;
2086:     } else {
2087:       allcolumns[i] = PETSC_FALSE;
2088:       ISGetIndices(iscol[i],&icol[i]);
2089:     }
2090:   }

2092:   if (scall == MAT_REUSE_MATRIX) {
2093:     /* Assumes new rows are same length as the old rows */
2094:     for (i=0; i<ismax; i++) {
2096:       subc = (Mat_SeqAIJ*)submats[i]->data;

2099:       /* Initial matrix as if empty */
2100:       PetscArrayzero(subc->ilen,submats[i]->rmap->n);

2102:       smat_i   = subc->submatis1;

2104:       nrqs        = smat_i->nrqs;
2105:       nrqr        = smat_i->nrqr;
2106:       rbuf1       = smat_i->rbuf1;
2107:       rbuf2       = smat_i->rbuf2;
2108:       rbuf3       = smat_i->rbuf3;
2109:       req_source2 = smat_i->req_source2;

2111:       sbuf1     = smat_i->sbuf1;
2112:       sbuf2     = smat_i->sbuf2;
2113:       ptr       = smat_i->ptr;
2114:       tmp       = smat_i->tmp;
2115:       ctr       = smat_i->ctr;

2117:       pa          = smat_i->pa;
2118:       req_size    = smat_i->req_size;
2119:       req_source1 = smat_i->req_source1;

2121:       allcolumns[i] = smat_i->allcolumns;
2122:       row2proc[i]   = smat_i->row2proc;
2123:       rmap[i]       = smat_i->rmap;
2124:       cmap[i]       = smat_i->cmap;
2125:     }

2127:     if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2129:       smat_i = (Mat_SubSppt*)submats[0]->data;

2131:       nrqs        = smat_i->nrqs;
2132:       nrqr        = smat_i->nrqr;
2133:       rbuf1       = smat_i->rbuf1;
2134:       rbuf2       = smat_i->rbuf2;
2135:       rbuf3       = smat_i->rbuf3;
2136:       req_source2 = smat_i->req_source2;

2138:       sbuf1       = smat_i->sbuf1;
2139:       sbuf2       = smat_i->sbuf2;
2140:       ptr         = smat_i->ptr;
2141:       tmp         = smat_i->tmp;
2142:       ctr         = smat_i->ctr;

2144:       pa          = smat_i->pa;
2145:       req_size    = smat_i->req_size;
2146:       req_source1 = smat_i->req_source1;

2148:       allcolumns[0] = PETSC_FALSE;
2149:     }
2150:   } else { /* scall == MAT_INITIAL_MATRIX */
2151:     /* Get some new tags to keep the communication clean */
2152:     PetscObjectGetNewTag((PetscObject)C,&tag2);
2153:     PetscObjectGetNewTag((PetscObject)C,&tag3);

2155:     /* evaluate communication - mesg to who, length of mesg, and buffer space
2156:      required. Based on this, buffers are allocated, and data copied into them*/
2157:     PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);   /* mesg size, initialize work vectors */

2159:     for (i=0; i<ismax; i++) {
2160:       jmax   = nrow[i];
2161:       irow_i = irow[i];

2163:       PetscMalloc1(jmax,&row2proc_i);
2164:       row2proc[i] = row2proc_i;

2166:       if (issorted[i]) proc = 0;
2167:       for (j=0; j<jmax; j++) {
2168:         if (!issorted[i]) proc = 0;
2169:         row = irow_i[j];
2170:         while (row >= C->rmap->range[proc+1]) proc++;
2171:         w4[proc]++;
2172:         row2proc_i[j] = proc; /* map row index to proc */
2173:       }
2174:       for (j=0; j<size; j++) {
2175:         if (w4[j]) { w1[j] += w4[j];  w3[j]++; w4[j] = 0;}
2176:       }
2177:     }

2179:     nrqs     = 0;              /* no of outgoing messages */
2180:     msz      = 0;              /* total mesg length (for all procs) */
2181:     w1[rank] = 0;              /* no mesg sent to self */
2182:     w3[rank] = 0;
2183:     for (i=0; i<size; i++) {
2184:       if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2185:     }
2186:     PetscMalloc1(nrqs,&pa); /*(proc -array)*/
2187:     for (i=0,j=0; i<size; i++) {
2188:       if (w1[i]) { pa[j] = i; j++; }
2189:     }

2191:     /* Each message would have a header = 1 + 2*(no of IS) + data */
2192:     for (i=0; i<nrqs; i++) {
2193:       j      = pa[i];
2194:       w1[j] += w2[j] + 2* w3[j];
2195:       msz   += w1[j];
2196:     }
2197:     PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);

2199:     /* Determine the number of messages to expect, their lengths, from from-ids */
2200:     PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2201:     PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

2203:     /* Now post the Irecvs corresponding to these messages */
2204:     PetscObjectGetNewTag((PetscObject)C,&tag0);
2205:     PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

2207:     /* Allocate Memory for outgoing messages */
2208:     PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2209:     PetscArrayzero(sbuf1,size);
2210:     PetscArrayzero(ptr,size);

2212:     {
2213:       PetscInt *iptr = tmp;
2214:       k    = 0;
2215:       for (i=0; i<nrqs; i++) {
2216:         j        = pa[i];
2217:         iptr    += k;
2218:         sbuf1[j] = iptr;
2219:         k        = w1[j];
2220:       }
2221:     }

2223:     /* Form the outgoing messages. Initialize the header space */
2224:     for (i=0; i<nrqs; i++) {
2225:       j           = pa[i];
2226:       sbuf1[j][0] = 0;
2227:       PetscArrayzero(sbuf1[j]+1,2*w3[j]);
2228:       ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
2229:     }

2231:     /* Parse the isrow and copy data into outbuf */
2232:     for (i=0; i<ismax; i++) {
2233:       row2proc_i = row2proc[i];
2234:       PetscArrayzero(ctr,size);
2235:       irow_i = irow[i];
2236:       jmax   = nrow[i];
2237:       for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
2238:         proc = row2proc_i[j];
2239:         if (proc != rank) { /* copy to the outgoing buf*/
2240:           ctr[proc]++;
2241:           *ptr[proc] = irow_i[j];
2242:           ptr[proc]++;
2243:         }
2244:       }
2245:       /* Update the headers for the current IS */
2246:       for (j=0; j<size; j++) { /* Can Optimise this loop too */
2247:         if ((ctr_j = ctr[j])) {
2248:           sbuf1_j        = sbuf1[j];
2249:           k              = ++sbuf1_j[0];
2250:           sbuf1_j[2*k]   = ctr_j;
2251:           sbuf1_j[2*k-1] = i;
2252:         }
2253:       }
2254:     }

2256:     /*  Now  post the sends */
2257:     PetscMalloc1(nrqs,&s_waits1);
2258:     for (i=0; i<nrqs; ++i) {
2259:       j    = pa[i];
2260:       MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2261:     }

2263:     /* Post Receives to capture the buffer size */
2264:     PetscMalloc1(nrqs,&r_waits2);
2265:     PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
2266:     if (nrqs) rbuf2[0] = tmp + msz;
2267:     for (i=1; i<nrqs; ++i) {
2268:       rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2269:     }
2270:     for (i=0; i<nrqs; ++i) {
2271:       j    = pa[i];
2272:       MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2273:     }

2275:     /* Send to other procs the buf size they should allocate */
2276:     /* Receive messages*/
2277:     PetscMalloc1(nrqr,&s_waits2);
2278:     PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2279:     {
2280:       PetscInt   *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2281:       PetscInt   *sbuf2_i;

2283:       MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
2284:       for (i=0; i<nrqr; ++i) {
2285:         req_size[i] = 0;
2286:         rbuf1_i        = rbuf1[i];
2287:         start          = 2*rbuf1_i[0] + 1;
2288:         end            = olengths1[i];
2289:         PetscMalloc1(end,&sbuf2[i]);
2290:         sbuf2_i        = sbuf2[i];
2291:         for (j=start; j<end; j++) {
2292:           id              = rbuf1_i[j] - rstart;
2293:           ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2294:           sbuf2_i[j]      = ncols;
2295:           req_size[i] += ncols;
2296:         }
2297:         req_source1[i] = onodes1[i];
2298:         /* form the header */
2299:         sbuf2_i[0] = req_size[i];
2300:         for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

2302:         MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2303:       }
2304:     }

2306:     PetscFree(onodes1);
2307:     PetscFree(olengths1);
2308:     PetscFree(r_waits1);
2309:     PetscFree4(w1,w2,w3,w4);

2311:     /* Receive messages*/
2312:     PetscMalloc1(nrqs,&r_waits3);
2313:     MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);
2314:     for (i=0; i<nrqs; ++i) {
2315:       PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
2316:       req_source2[i] = pa[i];
2317:       MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2318:     }
2319:     PetscFree(r_waits2);

2321:     /* Wait on sends1 and sends2 */
2322:     MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
2323:     MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
2324:     PetscFree(s_waits1);
2325:     PetscFree(s_waits2);

2327:     /* Now allocate sending buffers for a->j, and send them off */
2328:     PetscMalloc1(nrqr,&sbuf_aj);
2329:     for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2330:     if (nrqr) PetscMalloc1(j,&sbuf_aj[0]);
2331:     for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

2333:     PetscMalloc1(nrqr,&s_waits3);
2334:     {
2335:       PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2336:       PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2337:       PetscInt cend = C->cmap->rend;
2338:       PetscInt *a_j = a->j,*b_j = b->j,ctmp;

2340:       for (i=0; i<nrqr; i++) {
2341:         rbuf1_i   = rbuf1[i];
2342:         sbuf_aj_i = sbuf_aj[i];
2343:         ct1       = 2*rbuf1_i[0] + 1;
2344:         ct2       = 0;
2345:         for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2346:           kmax = rbuf1[i][2*j];
2347:           for (k=0; k<kmax; k++,ct1++) {
2348:             row    = rbuf1_i[ct1] - rstart;
2349:             nzA    = a_i[row+1] - a_i[row];
2350:             nzB    = b_i[row+1] - b_i[row];
2351:             ncols  = nzA + nzB;
2352:             cworkA = a_j + a_i[row];
2353:             cworkB = b_j + b_i[row];

2355:             /* load the column indices for this row into cols */
2356:             cols = sbuf_aj_i + ct2;

2358:             lwrite = 0;
2359:             for (l=0; l<nzB; l++) {
2360:               if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2361:             }
2362:             for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2363:             for (l=0; l<nzB; l++) {
2364:               if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2365:             }

2367:             ct2 += ncols;
2368:           }
2369:         }
2370:         MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2371:       }
2372:     }

2374:     /* create col map: global col of C -> local col of submatrices */
2375:     {
2376:       const PetscInt *icol_i;
2377: #if defined(PETSC_USE_CTABLE)
2378:       for (i=0; i<ismax; i++) {
2379:         if (!allcolumns[i]) {
2380:           PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);

2382:           jmax   = ncol[i];
2383:           icol_i = icol[i];
2384:           cmap_i = cmap[i];
2385:           for (j=0; j<jmax; j++) {
2386:             PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2387:           }
2388:         } else cmap[i] = NULL;
2389:       }
2390: #else
2391:       for (i=0; i<ismax; i++) {
2392:         if (!allcolumns[i]) {
2393:           PetscCalloc1(C->cmap->N,&cmap[i]);
2394:           jmax   = ncol[i];
2395:           icol_i = icol[i];
2396:           cmap_i = cmap[i];
2397:           for (j=0; j<jmax; j++) {
2398:             cmap_i[icol_i[j]] = j+1;
2399:           }
2400:         } else cmap[i] = NULL;
2401:       }
2402: #endif
2403:     }

2405:     /* Create lens which is required for MatCreate... */
2406:     for (i=0,j=0; i<ismax; i++) j += nrow[i];
2407:     PetscMalloc1(ismax,&lens);

2409:     if (ismax) {
2410:       PetscCalloc1(j,&lens[0]);
2411:     }
2412:     for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];

2414:     /* Update lens from local data */
2415:     for (i=0; i<ismax; i++) {
2416:       row2proc_i = row2proc[i];
2417:       jmax = nrow[i];
2418:       if (!allcolumns[i]) cmap_i = cmap[i];
2419:       irow_i = irow[i];
2420:       lens_i = lens[i];
2421:       for (j=0; j<jmax; j++) {
2422:         row = irow_i[j];
2423:         proc = row2proc_i[j];
2424:         if (proc == rank) {
2425:           MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2426:           if (!allcolumns[i]) {
2427:             for (k=0; k<ncols; k++) {
2428: #if defined(PETSC_USE_CTABLE)
2429:               PetscTableFind(cmap_i,cols[k]+1,&tcol);
2430: #else
2431:               tcol = cmap_i[cols[k]];
2432: #endif
2433:               if (tcol) lens_i[j]++;
2434:             }
2435:           } else { /* allcolumns */
2436:             lens_i[j] = ncols;
2437:           }
2438:           MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2439:         }
2440:       }
2441:     }

2443:     /* Create row map: global row of C -> local row of submatrices */
2444: #if defined(PETSC_USE_CTABLE)
2445:     for (i=0; i<ismax; i++) {
2446:       PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);
2447:       irow_i = irow[i];
2448:       jmax   = nrow[i];
2449:       for (j=0; j<jmax; j++) {
2450:       PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2451:       }
2452:     }
2453: #else
2454:     for (i=0; i<ismax; i++) {
2455:       PetscCalloc1(C->rmap->N,&rmap[i]);
2456:       rmap_i = rmap[i];
2457:       irow_i = irow[i];
2458:       jmax   = nrow[i];
2459:       for (j=0; j<jmax; j++) {
2460:         rmap_i[irow_i[j]] = j;
2461:       }
2462:     }
2463: #endif

2465:     /* Update lens from offproc data */
2466:     {
2467:       PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

2469:       MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);
2470:       for (tmp2=0; tmp2<nrqs; tmp2++) {
2471:         sbuf1_i = sbuf1[pa[tmp2]];
2472:         jmax    = sbuf1_i[0];
2473:         ct1     = 2*jmax+1;
2474:         ct2     = 0;
2475:         rbuf2_i = rbuf2[tmp2];
2476:         rbuf3_i = rbuf3[tmp2];
2477:         for (j=1; j<=jmax; j++) {
2478:           is_no  = sbuf1_i[2*j-1];
2479:           max1   = sbuf1_i[2*j];
2480:           lens_i = lens[is_no];
2481:           if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2482:           rmap_i = rmap[is_no];
2483:           for (k=0; k<max1; k++,ct1++) {
2484: #if defined(PETSC_USE_CTABLE)
2485:             PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2486:             row--;
2488: #else
2489:             row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2490: #endif
2491:             max2 = rbuf2_i[ct1];
2492:             for (l=0; l<max2; l++,ct2++) {
2493:               if (!allcolumns[is_no]) {
2494: #if defined(PETSC_USE_CTABLE)
2495:                 PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2496: #else
2497:                 tcol = cmap_i[rbuf3_i[ct2]];
2498: #endif
2499:                 if (tcol) lens_i[row]++;
2500:               } else { /* allcolumns */
2501:                 lens_i[row]++; /* lens_i[row] += max2 ? */
2502:               }
2503:             }
2504:           }
2505:         }
2506:       }
2507:     }
2508:     PetscFree(r_waits3);
2509:     MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);
2510:     PetscFree(s_waits3);

2512:     /* Create the submatrices */
2513:     for (i=0; i<ismax; i++) {
2514:       PetscInt    rbs,cbs;

2516:       ISGetBlockSize(isrow[i],&rbs);
2517:       ISGetBlockSize(iscol[i],&cbs);

2519:       MatCreate(PETSC_COMM_SELF,submats+i);
2520:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);

2522:       MatSetBlockSizes(submats[i],rbs,cbs);
2523:       MatSetType(submats[i],((PetscObject)A)->type_name);
2524:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);

2526:       /* create struct Mat_SubSppt and attached it to submat */
2527:       PetscNew(&smat_i);
2528:       subc = (Mat_SeqAIJ*)submats[i]->data;
2529:       subc->submatis1 = smat_i;

2531:       smat_i->destroy          = submats[i]->ops->destroy;
2532:       submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2533:       submats[i]->factortype   = C->factortype;

2535:       smat_i->id          = i;
2536:       smat_i->nrqs        = nrqs;
2537:       smat_i->nrqr        = nrqr;
2538:       smat_i->rbuf1       = rbuf1;
2539:       smat_i->rbuf2       = rbuf2;
2540:       smat_i->rbuf3       = rbuf3;
2541:       smat_i->sbuf2       = sbuf2;
2542:       smat_i->req_source2 = req_source2;

2544:       smat_i->sbuf1       = sbuf1;
2545:       smat_i->ptr         = ptr;
2546:       smat_i->tmp         = tmp;
2547:       smat_i->ctr         = ctr;

2549:       smat_i->pa           = pa;
2550:       smat_i->req_size     = req_size;
2551:       smat_i->req_source1  = req_source1;

2553:       smat_i->allcolumns  = allcolumns[i];
2554:       smat_i->singleis    = PETSC_FALSE;
2555:       smat_i->row2proc    = row2proc[i];
2556:       smat_i->rmap        = rmap[i];
2557:       smat_i->cmap        = cmap[i];
2558:     }

2560:     if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2561:       MatCreate(PETSC_COMM_SELF,&submats[0]);
2562:       MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2563:       MatSetType(submats[0],MATDUMMY);

2565:       /* create struct Mat_SubSppt and attached it to submat */
2566:       PetscNewLog(submats[0],&smat_i);
2567:       submats[0]->data = (void*)smat_i;

2569:       smat_i->destroy          = submats[0]->ops->destroy;
2570:       submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2571:       submats[0]->factortype   = C->factortype;

2573:       smat_i->id          = 0;
2574:       smat_i->nrqs        = nrqs;
2575:       smat_i->nrqr        = nrqr;
2576:       smat_i->rbuf1       = rbuf1;
2577:       smat_i->rbuf2       = rbuf2;
2578:       smat_i->rbuf3       = rbuf3;
2579:       smat_i->sbuf2       = sbuf2;
2580:       smat_i->req_source2 = req_source2;

2582:       smat_i->sbuf1       = sbuf1;
2583:       smat_i->ptr         = ptr;
2584:       smat_i->tmp         = tmp;
2585:       smat_i->ctr         = ctr;

2587:       smat_i->pa           = pa;
2588:       smat_i->req_size     = req_size;
2589:       smat_i->req_source1  = req_source1;

2591:       smat_i->allcolumns  = PETSC_FALSE;
2592:       smat_i->singleis    = PETSC_FALSE;
2593:       smat_i->row2proc    = NULL;
2594:       smat_i->rmap        = NULL;
2595:       smat_i->cmap        = NULL;
2596:     }

2598:     if (ismax) PetscFree(lens[0]);
2599:     PetscFree(lens);
2600:     if (sbuf_aj) {
2601:       PetscFree(sbuf_aj[0]);
2602:       PetscFree(sbuf_aj);
2603:     }

2605:   } /* endof scall == MAT_INITIAL_MATRIX */

2607:   /* Post recv matrix values */
2608:   PetscObjectGetNewTag((PetscObject)C,&tag4);
2609:   PetscMalloc1(nrqs,&rbuf4);
2610:   PetscMalloc1(nrqs,&r_waits4);
2611:   for (i=0; i<nrqs; ++i) {
2612:     PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
2613:     MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2614:   }

2616:   /* Allocate sending buffers for a->a, and send them off */
2617:   PetscMalloc1(nrqr,&sbuf_aa);
2618:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2619:   if (nrqr) PetscMalloc1(j,&sbuf_aa[0]);
2620:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

2622:   PetscMalloc1(nrqr,&s_waits4);
2623:   {
2624:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2625:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2626:     PetscInt    cend   = C->cmap->rend;
2627:     PetscInt    *b_j   = b->j;
2628:     PetscScalar *vworkA,*vworkB,*a_a,*b_a;

2630:     MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
2631:     MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);
2632:     for (i=0; i<nrqr; i++) {
2633:       rbuf1_i   = rbuf1[i];
2634:       sbuf_aa_i = sbuf_aa[i];
2635:       ct1       = 2*rbuf1_i[0]+1;
2636:       ct2       = 0;
2637:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2638:         kmax = rbuf1_i[2*j];
2639:         for (k=0; k<kmax; k++,ct1++) {
2640:           row    = rbuf1_i[ct1] - rstart;
2641:           nzA    = a_i[row+1] - a_i[row];
2642:           nzB    = b_i[row+1] - b_i[row];
2643:           ncols  = nzA + nzB;
2644:           cworkB = b_j + b_i[row];
2645:           vworkA = a_a + a_i[row];
2646:           vworkB = b_a + b_i[row];

2648:           /* load the column values for this row into vals*/
2649:           vals = sbuf_aa_i+ct2;

2651:           lwrite = 0;
2652:           for (l=0; l<nzB; l++) {
2653:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2654:           }
2655:           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2656:           for (l=0; l<nzB; l++) {
2657:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2658:           }

2660:           ct2 += ncols;
2661:         }
2662:       }
2663:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2664:     }
2665:     MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
2666:     MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);
2667:   }

2669:   /* Assemble the matrices */
2670:   /* First assemble the local rows */
2671:   for (i=0; i<ismax; i++) {
2672:     row2proc_i = row2proc[i];
2673:     subc      = (Mat_SeqAIJ*)submats[i]->data;
2674:     imat_ilen = subc->ilen;
2675:     imat_j    = subc->j;
2676:     imat_i    = subc->i;
2677:     imat_a    = subc->a;

2679:     if (!allcolumns[i]) cmap_i = cmap[i];
2680:     rmap_i = rmap[i];
2681:     irow_i = irow[i];
2682:     jmax   = nrow[i];
2683:     for (j=0; j<jmax; j++) {
2684:       row  = irow_i[j];
2685:       proc = row2proc_i[j];
2686:       if (proc == rank) {
2687:         old_row = row;
2688: #if defined(PETSC_USE_CTABLE)
2689:         PetscTableFind(rmap_i,row+1,&row);
2690:         row--;
2691: #else
2692:         row = rmap_i[row];
2693: #endif
2694:         ilen_row = imat_ilen[row];
2695:         MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2696:         mat_i    = imat_i[row];
2697:         mat_a    = imat_a + mat_i;
2698:         mat_j    = imat_j + mat_i;
2699:         if (!allcolumns[i]) {
2700:           for (k=0; k<ncols; k++) {
2701: #if defined(PETSC_USE_CTABLE)
2702:             PetscTableFind(cmap_i,cols[k]+1,&tcol);
2703: #else
2704:             tcol = cmap_i[cols[k]];
2705: #endif
2706:             if (tcol) {
2707:               *mat_j++ = tcol - 1;
2708:               *mat_a++ = vals[k];
2709:               ilen_row++;
2710:             }
2711:           }
2712:         } else { /* allcolumns */
2713:           for (k=0; k<ncols; k++) {
2714:             *mat_j++ = cols[k];  /* global col index! */
2715:             *mat_a++ = vals[k];
2716:             ilen_row++;
2717:           }
2718:         }
2719:         MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);

2721:         imat_ilen[row] = ilen_row;
2722:       }
2723:     }
2724:   }

2726:   /* Now assemble the off proc rows */
2727:   MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);
2728:   for (tmp2=0; tmp2<nrqs; tmp2++) {
2729:     sbuf1_i = sbuf1[pa[tmp2]];
2730:     jmax    = sbuf1_i[0];
2731:     ct1     = 2*jmax + 1;
2732:     ct2     = 0;
2733:     rbuf2_i = rbuf2[tmp2];
2734:     rbuf3_i = rbuf3[tmp2];
2735:     rbuf4_i = rbuf4[tmp2];
2736:     for (j=1; j<=jmax; j++) {
2737:       is_no     = sbuf1_i[2*j-1];
2738:       rmap_i    = rmap[is_no];
2739:       if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2740:       subc      = (Mat_SeqAIJ*)submats[is_no]->data;
2741:       imat_ilen = subc->ilen;
2742:       imat_j    = subc->j;
2743:       imat_i    = subc->i;
2744:       imat_a    = subc->a;
2745:       max1      = sbuf1_i[2*j];
2746:       for (k=0; k<max1; k++,ct1++) {
2747:         row = sbuf1_i[ct1];
2748: #if defined(PETSC_USE_CTABLE)
2749:         PetscTableFind(rmap_i,row+1,&row);
2750:         row--;
2751: #else
2752:         row = rmap_i[row];
2753: #endif
2754:         ilen  = imat_ilen[row];
2755:         mat_i = imat_i[row];
2756:         mat_a = imat_a + mat_i;
2757:         mat_j = imat_j + mat_i;
2758:         max2  = rbuf2_i[ct1];
2759:         if (!allcolumns[is_no]) {
2760:           for (l=0; l<max2; l++,ct2++) {
2761: #if defined(PETSC_USE_CTABLE)
2762:             PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2763: #else
2764:             tcol = cmap_i[rbuf3_i[ct2]];
2765: #endif
2766:             if (tcol) {
2767:               *mat_j++ = tcol - 1;
2768:               *mat_a++ = rbuf4_i[ct2];
2769:               ilen++;
2770:             }
2771:           }
2772:         } else { /* allcolumns */
2773:           for (l=0; l<max2; l++,ct2++) {
2774:             *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2775:             *mat_a++ = rbuf4_i[ct2];
2776:             ilen++;
2777:           }
2778:         }
2779:         imat_ilen[row] = ilen;
2780:       }
2781:     }
2782:   }

2784:   if (!iscsorted) { /* sort column indices of the rows */
2785:     for (i=0; i<ismax; i++) {
2786:       subc      = (Mat_SeqAIJ*)submats[i]->data;
2787:       imat_j    = subc->j;
2788:       imat_i    = subc->i;
2789:       imat_a    = subc->a;
2790:       imat_ilen = subc->ilen;

2792:       if (allcolumns[i]) continue;
2793:       jmax = nrow[i];
2794:       for (j=0; j<jmax; j++) {
2795:         mat_i = imat_i[j];
2796:         mat_a = imat_a + mat_i;
2797:         mat_j = imat_j + mat_i;
2798:         PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2799:       }
2800:     }
2801:   }

2803:   PetscFree(r_waits4);
2804:   MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);
2805:   PetscFree(s_waits4);

2807:   /* Restore the indices */
2808:   for (i=0; i<ismax; i++) {
2809:     ISRestoreIndices(isrow[i],irow+i);
2810:     if (!allcolumns[i]) {
2811:       ISRestoreIndices(iscol[i],icol+i);
2812:     }
2813:   }

2815:   for (i=0; i<ismax; i++) {
2816:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2817:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2818:   }

2820:   /* Destroy allocated memory */
2821:   if (sbuf_aa) {
2822:     PetscFree(sbuf_aa[0]);
2823:     PetscFree(sbuf_aa);
2824:   }
2825:   PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);

2827:   for (i=0; i<nrqs; ++i) {
2828:     PetscFree(rbuf4[i]);
2829:   }
2830:   PetscFree(rbuf4);

2832:   PetscFree4(row2proc,cmap,rmap,allcolumns);
2833:   return 0;
2834: }

2836: /*
2837:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2838:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2839:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2840:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2841:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2842:  state, and needs to be "assembled" later by compressing B's column space.

2844:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2845:  Following this call, C->A & C->B have been created, even if empty.
2846:  */
2847: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2848: {
2849:   /* If making this function public, change the error returned in this function away from _PLIB. */
2850:   Mat_MPIAIJ     *aij;
2851:   Mat_SeqAIJ     *Baij;
2852:   PetscBool      seqaij,Bdisassembled;
2853:   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2854:   PetscScalar    v;
2855:   const PetscInt *rowindices,*colindices;

2857:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2858:   if (A) {
2859:     PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2861:     if (rowemb) {
2862:       ISGetLocalSize(rowemb,&m);
2864:     } else {
2865:       if (C->rmap->n != A->rmap->n) {
2866:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2867:       }
2868:     }
2869:     if (dcolemb) {
2870:       ISGetLocalSize(dcolemb,&n);
2872:     } else {
2874:     }
2875:   }
2876:   if (B) {
2877:     PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2879:     if (rowemb) {
2880:       ISGetLocalSize(rowemb,&m);
2882:     } else {
2883:       if (C->rmap->n != B->rmap->n) {
2884:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2885:       }
2886:     }
2887:     if (ocolemb) {
2888:       ISGetLocalSize(ocolemb,&n);
2890:     } else {
2892:     }
2893:   }

2895:   aij = (Mat_MPIAIJ*)C->data;
2896:   if (!aij->A) {
2897:     /* Mimic parts of MatMPIAIJSetPreallocation() */
2898:     MatCreate(PETSC_COMM_SELF,&aij->A);
2899:     MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2900:     MatSetBlockSizesFromMats(aij->A,C,C);
2901:     MatSetType(aij->A,MATSEQAIJ);
2902:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2903:   }
2904:   if (A) {
2905:     MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2906:   } else {
2907:     MatSetUp(aij->A);
2908:   }
2909:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2910:     /*
2911:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2912:       need to "disassemble" B -- convert it to using C's global indices.
2913:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

2915:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2916:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

2918:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2919:       At least avoid calling MatSetValues() and the implied searches?
2920:     */

2922:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2923: #if defined(PETSC_USE_CTABLE)
2924:       PetscTableDestroy(&aij->colmap);
2925: #else
2926:       PetscFree(aij->colmap);
2927:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2928:       if (aij->B) {
2929:         PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2930:       }
2931: #endif
2932:       ngcol = 0;
2933:       if (aij->lvec) {
2934:         VecGetSize(aij->lvec,&ngcol);
2935:       }
2936:       if (aij->garray) {
2937:         PetscFree(aij->garray);
2938:         PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2939:       }
2940:       VecDestroy(&aij->lvec);
2941:       VecScatterDestroy(&aij->Mvctx);
2942:     }
2943:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2944:       MatDestroy(&aij->B);
2945:     }
2946:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2947:       MatZeroEntries(aij->B);
2948:     }
2949:   }
2950:   Bdisassembled = PETSC_FALSE;
2951:   if (!aij->B) {
2952:     MatCreate(PETSC_COMM_SELF,&aij->B);
2953:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2954:     MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2955:     MatSetBlockSizesFromMats(aij->B,B,B);
2956:     MatSetType(aij->B,MATSEQAIJ);
2957:     Bdisassembled = PETSC_TRUE;
2958:   }
2959:   if (B) {
2960:     Baij = (Mat_SeqAIJ*)B->data;
2961:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
2962:       PetscMalloc1(B->rmap->n,&nz);
2963:       for (i=0; i<B->rmap->n; i++) {
2964:         nz[i] = Baij->i[i+1] - Baij->i[i];
2965:       }
2966:       MatSeqAIJSetPreallocation(aij->B,0,nz);
2967:       PetscFree(nz);
2968:     }

2970:     PetscLayoutGetRange(C->rmap,&rstart,&rend);
2971:     shift = rend-rstart;
2972:     count = 0;
2973:     rowindices = NULL;
2974:     colindices = NULL;
2975:     if (rowemb) {
2976:       ISGetIndices(rowemb,&rowindices);
2977:     }
2978:     if (ocolemb) {
2979:       ISGetIndices(ocolemb,&colindices);
2980:     }
2981:     for (i=0; i<B->rmap->n; i++) {
2982:       PetscInt row;
2983:       row = i;
2984:       if (rowindices) row = rowindices[i];
2985:       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2986:         col  = Baij->j[count];
2987:         if (colindices) col = colindices[col];
2988:         if (Bdisassembled && col>=rstart) col += shift;
2989:         v    = Baij->a[count];
2990:         MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
2991:         ++count;
2992:       }
2993:     }
2994:     /* No assembly for aij->B is necessary. */
2995:     /* FIXME: set aij->B's nonzerostate correctly. */
2996:   } else {
2997:     MatSetUp(aij->B);
2998:   }
2999:   C->preallocated  = PETSC_TRUE;
3000:   C->was_assembled = PETSC_FALSE;
3001:   C->assembled     = PETSC_FALSE;
3002:    /*
3003:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3004:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3005:    */
3006:   return 0;
3007: }

3009: /*
3010:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
3011:  */
3012: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3013: {
3014:   Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;

3018:   /* FIXME: make sure C is assembled */
3019:   *A = aij->A;
3020:   *B = aij->B;
3021:   /* Note that we don't incref *A and *B, so be careful! */
3022:   return 0;
3023: }

3025: /*
3026:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3027:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3028: */
3029: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3030:                                                PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3031:                                                PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3032:                                                PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3033:                                                PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3034: {
3035:   PetscMPIInt    size,flag;
3036:   PetscInt       i,ii,cismax,ispar;
3037:   Mat            *A,*B;
3038:   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;

3040:   if (!ismax) return 0;

3042:   for (i = 0, cismax = 0; i < ismax; ++i) {
3043:     MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3045:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3046:     if (size > 1) ++cismax;
3047:   }

3049:   /*
3050:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3051:      ispar counts the number of parallel ISs across C's comm.
3052:   */
3053:   MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3054:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3055:     (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3056:     return 0;
3057:   }

3059:   /* if (ispar) */
3060:   /*
3061:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3062:     These are used to extract the off-diag portion of the resulting parallel matrix.
3063:     The row IS for the off-diag portion is the same as for the diag portion,
3064:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3065:   */
3066:   PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3067:   PetscMalloc1(cismax,&ciscol_p);
3068:   for (i = 0, ii = 0; i < ismax; ++i) {
3069:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3070:     if (size > 1) {
3071:       /*
3072:          TODO: This is the part that's ***NOT SCALABLE***.
3073:          To fix this we need to extract just the indices of C's nonzero columns
3074:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3075:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
3076:          to be done without serializing on the IS list, so, most likely, it is best
3077:          done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3078:       */
3079:       ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3080:       /* Now we have to
3081:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3082:              were sorted on each rank, concatenated they might no longer be sorted;
3083:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3084:              indices in the nondecreasing order to the original index positions.
3085:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3086:       */
3087:       ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3088:       ISSort(ciscol[ii]);
3089:       ++ii;
3090:     }
3091:   }
3092:   PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3093:   for (i = 0, ii = 0; i < ismax; ++i) {
3094:     PetscInt       j,issize;
3095:     const PetscInt *indices;

3097:     /*
3098:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3099:      */
3100:     ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3101:     ISSort(isrow[i]);
3102:     ISGetLocalSize(isrow[i],&issize);
3103:     ISGetIndices(isrow[i],&indices);
3104:     for (j = 1; j < issize; ++j) {
3106:     }
3107:     ISRestoreIndices(isrow[i],&indices);
3108:     ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3109:     ISSort(iscol[i]);
3110:     ISGetLocalSize(iscol[i],&issize);
3111:     ISGetIndices(iscol[i],&indices);
3112:     for (j = 1; j < issize; ++j) {
3114:     }
3115:     ISRestoreIndices(iscol[i],&indices);
3116:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3117:     if (size > 1) {
3118:       cisrow[ii] = isrow[i];
3119:       ++ii;
3120:     }
3121:   }
3122:   /*
3123:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3124:     array of sequential matrices underlying the resulting parallel matrices.
3125:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3126:     contain duplicates.

3128:     There are as many diag matrices as there are original index sets. There are only as many parallel
3129:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

3131:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3132:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3133:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3134:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3135:       with A[i] and B[ii] extracted from the corresponding MPI submat.
3136:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3137:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
3138:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3139:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3140:       values into A[i] and B[ii] sitting inside the corresponding submat.
3141:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3142:       A[i], B[ii], AA[i] or BB[ii] matrices.
3143:   */
3144:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3145:   if (scall == MAT_INITIAL_MATRIX) {
3146:     PetscMalloc1(ismax,submat);
3147:   }

3149:   /* Now obtain the sequential A and B submatrices separately. */
3150:   /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3151:   (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3152:   (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);

3154:   /*
3155:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3156:     matrices A & B have been extracted directly into the parallel matrices containing them, or
3157:     simply into the sequential matrix identical with the corresponding A (if size == 1).
3158:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3159:     to have the same sparsity pattern.
3160:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3161:     must be constructed for C. This is done by setseqmat(s).
3162:   */
3163:   for (i = 0, ii = 0; i < ismax; ++i) {
3164:     /*
3165:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3166:        That way we can avoid sorting and computing permutations when reusing.
3167:        To this end:
3168:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3169:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
3170:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3171:     */
3172:     MatStructure pattern = DIFFERENT_NONZERO_PATTERN;

3174:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3175:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3176:     if (size > 1) {
3177:       if (scall == MAT_INITIAL_MATRIX) {
3178:         MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3179:         MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3180:         MatSetType((*submat)[i],MATMPIAIJ);
3181:         PetscLayoutSetUp((*submat)[i]->rmap);
3182:         PetscLayoutSetUp((*submat)[i]->cmap);
3183:       }
3184:       /*
3185:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3186:       */
3187:       {
3188:         Mat AA = A[i],BB = B[ii];

3190:         if (AA || BB) {
3191:           setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3192:           MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3193:           MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3194:         }
3195:         MatDestroy(&AA);
3196:       }
3197:       ISDestroy(ciscol+ii);
3198:       ISDestroy(ciscol_p+ii);
3199:       ++ii;
3200:     } else { /* if (size == 1) */
3201:       if (scall == MAT_REUSE_MATRIX) {
3202:         MatDestroy(&(*submat)[i]);
3203:       }
3204:       if (isrow_p[i] || iscol_p[i]) {
3205:         MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3206:         setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3207:         /* Otherwise A is extracted straight into (*submats)[i]. */
3208:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3209:         MatDestroy(A+i);
3210:       } else (*submat)[i] = A[i];
3211:     }
3212:     ISDestroy(&isrow_p[i]);
3213:     ISDestroy(&iscol_p[i]);
3214:   }
3215:   PetscFree2(cisrow,ciscol);
3216:   PetscFree2(isrow_p,iscol_p);
3217:   PetscFree(ciscol_p);
3218:   PetscFree(A);
3219:   MatDestroySubMatrices(cismax,&B);
3220:   return 0;
3221: }

3223: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3224: {
3225:   MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3226:   return 0;
3227: }