Actual source code: mpiov.c

petsc-3.6.1 2015-08-06
Report Typos and Errors
  2: /*
  3:    Routines to compute overlapping regions of a parallel MPI matrix
  4:   and to find submatrices that were shared across processors.
  5: */
  6: #include <../src/mat/impls/aij/seq/aij.h>
  7: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  8: #include <petscbt.h>

 10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
 11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
 12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
 13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
 14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);

 18: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
 19: {
 21:   PetscInt       i;

 24:   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
 25:   for (i=0; i<ov; ++i) {
 26:     MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
 27:   }
 28:   return(0);
 29: }

 31: /*
 32:   Sample message format:
 33:   If a processor A wants processor B to process some elements corresponding
 34:   to index sets is[1],is[5]
 35:   mesg [0] = 2   (no of index sets in the mesg)
 36:   -----------
 37:   mesg [1] = 1 => is[1]
 38:   mesg [2] = sizeof(is[1]);
 39:   -----------
 40:   mesg [3] = 5  => is[5]
 41:   mesg [4] = sizeof(is[5]);
 42:   -----------
 43:   mesg [5]
 44:   mesg [n]  datas[1]
 45:   -----------
 46:   mesg[n+1]
 47:   mesg[m]  data(is[5])
 48:   -----------

 50:   Notes:
 51:   nrqs - no of requests sent (or to be sent out)
 52:   nrqr - no of requests recieved (which have to be or which have been processed
 53: */
 56: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
 57: {
 58:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
 59:   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
 60:   const PetscInt **idx,*idx_i;
 61:   PetscInt       *n,**data,len;
 63:   PetscMPIInt    size,rank,tag1,tag2;
 64:   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
 65:   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
 66:   PetscBT        *table;
 67:   MPI_Comm       comm;
 68:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
 69:   MPI_Status     *s_status,*recv_status;
 70:   char           *t_p;

 73:   PetscObjectGetComm((PetscObject)C,&comm);
 74:   size = c->size;
 75:   rank = c->rank;
 76:   M    = C->rmap->N;

 78:   PetscObjectGetNewTag((PetscObject)C,&tag1);
 79:   PetscObjectGetNewTag((PetscObject)C,&tag2);

 81:   PetscMalloc2(imax,&idx,imax,&n);

 83:   for (i=0; i<imax; i++) {
 84:     ISGetIndices(is[i],&idx[i]);
 85:     ISGetLocalSize(is[i],&n[i]);
 86:   }

 88:   /* evaluate communication - mesg to who,length of mesg, and buffer space
 89:      required. Based on this, buffers are allocated, and data copied into them*/
 90:   PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
 91:   PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
 92:   PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
 93:   PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
 94:   for (i=0; i<imax; i++) {
 95:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
 96:     idx_i = idx[i];
 97:     len   = n[i];
 98:     for (j=0; j<len; j++) {
 99:       row = idx_i[j];
100:       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
101:       PetscLayoutFindOwner(C->rmap,row,&proc);
102:       w4[proc]++;
103:     }
104:     for (j=0; j<size; j++) {
105:       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
106:     }
107:   }

109:   nrqs     = 0;              /* no of outgoing messages */
110:   msz      = 0;              /* total mesg length (for all proc */
111:   w1[rank] = 0;              /* no mesg sent to intself */
112:   w3[rank] = 0;
113:   for (i=0; i<size; i++) {
114:     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
115:   }
116:   /* pa - is list of processors to communicate with */
117:   PetscMalloc1(nrqs+1,&pa);
118:   for (i=0,j=0; i<size; i++) {
119:     if (w1[i]) {pa[j] = i; j++;}
120:   }

122:   /* Each message would have a header = 1 + 2*(no of IS) + data */
123:   for (i=0; i<nrqs; i++) {
124:     j      = pa[i];
125:     w1[j] += w2[j] + 2*w3[j];
126:     msz   += w1[j];
127:   }

129:   /* Determine the number of messages to expect, their lengths, from from-ids */
130:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
131:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

133:   /* Now post the Irecvs corresponding to these messages */
134:   PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);

136:   /* Allocate Memory for outgoing messages */
137:   PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
138:   PetscMemzero(outdat,size*sizeof(PetscInt*));
139:   PetscMemzero(ptr,size*sizeof(PetscInt*));

141:   {
142:     PetscInt *iptr = tmp,ict  = 0;
143:     for (i=0; i<nrqs; i++) {
144:       j         = pa[i];
145:       iptr     +=  ict;
146:       outdat[j] = iptr;
147:       ict       = w1[j];
148:     }
149:   }

151:   /* Form the outgoing messages */
152:   /*plug in the headers*/
153:   for (i=0; i<nrqs; i++) {
154:     j            = pa[i];
155:     outdat[j][0] = 0;
156:     PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
157:     ptr[j]       = outdat[j] + 2*w3[j] + 1;
158:   }

160:   /* Memory for doing local proc's work*/
161:   {
162:     PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);

164:     for (i=0; i<imax; i++) {
165:       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
166:       data[i]  = d_p + M*i;
167:     }
168:   }

170:   /* Parse the IS and update local tables and the outgoing buf with the data*/
171:   {
172:     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
173:     PetscBT  table_i;

175:     for (i=0; i<imax; i++) {
176:       PetscMemzero(ctr,size*sizeof(PetscInt));
177:       n_i     = n[i];
178:       table_i = table[i];
179:       idx_i   = idx[i];
180:       data_i  = data[i];
181:       isz_i   = isz[i];
182:       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
183:         row  = idx_i[j];
184:         PetscLayoutFindOwner(C->rmap,row,&proc);
185:         if (proc != rank) { /* copy to the outgoing buffer */
186:           ctr[proc]++;
187:           *ptr[proc] = row;
188:           ptr[proc]++;
189:         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
190:       }
191:       /* Update the headers for the current IS */
192:       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193:         if ((ctr_j = ctr[j])) {
194:           outdat_j        = outdat[j];
195:           k               = ++outdat_j[0];
196:           outdat_j[2*k]   = ctr_j;
197:           outdat_j[2*k-1] = i;
198:         }
199:       }
200:       isz[i] = isz_i;
201:     }
202:   }

204:   /*  Now  post the sends */
205:   PetscMalloc1(nrqs+1,&s_waits1);
206:   for (i=0; i<nrqs; ++i) {
207:     j    = pa[i];
208:     MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
209:   }

211:   /* No longer need the original indices*/
212:   for (i=0; i<imax; ++i) {
213:     ISRestoreIndices(is[i],idx+i);
214:   }
215:   PetscFree2(idx,n);

217:   for (i=0; i<imax; ++i) {
218:     ISDestroy(&is[i]);
219:   }

221:   /* Do Local work*/
222:   MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);

224:   /* Receive messages*/
225:   PetscMalloc1(nrqr+1,&recv_status);
226:   if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}

228:   PetscMalloc1(nrqs+1,&s_status);
229:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}

231:   /* Phase 1 sends are complete - deallocate buffers */
232:   PetscFree4(outdat,ptr,tmp,ctr);
233:   PetscFree4(w1,w2,w3,w4);

235:   PetscMalloc1(nrqr+1,&xdata);
236:   PetscMalloc1(nrqr+1,&isz1);
237:   MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
238:   PetscFree(rbuf[0]);
239:   PetscFree(rbuf);


242:   /* Send the data back*/
243:   /* Do a global reduction to know the buffer space req for incoming messages*/
244:   {
245:     PetscMPIInt *rw1;

247:     PetscCalloc1(size,&rw1);

249:     for (i=0; i<nrqr; ++i) {
250:       proc = recv_status[i].MPI_SOURCE;

252:       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
253:       rw1[proc] = isz1[i];
254:     }
255:     PetscFree(onodes1);
256:     PetscFree(olengths1);

258:     /* Determine the number of messages to expect, their lengths, from from-ids */
259:     PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
260:     PetscFree(rw1);
261:   }
262:   /* Now post the Irecvs corresponding to these messages */
263:   PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);

265:   /*  Now  post the sends */
266:   PetscMalloc1(nrqr+1,&s_waits2);
267:   for (i=0; i<nrqr; ++i) {
268:     j    = recv_status[i].MPI_SOURCE;
269:     MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
270:   }

272:   /* receive work done on other processors*/
273:   {
274:     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
275:     PetscMPIInt idex;
276:     PetscBT     table_i;
277:     MPI_Status  *status2;

279:     PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
280:     for (i=0; i<nrqs; ++i) {
281:       MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
282:       /* Process the message*/
283:       rbuf2_i = rbuf2[idex];
284:       ct1     = 2*rbuf2_i[0]+1;
285:       jmax    = rbuf2[idex][0];
286:       for (j=1; j<=jmax; j++) {
287:         max     = rbuf2_i[2*j];
288:         is_no   = rbuf2_i[2*j-1];
289:         isz_i   = isz[is_no];
290:         data_i  = data[is_no];
291:         table_i = table[is_no];
292:         for (k=0; k<max; k++,ct1++) {
293:           row = rbuf2_i[ct1];
294:           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
295:         }
296:         isz[is_no] = isz_i;
297:       }
298:     }

300:     if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
301:     PetscFree(status2);
302:   }

304:   for (i=0; i<imax; ++i) {
305:     ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
306:   }

308:   PetscFree(onodes2);
309:   PetscFree(olengths2);

311:   PetscFree(pa);
312:   PetscFree(rbuf2[0]);
313:   PetscFree(rbuf2);
314:   PetscFree(s_waits1);
315:   PetscFree(r_waits1);
316:   PetscFree(s_waits2);
317:   PetscFree(r_waits2);
318:   PetscFree5(table,data,isz,d_p,t_p);
319:   PetscFree(s_status);
320:   PetscFree(recv_status);
321:   PetscFree(xdata[0]);
322:   PetscFree(xdata);
323:   PetscFree(isz1);
324:   return(0);
325: }

329: /*
330:    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
331:        the work on the local processor.

333:      Inputs:
334:       C      - MAT_MPIAIJ;
335:       imax - total no of index sets processed at a time;
336:       table  - an array of char - size = m bits.

338:      Output:
339:       isz    - array containing the count of the solution elements corresponding
340:                to each index set;
341:       data   - pointer to the solutions
342: */
343: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
344: {
345:   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
346:   Mat        A  = c->A,B = c->B;
347:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
348:   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
349:   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
350:   PetscBT    table_i;

353:   rstart = C->rmap->rstart;
354:   cstart = C->cmap->rstart;
355:   ai     = a->i;
356:   aj     = a->j;
357:   bi     = b->i;
358:   bj     = b->j;
359:   garray = c->garray;


362:   for (i=0; i<imax; i++) {
363:     data_i  = data[i];
364:     table_i = table[i];
365:     isz_i   = isz[i];
366:     for (j=0,max=isz[i]; j<max; j++) {
367:       row   = data_i[j] - rstart;
368:       start = ai[row];
369:       end   = ai[row+1];
370:       for (k=start; k<end; k++) { /* Amat */
371:         val = aj[k] + cstart;
372:         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
373:       }
374:       start = bi[row];
375:       end   = bi[row+1];
376:       for (k=start; k<end; k++) { /* Bmat */
377:         val = garray[bj[k]];
378:         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379:       }
380:     }
381:     isz[i] = isz_i;
382:   }
383:   return(0);
384: }

388: /*
389:       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
390:          and return the output

392:          Input:
393:            C    - the matrix
394:            nrqr - no of messages being processed.
395:            rbuf - an array of pointers to the recieved requests

397:          Output:
398:            xdata - array of messages to be sent back
399:            isz1  - size of each message

401:   For better efficiency perhaps we should malloc separately each xdata[i],
402: then if a remalloc is required we need only copy the data for that one row
403: rather then all previous rows as it is now where a single large chunck of
404: memory is used.

406: */
407: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
408: {
409:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
410:   Mat            A  = c->A,B = c->B;
411:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
413:   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
414:   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
415:   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
416:   PetscInt       *rbuf_i,kmax,rbuf_0;
417:   PetscBT        xtable;

420:   m      = C->rmap->N;
421:   rstart = C->rmap->rstart;
422:   cstart = C->cmap->rstart;
423:   ai     = a->i;
424:   aj     = a->j;
425:   bi     = b->i;
426:   bj     = b->j;
427:   garray = c->garray;


430:   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
431:     rbuf_i =  rbuf[i];
432:     rbuf_0 =  rbuf_i[0];
433:     ct    += rbuf_0;
434:     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
435:   }

437:   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
438:   else max1 = 1;
439:   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
440:   PetscMalloc1(mem_estimate,&xdata[0]);
441:   ++no_malloc;
442:   PetscBTCreate(m,&xtable);
443:   PetscMemzero(isz1,nrqr*sizeof(PetscInt));

445:   ct3 = 0;
446:   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
447:     rbuf_i =  rbuf[i];
448:     rbuf_0 =  rbuf_i[0];
449:     ct1    =  2*rbuf_0+1;
450:     ct2    =  ct1;
451:     ct3   += ct1;
452:     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
453:       PetscBTMemzero(m,xtable);
454:       oct2 = ct2;
455:       kmax = rbuf_i[2*j];
456:       for (k=0; k<kmax; k++,ct1++) {
457:         row = rbuf_i[ct1];
458:         if (!PetscBTLookupSet(xtable,row)) {
459:           if (!(ct3 < mem_estimate)) {
460:             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
461:             PetscMalloc1(new_estimate,&tmp);
462:             PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
463:             PetscFree(xdata[0]);
464:             xdata[0]     = tmp;
465:             mem_estimate = new_estimate; ++no_malloc;
466:             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
467:           }
468:           xdata[i][ct2++] = row;
469:           ct3++;
470:         }
471:       }
472:       for (k=oct2,max2=ct2; k<max2; k++) {
473:         row   = xdata[i][k] - rstart;
474:         start = ai[row];
475:         end   = ai[row+1];
476:         for (l=start; l<end; l++) {
477:           val = aj[l] + cstart;
478:           if (!PetscBTLookupSet(xtable,val)) {
479:             if (!(ct3 < mem_estimate)) {
480:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
481:               PetscMalloc1(new_estimate,&tmp);
482:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
483:               PetscFree(xdata[0]);
484:               xdata[0]     = tmp;
485:               mem_estimate = new_estimate; ++no_malloc;
486:               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
487:             }
488:             xdata[i][ct2++] = val;
489:             ct3++;
490:           }
491:         }
492:         start = bi[row];
493:         end   = bi[row+1];
494:         for (l=start; l<end; l++) {
495:           val = garray[bj[l]];
496:           if (!PetscBTLookupSet(xtable,val)) {
497:             if (!(ct3 < mem_estimate)) {
498:               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
499:               PetscMalloc1(new_estimate,&tmp);
500:               PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
501:               PetscFree(xdata[0]);
502:               xdata[0]     = tmp;
503:               mem_estimate = new_estimate; ++no_malloc;
504:               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
505:             }
506:             xdata[i][ct2++] = val;
507:             ct3++;
508:           }
509:         }
510:       }
511:       /* Update the header*/
512:       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
513:       xdata[i][2*j-1] = rbuf_i[2*j-1];
514:     }
515:     xdata[i][0] = rbuf_0;
516:     xdata[i+1]  = xdata[i] + ct2;
517:     isz1[i]     = ct2; /* size of each message */
518:   }
519:   PetscBTDestroy(&xtable);
520:   PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
521:   return(0);
522: }
523: /* -------------------------------------------------------------------------*/
524: extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
525: extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
526: /*
527:     Every processor gets the entire matrix
528: */
531: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
532: {
533:   Mat            B;
534:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
535:   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
537:   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
538:   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
539:   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
540:   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;

543:   MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
544:   MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);

546:   if (scall == MAT_INITIAL_MATRIX) {
547:     /* ----------------------------------------------------------------
548:          Tell every processor the number of nonzeros per row
549:     */
550:     PetscMalloc1(A->rmap->N,&lens);
551:     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
552:       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
553:     }
554:     sendcount = A->rmap->rend - A->rmap->rstart;
555:     PetscMalloc2(size,&recvcounts,size,&displs);
556:     for (i=0; i<size; i++) {
557:       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
558:       displs[i]     = A->rmap->range[i];
559:     }
560: #if defined(PETSC_HAVE_MPI_IN_PLACE)
561:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
562: #else
563:     MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
564: #endif
565:     /* ---------------------------------------------------------------
566:          Create the sequential matrix of the same type as the local block diagonal
567:     */
568:     MatCreate(PETSC_COMM_SELF,&B);
569:     MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
570:     MatSetBlockSizesFromMats(B,A,A);
571:     MatSetType(B,((PetscObject)a->A)->type_name);
572:     MatSeqAIJSetPreallocation(B,0,lens);
573:     PetscMalloc1(1,Bin);
574:     **Bin = B;
575:     b     = (Mat_SeqAIJ*)B->data;

577:     /*--------------------------------------------------------------------
578:        Copy my part of matrix column indices over
579:     */
580:     sendcount  = ad->nz + bd->nz;
581:     jsendbuf   = b->j + b->i[rstarts[rank]];
582:     a_jsendbuf = ad->j;
583:     b_jsendbuf = bd->j;
584:     n          = A->rmap->rend - A->rmap->rstart;
585:     cnt        = 0;
586:     for (i=0; i<n; i++) {

588:       /* put in lower diagonal portion */
589:       m = bd->i[i+1] - bd->i[i];
590:       while (m > 0) {
591:         /* is it above diagonal (in bd (compressed) numbering) */
592:         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
593:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
594:         m--;
595:       }

597:       /* put in diagonal portion */
598:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
599:         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
600:       }

602:       /* put in upper diagonal portion */
603:       while (m-- > 0) {
604:         jsendbuf[cnt++] = garray[*b_jsendbuf++];
605:       }
606:     }
607:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

609:     /*--------------------------------------------------------------------
610:        Gather all column indices to all processors
611:     */
612:     for (i=0; i<size; i++) {
613:       recvcounts[i] = 0;
614:       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
615:         recvcounts[i] += lens[j];
616:       }
617:     }
618:     displs[0] = 0;
619:     for (i=1; i<size; i++) {
620:       displs[i] = displs[i-1] + recvcounts[i-1];
621:     }
622: #if defined(PETSC_HAVE_MPI_IN_PLACE)
623:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
624: #else
625:     MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
626: #endif
627:     /*--------------------------------------------------------------------
628:         Assemble the matrix into useable form (note numerical values not yet set)
629:     */
630:     /* set the b->ilen (length of each row) values */
631:     PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
632:     /* set the b->i indices */
633:     b->i[0] = 0;
634:     for (i=1; i<=A->rmap->N; i++) {
635:       b->i[i] = b->i[i-1] + lens[i-1];
636:     }
637:     PetscFree(lens);
638:     MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
639:     MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);

641:   } else {
642:     B = **Bin;
643:     b = (Mat_SeqAIJ*)B->data;
644:   }

646:   /*--------------------------------------------------------------------
647:        Copy my part of matrix numerical values into the values location
648:   */
649:   if (flag == MAT_GET_VALUES) {
650:     sendcount = ad->nz + bd->nz;
651:     sendbuf   = b->a + b->i[rstarts[rank]];
652:     a_sendbuf = ad->a;
653:     b_sendbuf = bd->a;
654:     b_sendj   = bd->j;
655:     n         = A->rmap->rend - A->rmap->rstart;
656:     cnt       = 0;
657:     for (i=0; i<n; i++) {

659:       /* put in lower diagonal portion */
660:       m = bd->i[i+1] - bd->i[i];
661:       while (m > 0) {
662:         /* is it above diagonal (in bd (compressed) numbering) */
663:         if (garray[*b_sendj] > A->rmap->rstart + i) break;
664:         sendbuf[cnt++] = *b_sendbuf++;
665:         m--;
666:         b_sendj++;
667:       }

669:       /* put in diagonal portion */
670:       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
671:         sendbuf[cnt++] = *a_sendbuf++;
672:       }

674:       /* put in upper diagonal portion */
675:       while (m-- > 0) {
676:         sendbuf[cnt++] = *b_sendbuf++;
677:         b_sendj++;
678:       }
679:     }
680:     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);

682:     /* -----------------------------------------------------------------
683:        Gather all numerical values to all processors
684:     */
685:     if (!recvcounts) {
686:       PetscMalloc2(size,&recvcounts,size,&displs);
687:     }
688:     for (i=0; i<size; i++) {
689:       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
690:     }
691:     displs[0] = 0;
692:     for (i=1; i<size; i++) {
693:       displs[i] = displs[i-1] + recvcounts[i-1];
694:     }
695:     recvbuf = b->a;
696: #if defined(PETSC_HAVE_MPI_IN_PLACE)
697:     MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
698: #else
699:     MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
700: #endif
701:   }  /* endof (flag == MAT_GET_VALUES) */
702:   PetscFree2(recvcounts,displs);

704:   if (A->symmetric) {
705:     MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
706:   } else if (A->hermitian) {
707:     MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
708:   } else if (A->structurally_symmetric) {
709:     MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
710:   }
711:   return(0);
712: }



718: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
719: {
721:   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
722:   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;


726:   /*
727:        Check for special case: each processor gets entire matrix
728:   */
729:   if (ismax == 1 && C->rmap->N == C->cmap->N) {
730:     ISIdentity(*isrow,&rowflag);
731:     ISIdentity(*iscol,&colflag);
732:     ISGetLocalSize(*isrow,&nrow);
733:     ISGetLocalSize(*iscol,&ncol);
734:     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
735:       wantallmatrix = PETSC_TRUE;

737:       PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
738:     }
739:   }
740:   MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));
741:   if (twantallmatrix) {
742:     MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
743:     return(0);
744:   }

746:   /* Allocate memory to hold all the submatrices */
747:   if (scall != MAT_REUSE_MATRIX) {
748:     PetscMalloc1(ismax+1,submat);
749:   }

751:   /* Check for special case: each processor gets entire matrix columns */
752:   PetscMalloc1(ismax+1,&allcolumns);
753:   for (i=0; i<ismax; i++) {
754:     ISIdentity(iscol[i],&colflag);
755:     ISGetLocalSize(iscol[i],&ncol);
756:     if (colflag && ncol == C->cmap->N) {
757:       allcolumns[i] = PETSC_TRUE;
758:     } else {
759:       allcolumns[i] = PETSC_FALSE;
760:     }
761:   }

763:   /* Determine the number of stages through which submatrices are done */
764:   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));

766:   /*
767:      Each stage will extract nmax submatrices.
768:      nmax is determined by the matrix column dimension.
769:      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
770:   */
771:   if (!nmax) nmax = 1;
772:   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);

774:   /* Make sure every processor loops through the nstages */
775:   MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));

777:   for (i=0,pos=0; i<nstages; i++) {
778:     if (pos+nmax <= ismax) max_no = nmax;
779:     else if (pos == ismax) max_no = 0;
780:     else                   max_no = ismax-pos;
781:     MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);
782:     pos += max_no;
783:   }

785:   PetscFree(allcolumns);
786:   return(0);
787: }

789: /* -------------------------------------------------------------------------*/
792: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
793: {
794:   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
795:   Mat            A  = c->A;
796:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
797:   const PetscInt **icol,**irow;
798:   PetscInt       *nrow,*ncol,start;
800:   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
801:   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
802:   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
803:   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
804:   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
805: #if defined(PETSC_USE_CTABLE)
806:   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
807: #else
808:   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
809: #endif
810:   const PetscInt *irow_i;
811:   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
812:   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
813:   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
814:   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
815:   MPI_Status     *r_status3,*r_status4,*s_status4;
816:   MPI_Comm       comm;
817:   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
818:   PetscMPIInt    *onodes1,*olengths1;
819:   PetscMPIInt    idex,idex2,end;

822:   PetscObjectGetComm((PetscObject)C,&comm);
823:   tag0 = ((PetscObject)C)->tag;
824:   size = c->size;
825:   rank = c->rank;

827:   /* Get some new tags to keep the communication clean */
828:   PetscObjectGetNewTag((PetscObject)C,&tag1);
829:   PetscObjectGetNewTag((PetscObject)C,&tag2);
830:   PetscObjectGetNewTag((PetscObject)C,&tag3);

832:   PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);

834:   for (i=0; i<ismax; i++) {
835:     ISGetIndices(isrow[i],&irow[i]);
836:     ISGetLocalSize(isrow[i],&nrow[i]);
837:     if (allcolumns[i]) {
838:       icol[i] = NULL;
839:       ncol[i] = C->cmap->N;
840:     } else {
841:       ISGetIndices(iscol[i],&icol[i]);
842:       ISGetLocalSize(iscol[i],&ncol[i]);
843:     }
844:   }

846:   /* evaluate communication - mesg to who, length of mesg, and buffer space
847:      required. Based on this, buffers are allocated, and data copied into them*/
848:   PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);   /* mesg size */
849:   PetscMemzero(w1,size*sizeof(PetscMPIInt));   /* initialize work vector*/
850:   PetscMemzero(w2,size*sizeof(PetscMPIInt));   /* initialize work vector*/
851:   PetscMemzero(w3,size*sizeof(PetscMPIInt));   /* initialize work vector*/
852:   for (i=0; i<ismax; i++) {
853:     PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
854:     jmax   = nrow[i];
855:     irow_i = irow[i];
856:     for (j=0; j<jmax; j++) {
857:       l   = 0;
858:       row = irow_i[j];
859:       while (row >= C->rmap->range[l+1]) l++;
860:       proc = l;
861:       w4[proc]++;
862:     }
863:     for (j=0; j<size; j++) {
864:       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
865:     }
866:   }

868:   nrqs     = 0;              /* no of outgoing messages */
869:   msz      = 0;              /* total mesg length (for all procs) */
870:   w1[rank] = 0;              /* no mesg sent to self */
871:   w3[rank] = 0;
872:   for (i=0; i<size; i++) {
873:     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
874:   }
875:   PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
876:   for (i=0,j=0; i<size; i++) {
877:     if (w1[i]) { pa[j] = i; j++; }
878:   }

880:   /* Each message would have a header = 1 + 2*(no of IS) + data */
881:   for (i=0; i<nrqs; i++) {
882:     j      = pa[i];
883:     w1[j] += w2[j] + 2* w3[j];
884:     msz   += w1[j];
885:   }
886:   PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);

888:   /* Determine the number of messages to expect, their lengths, from from-ids */
889:   PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
890:   PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);

892:   /* Now post the Irecvs corresponding to these messages */
893:   PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);

895:   PetscFree(onodes1);
896:   PetscFree(olengths1);

898:   /* Allocate Memory for outgoing messages */
899:   PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
900:   PetscMemzero(sbuf1,size*sizeof(PetscInt*));
901:   PetscMemzero(ptr,size*sizeof(PetscInt*));

903:   {
904:     PetscInt *iptr = tmp,ict = 0;
905:     for (i=0; i<nrqs; i++) {
906:       j        = pa[i];
907:       iptr    += ict;
908:       sbuf1[j] = iptr;
909:       ict      = w1[j];
910:     }
911:   }

913:   /* Form the outgoing messages */
914:   /* Initialize the header space */
915:   for (i=0; i<nrqs; i++) {
916:     j           = pa[i];
917:     sbuf1[j][0] = 0;
918:     PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
919:     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
920:   }

922:   /* Parse the isrow and copy data into outbuf */
923:   for (i=0; i<ismax; i++) {
924:     PetscMemzero(ctr,size*sizeof(PetscInt));
925:     irow_i = irow[i];
926:     jmax   = nrow[i];
927:     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
928:       l   = 0;
929:       row = irow_i[j];
930:       while (row >= C->rmap->range[l+1]) l++;
931:       proc = l;
932:       if (proc != rank) { /* copy to the outgoing buf*/
933:         ctr[proc]++;
934:         *ptr[proc] = row;
935:         ptr[proc]++;
936:       }
937:     }
938:     /* Update the headers for the current IS */
939:     for (j=0; j<size; j++) { /* Can Optimise this loop too */
940:       if ((ctr_j = ctr[j])) {
941:         sbuf1_j        = sbuf1[j];
942:         k              = ++sbuf1_j[0];
943:         sbuf1_j[2*k]   = ctr_j;
944:         sbuf1_j[2*k-1] = i;
945:       }
946:     }
947:   }

949:   /*  Now  post the sends */
950:   PetscMalloc1(nrqs+1,&s_waits1);
951:   for (i=0; i<nrqs; ++i) {
952:     j    = pa[i];
953:     MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
954:   }

956:   /* Post Receives to capture the buffer size */
957:   PetscMalloc1(nrqs+1,&r_waits2);
958:   PetscMalloc1(nrqs+1,&rbuf2);
959:   rbuf2[0] = tmp + msz;
960:   for (i=1; i<nrqs; ++i) {
961:     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
962:   }
963:   for (i=0; i<nrqs; ++i) {
964:     j    = pa[i];
965:     MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
966:   }

968:   /* Send to other procs the buf size they should allocate */


971:   /* Receive messages*/
972:   PetscMalloc1(nrqr+1,&s_waits2);
973:   PetscMalloc1(nrqr+1,&r_status1);
974:   PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);
975:   {
976:     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
977:     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
978:     PetscInt   *sbuf2_i;

980:     for (i=0; i<nrqr; ++i) {
981:       MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);

983:       req_size[idex] = 0;
984:       rbuf1_i        = rbuf1[idex];
985:       start          = 2*rbuf1_i[0] + 1;
986:       MPI_Get_count(r_status1+i,MPIU_INT,&end);
987:       PetscMalloc1(end+1,&sbuf2[idex]);
988:       sbuf2_i        = sbuf2[idex];
989:       for (j=start; j<end; j++) {
990:         id              = rbuf1_i[j] - rstart;
991:         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
992:         sbuf2_i[j]      = ncols;
993:         req_size[idex] += ncols;
994:       }
995:       req_source[idex] = r_status1[i].MPI_SOURCE;
996:       /* form the header */
997:       sbuf2_i[0] = req_size[idex];
998:       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];

1000:       MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1001:     }
1002:   }
1003:   PetscFree(r_status1);
1004:   PetscFree(r_waits1);

1006:   /*  recv buffer sizes */
1007:   /* Receive messages*/

1009:   PetscMalloc1(nrqs+1,&rbuf3);
1010:   PetscMalloc1(nrqs+1,&rbuf4);
1011:   PetscMalloc1(nrqs+1,&r_waits3);
1012:   PetscMalloc1(nrqs+1,&r_waits4);
1013:   PetscMalloc1(nrqs+1,&r_status2);

1015:   for (i=0; i<nrqs; ++i) {
1016:     MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1017:     PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);
1018:     PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);
1019:     MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1020:     MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1021:   }
1022:   PetscFree(r_status2);
1023:   PetscFree(r_waits2);

1025:   /* Wait on sends1 and sends2 */
1026:   PetscMalloc1(nrqs+1,&s_status1);
1027:   PetscMalloc1(nrqr+1,&s_status2);

1029:   if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1030:   if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1031:   PetscFree(s_status1);
1032:   PetscFree(s_status2);
1033:   PetscFree(s_waits1);
1034:   PetscFree(s_waits2);

1036:   /* Now allocate buffers for a->j, and send them off */
1037:   PetscMalloc1(nrqr+1,&sbuf_aj);
1038:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1039:   PetscMalloc1(j+1,&sbuf_aj[0]);
1040:   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];

1042:   PetscMalloc1(nrqr+1,&s_waits3);
1043:   {
1044:     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1045:     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1046:     PetscInt cend = C->cmap->rend;
1047:     PetscInt *a_j = a->j,*b_j = b->j,ctmp;

1049:     for (i=0; i<nrqr; i++) {
1050:       rbuf1_i   = rbuf1[i];
1051:       sbuf_aj_i = sbuf_aj[i];
1052:       ct1       = 2*rbuf1_i[0] + 1;
1053:       ct2       = 0;
1054:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1055:         kmax = rbuf1[i][2*j];
1056:         for (k=0; k<kmax; k++,ct1++) {
1057:           row    = rbuf1_i[ct1] - rstart;
1058:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1059:           ncols  = nzA + nzB;
1060:           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];

1062:           /* load the column indices for this row into cols*/
1063:           cols = sbuf_aj_i + ct2;

1065:           lwrite = 0;
1066:           for (l=0; l<nzB; l++) {
1067:             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1068:           }
1069:           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1070:           for (l=0; l<nzB; l++) {
1071:             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1072:           }

1074:           ct2 += ncols;
1075:         }
1076:       }
1077:       MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1078:     }
1079:   }
1080:   PetscMalloc1(nrqs+1,&r_status3);
1081:   PetscMalloc1(nrqr+1,&s_status3);

1083:   /* Allocate buffers for a->a, and send them off */
1084:   PetscMalloc1(nrqr+1,&sbuf_aa);
1085:   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1086:   PetscMalloc1(j+1,&sbuf_aa[0]);
1087:   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];

1089:   PetscMalloc1(nrqr+1,&s_waits4);
1090:   {
1091:     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1092:     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1093:     PetscInt    cend   = C->cmap->rend;
1094:     PetscInt    *b_j   = b->j;
1095:     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;

1097:     for (i=0; i<nrqr; i++) {
1098:       rbuf1_i   = rbuf1[i];
1099:       sbuf_aa_i = sbuf_aa[i];
1100:       ct1       = 2*rbuf1_i[0]+1;
1101:       ct2       = 0;
1102:       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1103:         kmax = rbuf1_i[2*j];
1104:         for (k=0; k<kmax; k++,ct1++) {
1105:           row    = rbuf1_i[ct1] - rstart;
1106:           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1107:           ncols  = nzA + nzB;
1108:           cworkB = b_j + b_i[row];
1109:           vworkA = a_a + a_i[row];
1110:           vworkB = b_a + b_i[row];

1112:           /* load the column values for this row into vals*/
1113:           vals = sbuf_aa_i+ct2;

1115:           lwrite = 0;
1116:           for (l=0; l<nzB; l++) {
1117:             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1118:           }
1119:           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1120:           for (l=0; l<nzB; l++) {
1121:             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1122:           }

1124:           ct2 += ncols;
1125:         }
1126:       }
1127:       MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1128:     }
1129:   }
1130:   PetscFree(rbuf1[0]);
1131:   PetscFree(rbuf1);
1132:   PetscMalloc1(nrqs+1,&r_status4);
1133:   PetscMalloc1(nrqr+1,&s_status4);

1135:   /* Form the matrix */
1136:   /* create col map: global col of C -> local col of submatrices */
1137:   {
1138:     const PetscInt *icol_i;
1139: #if defined(PETSC_USE_CTABLE)
1140:     PetscMalloc1(1+ismax,&cmap);
1141:     for (i=0; i<ismax; i++) {
1142:       if (!allcolumns[i]) {
1143:         PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);

1145:         jmax   = ncol[i];
1146:         icol_i = icol[i];
1147:         cmap_i = cmap[i];
1148:         for (j=0; j<jmax; j++) {
1149:           PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
1150:         }
1151:       } else {
1152:         cmap[i] = NULL;
1153:       }
1154:     }
1155: #else
1156:     PetscMalloc1(ismax,&cmap);
1157:     for (i=0; i<ismax; i++) {
1158:       if (!allcolumns[i]) {
1159:         PetscMalloc1(C->cmap->N,&cmap[i]);
1160:         PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));
1161:         jmax   = ncol[i];
1162:         icol_i = icol[i];
1163:         cmap_i = cmap[i];
1164:         for (j=0; j<jmax; j++) {
1165:           cmap_i[icol_i[j]] = j+1;
1166:         }
1167:       } else {
1168:         cmap[i] = NULL;
1169:       }
1170:     }
1171: #endif
1172:   }

1174:   /* Create lens which is required for MatCreate... */
1175:   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1176:   PetscMalloc1(ismax,&lens);
1177:   if (ismax) {
1178:     PetscMalloc1(j,&lens[0]);
1179:     PetscMemzero(lens[0],j*sizeof(PetscInt));
1180:   }
1181:   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];

1183:   /* Update lens from local data */
1184:   for (i=0; i<ismax; i++) {
1185:     jmax = nrow[i];
1186:     if (!allcolumns[i]) cmap_i = cmap[i];
1187:     irow_i = irow[i];
1188:     lens_i = lens[i];
1189:     for (j=0; j<jmax; j++) {
1190:       l   = 0;
1191:       row = irow_i[j];
1192:       while (row >= C->rmap->range[l+1]) l++;
1193:       proc = l;
1194:       if (proc == rank) {
1195:         MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1196:         if (!allcolumns[i]) {
1197:           for (k=0; k<ncols; k++) {
1198: #if defined(PETSC_USE_CTABLE)
1199:             PetscTableFind(cmap_i,cols[k]+1,&tcol);
1200: #else
1201:             tcol = cmap_i[cols[k]];
1202: #endif
1203:             if (tcol) lens_i[j]++;
1204:           }
1205:         } else { /* allcolumns */
1206:           lens_i[j] = ncols;
1207:         }
1208:         MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1209:       }
1210:     }
1211:   }

1213:   /* Create row map: global row of C -> local row of submatrices */
1214: #if defined(PETSC_USE_CTABLE)
1215:   PetscMalloc1(1+ismax,&rmap);
1216:   for (i=0; i<ismax; i++) {
1217:     PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
1218:     rmap_i = rmap[i];
1219:     irow_i = irow[i];
1220:     jmax   = nrow[i];
1221:     for (j=0; j<jmax; j++) {
1222:       PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1223:     }
1224:   }
1225: #else
1226:   PetscMalloc1(ismax,&rmap);
1227:   if (ismax) {
1228:     PetscMalloc1(ismax*C->rmap->N,&rmap[0]);
1229:     PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1230:   }
1231:   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1232:   for (i=0; i<ismax; i++) {
1233:     rmap_i = rmap[i];
1234:     irow_i = irow[i];
1235:     jmax   = nrow[i];
1236:     for (j=0; j<jmax; j++) {
1237:       rmap_i[irow_i[j]] = j;
1238:     }
1239:   }
1240: #endif

1242:   /* Update lens from offproc data */
1243:   {
1244:     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;

1246:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1247:       MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1248:       idex    = pa[idex2];
1249:       sbuf1_i = sbuf1[idex];
1250:       jmax    = sbuf1_i[0];
1251:       ct1     = 2*jmax+1;
1252:       ct2     = 0;
1253:       rbuf2_i = rbuf2[idex2];
1254:       rbuf3_i = rbuf3[idex2];
1255:       for (j=1; j<=jmax; j++) {
1256:         is_no  = sbuf1_i[2*j-1];
1257:         max1   = sbuf1_i[2*j];
1258:         lens_i = lens[is_no];
1259:         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1260:         rmap_i = rmap[is_no];
1261:         for (k=0; k<max1; k++,ct1++) {
1262: #if defined(PETSC_USE_CTABLE)
1263:           PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1264:           row--;
1265:           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1266: #else
1267:           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1268: #endif
1269:           max2 = rbuf2_i[ct1];
1270:           for (l=0; l<max2; l++,ct2++) {
1271:             if (!allcolumns[is_no]) {
1272: #if defined(PETSC_USE_CTABLE)
1273:               PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1274: #else
1275:               tcol = cmap_i[rbuf3_i[ct2]];
1276: #endif
1277:               if (tcol) lens_i[row]++;
1278:             } else { /* allcolumns */
1279:               lens_i[row]++; /* lens_i[row] += max2 ? */
1280:             }
1281:           }
1282:         }
1283:       }
1284:     }
1285:   }
1286:   PetscFree(r_status3);
1287:   PetscFree(r_waits3);
1288:   if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1289:   PetscFree(s_status3);
1290:   PetscFree(s_waits3);

1292:   /* Create the submatrices */
1293:   if (scall == MAT_REUSE_MATRIX) {
1294:     PetscBool flag;

1296:     /*
1297:         Assumes new rows are same length as the old rows,hence bug!
1298:     */
1299:     for (i=0; i<ismax; i++) {
1300:       mat = (Mat_SeqAIJ*)(submats[i]->data);
1301:       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1302:       PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1303:       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1304:       /* Initial matrix as if empty */
1305:       PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));

1307:       submats[i]->factortype = C->factortype;
1308:     }
1309:   } else {
1310:     for (i=0; i<ismax; i++) {
1311:       PetscInt rbs,cbs;
1312:       ISGetBlockSize(isrow[i],&rbs);
1313:       ISGetBlockSize(iscol[i],&cbs);

1315:       MatCreate(PETSC_COMM_SELF,submats+i);
1316:       MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);

1318:       MatSetBlockSizes(submats[i],rbs,cbs);
1319:       MatSetType(submats[i],((PetscObject)A)->type_name);
1320:       MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1321:     }
1322:   }

1324:   /* Assemble the matrices */
1325:   /* First assemble the local rows */
1326:   {
1327:     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1328:     PetscScalar *imat_a;

1330:     for (i=0; i<ismax; i++) {
1331:       mat       = (Mat_SeqAIJ*)submats[i]->data;
1332:       imat_ilen = mat->ilen;
1333:       imat_j    = mat->j;
1334:       imat_i    = mat->i;
1335:       imat_a    = mat->a;

1337:       if (!allcolumns[i]) cmap_i = cmap[i];
1338:       rmap_i = rmap[i];
1339:       irow_i = irow[i];
1340:       jmax   = nrow[i];
1341:       for (j=0; j<jmax; j++) {
1342:         l   = 0;
1343:         row = irow_i[j];
1344:         while (row >= C->rmap->range[l+1]) l++;
1345:         proc = l;
1346:         if (proc == rank) {
1347:           old_row = row;
1348: #if defined(PETSC_USE_CTABLE)
1349:           PetscTableFind(rmap_i,row+1,&row);
1350:           row--;
1351: #else
1352:           row = rmap_i[row];
1353: #endif
1354:           ilen_row = imat_ilen[row];
1355:           MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1356:           mat_i    = imat_i[row];
1357:           mat_a    = imat_a + mat_i;
1358:           mat_j    = imat_j + mat_i;
1359:           if (!allcolumns[i]) {
1360:             for (k=0; k<ncols; k++) {
1361: #if defined(PETSC_USE_CTABLE)
1362:               PetscTableFind(cmap_i,cols[k]+1,&tcol);
1363: #else
1364:               tcol = cmap_i[cols[k]];
1365: #endif
1366:               if (tcol) {
1367:                 *mat_j++ = tcol - 1;
1368:                 *mat_a++ = vals[k];
1369:                 ilen_row++;
1370:               }
1371:             }
1372:           } else { /* allcolumns */
1373:             for (k=0; k<ncols; k++) {
1374:               *mat_j++ = cols[k];  /* global col index! */
1375:               *mat_a++ = vals[k];
1376:               ilen_row++;
1377:             }
1378:           }
1379:           MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);

1381:           imat_ilen[row] = ilen_row;
1382:         }
1383:       }
1384:     }
1385:   }

1387:   /*   Now assemble the off proc rows*/
1388:   {
1389:     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1390:     PetscInt    *imat_j,*imat_i;
1391:     PetscScalar *imat_a,*rbuf4_i;

1393:     for (tmp2=0; tmp2<nrqs; tmp2++) {
1394:       MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1395:       idex    = pa[idex2];
1396:       sbuf1_i = sbuf1[idex];
1397:       jmax    = sbuf1_i[0];
1398:       ct1     = 2*jmax + 1;
1399:       ct2     = 0;
1400:       rbuf2_i = rbuf2[idex2];
1401:       rbuf3_i = rbuf3[idex2];
1402:       rbuf4_i = rbuf4[idex2];
1403:       for (j=1; j<=jmax; j++) {
1404:         is_no     = sbuf1_i[2*j-1];
1405:         rmap_i    = rmap[is_no];
1406:         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1407:         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1408:         imat_ilen = mat->ilen;
1409:         imat_j    = mat->j;
1410:         imat_i    = mat->i;
1411:         imat_a    = mat->a;
1412:         max1      = sbuf1_i[2*j];
1413:         for (k=0; k<max1; k++,ct1++) {
1414:           row = sbuf1_i[ct1];
1415: #if defined(PETSC_USE_CTABLE)
1416:           PetscTableFind(rmap_i,row+1,&row);
1417:           row--;
1418: #else
1419:           row = rmap_i[row];
1420: #endif
1421:           ilen  = imat_ilen[row];
1422:           mat_i = imat_i[row];
1423:           mat_a = imat_a + mat_i;
1424:           mat_j = imat_j + mat_i;
1425:           max2  = rbuf2_i[ct1];
1426:           if (!allcolumns[is_no]) {
1427:             for (l=0; l<max2; l++,ct2++) {

1429: #if defined(PETSC_USE_CTABLE)
1430:               PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1431: #else
1432:               tcol = cmap_i[rbuf3_i[ct2]];
1433: #endif
1434:               if (tcol) {
1435:                 *mat_j++ = tcol - 1;
1436:                 *mat_a++ = rbuf4_i[ct2];
1437:                 ilen++;
1438:               }
1439:             }
1440:           } else { /* allcolumns */
1441:             for (l=0; l<max2; l++,ct2++) {
1442:               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1443:               *mat_a++ = rbuf4_i[ct2];
1444:               ilen++;
1445:             }
1446:           }
1447:           imat_ilen[row] = ilen;
1448:         }
1449:       }
1450:     }
1451:   }

1453:   /* sort the rows */
1454:   {
1455:     PetscInt    *imat_ilen,*imat_j,*imat_i;
1456:     PetscScalar *imat_a;

1458:     for (i=0; i<ismax; i++) {
1459:       mat       = (Mat_SeqAIJ*)submats[i]->data;
1460:       imat_j    = mat->j;
1461:       imat_i    = mat->i;
1462:       imat_a    = mat->a;
1463:       imat_ilen = mat->ilen;

1465:       if (allcolumns[i]) continue;
1466:       jmax = nrow[i];
1467:       for (j=0; j<jmax; j++) {
1468:         PetscInt ilen;

1470:         mat_i = imat_i[j];
1471:         mat_a = imat_a + mat_i;
1472:         mat_j = imat_j + mat_i;
1473:         ilen  = imat_ilen[j];
1474:         PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);
1475:       }
1476:     }
1477:   }

1479:   PetscFree(r_status4);
1480:   PetscFree(r_waits4);
1481:   if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1482:   PetscFree(s_waits4);
1483:   PetscFree(s_status4);

1485:   /* Restore the indices */
1486:   for (i=0; i<ismax; i++) {
1487:     ISRestoreIndices(isrow[i],irow+i);
1488:     if (!allcolumns[i]) {
1489:       ISRestoreIndices(iscol[i],icol+i);
1490:     }
1491:   }

1493:   /* Destroy allocated memory */
1494:   PetscFree4(irow,icol,nrow,ncol);
1495:   PetscFree4(w1,w2,w3,w4);
1496:   PetscFree(pa);

1498:   PetscFree4(sbuf1,ptr,tmp,ctr);
1499:   PetscFree(rbuf2);
1500:   for (i=0; i<nrqr; ++i) {
1501:     PetscFree(sbuf2[i]);
1502:   }
1503:   for (i=0; i<nrqs; ++i) {
1504:     PetscFree(rbuf3[i]);
1505:     PetscFree(rbuf4[i]);
1506:   }

1508:   PetscFree3(sbuf2,req_size,req_source);
1509:   PetscFree(rbuf3);
1510:   PetscFree(rbuf4);
1511:   PetscFree(sbuf_aj[0]);
1512:   PetscFree(sbuf_aj);
1513:   PetscFree(sbuf_aa[0]);
1514:   PetscFree(sbuf_aa);

1516: #if defined(PETSC_USE_CTABLE)
1517:   for (i=0; i<ismax; i++) {PetscTableDestroy((PetscTable*)&rmap[i]);}
1518: #else
1519:   if (ismax) {PetscFree(rmap[0]);}
1520: #endif
1521:   PetscFree(rmap);

1523:   for (i=0; i<ismax; i++) {
1524:     if (!allcolumns[i]) {
1525: #if defined(PETSC_USE_CTABLE)
1526:       PetscTableDestroy((PetscTable*)&cmap[i]);
1527: #else
1528:       PetscFree(cmap[i]);
1529: #endif
1530:     }
1531:   }
1532:   PetscFree(cmap);
1533:   if (ismax) {PetscFree(lens[0]);}
1534:   PetscFree(lens);

1536:   for (i=0; i<ismax; i++) {
1537:     MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1538:     MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1539:   }
1540:   return(0);
1541: }

1543: /*
1544:  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1545:  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1546:  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1547:  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1548:  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1549:  state, and needs to be "assembled" later by compressing B's column space.

1551:  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1552:  Following this call, C->A & C->B have been created, even if empty.
1553:  */
1556: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1557: {
1558:   /* If making this function public, change the error returned in this function away from _PLIB. */
1560:   Mat_MPIAIJ     *aij;
1561:   Mat_SeqAIJ     *Baij;
1562:   PetscBool      seqaij,Bdisassembled;
1563:   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1564:   PetscScalar    v;
1565:   const PetscInt *rowindices,*colindices;

1568:   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1569:   if (A) {
1570:     PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
1571:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1572:     if (rowemb) {
1573:       ISGetLocalSize(rowemb,&m);
1574:       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1575:     } else {
1576:       if (C->rmap->n != A->rmap->n) {
1577:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1578:       }
1579:     }
1580:     if (dcolemb) {
1581:       ISGetLocalSize(dcolemb,&n);
1582:       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1583:     } else {
1584:       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1585:     }
1586:   }
1587:   if (B) {
1588:     PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
1589:     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1590:     if (rowemb) {
1591:       ISGetLocalSize(rowemb,&m);
1592:       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
1593:     } else {
1594:       if (C->rmap->n != B->rmap->n) {
1595:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
1596:       }
1597:     }
1598:     if (ocolemb) {
1599:       ISGetLocalSize(ocolemb,&n);
1600:       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
1601:     } else {
1602:       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
1603:     }
1604:   }

1606:   aij    = (Mat_MPIAIJ*)(C->data);
1607:   if (!aij->A) {
1608:     /* Mimic parts of MatMPIAIJSetPreallocation() */
1609:     MatCreate(PETSC_COMM_SELF,&aij->A);
1610:     MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
1611:     MatSetBlockSizesFromMats(aij->A,C,C);
1612:     MatSetType(aij->A,MATSEQAIJ);
1613:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
1614:   }
1615:   if (A) {
1616:     MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
1617:   } else {
1618:     MatSetUp(aij->A);
1619:   }
1620:   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
1621:     /*
1622:       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
1623:       need to "disassemble" B -- convert it to using C's global indices.
1624:       To insert the values we take the safer, albeit more expensive, route of MatSetValues().

1626:       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
1627:       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.

1629:       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
1630:       At least avoid calling MatSetValues() and the implied searches?
1631:     */

1633:     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
1634: #if defined(PETSC_USE_CTABLE)
1635:       PetscTableDestroy(&aij->colmap);
1636: #else
1637:       PetscFree(aij->colmap);
1638:       /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
1639:       if (aij->B) {
1640:         PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
1641:       }
1642: #endif
1643:       ngcol = 0;
1644:       if (aij->lvec) {
1645:         VecGetSize(aij->lvec,&ngcol);
1646:       }
1647:       if (aij->garray) {
1648:         PetscFree(aij->garray);
1649:         PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
1650:       }
1651:       VecDestroy(&aij->lvec);
1652:       VecScatterDestroy(&aij->Mvctx);
1653:     }
1654:     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
1655:       MatDestroy(&aij->B);
1656:     }
1657:     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
1658:       MatZeroEntries(aij->B);
1659:     }
1660:   }
1661:   Bdisassembled = PETSC_FALSE;
1662:   if (!aij->B) {
1663:     MatCreate(PETSC_COMM_SELF,&aij->B);
1664:     PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
1665:     MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
1666:     MatSetBlockSizesFromMats(aij->B,B,B);
1667:     MatSetType(aij->B,MATSEQAIJ);
1668:     Bdisassembled = PETSC_TRUE;
1669:   }
1670:   if (B) {
1671:     Baij = (Mat_SeqAIJ*)(B->data);
1672:     if (pattern == DIFFERENT_NONZERO_PATTERN) {
1673:       PetscMalloc1(B->rmap->n,&nz);
1674:       for (i=0; i<B->rmap->n; i++) {
1675:         nz[i] = Baij->i[i+1] - Baij->i[i];
1676:       }
1677:       MatSeqAIJSetPreallocation(aij->B,0,nz);
1678:       PetscFree(nz);
1679:     }

1681:     PetscLayoutGetRange(C->rmap,&rstart,&rend);
1682:     shift = rend-rstart;
1683:     count = 0;
1684:     rowindices = NULL;
1685:     colindices = NULL;
1686:     if (rowemb) {
1687:       ISGetIndices(rowemb,&rowindices);
1688:     }
1689:     if (ocolemb) {
1690:       ISGetIndices(ocolemb,&colindices);
1691:     }
1692:     for (i=0; i<B->rmap->n; i++) {
1693:       PetscInt row;
1694:       row = i;
1695:       if (rowindices) row = rowindices[i];
1696:       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
1697:         col  = Baij->j[count];
1698:         if (colindices) col = colindices[col];
1699:         if (Bdisassembled && col>=rstart) col += shift;
1700:         v    = Baij->a[count];
1701:         MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
1702:         ++count;
1703:       }
1704:     }
1705:     /* No assembly for aij->B is necessary. */
1706:     /* FIXME: set aij->B's nonzerostate correctly. */
1707:   } else {
1708:     MatSetUp(aij->B);
1709:   }
1710:   C->preallocated  = PETSC_TRUE;
1711:   C->was_assembled = PETSC_FALSE;
1712:   C->assembled     = PETSC_FALSE;
1713:    /*
1714:       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
1715:       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
1716:    */
1717:   return(0);
1718: }

1722: /*
1723:   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
1724:  */
1725: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
1726: {
1727:   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);

1732:   /* FIXME: make sure C is assembled */
1733:   *A = aij->A;
1734:   *B = aij->B;
1735:   /* Note that we don't incref *A and *B, so be careful! */
1736:   return(0);
1737: }

1739: /*
1740:   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
1741:   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
1742: */
1745: PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1746:                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
1747:                                                  PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
1748:                                                  PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
1749:                                                  PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
1750: {
1752:   PetscMPIInt    isize,flag;
1753:   PetscInt       i,ii,cismax,ispar;
1754:   Mat            *A,*B;
1755:   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;

1758:   if (!ismax) return(0);

1760:   for (i = 0, cismax = 0; i < ismax; ++i) {
1761:     PetscMPIInt isize;
1762:     MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
1763:     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1764:     MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
1765:     if (isize > 1) ++cismax;
1766:   }
1767:   /*
1768:      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
1769:      ispar counts the number of parallel ISs across C's comm.
1770:   */
1771:   MPI_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
1772:   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
1773:     (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
1774:     return(0);
1775:   }

1777:   /* if (ispar) */
1778:   /*
1779:     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
1780:     These are used to extract the off-diag portion of the resulting parallel matrix.
1781:     The row IS for the off-diag portion is the same as for the diag portion,
1782:     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
1783:   */
1784:   PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
1785:   PetscMalloc1(cismax,&ciscol_p);
1786:   for (i = 0, ii = 0; i < ismax; ++i) {
1787:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1788:     if (isize > 1) {
1789:       /*
1790:          TODO: This is the part that's ***NOT SCALABLE***.
1791:          To fix this we need to extract just the indices of C's nonzero columns
1792:          that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
1793:          part of iscol[i] -- without actually computing ciscol[ii]. This also has
1794:          to be done without serializing on the IS list, so, most likely, it is best
1795:          done by rewriting MatGetSubMatrices_MPIAIJ() directly.
1796:       */
1797:       ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
1798:       /* Now we have to
1799:          (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
1800:              were sorted on each rank, concatenated they might no longer be sorted;
1801:          (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
1802:              indices in the nondecreasing order to the original index positions.
1803:          If ciscol[ii] is strictly increasing, the permutation IS is NULL.
1804:       */
1805:       ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
1806:       ISSort(ciscol[ii]);
1807:       ++ii;
1808:     }
1809:   }
1810:   PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
1811:   for (i = 0, ii = 0; i < ismax; ++i) {
1812:     PetscInt       j,issize;
1813:     const PetscInt *indices;

1815:     /*
1816:        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
1817:      */
1818:     ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
1819:     ISSort(isrow[i]);
1820:     ISGetLocalSize(isrow[i],&issize);
1821:     ISGetIndices(isrow[i],&indices);
1822:     for (j = 1; j < issize; ++j) {
1823:       if (indices[j] == indices[j-1]) {
1824:         SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
1825:       }
1826:     }
1827:     ISRestoreIndices(isrow[i],&indices);


1830:     ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
1831:     ISSort(iscol[i]);
1832:     ISGetLocalSize(iscol[i],&issize);
1833:     ISGetIndices(iscol[i],&indices);
1834:     for (j = 1; j < issize; ++j) {
1835:       if (indices[j-1] == indices[j]) {
1836:         SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
1837:       }
1838:     }
1839:     ISRestoreIndices(iscol[i],&indices);
1840:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1841:     if (isize > 1) {
1842:       cisrow[ii] = isrow[i];
1843:       ++ii;
1844:     }
1845:   }
1846:   /*
1847:     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1848:     array of sequential matrices underlying the resulting parallel matrices.
1849:     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
1850:     contain duplicates.

1852:     There are as many diag matrices as there are original index sets. There are only as many parallel
1853:     and off-diag matrices, as there are parallel (comm size > 1) index sets.

1855:     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
1856:     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
1857:       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
1858:       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
1859:       with A[i] and B[ii] extracted from the corresponding MPI submat.
1860:     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
1861:       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
1862:       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
1863:       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
1864:       values into A[i] and B[ii] sitting inside the corresponding submat.
1865:     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
1866:       A[i], B[ii], AA[i] or BB[ii] matrices.
1867:   */
1868:   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
1869:   if (scall == MAT_INITIAL_MATRIX) {
1870:     PetscMalloc1(ismax,submat);
1871:     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
1872:   } else {
1873:     PetscMalloc1(ismax,&A);
1874:     PetscMalloc1(cismax,&B);
1875:     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
1876:     for (i = 0, ii = 0; i < ismax; ++i) {
1877:       MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1878:       if (isize > 1) {
1879:         Mat AA,BB;
1880:         (*getlocalmats)((*submat)[i],&AA,&BB);
1881:         if (!isrow_p[i] && !iscol_p[i]) {
1882:           A[i] = AA;
1883:         } else {
1884:           /* TODO: extract A[i] composed on AA. */
1885:           MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);
1886:         }
1887:         if (!isrow_p[i] && !ciscol_p[ii]) {
1888:           B[ii] = BB;
1889:         } else {
1890:           /* TODO: extract B[ii] composed on BB. */
1891:           MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);
1892:         }
1893:         ++ii;
1894:       } else {
1895:         if (!isrow_p[i] && !iscol_p[i]) {
1896:           A[i] = (*submat)[i];
1897:         } else {
1898:           /* TODO: extract A[i] composed on (*submat)[i]. */
1899:           MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);
1900:         }
1901:       }
1902:     }
1903:   }
1904:   /* Now obtain the sequential A and B submatrices separately. */
1905:   (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);
1906:   (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);
1907:   /*
1908:     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
1909:     matrices A & B have been extracted directly into the parallel matrices containing them, or
1910:     simply into the sequential matrix identical with the corresponding A (if isize == 1).
1911:     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
1912:     to have the same sparsity pattern.
1913:     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
1914:     must be constructed for C. This is done by setseqmat(s).
1915:   */
1916:   for (i = 0, ii = 0; i < ismax; ++i) {
1917:     /*
1918:        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
1919:        That way we can avoid sorting and computing permutations when reusing.
1920:        To this end:
1921:         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
1922:         - if caching arrays to hold the ISs, make and compose a container for them so that it can
1923:           be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
1924:     */
1925:     MatStructure pattern;
1926:     if (scall == MAT_INITIAL_MATRIX) {
1927:       pattern = DIFFERENT_NONZERO_PATTERN;
1928:     } else {
1929:       pattern = SAME_NONZERO_PATTERN;
1930:     }
1931:     MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1932:     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
1933:     if (isize > 1) {
1934:       if (scall == MAT_INITIAL_MATRIX) {
1935:         MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
1936:         MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1937:         MatSetType((*submat)[i],MATMPIAIJ);
1938:         PetscLayoutSetUp((*submat)[i]->rmap);
1939:         PetscLayoutSetUp((*submat)[i]->cmap);
1940:       }
1941:       /*
1942:         For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
1943:       */
1944:       {
1945:         Mat AA,BB;
1946:         AA = NULL;
1947:         if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
1948:         BB = NULL;
1949:         if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
1950:         if (AA || BB) {
1951:           setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
1952:           MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
1953:           MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
1954:         }
1955:         if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
1956:           /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
1957:           MatDestroy(&AA);
1958:         }
1959:         if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
1960:           /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
1961:           MatDestroy(&BB);
1962:         }
1963:       }
1964:       ISDestroy(ciscol+ii);
1965:       ISDestroy(ciscol_p+ii);
1966:       ++ii;
1967:     } else { /* if (isize == 1) */
1968:       if (scall == MAT_INITIAL_MATRIX) {
1969:         if (isrow_p[i] || iscol_p[i]) {
1970:           MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
1971:         } else (*submat)[i] = A[i];
1972:       }
1973:       if (isrow_p[i] || iscol_p[i]) {
1974:         setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
1975:         /* Otherwise A is extracted straight into (*submats)[i]. */
1976:         /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
1977:         MatDestroy(A+i);
1978:       }
1979:     }
1980:     ISDestroy(&isrow_p[i]);
1981:     ISDestroy(&iscol_p[i]);
1982:   }
1983:   PetscFree2(cisrow,ciscol);
1984:   PetscFree2(isrow_p,iscol_p);
1985:   PetscFree(ciscol_p);
1986:   PetscFree(A);
1987:   PetscFree(B);
1988:   return(0);
1989: }



1995: PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1996: {

2000:   MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
2001:   return(0);
2002: }