Actual source code: mpiov.c
petsc-3.6.1 2015-08-06
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
7: #include <../src/mat/impls/aij/mpi/mpiaij.h>
8: #include <petscbt.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
18: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19: {
21: PetscInt i;
24: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
27: }
28: return(0);
29: }
31: /*
32: Sample message format:
33: If a processor A wants processor B to process some elements corresponding
34: to index sets is[1],is[5]
35: mesg [0] = 2 (no of index sets in the mesg)
36: -----------
37: mesg [1] = 1 => is[1]
38: mesg [2] = sizeof(is[1]);
39: -----------
40: mesg [3] = 5 => is[5]
41: mesg [4] = sizeof(is[5]);
42: -----------
43: mesg [5]
44: mesg [n] datas[1]
45: -----------
46: mesg[n+1]
47: mesg[m] data(is[5])
48: -----------
50: Notes:
51: nrqs - no of requests sent (or to be sent out)
52: nrqr - no of requests recieved (which have to be or which have been processed
53: */
56: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
57: {
58: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
59: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
60: const PetscInt **idx,*idx_i;
61: PetscInt *n,**data,len;
63: PetscMPIInt size,rank,tag1,tag2;
64: PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
65: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
66: PetscBT *table;
67: MPI_Comm comm;
68: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
69: MPI_Status *s_status,*recv_status;
70: char *t_p;
73: PetscObjectGetComm((PetscObject)C,&comm);
74: size = c->size;
75: rank = c->rank;
76: M = C->rmap->N;
78: PetscObjectGetNewTag((PetscObject)C,&tag1);
79: PetscObjectGetNewTag((PetscObject)C,&tag2);
81: PetscMalloc2(imax,&idx,imax,&n);
83: for (i=0; i<imax; i++) {
84: ISGetIndices(is[i],&idx[i]);
85: ISGetLocalSize(is[i],&n[i]);
86: }
88: /* evaluate communication - mesg to who,length of mesg, and buffer space
89: required. Based on this, buffers are allocated, and data copied into them*/
90: PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
91: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
92: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
93: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
94: for (i=0; i<imax; i++) {
95: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
96: idx_i = idx[i];
97: len = n[i];
98: for (j=0; j<len; j++) {
99: row = idx_i[j];
100: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
101: PetscLayoutFindOwner(C->rmap,row,&proc);
102: w4[proc]++;
103: }
104: for (j=0; j<size; j++) {
105: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
106: }
107: }
109: nrqs = 0; /* no of outgoing messages */
110: msz = 0; /* total mesg length (for all proc */
111: w1[rank] = 0; /* no mesg sent to intself */
112: w3[rank] = 0;
113: for (i=0; i<size; i++) {
114: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
115: }
116: /* pa - is list of processors to communicate with */
117: PetscMalloc1(nrqs+1,&pa);
118: for (i=0,j=0; i<size; i++) {
119: if (w1[i]) {pa[j] = i; j++;}
120: }
122: /* Each message would have a header = 1 + 2*(no of IS) + data */
123: for (i=0; i<nrqs; i++) {
124: j = pa[i];
125: w1[j] += w2[j] + 2*w3[j];
126: msz += w1[j];
127: }
129: /* Determine the number of messages to expect, their lengths, from from-ids */
130: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
131: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
133: /* Now post the Irecvs corresponding to these messages */
134: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
136: /* Allocate Memory for outgoing messages */
137: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
138: PetscMemzero(outdat,size*sizeof(PetscInt*));
139: PetscMemzero(ptr,size*sizeof(PetscInt*));
141: {
142: PetscInt *iptr = tmp,ict = 0;
143: for (i=0; i<nrqs; i++) {
144: j = pa[i];
145: iptr += ict;
146: outdat[j] = iptr;
147: ict = w1[j];
148: }
149: }
151: /* Form the outgoing messages */
152: /*plug in the headers*/
153: for (i=0; i<nrqs; i++) {
154: j = pa[i];
155: outdat[j][0] = 0;
156: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
157: ptr[j] = outdat[j] + 2*w3[j] + 1;
158: }
160: /* Memory for doing local proc's work*/
161: {
162: PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
164: for (i=0; i<imax; i++) {
165: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
166: data[i] = d_p + M*i;
167: }
168: }
170: /* Parse the IS and update local tables and the outgoing buf with the data*/
171: {
172: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
173: PetscBT table_i;
175: for (i=0; i<imax; i++) {
176: PetscMemzero(ctr,size*sizeof(PetscInt));
177: n_i = n[i];
178: table_i = table[i];
179: idx_i = idx[i];
180: data_i = data[i];
181: isz_i = isz[i];
182: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
183: row = idx_i[j];
184: PetscLayoutFindOwner(C->rmap,row,&proc);
185: if (proc != rank) { /* copy to the outgoing buffer */
186: ctr[proc]++;
187: *ptr[proc] = row;
188: ptr[proc]++;
189: } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
190: }
191: /* Update the headers for the current IS */
192: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193: if ((ctr_j = ctr[j])) {
194: outdat_j = outdat[j];
195: k = ++outdat_j[0];
196: outdat_j[2*k] = ctr_j;
197: outdat_j[2*k-1] = i;
198: }
199: }
200: isz[i] = isz_i;
201: }
202: }
204: /* Now post the sends */
205: PetscMalloc1(nrqs+1,&s_waits1);
206: for (i=0; i<nrqs; ++i) {
207: j = pa[i];
208: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
209: }
211: /* No longer need the original indices*/
212: for (i=0; i<imax; ++i) {
213: ISRestoreIndices(is[i],idx+i);
214: }
215: PetscFree2(idx,n);
217: for (i=0; i<imax; ++i) {
218: ISDestroy(&is[i]);
219: }
221: /* Do Local work*/
222: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
224: /* Receive messages*/
225: PetscMalloc1(nrqr+1,&recv_status);
226: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
228: PetscMalloc1(nrqs+1,&s_status);
229: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
231: /* Phase 1 sends are complete - deallocate buffers */
232: PetscFree4(outdat,ptr,tmp,ctr);
233: PetscFree4(w1,w2,w3,w4);
235: PetscMalloc1(nrqr+1,&xdata);
236: PetscMalloc1(nrqr+1,&isz1);
237: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
238: PetscFree(rbuf[0]);
239: PetscFree(rbuf);
242: /* Send the data back*/
243: /* Do a global reduction to know the buffer space req for incoming messages*/
244: {
245: PetscMPIInt *rw1;
247: PetscCalloc1(size,&rw1);
249: for (i=0; i<nrqr; ++i) {
250: proc = recv_status[i].MPI_SOURCE;
252: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
253: rw1[proc] = isz1[i];
254: }
255: PetscFree(onodes1);
256: PetscFree(olengths1);
258: /* Determine the number of messages to expect, their lengths, from from-ids */
259: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
260: PetscFree(rw1);
261: }
262: /* Now post the Irecvs corresponding to these messages */
263: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
265: /* Now post the sends */
266: PetscMalloc1(nrqr+1,&s_waits2);
267: for (i=0; i<nrqr; ++i) {
268: j = recv_status[i].MPI_SOURCE;
269: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
270: }
272: /* receive work done on other processors*/
273: {
274: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
275: PetscMPIInt idex;
276: PetscBT table_i;
277: MPI_Status *status2;
279: PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
280: for (i=0; i<nrqs; ++i) {
281: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
282: /* Process the message*/
283: rbuf2_i = rbuf2[idex];
284: ct1 = 2*rbuf2_i[0]+1;
285: jmax = rbuf2[idex][0];
286: for (j=1; j<=jmax; j++) {
287: max = rbuf2_i[2*j];
288: is_no = rbuf2_i[2*j-1];
289: isz_i = isz[is_no];
290: data_i = data[is_no];
291: table_i = table[is_no];
292: for (k=0; k<max; k++,ct1++) {
293: row = rbuf2_i[ct1];
294: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
295: }
296: isz[is_no] = isz_i;
297: }
298: }
300: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
301: PetscFree(status2);
302: }
304: for (i=0; i<imax; ++i) {
305: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
306: }
308: PetscFree(onodes2);
309: PetscFree(olengths2);
311: PetscFree(pa);
312: PetscFree(rbuf2[0]);
313: PetscFree(rbuf2);
314: PetscFree(s_waits1);
315: PetscFree(r_waits1);
316: PetscFree(s_waits2);
317: PetscFree(r_waits2);
318: PetscFree5(table,data,isz,d_p,t_p);
319: PetscFree(s_status);
320: PetscFree(recv_status);
321: PetscFree(xdata[0]);
322: PetscFree(xdata);
323: PetscFree(isz1);
324: return(0);
325: }
329: /*
330: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
331: the work on the local processor.
333: Inputs:
334: C - MAT_MPIAIJ;
335: imax - total no of index sets processed at a time;
336: table - an array of char - size = m bits.
338: Output:
339: isz - array containing the count of the solution elements corresponding
340: to each index set;
341: data - pointer to the solutions
342: */
343: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
344: {
345: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
346: Mat A = c->A,B = c->B;
347: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
348: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
349: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
350: PetscBT table_i;
353: rstart = C->rmap->rstart;
354: cstart = C->cmap->rstart;
355: ai = a->i;
356: aj = a->j;
357: bi = b->i;
358: bj = b->j;
359: garray = c->garray;
362: for (i=0; i<imax; i++) {
363: data_i = data[i];
364: table_i = table[i];
365: isz_i = isz[i];
366: for (j=0,max=isz[i]; j<max; j++) {
367: row = data_i[j] - rstart;
368: start = ai[row];
369: end = ai[row+1];
370: for (k=start; k<end; k++) { /* Amat */
371: val = aj[k] + cstart;
372: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
373: }
374: start = bi[row];
375: end = bi[row+1];
376: for (k=start; k<end; k++) { /* Bmat */
377: val = garray[bj[k]];
378: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379: }
380: }
381: isz[i] = isz_i;
382: }
383: return(0);
384: }
388: /*
389: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
390: and return the output
392: Input:
393: C - the matrix
394: nrqr - no of messages being processed.
395: rbuf - an array of pointers to the recieved requests
397: Output:
398: xdata - array of messages to be sent back
399: isz1 - size of each message
401: For better efficiency perhaps we should malloc separately each xdata[i],
402: then if a remalloc is required we need only copy the data for that one row
403: rather then all previous rows as it is now where a single large chunck of
404: memory is used.
406: */
407: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
408: {
409: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
410: Mat A = c->A,B = c->B;
411: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
413: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
414: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
415: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
416: PetscInt *rbuf_i,kmax,rbuf_0;
417: PetscBT xtable;
420: m = C->rmap->N;
421: rstart = C->rmap->rstart;
422: cstart = C->cmap->rstart;
423: ai = a->i;
424: aj = a->j;
425: bi = b->i;
426: bj = b->j;
427: garray = c->garray;
430: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
431: rbuf_i = rbuf[i];
432: rbuf_0 = rbuf_i[0];
433: ct += rbuf_0;
434: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
435: }
437: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
438: else max1 = 1;
439: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
440: PetscMalloc1(mem_estimate,&xdata[0]);
441: ++no_malloc;
442: PetscBTCreate(m,&xtable);
443: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
445: ct3 = 0;
446: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
447: rbuf_i = rbuf[i];
448: rbuf_0 = rbuf_i[0];
449: ct1 = 2*rbuf_0+1;
450: ct2 = ct1;
451: ct3 += ct1;
452: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
453: PetscBTMemzero(m,xtable);
454: oct2 = ct2;
455: kmax = rbuf_i[2*j];
456: for (k=0; k<kmax; k++,ct1++) {
457: row = rbuf_i[ct1];
458: if (!PetscBTLookupSet(xtable,row)) {
459: if (!(ct3 < mem_estimate)) {
460: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
461: PetscMalloc1(new_estimate,&tmp);
462: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
463: PetscFree(xdata[0]);
464: xdata[0] = tmp;
465: mem_estimate = new_estimate; ++no_malloc;
466: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
467: }
468: xdata[i][ct2++] = row;
469: ct3++;
470: }
471: }
472: for (k=oct2,max2=ct2; k<max2; k++) {
473: row = xdata[i][k] - rstart;
474: start = ai[row];
475: end = ai[row+1];
476: for (l=start; l<end; l++) {
477: val = aj[l] + cstart;
478: if (!PetscBTLookupSet(xtable,val)) {
479: if (!(ct3 < mem_estimate)) {
480: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
481: PetscMalloc1(new_estimate,&tmp);
482: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
483: PetscFree(xdata[0]);
484: xdata[0] = tmp;
485: mem_estimate = new_estimate; ++no_malloc;
486: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
487: }
488: xdata[i][ct2++] = val;
489: ct3++;
490: }
491: }
492: start = bi[row];
493: end = bi[row+1];
494: for (l=start; l<end; l++) {
495: val = garray[bj[l]];
496: if (!PetscBTLookupSet(xtable,val)) {
497: if (!(ct3 < mem_estimate)) {
498: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
499: PetscMalloc1(new_estimate,&tmp);
500: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
501: PetscFree(xdata[0]);
502: xdata[0] = tmp;
503: mem_estimate = new_estimate; ++no_malloc;
504: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
505: }
506: xdata[i][ct2++] = val;
507: ct3++;
508: }
509: }
510: }
511: /* Update the header*/
512: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
513: xdata[i][2*j-1] = rbuf_i[2*j-1];
514: }
515: xdata[i][0] = rbuf_0;
516: xdata[i+1] = xdata[i] + ct2;
517: isz1[i] = ct2; /* size of each message */
518: }
519: PetscBTDestroy(&xtable);
520: PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
521: return(0);
522: }
523: /* -------------------------------------------------------------------------*/
524: extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
525: extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
526: /*
527: Every processor gets the entire matrix
528: */
531: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
532: {
533: Mat B;
534: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
535: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
537: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
538: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
539: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
540: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
543: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
544: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
546: if (scall == MAT_INITIAL_MATRIX) {
547: /* ----------------------------------------------------------------
548: Tell every processor the number of nonzeros per row
549: */
550: PetscMalloc1(A->rmap->N,&lens);
551: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
552: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
553: }
554: sendcount = A->rmap->rend - A->rmap->rstart;
555: PetscMalloc2(size,&recvcounts,size,&displs);
556: for (i=0; i<size; i++) {
557: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
558: displs[i] = A->rmap->range[i];
559: }
560: #if defined(PETSC_HAVE_MPI_IN_PLACE)
561: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
562: #else
563: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
564: #endif
565: /* ---------------------------------------------------------------
566: Create the sequential matrix of the same type as the local block diagonal
567: */
568: MatCreate(PETSC_COMM_SELF,&B);
569: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
570: MatSetBlockSizesFromMats(B,A,A);
571: MatSetType(B,((PetscObject)a->A)->type_name);
572: MatSeqAIJSetPreallocation(B,0,lens);
573: PetscMalloc1(1,Bin);
574: **Bin = B;
575: b = (Mat_SeqAIJ*)B->data;
577: /*--------------------------------------------------------------------
578: Copy my part of matrix column indices over
579: */
580: sendcount = ad->nz + bd->nz;
581: jsendbuf = b->j + b->i[rstarts[rank]];
582: a_jsendbuf = ad->j;
583: b_jsendbuf = bd->j;
584: n = A->rmap->rend - A->rmap->rstart;
585: cnt = 0;
586: for (i=0; i<n; i++) {
588: /* put in lower diagonal portion */
589: m = bd->i[i+1] - bd->i[i];
590: while (m > 0) {
591: /* is it above diagonal (in bd (compressed) numbering) */
592: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
593: jsendbuf[cnt++] = garray[*b_jsendbuf++];
594: m--;
595: }
597: /* put in diagonal portion */
598: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
599: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
600: }
602: /* put in upper diagonal portion */
603: while (m-- > 0) {
604: jsendbuf[cnt++] = garray[*b_jsendbuf++];
605: }
606: }
607: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
609: /*--------------------------------------------------------------------
610: Gather all column indices to all processors
611: */
612: for (i=0; i<size; i++) {
613: recvcounts[i] = 0;
614: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
615: recvcounts[i] += lens[j];
616: }
617: }
618: displs[0] = 0;
619: for (i=1; i<size; i++) {
620: displs[i] = displs[i-1] + recvcounts[i-1];
621: }
622: #if defined(PETSC_HAVE_MPI_IN_PLACE)
623: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
624: #else
625: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
626: #endif
627: /*--------------------------------------------------------------------
628: Assemble the matrix into useable form (note numerical values not yet set)
629: */
630: /* set the b->ilen (length of each row) values */
631: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
632: /* set the b->i indices */
633: b->i[0] = 0;
634: for (i=1; i<=A->rmap->N; i++) {
635: b->i[i] = b->i[i-1] + lens[i-1];
636: }
637: PetscFree(lens);
638: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
639: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
641: } else {
642: B = **Bin;
643: b = (Mat_SeqAIJ*)B->data;
644: }
646: /*--------------------------------------------------------------------
647: Copy my part of matrix numerical values into the values location
648: */
649: if (flag == MAT_GET_VALUES) {
650: sendcount = ad->nz + bd->nz;
651: sendbuf = b->a + b->i[rstarts[rank]];
652: a_sendbuf = ad->a;
653: b_sendbuf = bd->a;
654: b_sendj = bd->j;
655: n = A->rmap->rend - A->rmap->rstart;
656: cnt = 0;
657: for (i=0; i<n; i++) {
659: /* put in lower diagonal portion */
660: m = bd->i[i+1] - bd->i[i];
661: while (m > 0) {
662: /* is it above diagonal (in bd (compressed) numbering) */
663: if (garray[*b_sendj] > A->rmap->rstart + i) break;
664: sendbuf[cnt++] = *b_sendbuf++;
665: m--;
666: b_sendj++;
667: }
669: /* put in diagonal portion */
670: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
671: sendbuf[cnt++] = *a_sendbuf++;
672: }
674: /* put in upper diagonal portion */
675: while (m-- > 0) {
676: sendbuf[cnt++] = *b_sendbuf++;
677: b_sendj++;
678: }
679: }
680: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
682: /* -----------------------------------------------------------------
683: Gather all numerical values to all processors
684: */
685: if (!recvcounts) {
686: PetscMalloc2(size,&recvcounts,size,&displs);
687: }
688: for (i=0; i<size; i++) {
689: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
690: }
691: displs[0] = 0;
692: for (i=1; i<size; i++) {
693: displs[i] = displs[i-1] + recvcounts[i-1];
694: }
695: recvbuf = b->a;
696: #if defined(PETSC_HAVE_MPI_IN_PLACE)
697: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
698: #else
699: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
700: #endif
701: } /* endof (flag == MAT_GET_VALUES) */
702: PetscFree2(recvcounts,displs);
704: if (A->symmetric) {
705: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
706: } else if (A->hermitian) {
707: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
708: } else if (A->structurally_symmetric) {
709: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
710: }
711: return(0);
712: }
718: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
719: {
721: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
722: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
726: /*
727: Check for special case: each processor gets entire matrix
728: */
729: if (ismax == 1 && C->rmap->N == C->cmap->N) {
730: ISIdentity(*isrow,&rowflag);
731: ISIdentity(*iscol,&colflag);
732: ISGetLocalSize(*isrow,&nrow);
733: ISGetLocalSize(*iscol,&ncol);
734: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
735: wantallmatrix = PETSC_TRUE;
737: PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
738: }
739: }
740: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));
741: if (twantallmatrix) {
742: MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
743: return(0);
744: }
746: /* Allocate memory to hold all the submatrices */
747: if (scall != MAT_REUSE_MATRIX) {
748: PetscMalloc1(ismax+1,submat);
749: }
751: /* Check for special case: each processor gets entire matrix columns */
752: PetscMalloc1(ismax+1,&allcolumns);
753: for (i=0; i<ismax; i++) {
754: ISIdentity(iscol[i],&colflag);
755: ISGetLocalSize(iscol[i],&ncol);
756: if (colflag && ncol == C->cmap->N) {
757: allcolumns[i] = PETSC_TRUE;
758: } else {
759: allcolumns[i] = PETSC_FALSE;
760: }
761: }
763: /* Determine the number of stages through which submatrices are done */
764: nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
766: /*
767: Each stage will extract nmax submatrices.
768: nmax is determined by the matrix column dimension.
769: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
770: */
771: if (!nmax) nmax = 1;
772: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
774: /* Make sure every processor loops through the nstages */
775: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
777: for (i=0,pos=0; i<nstages; i++) {
778: if (pos+nmax <= ismax) max_no = nmax;
779: else if (pos == ismax) max_no = 0;
780: else max_no = ismax-pos;
781: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);
782: pos += max_no;
783: }
785: PetscFree(allcolumns);
786: return(0);
787: }
789: /* -------------------------------------------------------------------------*/
792: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
793: {
794: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
795: Mat A = c->A;
796: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
797: const PetscInt **icol,**irow;
798: PetscInt *nrow,*ncol,start;
800: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
801: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
802: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
803: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
804: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
805: #if defined(PETSC_USE_CTABLE)
806: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
807: #else
808: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
809: #endif
810: const PetscInt *irow_i;
811: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
812: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
813: MPI_Request *r_waits4,*s_waits3,*s_waits4;
814: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
815: MPI_Status *r_status3,*r_status4,*s_status4;
816: MPI_Comm comm;
817: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
818: PetscMPIInt *onodes1,*olengths1;
819: PetscMPIInt idex,idex2,end;
822: PetscObjectGetComm((PetscObject)C,&comm);
823: tag0 = ((PetscObject)C)->tag;
824: size = c->size;
825: rank = c->rank;
827: /* Get some new tags to keep the communication clean */
828: PetscObjectGetNewTag((PetscObject)C,&tag1);
829: PetscObjectGetNewTag((PetscObject)C,&tag2);
830: PetscObjectGetNewTag((PetscObject)C,&tag3);
832: PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);
834: for (i=0; i<ismax; i++) {
835: ISGetIndices(isrow[i],&irow[i]);
836: ISGetLocalSize(isrow[i],&nrow[i]);
837: if (allcolumns[i]) {
838: icol[i] = NULL;
839: ncol[i] = C->cmap->N;
840: } else {
841: ISGetIndices(iscol[i],&icol[i]);
842: ISGetLocalSize(iscol[i],&ncol[i]);
843: }
844: }
846: /* evaluate communication - mesg to who, length of mesg, and buffer space
847: required. Based on this, buffers are allocated, and data copied into them*/
848: PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size */
849: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialize work vector*/
850: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialize work vector*/
851: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialize work vector*/
852: for (i=0; i<ismax; i++) {
853: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
854: jmax = nrow[i];
855: irow_i = irow[i];
856: for (j=0; j<jmax; j++) {
857: l = 0;
858: row = irow_i[j];
859: while (row >= C->rmap->range[l+1]) l++;
860: proc = l;
861: w4[proc]++;
862: }
863: for (j=0; j<size; j++) {
864: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
865: }
866: }
868: nrqs = 0; /* no of outgoing messages */
869: msz = 0; /* total mesg length (for all procs) */
870: w1[rank] = 0; /* no mesg sent to self */
871: w3[rank] = 0;
872: for (i=0; i<size; i++) {
873: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
874: }
875: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
876: for (i=0,j=0; i<size; i++) {
877: if (w1[i]) { pa[j] = i; j++; }
878: }
880: /* Each message would have a header = 1 + 2*(no of IS) + data */
881: for (i=0; i<nrqs; i++) {
882: j = pa[i];
883: w1[j] += w2[j] + 2* w3[j];
884: msz += w1[j];
885: }
886: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
888: /* Determine the number of messages to expect, their lengths, from from-ids */
889: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
890: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
892: /* Now post the Irecvs corresponding to these messages */
893: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
895: PetscFree(onodes1);
896: PetscFree(olengths1);
898: /* Allocate Memory for outgoing messages */
899: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
900: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
901: PetscMemzero(ptr,size*sizeof(PetscInt*));
903: {
904: PetscInt *iptr = tmp,ict = 0;
905: for (i=0; i<nrqs; i++) {
906: j = pa[i];
907: iptr += ict;
908: sbuf1[j] = iptr;
909: ict = w1[j];
910: }
911: }
913: /* Form the outgoing messages */
914: /* Initialize the header space */
915: for (i=0; i<nrqs; i++) {
916: j = pa[i];
917: sbuf1[j][0] = 0;
918: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
919: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
920: }
922: /* Parse the isrow and copy data into outbuf */
923: for (i=0; i<ismax; i++) {
924: PetscMemzero(ctr,size*sizeof(PetscInt));
925: irow_i = irow[i];
926: jmax = nrow[i];
927: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
928: l = 0;
929: row = irow_i[j];
930: while (row >= C->rmap->range[l+1]) l++;
931: proc = l;
932: if (proc != rank) { /* copy to the outgoing buf*/
933: ctr[proc]++;
934: *ptr[proc] = row;
935: ptr[proc]++;
936: }
937: }
938: /* Update the headers for the current IS */
939: for (j=0; j<size; j++) { /* Can Optimise this loop too */
940: if ((ctr_j = ctr[j])) {
941: sbuf1_j = sbuf1[j];
942: k = ++sbuf1_j[0];
943: sbuf1_j[2*k] = ctr_j;
944: sbuf1_j[2*k-1] = i;
945: }
946: }
947: }
949: /* Now post the sends */
950: PetscMalloc1(nrqs+1,&s_waits1);
951: for (i=0; i<nrqs; ++i) {
952: j = pa[i];
953: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
954: }
956: /* Post Receives to capture the buffer size */
957: PetscMalloc1(nrqs+1,&r_waits2);
958: PetscMalloc1(nrqs+1,&rbuf2);
959: rbuf2[0] = tmp + msz;
960: for (i=1; i<nrqs; ++i) {
961: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
962: }
963: for (i=0; i<nrqs; ++i) {
964: j = pa[i];
965: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
966: }
968: /* Send to other procs the buf size they should allocate */
971: /* Receive messages*/
972: PetscMalloc1(nrqr+1,&s_waits2);
973: PetscMalloc1(nrqr+1,&r_status1);
974: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);
975: {
976: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
977: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
978: PetscInt *sbuf2_i;
980: for (i=0; i<nrqr; ++i) {
981: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
983: req_size[idex] = 0;
984: rbuf1_i = rbuf1[idex];
985: start = 2*rbuf1_i[0] + 1;
986: MPI_Get_count(r_status1+i,MPIU_INT,&end);
987: PetscMalloc1(end+1,&sbuf2[idex]);
988: sbuf2_i = sbuf2[idex];
989: for (j=start; j<end; j++) {
990: id = rbuf1_i[j] - rstart;
991: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
992: sbuf2_i[j] = ncols;
993: req_size[idex] += ncols;
994: }
995: req_source[idex] = r_status1[i].MPI_SOURCE;
996: /* form the header */
997: sbuf2_i[0] = req_size[idex];
998: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1000: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1001: }
1002: }
1003: PetscFree(r_status1);
1004: PetscFree(r_waits1);
1006: /* recv buffer sizes */
1007: /* Receive messages*/
1009: PetscMalloc1(nrqs+1,&rbuf3);
1010: PetscMalloc1(nrqs+1,&rbuf4);
1011: PetscMalloc1(nrqs+1,&r_waits3);
1012: PetscMalloc1(nrqs+1,&r_waits4);
1013: PetscMalloc1(nrqs+1,&r_status2);
1015: for (i=0; i<nrqs; ++i) {
1016: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1017: PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);
1018: PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);
1019: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1020: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1021: }
1022: PetscFree(r_status2);
1023: PetscFree(r_waits2);
1025: /* Wait on sends1 and sends2 */
1026: PetscMalloc1(nrqs+1,&s_status1);
1027: PetscMalloc1(nrqr+1,&s_status2);
1029: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1030: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1031: PetscFree(s_status1);
1032: PetscFree(s_status2);
1033: PetscFree(s_waits1);
1034: PetscFree(s_waits2);
1036: /* Now allocate buffers for a->j, and send them off */
1037: PetscMalloc1(nrqr+1,&sbuf_aj);
1038: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1039: PetscMalloc1(j+1,&sbuf_aj[0]);
1040: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1042: PetscMalloc1(nrqr+1,&s_waits3);
1043: {
1044: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1045: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1046: PetscInt cend = C->cmap->rend;
1047: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1049: for (i=0; i<nrqr; i++) {
1050: rbuf1_i = rbuf1[i];
1051: sbuf_aj_i = sbuf_aj[i];
1052: ct1 = 2*rbuf1_i[0] + 1;
1053: ct2 = 0;
1054: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1055: kmax = rbuf1[i][2*j];
1056: for (k=0; k<kmax; k++,ct1++) {
1057: row = rbuf1_i[ct1] - rstart;
1058: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1059: ncols = nzA + nzB;
1060: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1062: /* load the column indices for this row into cols*/
1063: cols = sbuf_aj_i + ct2;
1065: lwrite = 0;
1066: for (l=0; l<nzB; l++) {
1067: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1068: }
1069: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1070: for (l=0; l<nzB; l++) {
1071: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1072: }
1074: ct2 += ncols;
1075: }
1076: }
1077: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1078: }
1079: }
1080: PetscMalloc1(nrqs+1,&r_status3);
1081: PetscMalloc1(nrqr+1,&s_status3);
1083: /* Allocate buffers for a->a, and send them off */
1084: PetscMalloc1(nrqr+1,&sbuf_aa);
1085: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1086: PetscMalloc1(j+1,&sbuf_aa[0]);
1087: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1089: PetscMalloc1(nrqr+1,&s_waits4);
1090: {
1091: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1092: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1093: PetscInt cend = C->cmap->rend;
1094: PetscInt *b_j = b->j;
1095: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1097: for (i=0; i<nrqr; i++) {
1098: rbuf1_i = rbuf1[i];
1099: sbuf_aa_i = sbuf_aa[i];
1100: ct1 = 2*rbuf1_i[0]+1;
1101: ct2 = 0;
1102: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1103: kmax = rbuf1_i[2*j];
1104: for (k=0; k<kmax; k++,ct1++) {
1105: row = rbuf1_i[ct1] - rstart;
1106: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1107: ncols = nzA + nzB;
1108: cworkB = b_j + b_i[row];
1109: vworkA = a_a + a_i[row];
1110: vworkB = b_a + b_i[row];
1112: /* load the column values for this row into vals*/
1113: vals = sbuf_aa_i+ct2;
1115: lwrite = 0;
1116: for (l=0; l<nzB; l++) {
1117: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1118: }
1119: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1120: for (l=0; l<nzB; l++) {
1121: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1122: }
1124: ct2 += ncols;
1125: }
1126: }
1127: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1128: }
1129: }
1130: PetscFree(rbuf1[0]);
1131: PetscFree(rbuf1);
1132: PetscMalloc1(nrqs+1,&r_status4);
1133: PetscMalloc1(nrqr+1,&s_status4);
1135: /* Form the matrix */
1136: /* create col map: global col of C -> local col of submatrices */
1137: {
1138: const PetscInt *icol_i;
1139: #if defined(PETSC_USE_CTABLE)
1140: PetscMalloc1(1+ismax,&cmap);
1141: for (i=0; i<ismax; i++) {
1142: if (!allcolumns[i]) {
1143: PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);
1145: jmax = ncol[i];
1146: icol_i = icol[i];
1147: cmap_i = cmap[i];
1148: for (j=0; j<jmax; j++) {
1149: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
1150: }
1151: } else {
1152: cmap[i] = NULL;
1153: }
1154: }
1155: #else
1156: PetscMalloc1(ismax,&cmap);
1157: for (i=0; i<ismax; i++) {
1158: if (!allcolumns[i]) {
1159: PetscMalloc1(C->cmap->N,&cmap[i]);
1160: PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));
1161: jmax = ncol[i];
1162: icol_i = icol[i];
1163: cmap_i = cmap[i];
1164: for (j=0; j<jmax; j++) {
1165: cmap_i[icol_i[j]] = j+1;
1166: }
1167: } else {
1168: cmap[i] = NULL;
1169: }
1170: }
1171: #endif
1172: }
1174: /* Create lens which is required for MatCreate... */
1175: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1176: PetscMalloc1(ismax,&lens);
1177: if (ismax) {
1178: PetscMalloc1(j,&lens[0]);
1179: PetscMemzero(lens[0],j*sizeof(PetscInt));
1180: }
1181: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1183: /* Update lens from local data */
1184: for (i=0; i<ismax; i++) {
1185: jmax = nrow[i];
1186: if (!allcolumns[i]) cmap_i = cmap[i];
1187: irow_i = irow[i];
1188: lens_i = lens[i];
1189: for (j=0; j<jmax; j++) {
1190: l = 0;
1191: row = irow_i[j];
1192: while (row >= C->rmap->range[l+1]) l++;
1193: proc = l;
1194: if (proc == rank) {
1195: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1196: if (!allcolumns[i]) {
1197: for (k=0; k<ncols; k++) {
1198: #if defined(PETSC_USE_CTABLE)
1199: PetscTableFind(cmap_i,cols[k]+1,&tcol);
1200: #else
1201: tcol = cmap_i[cols[k]];
1202: #endif
1203: if (tcol) lens_i[j]++;
1204: }
1205: } else { /* allcolumns */
1206: lens_i[j] = ncols;
1207: }
1208: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1209: }
1210: }
1211: }
1213: /* Create row map: global row of C -> local row of submatrices */
1214: #if defined(PETSC_USE_CTABLE)
1215: PetscMalloc1(1+ismax,&rmap);
1216: for (i=0; i<ismax; i++) {
1217: PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
1218: rmap_i = rmap[i];
1219: irow_i = irow[i];
1220: jmax = nrow[i];
1221: for (j=0; j<jmax; j++) {
1222: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1223: }
1224: }
1225: #else
1226: PetscMalloc1(ismax,&rmap);
1227: if (ismax) {
1228: PetscMalloc1(ismax*C->rmap->N,&rmap[0]);
1229: PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1230: }
1231: for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1232: for (i=0; i<ismax; i++) {
1233: rmap_i = rmap[i];
1234: irow_i = irow[i];
1235: jmax = nrow[i];
1236: for (j=0; j<jmax; j++) {
1237: rmap_i[irow_i[j]] = j;
1238: }
1239: }
1240: #endif
1242: /* Update lens from offproc data */
1243: {
1244: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1246: for (tmp2=0; tmp2<nrqs; tmp2++) {
1247: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1248: idex = pa[idex2];
1249: sbuf1_i = sbuf1[idex];
1250: jmax = sbuf1_i[0];
1251: ct1 = 2*jmax+1;
1252: ct2 = 0;
1253: rbuf2_i = rbuf2[idex2];
1254: rbuf3_i = rbuf3[idex2];
1255: for (j=1; j<=jmax; j++) {
1256: is_no = sbuf1_i[2*j-1];
1257: max1 = sbuf1_i[2*j];
1258: lens_i = lens[is_no];
1259: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1260: rmap_i = rmap[is_no];
1261: for (k=0; k<max1; k++,ct1++) {
1262: #if defined(PETSC_USE_CTABLE)
1263: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1264: row--;
1265: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1266: #else
1267: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1268: #endif
1269: max2 = rbuf2_i[ct1];
1270: for (l=0; l<max2; l++,ct2++) {
1271: if (!allcolumns[is_no]) {
1272: #if defined(PETSC_USE_CTABLE)
1273: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1274: #else
1275: tcol = cmap_i[rbuf3_i[ct2]];
1276: #endif
1277: if (tcol) lens_i[row]++;
1278: } else { /* allcolumns */
1279: lens_i[row]++; /* lens_i[row] += max2 ? */
1280: }
1281: }
1282: }
1283: }
1284: }
1285: }
1286: PetscFree(r_status3);
1287: PetscFree(r_waits3);
1288: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1289: PetscFree(s_status3);
1290: PetscFree(s_waits3);
1292: /* Create the submatrices */
1293: if (scall == MAT_REUSE_MATRIX) {
1294: PetscBool flag;
1296: /*
1297: Assumes new rows are same length as the old rows,hence bug!
1298: */
1299: for (i=0; i<ismax; i++) {
1300: mat = (Mat_SeqAIJ*)(submats[i]->data);
1301: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1302: PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1303: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1304: /* Initial matrix as if empty */
1305: PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));
1307: submats[i]->factortype = C->factortype;
1308: }
1309: } else {
1310: for (i=0; i<ismax; i++) {
1311: PetscInt rbs,cbs;
1312: ISGetBlockSize(isrow[i],&rbs);
1313: ISGetBlockSize(iscol[i],&cbs);
1315: MatCreate(PETSC_COMM_SELF,submats+i);
1316: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1318: MatSetBlockSizes(submats[i],rbs,cbs);
1319: MatSetType(submats[i],((PetscObject)A)->type_name);
1320: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1321: }
1322: }
1324: /* Assemble the matrices */
1325: /* First assemble the local rows */
1326: {
1327: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1328: PetscScalar *imat_a;
1330: for (i=0; i<ismax; i++) {
1331: mat = (Mat_SeqAIJ*)submats[i]->data;
1332: imat_ilen = mat->ilen;
1333: imat_j = mat->j;
1334: imat_i = mat->i;
1335: imat_a = mat->a;
1337: if (!allcolumns[i]) cmap_i = cmap[i];
1338: rmap_i = rmap[i];
1339: irow_i = irow[i];
1340: jmax = nrow[i];
1341: for (j=0; j<jmax; j++) {
1342: l = 0;
1343: row = irow_i[j];
1344: while (row >= C->rmap->range[l+1]) l++;
1345: proc = l;
1346: if (proc == rank) {
1347: old_row = row;
1348: #if defined(PETSC_USE_CTABLE)
1349: PetscTableFind(rmap_i,row+1,&row);
1350: row--;
1351: #else
1352: row = rmap_i[row];
1353: #endif
1354: ilen_row = imat_ilen[row];
1355: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1356: mat_i = imat_i[row];
1357: mat_a = imat_a + mat_i;
1358: mat_j = imat_j + mat_i;
1359: if (!allcolumns[i]) {
1360: for (k=0; k<ncols; k++) {
1361: #if defined(PETSC_USE_CTABLE)
1362: PetscTableFind(cmap_i,cols[k]+1,&tcol);
1363: #else
1364: tcol = cmap_i[cols[k]];
1365: #endif
1366: if (tcol) {
1367: *mat_j++ = tcol - 1;
1368: *mat_a++ = vals[k];
1369: ilen_row++;
1370: }
1371: }
1372: } else { /* allcolumns */
1373: for (k=0; k<ncols; k++) {
1374: *mat_j++ = cols[k]; /* global col index! */
1375: *mat_a++ = vals[k];
1376: ilen_row++;
1377: }
1378: }
1379: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1381: imat_ilen[row] = ilen_row;
1382: }
1383: }
1384: }
1385: }
1387: /* Now assemble the off proc rows*/
1388: {
1389: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1390: PetscInt *imat_j,*imat_i;
1391: PetscScalar *imat_a,*rbuf4_i;
1393: for (tmp2=0; tmp2<nrqs; tmp2++) {
1394: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1395: idex = pa[idex2];
1396: sbuf1_i = sbuf1[idex];
1397: jmax = sbuf1_i[0];
1398: ct1 = 2*jmax + 1;
1399: ct2 = 0;
1400: rbuf2_i = rbuf2[idex2];
1401: rbuf3_i = rbuf3[idex2];
1402: rbuf4_i = rbuf4[idex2];
1403: for (j=1; j<=jmax; j++) {
1404: is_no = sbuf1_i[2*j-1];
1405: rmap_i = rmap[is_no];
1406: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1407: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1408: imat_ilen = mat->ilen;
1409: imat_j = mat->j;
1410: imat_i = mat->i;
1411: imat_a = mat->a;
1412: max1 = sbuf1_i[2*j];
1413: for (k=0; k<max1; k++,ct1++) {
1414: row = sbuf1_i[ct1];
1415: #if defined(PETSC_USE_CTABLE)
1416: PetscTableFind(rmap_i,row+1,&row);
1417: row--;
1418: #else
1419: row = rmap_i[row];
1420: #endif
1421: ilen = imat_ilen[row];
1422: mat_i = imat_i[row];
1423: mat_a = imat_a + mat_i;
1424: mat_j = imat_j + mat_i;
1425: max2 = rbuf2_i[ct1];
1426: if (!allcolumns[is_no]) {
1427: for (l=0; l<max2; l++,ct2++) {
1429: #if defined(PETSC_USE_CTABLE)
1430: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1431: #else
1432: tcol = cmap_i[rbuf3_i[ct2]];
1433: #endif
1434: if (tcol) {
1435: *mat_j++ = tcol - 1;
1436: *mat_a++ = rbuf4_i[ct2];
1437: ilen++;
1438: }
1439: }
1440: } else { /* allcolumns */
1441: for (l=0; l<max2; l++,ct2++) {
1442: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1443: *mat_a++ = rbuf4_i[ct2];
1444: ilen++;
1445: }
1446: }
1447: imat_ilen[row] = ilen;
1448: }
1449: }
1450: }
1451: }
1453: /* sort the rows */
1454: {
1455: PetscInt *imat_ilen,*imat_j,*imat_i;
1456: PetscScalar *imat_a;
1458: for (i=0; i<ismax; i++) {
1459: mat = (Mat_SeqAIJ*)submats[i]->data;
1460: imat_j = mat->j;
1461: imat_i = mat->i;
1462: imat_a = mat->a;
1463: imat_ilen = mat->ilen;
1465: if (allcolumns[i]) continue;
1466: jmax = nrow[i];
1467: for (j=0; j<jmax; j++) {
1468: PetscInt ilen;
1470: mat_i = imat_i[j];
1471: mat_a = imat_a + mat_i;
1472: mat_j = imat_j + mat_i;
1473: ilen = imat_ilen[j];
1474: PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);
1475: }
1476: }
1477: }
1479: PetscFree(r_status4);
1480: PetscFree(r_waits4);
1481: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1482: PetscFree(s_waits4);
1483: PetscFree(s_status4);
1485: /* Restore the indices */
1486: for (i=0; i<ismax; i++) {
1487: ISRestoreIndices(isrow[i],irow+i);
1488: if (!allcolumns[i]) {
1489: ISRestoreIndices(iscol[i],icol+i);
1490: }
1491: }
1493: /* Destroy allocated memory */
1494: PetscFree4(irow,icol,nrow,ncol);
1495: PetscFree4(w1,w2,w3,w4);
1496: PetscFree(pa);
1498: PetscFree4(sbuf1,ptr,tmp,ctr);
1499: PetscFree(rbuf2);
1500: for (i=0; i<nrqr; ++i) {
1501: PetscFree(sbuf2[i]);
1502: }
1503: for (i=0; i<nrqs; ++i) {
1504: PetscFree(rbuf3[i]);
1505: PetscFree(rbuf4[i]);
1506: }
1508: PetscFree3(sbuf2,req_size,req_source);
1509: PetscFree(rbuf3);
1510: PetscFree(rbuf4);
1511: PetscFree(sbuf_aj[0]);
1512: PetscFree(sbuf_aj);
1513: PetscFree(sbuf_aa[0]);
1514: PetscFree(sbuf_aa);
1516: #if defined(PETSC_USE_CTABLE)
1517: for (i=0; i<ismax; i++) {PetscTableDestroy((PetscTable*)&rmap[i]);}
1518: #else
1519: if (ismax) {PetscFree(rmap[0]);}
1520: #endif
1521: PetscFree(rmap);
1523: for (i=0; i<ismax; i++) {
1524: if (!allcolumns[i]) {
1525: #if defined(PETSC_USE_CTABLE)
1526: PetscTableDestroy((PetscTable*)&cmap[i]);
1527: #else
1528: PetscFree(cmap[i]);
1529: #endif
1530: }
1531: }
1532: PetscFree(cmap);
1533: if (ismax) {PetscFree(lens[0]);}
1534: PetscFree(lens);
1536: for (i=0; i<ismax; i++) {
1537: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1538: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1539: }
1540: return(0);
1541: }
1543: /*
1544: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1545: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1546: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1547: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1548: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1549: state, and needs to be "assembled" later by compressing B's column space.
1551: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1552: Following this call, C->A & C->B have been created, even if empty.
1553: */
1556: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1557: {
1558: /* If making this function public, change the error returned in this function away from _PLIB. */
1560: Mat_MPIAIJ *aij;
1561: Mat_SeqAIJ *Baij;
1562: PetscBool seqaij,Bdisassembled;
1563: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1564: PetscScalar v;
1565: const PetscInt *rowindices,*colindices;
1568: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1569: if (A) {
1570: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
1571: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1572: if (rowemb) {
1573: ISGetLocalSize(rowemb,&m);
1574: if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1575: } else {
1576: if (C->rmap->n != A->rmap->n) {
1577: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1578: }
1579: }
1580: if (dcolemb) {
1581: ISGetLocalSize(dcolemb,&n);
1582: if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1583: } else {
1584: if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1585: }
1586: }
1587: if (B) {
1588: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
1589: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1590: if (rowemb) {
1591: ISGetLocalSize(rowemb,&m);
1592: if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
1593: } else {
1594: if (C->rmap->n != B->rmap->n) {
1595: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
1596: }
1597: }
1598: if (ocolemb) {
1599: ISGetLocalSize(ocolemb,&n);
1600: if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
1601: } else {
1602: if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
1603: }
1604: }
1606: aij = (Mat_MPIAIJ*)(C->data);
1607: if (!aij->A) {
1608: /* Mimic parts of MatMPIAIJSetPreallocation() */
1609: MatCreate(PETSC_COMM_SELF,&aij->A);
1610: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
1611: MatSetBlockSizesFromMats(aij->A,C,C);
1612: MatSetType(aij->A,MATSEQAIJ);
1613: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
1614: }
1615: if (A) {
1616: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
1617: } else {
1618: MatSetUp(aij->A);
1619: }
1620: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
1621: /*
1622: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
1623: need to "disassemble" B -- convert it to using C's global indices.
1624: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
1626: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
1627: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
1629: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
1630: At least avoid calling MatSetValues() and the implied searches?
1631: */
1633: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
1634: #if defined(PETSC_USE_CTABLE)
1635: PetscTableDestroy(&aij->colmap);
1636: #else
1637: PetscFree(aij->colmap);
1638: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
1639: if (aij->B) {
1640: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
1641: }
1642: #endif
1643: ngcol = 0;
1644: if (aij->lvec) {
1645: VecGetSize(aij->lvec,&ngcol);
1646: }
1647: if (aij->garray) {
1648: PetscFree(aij->garray);
1649: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
1650: }
1651: VecDestroy(&aij->lvec);
1652: VecScatterDestroy(&aij->Mvctx);
1653: }
1654: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
1655: MatDestroy(&aij->B);
1656: }
1657: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
1658: MatZeroEntries(aij->B);
1659: }
1660: }
1661: Bdisassembled = PETSC_FALSE;
1662: if (!aij->B) {
1663: MatCreate(PETSC_COMM_SELF,&aij->B);
1664: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
1665: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
1666: MatSetBlockSizesFromMats(aij->B,B,B);
1667: MatSetType(aij->B,MATSEQAIJ);
1668: Bdisassembled = PETSC_TRUE;
1669: }
1670: if (B) {
1671: Baij = (Mat_SeqAIJ*)(B->data);
1672: if (pattern == DIFFERENT_NONZERO_PATTERN) {
1673: PetscMalloc1(B->rmap->n,&nz);
1674: for (i=0; i<B->rmap->n; i++) {
1675: nz[i] = Baij->i[i+1] - Baij->i[i];
1676: }
1677: MatSeqAIJSetPreallocation(aij->B,0,nz);
1678: PetscFree(nz);
1679: }
1681: PetscLayoutGetRange(C->rmap,&rstart,&rend);
1682: shift = rend-rstart;
1683: count = 0;
1684: rowindices = NULL;
1685: colindices = NULL;
1686: if (rowemb) {
1687: ISGetIndices(rowemb,&rowindices);
1688: }
1689: if (ocolemb) {
1690: ISGetIndices(ocolemb,&colindices);
1691: }
1692: for (i=0; i<B->rmap->n; i++) {
1693: PetscInt row;
1694: row = i;
1695: if (rowindices) row = rowindices[i];
1696: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
1697: col = Baij->j[count];
1698: if (colindices) col = colindices[col];
1699: if (Bdisassembled && col>=rstart) col += shift;
1700: v = Baij->a[count];
1701: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
1702: ++count;
1703: }
1704: }
1705: /* No assembly for aij->B is necessary. */
1706: /* FIXME: set aij->B's nonzerostate correctly. */
1707: } else {
1708: MatSetUp(aij->B);
1709: }
1710: C->preallocated = PETSC_TRUE;
1711: C->was_assembled = PETSC_FALSE;
1712: C->assembled = PETSC_FALSE;
1713: /*
1714: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
1715: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
1716: */
1717: return(0);
1718: }
1722: /*
1723: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
1724: */
1725: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
1726: {
1727: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1732: /* FIXME: make sure C is assembled */
1733: *A = aij->A;
1734: *B = aij->B;
1735: /* Note that we don't incref *A and *B, so be careful! */
1736: return(0);
1737: }
1739: /*
1740: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
1741: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
1742: */
1745: PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1746: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
1747: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
1748: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
1749: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
1750: {
1752: PetscMPIInt isize,flag;
1753: PetscInt i,ii,cismax,ispar;
1754: Mat *A,*B;
1755: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
1758: if (!ismax) return(0);
1760: for (i = 0, cismax = 0; i < ismax; ++i) {
1761: PetscMPIInt isize;
1762: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
1763: if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1764: MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
1765: if (isize > 1) ++cismax;
1766: }
1767: /*
1768: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
1769: ispar counts the number of parallel ISs across C's comm.
1770: */
1771: MPI_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
1772: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
1773: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
1774: return(0);
1775: }
1777: /* if (ispar) */
1778: /*
1779: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
1780: These are used to extract the off-diag portion of the resulting parallel matrix.
1781: The row IS for the off-diag portion is the same as for the diag portion,
1782: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
1783: */
1784: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
1785: PetscMalloc1(cismax,&ciscol_p);
1786: for (i = 0, ii = 0; i < ismax; ++i) {
1787: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1788: if (isize > 1) {
1789: /*
1790: TODO: This is the part that's ***NOT SCALABLE***.
1791: To fix this we need to extract just the indices of C's nonzero columns
1792: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
1793: part of iscol[i] -- without actually computing ciscol[ii]. This also has
1794: to be done without serializing on the IS list, so, most likely, it is best
1795: done by rewriting MatGetSubMatrices_MPIAIJ() directly.
1796: */
1797: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
1798: /* Now we have to
1799: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
1800: were sorted on each rank, concatenated they might no longer be sorted;
1801: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
1802: indices in the nondecreasing order to the original index positions.
1803: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
1804: */
1805: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
1806: ISSort(ciscol[ii]);
1807: ++ii;
1808: }
1809: }
1810: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
1811: for (i = 0, ii = 0; i < ismax; ++i) {
1812: PetscInt j,issize;
1813: const PetscInt *indices;
1815: /*
1816: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
1817: */
1818: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
1819: ISSort(isrow[i]);
1820: ISGetLocalSize(isrow[i],&issize);
1821: ISGetIndices(isrow[i],&indices);
1822: for (j = 1; j < issize; ++j) {
1823: if (indices[j] == indices[j-1]) {
1824: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
1825: }
1826: }
1827: ISRestoreIndices(isrow[i],&indices);
1830: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
1831: ISSort(iscol[i]);
1832: ISGetLocalSize(iscol[i],&issize);
1833: ISGetIndices(iscol[i],&indices);
1834: for (j = 1; j < issize; ++j) {
1835: if (indices[j-1] == indices[j]) {
1836: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
1837: }
1838: }
1839: ISRestoreIndices(iscol[i],&indices);
1840: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1841: if (isize > 1) {
1842: cisrow[ii] = isrow[i];
1843: ++ii;
1844: }
1845: }
1846: /*
1847: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1848: array of sequential matrices underlying the resulting parallel matrices.
1849: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
1850: contain duplicates.
1852: There are as many diag matrices as there are original index sets. There are only as many parallel
1853: and off-diag matrices, as there are parallel (comm size > 1) index sets.
1855: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
1856: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
1857: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
1858: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
1859: with A[i] and B[ii] extracted from the corresponding MPI submat.
1860: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
1861: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
1862: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
1863: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
1864: values into A[i] and B[ii] sitting inside the corresponding submat.
1865: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
1866: A[i], B[ii], AA[i] or BB[ii] matrices.
1867: */
1868: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
1869: if (scall == MAT_INITIAL_MATRIX) {
1870: PetscMalloc1(ismax,submat);
1871: /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
1872: } else {
1873: PetscMalloc1(ismax,&A);
1874: PetscMalloc1(cismax,&B);
1875: /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
1876: for (i = 0, ii = 0; i < ismax; ++i) {
1877: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1878: if (isize > 1) {
1879: Mat AA,BB;
1880: (*getlocalmats)((*submat)[i],&AA,&BB);
1881: if (!isrow_p[i] && !iscol_p[i]) {
1882: A[i] = AA;
1883: } else {
1884: /* TODO: extract A[i] composed on AA. */
1885: MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);
1886: }
1887: if (!isrow_p[i] && !ciscol_p[ii]) {
1888: B[ii] = BB;
1889: } else {
1890: /* TODO: extract B[ii] composed on BB. */
1891: MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);
1892: }
1893: ++ii;
1894: } else {
1895: if (!isrow_p[i] && !iscol_p[i]) {
1896: A[i] = (*submat)[i];
1897: } else {
1898: /* TODO: extract A[i] composed on (*submat)[i]. */
1899: MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);
1900: }
1901: }
1902: }
1903: }
1904: /* Now obtain the sequential A and B submatrices separately. */
1905: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);
1906: (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);
1907: /*
1908: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
1909: matrices A & B have been extracted directly into the parallel matrices containing them, or
1910: simply into the sequential matrix identical with the corresponding A (if isize == 1).
1911: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
1912: to have the same sparsity pattern.
1913: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
1914: must be constructed for C. This is done by setseqmat(s).
1915: */
1916: for (i = 0, ii = 0; i < ismax; ++i) {
1917: /*
1918: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
1919: That way we can avoid sorting and computing permutations when reusing.
1920: To this end:
1921: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
1922: - if caching arrays to hold the ISs, make and compose a container for them so that it can
1923: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
1924: */
1925: MatStructure pattern;
1926: if (scall == MAT_INITIAL_MATRIX) {
1927: pattern = DIFFERENT_NONZERO_PATTERN;
1928: } else {
1929: pattern = SAME_NONZERO_PATTERN;
1930: }
1931: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
1932: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
1933: if (isize > 1) {
1934: if (scall == MAT_INITIAL_MATRIX) {
1935: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
1936: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1937: MatSetType((*submat)[i],MATMPIAIJ);
1938: PetscLayoutSetUp((*submat)[i]->rmap);
1939: PetscLayoutSetUp((*submat)[i]->cmap);
1940: }
1941: /*
1942: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
1943: */
1944: {
1945: Mat AA,BB;
1946: AA = NULL;
1947: if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
1948: BB = NULL;
1949: if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
1950: if (AA || BB) {
1951: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
1952: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
1953: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
1954: }
1955: if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
1956: /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
1957: MatDestroy(&AA);
1958: }
1959: if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
1960: /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
1961: MatDestroy(&BB);
1962: }
1963: }
1964: ISDestroy(ciscol+ii);
1965: ISDestroy(ciscol_p+ii);
1966: ++ii;
1967: } else { /* if (isize == 1) */
1968: if (scall == MAT_INITIAL_MATRIX) {
1969: if (isrow_p[i] || iscol_p[i]) {
1970: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
1971: } else (*submat)[i] = A[i];
1972: }
1973: if (isrow_p[i] || iscol_p[i]) {
1974: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
1975: /* Otherwise A is extracted straight into (*submats)[i]. */
1976: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
1977: MatDestroy(A+i);
1978: }
1979: }
1980: ISDestroy(&isrow_p[i]);
1981: ISDestroy(&iscol_p[i]);
1982: }
1983: PetscFree2(cisrow,ciscol);
1984: PetscFree2(isrow_p,iscol_p);
1985: PetscFree(ciscol_p);
1986: PetscFree(A);
1987: PetscFree(B);
1988: return(0);
1989: }
1995: PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1996: {
2000: MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
2001: return(0);
2002: }