Actual source code: baijov.c
petsc-3.13.6 2020-09-29
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15: {
17: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
18: IS *is_new;
21: PetscMalloc1(imax,&is_new);
22: /* Convert the indices into block format */
23: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
27: }
28: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
29: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
30: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
31: PetscFree(is_new);
32: return(0);
33: }
35: /*
36: Sample message format:
37: If a processor A wants processor B to process some elements corresponding
38: to index sets is[1], is[5]
39: mesg [0] = 2 (no of index sets in the mesg)
40: -----------
41: mesg [1] = 1 => is[1]
42: mesg [2] = sizeof(is[1]);
43: -----------
44: mesg [5] = 5 => is[5]
45: mesg [6] = sizeof(is[5]);
46: -----------
47: mesg [7]
48: mesg [n] data(is[1])
49: -----------
50: mesg[n+1]
51: mesg[m] data(is[5])
52: -----------
54: Notes:
55: nrqs - no of requests sent (or to be sent out)
56: nrqr - no of requests received (which have to be or which have been processed)
57: */
58: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
59: {
60: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*w3,*w4,**data,len;
64: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
65: PetscInt Mbs,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
67: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2,proc=-1;
68: PetscBT *table;
69: MPI_Comm comm,*iscomms;
70: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
71: MPI_Status *s_status,*recv_status;
72: char *t_p;
75: PetscObjectGetComm((PetscObject)C,&comm);
76: size = c->size;
77: rank = c->rank;
78: Mbs = c->Mbs;
80: PetscObjectGetNewTag((PetscObject)C,&tag1);
81: PetscObjectGetNewTag((PetscObject)C,&tag2);
83: PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);
85: for (i=0; i<imax; i++) {
86: ISGetIndices(is[i],&idx[i]);
87: ISGetLocalSize(is[i],&n[i]);
88: }
90: /* evaluate communication - mesg to who,length of mesg, and buffer space
91: required. Based on this, buffers are allocated, and data copied into them*/
92: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
93: for (i=0; i<imax; i++) {
94: PetscArrayzero(w4,size); /* initialise work vector*/
95: idx_i = idx[i];
96: len = n[i];
97: for (j=0; j<len; j++) {
98: row = idx_i[j];
99: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
101: w4[proc]++;
102: }
103: for (j=0; j<size; j++) {
104: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105: }
106: }
108: nrqs = 0; /* no of outgoing messages */
109: msz = 0; /* total mesg length (for all proc */
110: w1[rank] = 0; /* no mesg sent to itself */
111: w3[rank] = 0;
112: for (i=0; i<size; i++) {
113: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114: }
115: /* pa - is list of processors to communicate with */
116: PetscMalloc1(nrqs+1,&pa);
117: for (i=0,j=0; i<size; i++) {
118: if (w1[i]) {pa[j] = i; j++;}
119: }
121: /* Each message would have a header = 1 + 2*(no of IS) + data */
122: for (i=0; i<nrqs; i++) {
123: j = pa[i];
124: w1[j] += w2[j] + 2*w3[j];
125: msz += w1[j];
126: }
128: /* Determine the number of messages to expect, their lengths, from from-ids */
129: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
130: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
132: /* Now post the Irecvs corresponding to these messages */
133: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
135: /* Allocate Memory for outgoing messages */
136: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
137: PetscArrayzero(outdat,size);
138: PetscArrayzero(ptr,size);
139: {
140: PetscInt *iptr = tmp,ict = 0;
141: for (i=0; i<nrqs; i++) {
142: j = pa[i];
143: iptr += ict;
144: outdat[j] = iptr;
145: ict = w1[j];
146: }
147: }
149: /* Form the outgoing messages */
150: /*plug in the headers*/
151: for (i=0; i<nrqs; i++) {
152: j = pa[i];
153: outdat[j][0] = 0;
154: PetscArrayzero(outdat[j]+1,2*w3[j]);
155: ptr[j] = outdat[j] + 2*w3[j] + 1;
156: }
158: /* Memory for doing local proc's work*/
159: {
160: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
162: for (i=0; i<imax; i++) {
163: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
164: data[i] = d_p + (Mbs)*i;
165: }
166: }
168: /* Parse the IS and update local tables and the outgoing buf with the data*/
169: {
170: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
171: PetscBT table_i;
173: for (i=0; i<imax; i++) {
174: PetscArrayzero(ctr,size);
175: n_i = n[i];
176: table_i = table[i];
177: idx_i = idx[i];
178: data_i = data[i];
179: isz_i = isz[i];
180: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
181: row = idx_i[j];
182: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
183: if (proc != rank) { /* copy to the outgoing buffer */
184: ctr[proc]++;
185: *ptr[proc] = row;
186: ptr[proc]++;
187: } else { /* Update the local table */
188: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
189: }
190: }
191: /* Update the headers for the current IS */
192: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193: if ((ctr_j = ctr[j])) {
194: outdat_j = outdat[j];
195: k = ++outdat_j[0];
196: outdat_j[2*k] = ctr_j;
197: outdat_j[2*k-1] = i;
198: }
199: }
200: isz[i] = isz_i;
201: }
202: }
204: /* Now post the sends */
205: PetscMalloc1(nrqs+1,&s_waits1);
206: for (i=0; i<nrqs; ++i) {
207: j = pa[i];
208: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
209: }
211: /* No longer need the original indices*/
212: for (i=0; i<imax; ++i) {
213: ISRestoreIndices(is[i],idx+i);
214: }
215: PetscFree2(*(PetscInt***)&idx,n);
217: PetscMalloc1(imax,&iscomms);
218: for (i=0; i<imax; ++i) {
219: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
220: ISDestroy(&is[i]);
221: }
223: /* Do Local work*/
224: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
226: /* Receive messages*/
227: PetscMalloc1(nrqr+1,&recv_status);
228: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
230: PetscMalloc1(nrqs+1,&s_status);
231: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
233: /* Phase 1 sends are complete - deallocate buffers */
234: PetscFree4(outdat,ptr,tmp,ctr);
235: PetscFree4(w1,w2,w3,w4);
237: PetscMalloc1(nrqr+1,&xdata);
238: PetscMalloc1(nrqr+1,&isz1);
239: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
240: PetscFree(rbuf[0]);
241: PetscFree(rbuf);
243: /* Send the data back*/
244: /* Do a global reduction to know the buffer space req for incoming messages*/
245: {
246: PetscMPIInt *rw1;
248: PetscCalloc1(size,&rw1);
250: for (i=0; i<nrqr; ++i) {
251: proc = recv_status[i].MPI_SOURCE;
252: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
253: rw1[proc] = isz1[i];
254: }
256: PetscFree(onodes1);
257: PetscFree(olengths1);
259: /* Determine the number of messages to expect, their lengths, from from-ids */
260: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
261: PetscFree(rw1);
262: }
263: /* Now post the Irecvs corresponding to these messages */
264: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
266: /* Now post the sends */
267: PetscMalloc1(nrqr+1,&s_waits2);
268: for (i=0; i<nrqr; ++i) {
269: j = recv_status[i].MPI_SOURCE;
270: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
271: }
273: /* receive work done on other processors*/
274: {
275: PetscMPIInt idex;
276: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
277: PetscBT table_i;
278: MPI_Status *status2;
280: PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);
281: for (i=0; i<nrqs; ++i) {
282: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
283: /* Process the message*/
284: rbuf2_i = rbuf2[idex];
285: ct1 = 2*rbuf2_i[0]+1;
286: jmax = rbuf2[idex][0];
287: for (j=1; j<=jmax; j++) {
288: max = rbuf2_i[2*j];
289: is_no = rbuf2_i[2*j-1];
290: isz_i = isz[is_no];
291: data_i = data[is_no];
292: table_i = table[is_no];
293: for (k=0; k<max; k++,ct1++) {
294: row = rbuf2_i[ct1];
295: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
296: }
297: isz[is_no] = isz_i;
298: }
299: }
300: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
301: PetscFree(status2);
302: }
304: for (i=0; i<imax; ++i) {
305: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
306: PetscCommDestroy(&iscomms[i]);
307: }
309: PetscFree(iscomms);
310: PetscFree(onodes2);
311: PetscFree(olengths2);
313: PetscFree(pa);
314: PetscFree(rbuf2[0]);
315: PetscFree(rbuf2);
316: PetscFree(s_waits1);
317: PetscFree(r_waits1);
318: PetscFree(s_waits2);
319: PetscFree(r_waits2);
320: PetscFree5(table,data,isz,d_p,t_p);
321: PetscFree(s_status);
322: PetscFree(recv_status);
323: PetscFree(xdata[0]);
324: PetscFree(xdata);
325: PetscFree(isz1);
326: return(0);
327: }
329: /*
330: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
331: the work on the local processor.
333: Inputs:
334: C - MAT_MPIBAIJ;
335: imax - total no of index sets processed at a time;
336: table - an array of char - size = Mbs bits.
338: Output:
339: isz - array containing the count of the solution elements corresponding
340: to each index set;
341: data - pointer to the solutions
342: */
343: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
344: {
345: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
346: Mat A = c->A,B = c->B;
347: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
348: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
349: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
350: PetscBT table_i;
353: rstart = c->rstartbs;
354: cstart = c->cstartbs;
355: ai = a->i;
356: aj = a->j;
357: bi = b->i;
358: bj = b->j;
359: garray = c->garray;
362: for (i=0; i<imax; i++) {
363: data_i = data[i];
364: table_i = table[i];
365: isz_i = isz[i];
366: for (j=0,max=isz[i]; j<max; j++) {
367: row = data_i[j] - rstart;
368: start = ai[row];
369: end = ai[row+1];
370: for (k=start; k<end; k++) { /* Amat */
371: val = aj[k] + cstart;
372: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
373: }
374: start = bi[row];
375: end = bi[row+1];
376: for (k=start; k<end; k++) { /* Bmat */
377: val = garray[bj[k]];
378: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379: }
380: }
381: isz[i] = isz_i;
382: }
383: return(0);
384: }
385: /*
386: MatIncreaseOverlap_MPIBAIJ_Receive - Process the received messages,
387: and return the output
389: Input:
390: C - the matrix
391: nrqr - no of messages being processed.
392: rbuf - an array of pointers to the received requests
394: Output:
395: xdata - array of messages to be sent back
396: isz1 - size of each message
398: For better efficiency perhaps we should malloc separately each xdata[i],
399: then if a remalloc is required we need only copy the data for that one row
400: rather than all previous rows as it is now where a single large chunck of
401: memory is used.
403: */
404: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
405: {
406: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
407: Mat A = c->A,B = c->B;
408: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
410: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
411: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
412: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
413: PetscInt *rbuf_i,kmax,rbuf_0;
414: PetscBT xtable;
417: Mbs = c->Mbs;
418: rstart = c->rstartbs;
419: cstart = c->cstartbs;
420: ai = a->i;
421: aj = a->j;
422: bi = b->i;
423: bj = b->j;
424: garray = c->garray;
427: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
428: rbuf_i = rbuf[i];
429: rbuf_0 = rbuf_i[0];
430: ct += rbuf_0;
431: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
432: }
434: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
435: else max1 = 1;
436: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
437: PetscMalloc1(mem_estimate,&xdata[0]);
438: ++no_malloc;
439: PetscBTCreate(Mbs,&xtable);
440: PetscArrayzero(isz1,nrqr);
442: ct3 = 0;
443: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
444: rbuf_i = rbuf[i];
445: rbuf_0 = rbuf_i[0];
446: ct1 = 2*rbuf_0+1;
447: ct2 = ct1;
448: ct3 += ct1;
449: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
450: PetscBTMemzero(Mbs,xtable);
451: oct2 = ct2;
452: kmax = rbuf_i[2*j];
453: for (k=0; k<kmax; k++,ct1++) {
454: row = rbuf_i[ct1];
455: if (!PetscBTLookupSet(xtable,row)) {
456: if (!(ct3 < mem_estimate)) {
457: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
458: PetscMalloc1(new_estimate,&tmp);
459: PetscArraycpy(tmp,xdata[0],mem_estimate);
460: PetscFree(xdata[0]);
461: xdata[0] = tmp;
462: mem_estimate = new_estimate; ++no_malloc;
463: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
464: }
465: xdata[i][ct2++] = row;
466: ct3++;
467: }
468: }
469: for (k=oct2,max2=ct2; k<max2; k++) {
470: row = xdata[i][k] - rstart;
471: start = ai[row];
472: end = ai[row+1];
473: for (l=start; l<end; l++) {
474: val = aj[l] + cstart;
475: if (!PetscBTLookupSet(xtable,val)) {
476: if (!(ct3 < mem_estimate)) {
477: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
478: PetscMalloc1(new_estimate,&tmp);
479: PetscArraycpy(tmp,xdata[0],mem_estimate);
480: PetscFree(xdata[0]);
481: xdata[0] = tmp;
482: mem_estimate = new_estimate; ++no_malloc;
483: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
484: }
485: xdata[i][ct2++] = val;
486: ct3++;
487: }
488: }
489: start = bi[row];
490: end = bi[row+1];
491: for (l=start; l<end; l++) {
492: val = garray[bj[l]];
493: if (!PetscBTLookupSet(xtable,val)) {
494: if (!(ct3 < mem_estimate)) {
495: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
496: PetscMalloc1(new_estimate,&tmp);
497: PetscArraycpy(tmp,xdata[0],mem_estimate);
498: PetscFree(xdata[0]);
499: xdata[0] = tmp;
500: mem_estimate = new_estimate; ++no_malloc;
501: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
502: }
503: xdata[i][ct2++] = val;
504: ct3++;
505: }
506: }
507: }
508: /* Update the header*/
509: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
510: xdata[i][2*j-1] = rbuf_i[2*j-1];
511: }
512: xdata[i][0] = rbuf_0;
513: xdata[i+1] = xdata[i] + ct2;
514: isz1[i] = ct2; /* size of each message */
515: }
516: PetscBTDestroy(&xtable);
517: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
518: return(0);
519: }
521: PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
522: {
523: IS *isrow_block,*iscol_block;
524: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
526: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
527: Mat_SeqBAIJ *subc;
528: Mat_SubSppt *smat;
531: /* The compression and expansion should be avoided. Doesn't point
532: out errors, might change the indices, hence buggey */
533: PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);
534: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);
535: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);
537: /* Determine the number of stages through which submatrices are done */
538: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
539: else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
540: if (!nmax) nmax = 1;
542: if (scall == MAT_INITIAL_MATRIX) {
543: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
545: /* Make sure every processor loops through the nstages */
546: MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
548: /* Allocate memory to hold all the submatrices and dummy submatrices */
549: PetscCalloc1(ismax+nstages,submat);
550: } else { /* MAT_REUSE_MATRIX */
551: if (ismax) {
552: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
553: smat = subc->submatis1;
554: } else { /* (*submat)[0] is a dummy matrix */
555: smat = (Mat_SubSppt*)(*submat)[0]->data;
556: }
557: if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat");
558: nstages = smat->nstages;
559: }
561: for (i=0,pos=0; i<nstages; i++) {
562: if (pos+nmax <= ismax) max_no = nmax;
563: else if (pos == ismax) max_no = 0;
564: else max_no = ismax-pos;
566: MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);
567: if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
568: smat = (Mat_SubSppt*)(*submat)[pos]->data;
569: smat->nstages = nstages;
570: }
571: pos += max_no;
572: }
574: if (scall == MAT_INITIAL_MATRIX && ismax) {
575: /* save nstages for reuse */
576: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
577: smat = subc->submatis1;
578: smat->nstages = nstages;
579: }
581: for (i=0; i<ismax; i++) {
582: ISDestroy(&isrow_block[i]);
583: ISDestroy(&iscol_block[i]);
584: }
585: PetscFree2(isrow_block,iscol_block);
586: return(0);
587: }
589: #if defined(PETSC_USE_CTABLE)
590: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
591: {
592: PetscInt nGlobalNd = proc_gnode[size];
593: PetscMPIInt fproc;
597: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
598: if (fproc > size) fproc = size;
599: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
600: if (row < proc_gnode[fproc]) fproc--;
601: else fproc++;
602: }
603: *rank = fproc;
604: return(0);
605: }
606: #endif
608: /* -------------------------------------------------------------------------*/
609: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
610: PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
611: {
612: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
613: Mat A = c->A;
614: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
615: const PetscInt **icol,**irow;
616: PetscInt *nrow,*ncol,start;
618: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
619: PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
620: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
621: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
622: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
623: #if defined(PETSC_USE_CTABLE)
624: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
625: #else
626: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
627: #endif
628: const PetscInt *irow_i,*icol_i;
629: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
630: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
631: MPI_Request *r_waits4,*s_waits3,*s_waits4;
632: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
633: MPI_Status *r_status3,*r_status4,*s_status4;
634: MPI_Comm comm;
635: PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i;
636: PetscMPIInt *onodes1,*olengths1,end;
637: PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
638: Mat_SubSppt *smat_i;
639: PetscBool *issorted,colflag,iscsorted=PETSC_TRUE;
640: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
641: PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
642: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
643: PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
644: PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
645: PetscInt cstart = c->cstartbs,*bmap = c->garray;
646: PetscBool *allrows,*allcolumns;
649: PetscObjectGetComm((PetscObject)C,&comm);
650: size = c->size;
651: rank = c->rank;
653: PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);
654: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
656: for (i=0; i<ismax; i++) {
657: ISSorted(iscol[i],&issorted[i]);
658: if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
659: ISSorted(isrow[i],&issorted[i]);
661: /* Check for special case: allcolumns */
662: ISIdentity(iscol[i],&colflag);
663: ISGetLocalSize(iscol[i],&ncol[i]);
665: if (colflag && ncol[i] == c->Nbs) {
666: allcolumns[i] = PETSC_TRUE;
667: icol[i] = NULL;
668: } else {
669: allcolumns[i] = PETSC_FALSE;
670: ISGetIndices(iscol[i],&icol[i]);
671: }
673: /* Check for special case: allrows */
674: ISIdentity(isrow[i],&colflag);
675: ISGetLocalSize(isrow[i],&nrow[i]);
676: if (colflag && nrow[i] == c->Mbs) {
677: allrows[i] = PETSC_TRUE;
678: irow[i] = NULL;
679: } else {
680: allrows[i] = PETSC_FALSE;
681: ISGetIndices(isrow[i],&irow[i]);
682: }
683: }
685: if (scall == MAT_REUSE_MATRIX) {
686: /* Assumes new rows are same length as the old rows */
687: for (i=0; i<ismax; i++) {
688: subc = (Mat_SeqBAIJ*)(submats[i]->data);
689: if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
691: /* Initial matrix as if empty */
692: PetscArrayzero(subc->ilen,subc->mbs);
694: /* Initial matrix as if empty */
695: submats[i]->factortype = C->factortype;
697: smat_i = subc->submatis1;
699: nrqs = smat_i->nrqs;
700: nrqr = smat_i->nrqr;
701: rbuf1 = smat_i->rbuf1;
702: rbuf2 = smat_i->rbuf2;
703: rbuf3 = smat_i->rbuf3;
704: req_source2 = smat_i->req_source2;
706: sbuf1 = smat_i->sbuf1;
707: sbuf2 = smat_i->sbuf2;
708: ptr = smat_i->ptr;
709: tmp = smat_i->tmp;
710: ctr = smat_i->ctr;
712: pa = smat_i->pa;
713: req_size = smat_i->req_size;
714: req_source1 = smat_i->req_source1;
716: allcolumns[i] = smat_i->allcolumns;
717: allrows[i] = smat_i->allrows;
718: row2proc[i] = smat_i->row2proc;
719: rmap[i] = smat_i->rmap;
720: cmap[i] = smat_i->cmap;
721: }
723: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
724: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
725: smat_i = (Mat_SubSppt*)submats[0]->data;
727: nrqs = smat_i->nrqs;
728: nrqr = smat_i->nrqr;
729: rbuf1 = smat_i->rbuf1;
730: rbuf2 = smat_i->rbuf2;
731: rbuf3 = smat_i->rbuf3;
732: req_source2 = smat_i->req_source2;
734: sbuf1 = smat_i->sbuf1;
735: sbuf2 = smat_i->sbuf2;
736: ptr = smat_i->ptr;
737: tmp = smat_i->tmp;
738: ctr = smat_i->ctr;
740: pa = smat_i->pa;
741: req_size = smat_i->req_size;
742: req_source1 = smat_i->req_source1;
744: allcolumns[0] = PETSC_FALSE;
745: }
746: } else { /* scall == MAT_INITIAL_MATRIX */
747: /* Get some new tags to keep the communication clean */
748: PetscObjectGetNewTag((PetscObject)C,&tag2);
749: PetscObjectGetNewTag((PetscObject)C,&tag3);
751: /* evaluate communication - mesg to who, length of mesg, and buffer space
752: required. Based on this, buffers are allocated, and data copied into them*/
753: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
755: for (i=0; i<ismax; i++) {
756: jmax = nrow[i];
757: irow_i = irow[i];
759: PetscMalloc1(jmax,&row2proc_i);
760: row2proc[i] = row2proc_i;
762: if (issorted[i]) proc = 0;
763: for (j=0; j<jmax; j++) {
764: if (!issorted[i]) proc = 0;
765: if (allrows[i]) row = j;
766: else row = irow_i[j];
768: while (row >= c->rangebs[proc+1]) proc++;
769: w4[proc]++;
770: row2proc_i[j] = proc; /* map row index to proc */
771: }
772: for (j=0; j<size; j++) {
773: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
774: }
775: }
777: nrqs = 0; /* no of outgoing messages */
778: msz = 0; /* total mesg length (for all procs) */
779: w1[rank] = 0; /* no mesg sent to self */
780: w3[rank] = 0;
781: for (i=0; i<size; i++) {
782: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
783: }
784: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
785: for (i=0,j=0; i<size; i++) {
786: if (w1[i]) { pa[j] = i; j++; }
787: }
789: /* Each message would have a header = 1 + 2*(no of IS) + data */
790: for (i=0; i<nrqs; i++) {
791: j = pa[i];
792: w1[j] += w2[j] + 2* w3[j];
793: msz += w1[j];
794: }
795: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
797: /* Determine the number of messages to expect, their lengths, from from-ids */
798: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
799: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
801: /* Now post the Irecvs corresponding to these messages */
802: tag0 = ((PetscObject)C)->tag;
803: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
805: PetscFree(onodes1);
806: PetscFree(olengths1);
808: /* Allocate Memory for outgoing messages */
809: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
810: PetscArrayzero(sbuf1,size);
811: PetscArrayzero(ptr,size);
813: {
814: PetscInt *iptr = tmp;
815: k = 0;
816: for (i=0; i<nrqs; i++) {
817: j = pa[i];
818: iptr += k;
819: sbuf1[j] = iptr;
820: k = w1[j];
821: }
822: }
824: /* Form the outgoing messages. Initialize the header space */
825: for (i=0; i<nrqs; i++) {
826: j = pa[i];
827: sbuf1[j][0] = 0;
828: PetscArrayzero(sbuf1[j]+1,2*w3[j]);
829: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
830: }
832: /* Parse the isrow and copy data into outbuf */
833: for (i=0; i<ismax; i++) {
834: row2proc_i = row2proc[i];
835: PetscArrayzero(ctr,size);
836: irow_i = irow[i];
837: jmax = nrow[i];
838: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
839: proc = row2proc_i[j];
840: if (allrows[i]) row = j;
841: else row = irow_i[j];
843: if (proc != rank) { /* copy to the outgoing buf*/
844: ctr[proc]++;
845: *ptr[proc] = row;
846: ptr[proc]++;
847: }
848: }
849: /* Update the headers for the current IS */
850: for (j=0; j<size; j++) { /* Can Optimise this loop too */
851: if ((ctr_j = ctr[j])) {
852: sbuf1_j = sbuf1[j];
853: k = ++sbuf1_j[0];
854: sbuf1_j[2*k] = ctr_j;
855: sbuf1_j[2*k-1] = i;
856: }
857: }
858: }
860: /* Now post the sends */
861: PetscMalloc1(nrqs+1,&s_waits1);
862: for (i=0; i<nrqs; ++i) {
863: j = pa[i];
864: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
865: }
867: /* Post Receives to capture the buffer size */
868: PetscMalloc1(nrqs+1,&r_waits2);
869: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
870: rbuf2[0] = tmp + msz;
871: for (i=1; i<nrqs; ++i) {
872: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
873: }
874: for (i=0; i<nrqs; ++i) {
875: j = pa[i];
876: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
877: }
879: /* Send to other procs the buf size they should allocate */
880: /* Receive messages*/
881: PetscMalloc1(nrqr+1,&s_waits2);
882: PetscMalloc1(nrqr+1,&r_status1);
883: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
885: MPI_Waitall(nrqr,r_waits1,r_status1);
886: for (i=0; i<nrqr; ++i) {
887: req_size[i] = 0;
888: rbuf1_i = rbuf1[i];
889: start = 2*rbuf1_i[0] + 1;
890: MPI_Get_count(r_status1+i,MPIU_INT,&end);
891: PetscMalloc1(end+1,&sbuf2[i]);
892: sbuf2_i = sbuf2[i];
893: for (j=start; j<end; j++) {
894: row = rbuf1_i[j] - rstart;
895: ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
896: sbuf2_i[j] = ncols;
897: req_size[i] += ncols;
898: }
899: req_source1[i] = r_status1[i].MPI_SOURCE;
900: /* form the header */
901: sbuf2_i[0] = req_size[i];
902: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
904: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
905: }
907: PetscFree(r_status1);
908: PetscFree(r_waits1);
909: PetscFree4(w1,w2,w3,w4);
911: /* Receive messages*/
912: PetscMalloc1(nrqs+1,&r_waits3);
913: PetscMalloc1(nrqs+1,&r_status2);
915: MPI_Waitall(nrqs,r_waits2,r_status2);
916: for (i=0; i<nrqs; ++i) {
917: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
918: req_source2[i] = r_status2[i].MPI_SOURCE;
919: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
920: }
921: PetscFree(r_status2);
922: PetscFree(r_waits2);
924: /* Wait on sends1 and sends2 */
925: PetscMalloc1(nrqs+1,&s_status1);
926: PetscMalloc1(nrqr+1,&s_status2);
928: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
929: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
930: PetscFree(s_status1);
931: PetscFree(s_status2);
932: PetscFree(s_waits1);
933: PetscFree(s_waits2);
935: /* Now allocate sending buffers for a->j, and send them off */
936: PetscMalloc1(nrqr+1,&sbuf_aj);
937: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
938: PetscMalloc1(j+1,&sbuf_aj[0]);
939: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
941: PetscMalloc1(nrqr+1,&s_waits3);
942: {
944: for (i=0; i<nrqr; i++) {
945: rbuf1_i = rbuf1[i];
946: sbuf_aj_i = sbuf_aj[i];
947: ct1 = 2*rbuf1_i[0] + 1;
948: ct2 = 0;
949: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
950: kmax = rbuf1[i][2*j];
951: for (k=0; k<kmax; k++,ct1++) {
952: row = rbuf1_i[ct1] - rstart;
953: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
954: ncols = nzA + nzB;
955: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
957: /* load the column indices for this row into cols */
958: cols = sbuf_aj_i + ct2;
959: for (l=0; l<nzB; l++) {
960: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
961: else break;
962: }
963: imark = l;
964: for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
965: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
966: ct2 += ncols;
967: }
968: }
969: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
970: }
971: }
972: PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);
974: /* create col map: global col of C -> local col of submatrices */
975: #if defined(PETSC_USE_CTABLE)
976: for (i=0; i<ismax; i++) {
977: if (!allcolumns[i]) {
978: PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);
980: jmax = ncol[i];
981: icol_i = icol[i];
982: cmap_i = cmap[i];
983: for (j=0; j<jmax; j++) {
984: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
985: }
986: } else cmap[i] = NULL;
987: }
988: #else
989: for (i=0; i<ismax; i++) {
990: if (!allcolumns[i]) {
991: PetscCalloc1(c->Nbs,&cmap[i]);
992: jmax = ncol[i];
993: icol_i = icol[i];
994: cmap_i = cmap[i];
995: for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
996: } else cmap[i] = NULL;
997: }
998: #endif
1000: /* Create lens which is required for MatCreate... */
1001: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1002: PetscMalloc1(ismax,&lens);
1004: if (ismax) {
1005: PetscCalloc1(j,&lens[0]);
1006: }
1007: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1009: /* Update lens from local data */
1010: for (i=0; i<ismax; i++) {
1011: row2proc_i = row2proc[i];
1012: jmax = nrow[i];
1013: if (!allcolumns[i]) cmap_i = cmap[i];
1014: irow_i = irow[i];
1015: lens_i = lens[i];
1016: for (j=0; j<jmax; j++) {
1017: if (allrows[i]) row = j;
1018: else row = irow_i[j]; /* global blocked row of C */
1020: proc = row2proc_i[j];
1021: if (proc == rank) {
1022: /* Get indices from matA and then from matB */
1023: #if defined(PETSC_USE_CTABLE)
1024: PetscInt tt;
1025: #endif
1026: row = row - rstart;
1027: nzA = a_i[row+1] - a_i[row];
1028: nzB = b_i[row+1] - b_i[row];
1029: cworkA = a_j + a_i[row];
1030: cworkB = b_j + b_i[row];
1032: if (!allcolumns[i]) {
1033: #if defined(PETSC_USE_CTABLE)
1034: for (k=0; k<nzA; k++) {
1035: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1036: if (tt) lens_i[j]++;
1037: }
1038: for (k=0; k<nzB; k++) {
1039: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1040: if (tt) lens_i[j]++;
1041: }
1043: #else
1044: for (k=0; k<nzA; k++) {
1045: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1046: }
1047: for (k=0; k<nzB; k++) {
1048: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1049: }
1050: #endif
1051: } else { /* allcolumns */
1052: lens_i[j] = nzA + nzB;
1053: }
1054: }
1055: }
1056: }
1058: /* Create row map: global row of C -> local row of submatrices */
1059: for (i=0; i<ismax; i++) {
1060: if (!allrows[i]) {
1061: #if defined(PETSC_USE_CTABLE)
1062: PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);
1063: irow_i = irow[i];
1064: jmax = nrow[i];
1065: for (j=0; j<jmax; j++) {
1066: if (allrows[i]) {
1067: PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);
1068: } else {
1069: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1070: }
1071: }
1072: #else
1073: PetscCalloc1(c->Mbs,&rmap[i]);
1074: rmap_i = rmap[i];
1075: irow_i = irow[i];
1076: jmax = nrow[i];
1077: for (j=0; j<jmax; j++) {
1078: if (allrows[i]) rmap_i[j] = j;
1079: else rmap_i[irow_i[j]] = j;
1080: }
1081: #endif
1082: } else rmap[i] = NULL;
1083: }
1085: /* Update lens from offproc data */
1086: {
1087: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1089: MPI_Waitall(nrqs,r_waits3,r_status3);
1090: for (tmp2=0; tmp2<nrqs; tmp2++) {
1091: sbuf1_i = sbuf1[pa[tmp2]];
1092: jmax = sbuf1_i[0];
1093: ct1 = 2*jmax+1;
1094: ct2 = 0;
1095: rbuf2_i = rbuf2[tmp2];
1096: rbuf3_i = rbuf3[tmp2];
1097: for (j=1; j<=jmax; j++) {
1098: is_no = sbuf1_i[2*j-1];
1099: max1 = sbuf1_i[2*j];
1100: lens_i = lens[is_no];
1101: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1102: rmap_i = rmap[is_no];
1103: for (k=0; k<max1; k++,ct1++) {
1104: if (allrows[is_no]) {
1105: row = sbuf1_i[ct1];
1106: } else {
1107: #if defined(PETSC_USE_CTABLE)
1108: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1109: row--;
1110: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1111: #else
1112: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1113: #endif
1114: }
1115: max2 = rbuf2_i[ct1];
1116: for (l=0; l<max2; l++,ct2++) {
1117: if (!allcolumns[is_no]) {
1118: #if defined(PETSC_USE_CTABLE)
1119: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1120: #else
1121: tcol = cmap_i[rbuf3_i[ct2]];
1122: #endif
1123: if (tcol) lens_i[row]++;
1124: } else { /* allcolumns */
1125: lens_i[row]++; /* lens_i[row] += max2 ? */
1126: }
1127: }
1128: }
1129: }
1130: }
1131: }
1132: PetscFree(r_waits3);
1133: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1134: PetscFree2(r_status3,s_status3);
1135: PetscFree(s_waits3);
1137: /* Create the submatrices */
1138: for (i=0; i<ismax; i++) {
1139: PetscInt bs_tmp;
1140: if (ijonly) bs_tmp = 1;
1141: else bs_tmp = bs;
1143: MatCreate(PETSC_COMM_SELF,submats+i);
1144: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);
1146: MatSetType(submats[i],((PetscObject)A)->type_name);
1147: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1148: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1150: /* create struct Mat_SubSppt and attached it to submat */
1151: PetscNew(&smat_i);
1152: subc = (Mat_SeqBAIJ*)submats[i]->data;
1153: subc->submatis1 = smat_i;
1155: smat_i->destroy = submats[i]->ops->destroy;
1156: submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1157: submats[i]->factortype = C->factortype;
1159: smat_i->id = i;
1160: smat_i->nrqs = nrqs;
1161: smat_i->nrqr = nrqr;
1162: smat_i->rbuf1 = rbuf1;
1163: smat_i->rbuf2 = rbuf2;
1164: smat_i->rbuf3 = rbuf3;
1165: smat_i->sbuf2 = sbuf2;
1166: smat_i->req_source2 = req_source2;
1168: smat_i->sbuf1 = sbuf1;
1169: smat_i->ptr = ptr;
1170: smat_i->tmp = tmp;
1171: smat_i->ctr = ctr;
1173: smat_i->pa = pa;
1174: smat_i->req_size = req_size;
1175: smat_i->req_source1 = req_source1;
1177: smat_i->allcolumns = allcolumns[i];
1178: smat_i->allrows = allrows[i];
1179: smat_i->singleis = PETSC_FALSE;
1180: smat_i->row2proc = row2proc[i];
1181: smat_i->rmap = rmap[i];
1182: smat_i->cmap = cmap[i];
1183: }
1185: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1186: MatCreate(PETSC_COMM_SELF,&submats[0]);
1187: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
1188: MatSetType(submats[0],MATDUMMY);
1190: /* create struct Mat_SubSppt and attached it to submat */
1191: PetscNewLog(submats[0],&smat_i);
1192: submats[0]->data = (void*)smat_i;
1194: smat_i->destroy = submats[0]->ops->destroy;
1195: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1196: submats[0]->factortype = C->factortype;
1198: smat_i->id = 0;
1199: smat_i->nrqs = nrqs;
1200: smat_i->nrqr = nrqr;
1201: smat_i->rbuf1 = rbuf1;
1202: smat_i->rbuf2 = rbuf2;
1203: smat_i->rbuf3 = rbuf3;
1204: smat_i->sbuf2 = sbuf2;
1205: smat_i->req_source2 = req_source2;
1207: smat_i->sbuf1 = sbuf1;
1208: smat_i->ptr = ptr;
1209: smat_i->tmp = tmp;
1210: smat_i->ctr = ctr;
1212: smat_i->pa = pa;
1213: smat_i->req_size = req_size;
1214: smat_i->req_source1 = req_source1;
1216: smat_i->allcolumns = PETSC_FALSE;
1217: smat_i->singleis = PETSC_FALSE;
1218: smat_i->row2proc = NULL;
1219: smat_i->rmap = NULL;
1220: smat_i->cmap = NULL;
1221: }
1224: if (ismax) {PetscFree(lens[0]);}
1225: PetscFree(lens);
1226: PetscFree(sbuf_aj[0]);
1227: PetscFree(sbuf_aj);
1229: } /* endof scall == MAT_INITIAL_MATRIX */
1231: /* Post recv matrix values */
1232: if (!ijonly) {
1233: PetscObjectGetNewTag((PetscObject)C,&tag4);
1234: PetscMalloc1(nrqs+1,&rbuf4);
1235: PetscMalloc1(nrqs+1,&r_waits4);
1236: PetscMalloc1(nrqs+1,&r_status4);
1237: PetscMalloc1(nrqr+1,&s_status4);
1238: for (i=0; i<nrqs; ++i) {
1239: PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);
1240: MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1241: }
1243: /* Allocate sending buffers for a->a, and send them off */
1244: PetscMalloc1(nrqr+1,&sbuf_aa);
1245: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1247: PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);
1248: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1250: PetscMalloc1(nrqr+1,&s_waits4);
1252: for (i=0; i<nrqr; i++) {
1253: rbuf1_i = rbuf1[i];
1254: sbuf_aa_i = sbuf_aa[i];
1255: ct1 = 2*rbuf1_i[0]+1;
1256: ct2 = 0;
1257: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1258: kmax = rbuf1_i[2*j];
1259: for (k=0; k<kmax; k++,ct1++) {
1260: row = rbuf1_i[ct1] - rstart;
1261: nzA = a_i[row+1] - a_i[row];
1262: nzB = b_i[row+1] - b_i[row];
1263: ncols = nzA + nzB;
1264: cworkB = b_j + b_i[row];
1265: vworkA = a_a + a_i[row]*bs2;
1266: vworkB = b_a + b_i[row]*bs2;
1268: /* load the column values for this row into vals*/
1269: vals = sbuf_aa_i+ct2*bs2;
1270: for (l=0; l<nzB; l++) {
1271: if ((bmap[cworkB[l]]) < cstart) {
1272: PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2);
1273: } else break;
1274: }
1275: imark = l;
1276: for (l=0; l<nzA; l++) {
1277: PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2);
1278: }
1279: for (l=imark; l<nzB; l++) {
1280: PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2);
1281: }
1283: ct2 += ncols;
1284: }
1285: }
1286: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1287: }
1288: }
1290: /* Assemble the matrices */
1291: /* First assemble the local rows */
1292: for (i=0; i<ismax; i++) {
1293: row2proc_i = row2proc[i];
1294: subc = (Mat_SeqBAIJ*)submats[i]->data;
1295: imat_ilen = subc->ilen;
1296: imat_j = subc->j;
1297: imat_i = subc->i;
1298: imat_a = subc->a;
1300: if (!allcolumns[i]) cmap_i = cmap[i];
1301: rmap_i = rmap[i];
1302: irow_i = irow[i];
1303: jmax = nrow[i];
1304: for (j=0; j<jmax; j++) {
1305: if (allrows[i]) row = j;
1306: else row = irow_i[j];
1307: proc = row2proc_i[j];
1309: if (proc == rank) {
1311: row = row - rstart;
1312: nzA = a_i[row+1] - a_i[row];
1313: nzB = b_i[row+1] - b_i[row];
1314: cworkA = a_j + a_i[row];
1315: cworkB = b_j + b_i[row];
1316: if (!ijonly) {
1317: vworkA = a_a + a_i[row]*bs2;
1318: vworkB = b_a + b_i[row]*bs2;
1319: }
1321: if (allrows[i]) {
1322: row = row+rstart;
1323: } else {
1324: #if defined(PETSC_USE_CTABLE)
1325: PetscTableFind(rmap_i,row+rstart+1,&row);
1326: row--;
1328: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1329: #else
1330: row = rmap_i[row + rstart];
1331: #endif
1332: }
1333: mat_i = imat_i[row];
1334: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1335: mat_j = imat_j + mat_i;
1336: ilen = imat_ilen[row];
1338: /* load the column indices for this row into cols*/
1339: if (!allcolumns[i]) {
1340: for (l=0; l<nzB; l++) {
1341: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1342: #if defined(PETSC_USE_CTABLE)
1343: PetscTableFind(cmap_i,ctmp+1,&tcol);
1344: if (tcol) {
1345: #else
1346: if ((tcol = cmap_i[ctmp])) {
1347: #endif
1348: *mat_j++ = tcol - 1;
1349: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1350: mat_a += bs2;
1351: ilen++;
1352: }
1353: } else break;
1354: }
1355: imark = l;
1356: for (l=0; l<nzA; l++) {
1357: #if defined(PETSC_USE_CTABLE)
1358: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1359: if (tcol) {
1360: #else
1361: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1362: #endif
1363: *mat_j++ = tcol - 1;
1364: if (!ijonly) {
1365: PetscArraycpy(mat_a,vworkA+l*bs2,bs2);
1366: mat_a += bs2;
1367: }
1368: ilen++;
1369: }
1370: }
1371: for (l=imark; l<nzB; l++) {
1372: #if defined(PETSC_USE_CTABLE)
1373: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1374: if (tcol) {
1375: #else
1376: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1377: #endif
1378: *mat_j++ = tcol - 1;
1379: if (!ijonly) {
1380: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1381: mat_a += bs2;
1382: }
1383: ilen++;
1384: }
1385: }
1386: } else { /* allcolumns */
1387: for (l=0; l<nzB; l++) {
1388: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1389: *mat_j++ = ctmp;
1390: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1391: mat_a += bs2;
1392: ilen++;
1393: } else break;
1394: }
1395: imark = l;
1396: for (l=0; l<nzA; l++) {
1397: *mat_j++ = cstart+cworkA[l];
1398: if (!ijonly) {
1399: PetscArraycpy(mat_a,vworkA+l*bs2,bs2);
1400: mat_a += bs2;
1401: }
1402: ilen++;
1403: }
1404: for (l=imark; l<nzB; l++) {
1405: *mat_j++ = bmap[cworkB[l]];
1406: if (!ijonly) {
1407: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1408: mat_a += bs2;
1409: }
1410: ilen++;
1411: }
1412: }
1413: imat_ilen[row] = ilen;
1414: }
1415: }
1416: }
1418: /* Now assemble the off proc rows */
1419: if (!ijonly) {
1420: MPI_Waitall(nrqs,r_waits4,r_status4);
1421: }
1422: for (tmp2=0; tmp2<nrqs; tmp2++) {
1423: sbuf1_i = sbuf1[pa[tmp2]];
1424: jmax = sbuf1_i[0];
1425: ct1 = 2*jmax + 1;
1426: ct2 = 0;
1427: rbuf2_i = rbuf2[tmp2];
1428: rbuf3_i = rbuf3[tmp2];
1429: if (!ijonly) rbuf4_i = rbuf4[tmp2];
1430: for (j=1; j<=jmax; j++) {
1431: is_no = sbuf1_i[2*j-1];
1432: rmap_i = rmap[is_no];
1433: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1434: subc = (Mat_SeqBAIJ*)submats[is_no]->data;
1435: imat_ilen = subc->ilen;
1436: imat_j = subc->j;
1437: imat_i = subc->i;
1438: if (!ijonly) imat_a = subc->a;
1439: max1 = sbuf1_i[2*j];
1440: for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1441: row = sbuf1_i[ct1];
1443: if (allrows[is_no]) {
1444: row = sbuf1_i[ct1];
1445: } else {
1446: #if defined(PETSC_USE_CTABLE)
1447: PetscTableFind(rmap_i,row+1,&row);
1448: row--;
1449: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1450: #else
1451: row = rmap_i[row];
1452: #endif
1453: }
1454: ilen = imat_ilen[row];
1455: mat_i = imat_i[row];
1456: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1457: mat_j = imat_j + mat_i;
1458: max2 = rbuf2_i[ct1];
1459: if (!allcolumns[is_no]) {
1460: for (l=0; l<max2; l++,ct2++) {
1461: #if defined(PETSC_USE_CTABLE)
1462: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1463: #else
1464: tcol = cmap_i[rbuf3_i[ct2]];
1465: #endif
1466: if (tcol) {
1467: *mat_j++ = tcol - 1;
1468: if (!ijonly) {
1469: PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);
1470: mat_a += bs2;
1471: }
1472: ilen++;
1473: }
1474: }
1475: } else { /* allcolumns */
1476: for (l=0; l<max2; l++,ct2++) {
1477: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1478: if (!ijonly) {
1479: PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);
1480: mat_a += bs2;
1481: }
1482: ilen++;
1483: }
1484: }
1485: imat_ilen[row] = ilen;
1486: }
1487: }
1488: }
1490: if (!iscsorted) { /* sort column indices of the rows */
1491: MatScalar *work;
1493: PetscMalloc1(bs2,&work);
1494: for (i=0; i<ismax; i++) {
1495: subc = (Mat_SeqBAIJ*)submats[i]->data;
1496: imat_ilen = subc->ilen;
1497: imat_j = subc->j;
1498: imat_i = subc->i;
1499: if (!ijonly) imat_a = subc->a;
1500: if (allcolumns[i]) continue;
1502: jmax = nrow[i];
1503: for (j=0; j<jmax; j++) {
1504: mat_i = imat_i[j];
1505: mat_j = imat_j + mat_i;
1506: ilen = imat_ilen[j];
1507: if (ijonly) {
1508: PetscSortInt(ilen,mat_j);
1509: } else {
1510: mat_a = imat_a + mat_i*bs2;
1511: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
1512: }
1513: }
1514: }
1515: PetscFree(work);
1516: }
1518: if (!ijonly) {
1519: PetscFree(r_status4);
1520: PetscFree(r_waits4);
1521: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1522: PetscFree(s_waits4);
1523: PetscFree(s_status4);
1524: }
1526: /* Restore the indices */
1527: for (i=0; i<ismax; i++) {
1528: if (!allrows[i]) {
1529: ISRestoreIndices(isrow[i],irow+i);
1530: }
1531: if (!allcolumns[i]) {
1532: ISRestoreIndices(iscol[i],icol+i);
1533: }
1534: }
1536: for (i=0; i<ismax; i++) {
1537: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1538: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1539: }
1541: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
1542: PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);
1544: if (!ijonly) {
1545: PetscFree(sbuf_aa[0]);
1546: PetscFree(sbuf_aa);
1548: for (i=0; i<nrqs; ++i) {
1549: PetscFree(rbuf4[i]);
1550: }
1551: PetscFree(rbuf4);
1552: }
1553: c->ijonly = PETSC_FALSE; /* set back to the default */
1554: return(0);
1555: }