Actual source code: baijov.c
petsc-3.9.4 2018-09-11
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15: {
17: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
18: IS *is_new;
21: PetscMalloc1(imax,&is_new);
22: /* Convert the indices into block format */
23: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
27: }
28: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
29: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
30: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
31: PetscFree(is_new);
32: return(0);
33: }
35: /*
36: Sample message format:
37: If a processor A wants processor B to process some elements corresponding
38: to index sets is[1], is[5]
39: mesg [0] = 2 (no of index sets in the mesg)
40: -----------
41: mesg [1] = 1 => is[1]
42: mesg [2] = sizeof(is[1]);
43: -----------
44: mesg [5] = 5 => is[5]
45: mesg [6] = sizeof(is[5]);
46: -----------
47: mesg [7]
48: mesg [n] data(is[1])
49: -----------
50: mesg[n+1]
51: mesg[m] data(is[5])
52: -----------
54: Notes:
55: nrqs - no of requests sent (or to be sent out)
56: nrqr - no of requests recieved (which have to be or which have been processed
57: */
58: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
59: {
60: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*w3,*w4,**data,len;
64: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
65: PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
67: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
68: PetscBT *table;
69: MPI_Comm comm,*iscomms;
70: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
71: MPI_Status *s_status,*recv_status;
72: char *t_p;
75: PetscObjectGetComm((PetscObject)C,&comm);
76: size = c->size;
77: rank = c->rank;
78: Mbs = c->Mbs;
80: PetscObjectGetNewTag((PetscObject)C,&tag1);
81: PetscObjectGetNewTag((PetscObject)C,&tag2);
83: PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);
85: for (i=0; i<imax; i++) {
86: ISGetIndices(is[i],&idx[i]);
87: ISGetLocalSize(is[i],&n[i]);
88: }
90: /* evaluate communication - mesg to who,length of mesg, and buffer space
91: required. Based on this, buffers are allocated, and data copied into them*/
92: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
93: for (i=0; i<imax; i++) {
94: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
95: idx_i = idx[i];
96: len = n[i];
97: for (j=0; j<len; j++) {
98: row = idx_i[j];
99: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
101: w4[proc]++;
102: }
103: for (j=0; j<size; j++) {
104: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105: }
106: }
108: nrqs = 0; /* no of outgoing messages */
109: msz = 0; /* total mesg length (for all proc */
110: w1[rank] = 0; /* no mesg sent to itself */
111: w3[rank] = 0;
112: for (i=0; i<size; i++) {
113: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114: }
115: /* pa - is list of processors to communicate with */
116: PetscMalloc1(nrqs+1,&pa);
117: for (i=0,j=0; i<size; i++) {
118: if (w1[i]) {pa[j] = i; j++;}
119: }
121: /* Each message would have a header = 1 + 2*(no of IS) + data */
122: for (i=0; i<nrqs; i++) {
123: j = pa[i];
124: w1[j] += w2[j] + 2*w3[j];
125: msz += w1[j];
126: }
128: /* Determine the number of messages to expect, their lengths, from from-ids */
129: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
130: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
132: /* Now post the Irecvs corresponding to these messages */
133: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
135: /* Allocate Memory for outgoing messages */
136: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
137: PetscMemzero(outdat,size*sizeof(PetscInt*));
138: PetscMemzero(ptr,size*sizeof(PetscInt*));
139: {
140: PetscInt *iptr = tmp,ict = 0;
141: for (i=0; i<nrqs; i++) {
142: j = pa[i];
143: iptr += ict;
144: outdat[j] = iptr;
145: ict = w1[j];
146: }
147: }
149: /* Form the outgoing messages */
150: /*plug in the headers*/
151: for (i=0; i<nrqs; i++) {
152: j = pa[i];
153: outdat[j][0] = 0;
154: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
155: ptr[j] = outdat[j] + 2*w3[j] + 1;
156: }
158: /* Memory for doing local proc's work*/
159: {
160: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
162: for (i=0; i<imax; i++) {
163: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
164: data[i] = d_p + (Mbs)*i;
165: }
166: }
168: /* Parse the IS and update local tables and the outgoing buf with the data*/
169: {
170: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
171: PetscBT table_i;
173: for (i=0; i<imax; i++) {
174: PetscMemzero(ctr,size*sizeof(PetscInt));
175: n_i = n[i];
176: table_i = table[i];
177: idx_i = idx[i];
178: data_i = data[i];
179: isz_i = isz[i];
180: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
181: row = idx_i[j];
182: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
183: if (proc != rank) { /* copy to the outgoing buffer */
184: ctr[proc]++;
185: *ptr[proc] = row;
186: ptr[proc]++;
187: } else { /* Update the local table */
188: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
189: }
190: }
191: /* Update the headers for the current IS */
192: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193: if ((ctr_j = ctr[j])) {
194: outdat_j = outdat[j];
195: k = ++outdat_j[0];
196: outdat_j[2*k] = ctr_j;
197: outdat_j[2*k-1] = i;
198: }
199: }
200: isz[i] = isz_i;
201: }
202: }
204: /* Now post the sends */
205: PetscMalloc1(nrqs+1,&s_waits1);
206: for (i=0; i<nrqs; ++i) {
207: j = pa[i];
208: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
209: }
211: /* No longer need the original indices*/
212: for (i=0; i<imax; ++i) {
213: ISRestoreIndices(is[i],idx+i);
214: }
215: PetscFree2(*(PetscInt***)&idx,n);
217: PetscMalloc1(imax,&iscomms);
218: for (i=0; i<imax; ++i) {
219: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
220: ISDestroy(&is[i]);
221: }
223: /* Do Local work*/
224: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
226: /* Receive messages*/
227: PetscMalloc1(nrqr+1,&recv_status);
228: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
230: PetscMalloc1(nrqs+1,&s_status);
231: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
233: /* Phase 1 sends are complete - deallocate buffers */
234: PetscFree4(outdat,ptr,tmp,ctr);
235: PetscFree4(w1,w2,w3,w4);
237: PetscMalloc1(nrqr+1,&xdata);
238: PetscMalloc1(nrqr+1,&isz1);
239: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
240: PetscFree(rbuf[0]);
241: PetscFree(rbuf);
243: /* Send the data back*/
244: /* Do a global reduction to know the buffer space req for incoming messages*/
245: {
246: PetscMPIInt *rw1;
248: PetscCalloc1(size,&rw1);
250: for (i=0; i<nrqr; ++i) {
251: proc = recv_status[i].MPI_SOURCE;
252: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
253: rw1[proc] = isz1[i];
254: }
256: PetscFree(onodes1);
257: PetscFree(olengths1);
259: /* Determine the number of messages to expect, their lengths, from from-ids */
260: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
261: PetscFree(rw1);
262: }
263: /* Now post the Irecvs corresponding to these messages */
264: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
266: /* Now post the sends */
267: PetscMalloc1(nrqr+1,&s_waits2);
268: for (i=0; i<nrqr; ++i) {
269: j = recv_status[i].MPI_SOURCE;
270: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
271: }
273: /* receive work done on other processors*/
274: {
275: PetscMPIInt idex;
276: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
277: PetscBT table_i;
278: MPI_Status *status2;
280: PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);
281: for (i=0; i<nrqs; ++i) {
282: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
283: /* Process the message*/
284: rbuf2_i = rbuf2[idex];
285: ct1 = 2*rbuf2_i[0]+1;
286: jmax = rbuf2[idex][0];
287: for (j=1; j<=jmax; j++) {
288: max = rbuf2_i[2*j];
289: is_no = rbuf2_i[2*j-1];
290: isz_i = isz[is_no];
291: data_i = data[is_no];
292: table_i = table[is_no];
293: for (k=0; k<max; k++,ct1++) {
294: row = rbuf2_i[ct1];
295: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
296: }
297: isz[is_no] = isz_i;
298: }
299: }
300: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
301: PetscFree(status2);
302: }
304: for (i=0; i<imax; ++i) {
305: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
306: PetscCommDestroy(&iscomms[i]);
307: }
309: PetscFree(iscomms);
310: PetscFree(onodes2);
311: PetscFree(olengths2);
313: PetscFree(pa);
314: PetscFree(rbuf2[0]);
315: PetscFree(rbuf2);
316: PetscFree(s_waits1);
317: PetscFree(r_waits1);
318: PetscFree(s_waits2);
319: PetscFree(r_waits2);
320: PetscFree5(table,data,isz,d_p,t_p);
321: PetscFree(s_status);
322: PetscFree(recv_status);
323: PetscFree(xdata[0]);
324: PetscFree(xdata);
325: PetscFree(isz1);
326: return(0);
327: }
329: /*
330: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
331: the work on the local processor.
333: Inputs:
334: C - MAT_MPIBAIJ;
335: imax - total no of index sets processed at a time;
336: table - an array of char - size = Mbs bits.
338: Output:
339: isz - array containing the count of the solution elements corresponding
340: to each index set;
341: data - pointer to the solutions
342: */
343: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
344: {
345: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
346: Mat A = c->A,B = c->B;
347: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
348: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
349: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
350: PetscBT table_i;
353: rstart = c->rstartbs;
354: cstart = c->cstartbs;
355: ai = a->i;
356: aj = a->j;
357: bi = b->i;
358: bj = b->j;
359: garray = c->garray;
362: for (i=0; i<imax; i++) {
363: data_i = data[i];
364: table_i = table[i];
365: isz_i = isz[i];
366: for (j=0,max=isz[i]; j<max; j++) {
367: row = data_i[j] - rstart;
368: start = ai[row];
369: end = ai[row+1];
370: for (k=start; k<end; k++) { /* Amat */
371: val = aj[k] + cstart;
372: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
373: }
374: start = bi[row];
375: end = bi[row+1];
376: for (k=start; k<end; k++) { /* Bmat */
377: val = garray[bj[k]];
378: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379: }
380: }
381: isz[i] = isz_i;
382: }
383: return(0);
384: }
385: /*
386: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
387: and return the output
389: Input:
390: C - the matrix
391: nrqr - no of messages being processed.
392: rbuf - an array of pointers to the recieved requests
394: Output:
395: xdata - array of messages to be sent back
396: isz1 - size of each message
398: For better efficiency perhaps we should malloc separately each xdata[i],
399: then if a remalloc is required we need only copy the data for that one row
400: rather than all previous rows as it is now where a single large chunck of
401: memory is used.
403: */
404: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
405: {
406: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
407: Mat A = c->A,B = c->B;
408: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
410: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
411: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
412: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
413: PetscInt *rbuf_i,kmax,rbuf_0;
414: PetscBT xtable;
417: Mbs = c->Mbs;
418: rstart = c->rstartbs;
419: cstart = c->cstartbs;
420: ai = a->i;
421: aj = a->j;
422: bi = b->i;
423: bj = b->j;
424: garray = c->garray;
427: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
428: rbuf_i = rbuf[i];
429: rbuf_0 = rbuf_i[0];
430: ct += rbuf_0;
431: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
432: }
434: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
435: else max1 = 1;
436: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
437: PetscMalloc1(mem_estimate,&xdata[0]);
438: ++no_malloc;
439: PetscBTCreate(Mbs,&xtable);
440: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
442: ct3 = 0;
443: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
444: rbuf_i = rbuf[i];
445: rbuf_0 = rbuf_i[0];
446: ct1 = 2*rbuf_0+1;
447: ct2 = ct1;
448: ct3 += ct1;
449: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
450: PetscBTMemzero(Mbs,xtable);
451: oct2 = ct2;
452: kmax = rbuf_i[2*j];
453: for (k=0; k<kmax; k++,ct1++) {
454: row = rbuf_i[ct1];
455: if (!PetscBTLookupSet(xtable,row)) {
456: if (!(ct3 < mem_estimate)) {
457: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
458: PetscMalloc1(new_estimate,&tmp);
459: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
460: PetscFree(xdata[0]);
461: xdata[0] = tmp;
462: mem_estimate = new_estimate; ++no_malloc;
463: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
464: }
465: xdata[i][ct2++] = row;
466: ct3++;
467: }
468: }
469: for (k=oct2,max2=ct2; k<max2; k++) {
470: row = xdata[i][k] - rstart;
471: start = ai[row];
472: end = ai[row+1];
473: for (l=start; l<end; l++) {
474: val = aj[l] + cstart;
475: if (!PetscBTLookupSet(xtable,val)) {
476: if (!(ct3 < mem_estimate)) {
477: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
478: PetscMalloc1(new_estimate,&tmp);
479: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
480: PetscFree(xdata[0]);
481: xdata[0] = tmp;
482: mem_estimate = new_estimate; ++no_malloc;
483: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
484: }
485: xdata[i][ct2++] = val;
486: ct3++;
487: }
488: }
489: start = bi[row];
490: end = bi[row+1];
491: for (l=start; l<end; l++) {
492: val = garray[bj[l]];
493: if (!PetscBTLookupSet(xtable,val)) {
494: if (!(ct3 < mem_estimate)) {
495: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
496: PetscMalloc1(new_estimate,&tmp);
497: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
498: PetscFree(xdata[0]);
499: xdata[0] = tmp;
500: mem_estimate = new_estimate; ++no_malloc;
501: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
502: }
503: xdata[i][ct2++] = val;
504: ct3++;
505: }
506: }
507: }
508: /* Update the header*/
509: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
510: xdata[i][2*j-1] = rbuf_i[2*j-1];
511: }
512: xdata[i][0] = rbuf_0;
513: xdata[i+1] = xdata[i] + ct2;
514: isz1[i] = ct2; /* size of each message */
515: }
516: PetscBTDestroy(&xtable);
517: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
518: return(0);
519: }
521: PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
522: {
523: IS *isrow_block,*iscol_block;
524: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
526: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
527: Mat_SeqBAIJ *subc;
528: Mat_SubSppt *smat;
531: /* The compression and expansion should be avoided. Doesn't point
532: out errors, might change the indices, hence buggey */
533: PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);
534: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);
535: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);
537: /* Determine the number of stages through which submatrices are done */
538: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
539: else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
540: if (!nmax) nmax = 1;
542: if (scall == MAT_INITIAL_MATRIX) {
543: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
545: /* Make sure every processor loops through the nstages */
546: MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
548: /* Allocate memory to hold all the submatrices and dummy submatrices */
549: PetscCalloc1(ismax+nstages,submat);
550: } else { /* MAT_REUSE_MATRIX */
551: if (ismax) {
552: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
553: smat = subc->submatis1;
554: } else { /* (*submat)[0] is a dummy matrix */
555: smat = (Mat_SubSppt*)(*submat)[0]->data;
556: }
557: if (!smat) {
558: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat");
559: }
560: nstages = smat->nstages;
561: }
563: for (i=0,pos=0; i<nstages; i++) {
564: if (pos+nmax <= ismax) max_no = nmax;
565: else if (pos == ismax) max_no = 0;
566: else max_no = ismax-pos;
568: MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);
569: if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
570: smat = (Mat_SubSppt*)(*submat)[pos]->data;
571: smat->nstages = nstages;
572: }
573: pos += max_no;
574: }
576: if (scall == MAT_INITIAL_MATRIX && ismax) {
577: /* save nstages for reuse */
578: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
579: smat = subc->submatis1;
580: smat->nstages = nstages;
581: }
583: for (i=0; i<ismax; i++) {
584: ISDestroy(&isrow_block[i]);
585: ISDestroy(&iscol_block[i]);
586: }
587: PetscFree2(isrow_block,iscol_block);
588: return(0);
589: }
591: #if defined(PETSC_USE_CTABLE)
592: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
593: {
594: PetscInt nGlobalNd = proc_gnode[size];
595: PetscMPIInt fproc;
599: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
600: if (fproc > size) fproc = size;
601: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
602: if (row < proc_gnode[fproc]) fproc--;
603: else fproc++;
604: }
605: *rank = fproc;
606: return(0);
607: }
608: #endif
610: /* -------------------------------------------------------------------------*/
611: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
612: PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
613: {
614: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
615: Mat A = c->A;
616: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
617: const PetscInt **icol,**irow;
618: PetscInt *nrow,*ncol,start;
620: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
621: PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
622: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
623: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
624: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
625: #if defined(PETSC_USE_CTABLE)
626: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
627: #else
628: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
629: #endif
630: const PetscInt *irow_i,*icol_i;
631: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
632: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
633: MPI_Request *r_waits4,*s_waits3,*s_waits4;
634: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
635: MPI_Status *r_status3,*r_status4,*s_status4;
636: MPI_Comm comm;
637: PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a,*sbuf_aa_i;
638: PetscMPIInt *onodes1,*olengths1,end;
639: PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
640: Mat_SubSppt *smat_i;
641: PetscBool *issorted,colflag,iscsorted=PETSC_TRUE;
642: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
643: PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
644: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
645: PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
646: PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
647: PetscInt cstart = c->cstartbs,*bmap = c->garray;
648: PetscBool *allrows,*allcolumns;
651: PetscObjectGetComm((PetscObject)C,&comm);
652: size = c->size;
653: rank = c->rank;
655: PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);
656: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
658: for (i=0; i<ismax; i++) {
659: ISSorted(iscol[i],&issorted[i]);
660: if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
661: ISSorted(isrow[i],&issorted[i]);
663: /* Check for special case: allcolumns */
664: ISIdentity(iscol[i],&colflag);
665: ISGetLocalSize(iscol[i],&ncol[i]);
667: if (colflag && ncol[i] == c->Nbs) {
668: allcolumns[i] = PETSC_TRUE;
669: icol[i] = NULL;
670: } else {
671: allcolumns[i] = PETSC_FALSE;
672: ISGetIndices(iscol[i],&icol[i]);
673: }
675: /* Check for special case: allrows */
676: ISIdentity(isrow[i],&colflag);
677: ISGetLocalSize(isrow[i],&nrow[i]);
678: if (colflag && nrow[i] == c->Mbs) {
679: allrows[i] = PETSC_TRUE;
680: irow[i] = NULL;
681: } else {
682: allrows[i] = PETSC_FALSE;
683: ISGetIndices(isrow[i],&irow[i]);
684: }
685: }
687: if (scall == MAT_REUSE_MATRIX) {
688: /* Assumes new rows are same length as the old rows */
689: for (i=0; i<ismax; i++) {
690: subc = (Mat_SeqBAIJ*)(submats[i]->data);
691: if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
693: /* Initial matrix as if empty */
694: PetscMemzero(subc->ilen,subc->mbs*sizeof(PetscInt));
696: /* Initial matrix as if empty */
697: submats[i]->factortype = C->factortype;
699: smat_i = subc->submatis1;
701: nrqs = smat_i->nrqs;
702: nrqr = smat_i->nrqr;
703: rbuf1 = smat_i->rbuf1;
704: rbuf2 = smat_i->rbuf2;
705: rbuf3 = smat_i->rbuf3;
706: req_source2 = smat_i->req_source2;
708: sbuf1 = smat_i->sbuf1;
709: sbuf2 = smat_i->sbuf2;
710: ptr = smat_i->ptr;
711: tmp = smat_i->tmp;
712: ctr = smat_i->ctr;
714: pa = smat_i->pa;
715: req_size = smat_i->req_size;
716: req_source1 = smat_i->req_source1;
718: allcolumns[i] = smat_i->allcolumns;
719: allrows[i] = smat_i->allrows;
720: row2proc[i] = smat_i->row2proc;
721: rmap[i] = smat_i->rmap;
722: cmap[i] = smat_i->cmap;
723: }
725: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
726: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
727: smat_i = (Mat_SubSppt*)submats[0]->data;
729: nrqs = smat_i->nrqs;
730: nrqr = smat_i->nrqr;
731: rbuf1 = smat_i->rbuf1;
732: rbuf2 = smat_i->rbuf2;
733: rbuf3 = smat_i->rbuf3;
734: req_source2 = smat_i->req_source2;
736: sbuf1 = smat_i->sbuf1;
737: sbuf2 = smat_i->sbuf2;
738: ptr = smat_i->ptr;
739: tmp = smat_i->tmp;
740: ctr = smat_i->ctr;
742: pa = smat_i->pa;
743: req_size = smat_i->req_size;
744: req_source1 = smat_i->req_source1;
746: allcolumns[0] = PETSC_FALSE;
747: }
748: } else { /* scall == MAT_INITIAL_MATRIX */
749: /* Get some new tags to keep the communication clean */
750: PetscObjectGetNewTag((PetscObject)C,&tag2);
751: PetscObjectGetNewTag((PetscObject)C,&tag3);
753: /* evaluate communication - mesg to who, length of mesg, and buffer space
754: required. Based on this, buffers are allocated, and data copied into them*/
755: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
757: for (i=0; i<ismax; i++) {
758: jmax = nrow[i];
759: irow_i = irow[i];
761: PetscMalloc1(jmax,&row2proc_i);
762: row2proc[i] = row2proc_i;
764: if (issorted[i]) proc = 0;
765: for (j=0; j<jmax; j++) {
766: if (!issorted[i]) proc = 0;
767: if (allrows[i]) row = j;
768: else row = irow_i[j];
770: while (row >= c->rangebs[proc+1]) proc++;
771: w4[proc]++;
772: row2proc_i[j] = proc; /* map row index to proc */
773: }
774: for (j=0; j<size; j++) {
775: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
776: }
777: }
779: nrqs = 0; /* no of outgoing messages */
780: msz = 0; /* total mesg length (for all procs) */
781: w1[rank] = 0; /* no mesg sent to self */
782: w3[rank] = 0;
783: for (i=0; i<size; i++) {
784: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
785: }
786: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
787: for (i=0,j=0; i<size; i++) {
788: if (w1[i]) { pa[j] = i; j++; }
789: }
791: /* Each message would have a header = 1 + 2*(no of IS) + data */
792: for (i=0; i<nrqs; i++) {
793: j = pa[i];
794: w1[j] += w2[j] + 2* w3[j];
795: msz += w1[j];
796: }
797: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
799: /* Determine the number of messages to expect, their lengths, from from-ids */
800: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
801: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
803: /* Now post the Irecvs corresponding to these messages */
804: tag0 = ((PetscObject)C)->tag;
805: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
807: PetscFree(onodes1);
808: PetscFree(olengths1);
810: /* Allocate Memory for outgoing messages */
811: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
812: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
813: PetscMemzero(ptr,size*sizeof(PetscInt*));
815: {
816: PetscInt *iptr = tmp;
817: k = 0;
818: for (i=0; i<nrqs; i++) {
819: j = pa[i];
820: iptr += k;
821: sbuf1[j] = iptr;
822: k = w1[j];
823: }
824: }
826: /* Form the outgoing messages. Initialize the header space */
827: for (i=0; i<nrqs; i++) {
828: j = pa[i];
829: sbuf1[j][0] = 0;
830: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
831: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
832: }
834: /* Parse the isrow and copy data into outbuf */
835: for (i=0; i<ismax; i++) {
836: row2proc_i = row2proc[i];
837: PetscMemzero(ctr,size*sizeof(PetscInt));
838: irow_i = irow[i];
839: jmax = nrow[i];
840: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
841: proc = row2proc_i[j];
842: if (allrows[i]) row = j;
843: else row = irow_i[j];
845: if (proc != rank) { /* copy to the outgoing buf*/
846: ctr[proc]++;
847: *ptr[proc] = row;
848: ptr[proc]++;
849: }
850: }
851: /* Update the headers for the current IS */
852: for (j=0; j<size; j++) { /* Can Optimise this loop too */
853: if ((ctr_j = ctr[j])) {
854: sbuf1_j = sbuf1[j];
855: k = ++sbuf1_j[0];
856: sbuf1_j[2*k] = ctr_j;
857: sbuf1_j[2*k-1] = i;
858: }
859: }
860: }
862: /* Now post the sends */
863: PetscMalloc1(nrqs+1,&s_waits1);
864: for (i=0; i<nrqs; ++i) {
865: j = pa[i];
866: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
867: }
869: /* Post Receives to capture the buffer size */
870: PetscMalloc1(nrqs+1,&r_waits2);
871: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
872: rbuf2[0] = tmp + msz;
873: for (i=1; i<nrqs; ++i) {
874: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
875: }
876: for (i=0; i<nrqs; ++i) {
877: j = pa[i];
878: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
879: }
881: /* Send to other procs the buf size they should allocate */
882: /* Receive messages*/
883: PetscMalloc1(nrqr+1,&s_waits2);
884: PetscMalloc1(nrqr+1,&r_status1);
885: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
887: MPI_Waitall(nrqr,r_waits1,r_status1);
888: for (i=0; i<nrqr; ++i) {
889: req_size[i] = 0;
890: rbuf1_i = rbuf1[i];
891: start = 2*rbuf1_i[0] + 1;
892: MPI_Get_count(r_status1+i,MPIU_INT,&end);
893: PetscMalloc1(end+1,&sbuf2[i]);
894: sbuf2_i = sbuf2[i];
895: for (j=start; j<end; j++) {
896: row = rbuf1_i[j] - rstart;
897: ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
898: sbuf2_i[j] = ncols;
899: req_size[i] += ncols;
900: }
901: req_source1[i] = r_status1[i].MPI_SOURCE;
902: /* form the header */
903: sbuf2_i[0] = req_size[i];
904: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
906: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
907: }
909: PetscFree(r_status1);
910: PetscFree(r_waits1);
911: PetscFree4(w1,w2,w3,w4);
913: /* Receive messages*/
914: PetscMalloc1(nrqs+1,&r_waits3);
915: PetscMalloc1(nrqs+1,&r_status2);
917: MPI_Waitall(nrqs,r_waits2,r_status2);
918: for (i=0; i<nrqs; ++i) {
919: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
920: req_source2[i] = r_status2[i].MPI_SOURCE;
921: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
922: }
923: PetscFree(r_status2);
924: PetscFree(r_waits2);
926: /* Wait on sends1 and sends2 */
927: PetscMalloc1(nrqs+1,&s_status1);
928: PetscMalloc1(nrqr+1,&s_status2);
930: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
931: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
932: PetscFree(s_status1);
933: PetscFree(s_status2);
934: PetscFree(s_waits1);
935: PetscFree(s_waits2);
937: /* Now allocate sending buffers for a->j, and send them off */
938: PetscMalloc1(nrqr+1,&sbuf_aj);
939: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
940: PetscMalloc1(j+1,&sbuf_aj[0]);
941: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
943: PetscMalloc1(nrqr+1,&s_waits3);
944: {
946: for (i=0; i<nrqr; i++) {
947: rbuf1_i = rbuf1[i];
948: sbuf_aj_i = sbuf_aj[i];
949: ct1 = 2*rbuf1_i[0] + 1;
950: ct2 = 0;
951: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
952: kmax = rbuf1[i][2*j];
953: for (k=0; k<kmax; k++,ct1++) {
954: row = rbuf1_i[ct1] - rstart;
955: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
956: ncols = nzA + nzB;
957: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
959: /* load the column indices for this row into cols */
960: cols = sbuf_aj_i + ct2;
961: for (l=0; l<nzB; l++) {
962: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
963: else break;
964: }
965: imark = l;
966: for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
967: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
968: ct2 += ncols;
969: }
970: }
971: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
972: }
973: }
974: PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);
976: /* create col map: global col of C -> local col of submatrices */
977: #if defined(PETSC_USE_CTABLE)
978: for (i=0; i<ismax; i++) {
979: if (!allcolumns[i]) {
980: PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);
982: jmax = ncol[i];
983: icol_i = icol[i];
984: cmap_i = cmap[i];
985: for (j=0; j<jmax; j++) {
986: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
987: }
988: } else cmap[i] = NULL;
989: }
990: #else
991: for (i=0; i<ismax; i++) {
992: if (!allcolumns[i]) {
993: PetscCalloc1(c->Nbs,&cmap[i]);
994: jmax = ncol[i];
995: icol_i = icol[i];
996: cmap_i = cmap[i];
997: for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
998: } else cmap[i] = NULL;
999: }
1000: #endif
1002: /* Create lens which is required for MatCreate... */
1003: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1004: PetscMalloc1(ismax,&lens);
1006: if (ismax) {
1007: PetscCalloc1(j,&lens[0]);
1008: }
1009: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1011: /* Update lens from local data */
1012: for (i=0; i<ismax; i++) {
1013: row2proc_i = row2proc[i];
1014: jmax = nrow[i];
1015: if (!allcolumns[i]) cmap_i = cmap[i];
1016: irow_i = irow[i];
1017: lens_i = lens[i];
1018: for (j=0; j<jmax; j++) {
1019: if (allrows[i]) row = j;
1020: else row = irow_i[j]; /* global blocked row of C */
1022: proc = row2proc_i[j];
1023: if (proc == rank) {
1024: /* Get indices from matA and then from matB */
1025: #if defined(PETSC_USE_CTABLE)
1026: PetscInt tt;
1027: #endif
1028: row = row - rstart;
1029: nzA = a_i[row+1] - a_i[row];
1030: nzB = b_i[row+1] - b_i[row];
1031: cworkA = a_j + a_i[row];
1032: cworkB = b_j + b_i[row];
1034: if (!allcolumns[i]) {
1035: #if defined(PETSC_USE_CTABLE)
1036: for (k=0; k<nzA; k++) {
1037: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1038: if (tt) lens_i[j]++;
1039: }
1040: for (k=0; k<nzB; k++) {
1041: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1042: if (tt) lens_i[j]++;
1043: }
1045: #else
1046: for (k=0; k<nzA; k++) {
1047: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1048: }
1049: for (k=0; k<nzB; k++) {
1050: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1051: }
1052: #endif
1053: } else { /* allcolumns */
1054: lens_i[j] = nzA + nzB;
1055: }
1056: }
1057: }
1058: }
1060: /* Create row map: global row of C -> local row of submatrices */
1061: for (i=0; i<ismax; i++) {
1062: if (!allrows[i]) {
1063: #if defined(PETSC_USE_CTABLE)
1064: PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);
1065: irow_i = irow[i];
1066: jmax = nrow[i];
1067: for (j=0; j<jmax; j++) {
1068: if (allrows[i]) {
1069: PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);
1070: } else {
1071: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1072: }
1073: }
1074: #else
1075: PetscCalloc1(c->Mbs,&rmap[i]);
1076: rmap_i = rmap[i];
1077: irow_i = irow[i];
1078: jmax = nrow[i];
1079: for (j=0; j<jmax; j++) {
1080: if (allrows[i]) rmap_i[j] = j;
1081: else rmap_i[irow_i[j]] = j;
1082: }
1083: #endif
1084: } else rmap[i] = NULL;
1085: }
1087: /* Update lens from offproc data */
1088: {
1089: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1091: MPI_Waitall(nrqs,r_waits3,r_status3);
1092: for (tmp2=0; tmp2<nrqs; tmp2++) {
1093: sbuf1_i = sbuf1[pa[tmp2]];
1094: jmax = sbuf1_i[0];
1095: ct1 = 2*jmax+1;
1096: ct2 = 0;
1097: rbuf2_i = rbuf2[tmp2];
1098: rbuf3_i = rbuf3[tmp2];
1099: for (j=1; j<=jmax; j++) {
1100: is_no = sbuf1_i[2*j-1];
1101: max1 = sbuf1_i[2*j];
1102: lens_i = lens[is_no];
1103: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1104: rmap_i = rmap[is_no];
1105: for (k=0; k<max1; k++,ct1++) {
1106: if (allrows[is_no]) {
1107: row = sbuf1_i[ct1];
1108: } else {
1109: #if defined(PETSC_USE_CTABLE)
1110: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1111: row--;
1112: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1113: #else
1114: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1115: #endif
1116: }
1117: max2 = rbuf2_i[ct1];
1118: for (l=0; l<max2; l++,ct2++) {
1119: if (!allcolumns[is_no]) {
1120: #if defined(PETSC_USE_CTABLE)
1121: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1122: #else
1123: tcol = cmap_i[rbuf3_i[ct2]];
1124: #endif
1125: if (tcol) lens_i[row]++;
1126: } else { /* allcolumns */
1127: lens_i[row]++; /* lens_i[row] += max2 ? */
1128: }
1129: }
1130: }
1131: }
1132: }
1133: }
1134: PetscFree(r_waits3);
1135: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1136: PetscFree2(r_status3,s_status3);
1137: PetscFree(s_waits3);
1139: /* Create the submatrices */
1140: for (i=0; i<ismax; i++) {
1141: PetscInt bs_tmp;
1142: if (ijonly) bs_tmp = 1;
1143: else bs_tmp = bs;
1145: MatCreate(PETSC_COMM_SELF,submats+i);
1146: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);
1148: MatSetType(submats[i],((PetscObject)A)->type_name);
1149: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1150: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1152: /* create struct Mat_SubSppt and attached it to submat */
1153: PetscNew(&smat_i);
1154: subc = (Mat_SeqBAIJ*)submats[i]->data;
1155: subc->submatis1 = smat_i;
1157: smat_i->destroy = submats[i]->ops->destroy;
1158: submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1159: submats[i]->factortype = C->factortype;
1161: smat_i->id = i;
1162: smat_i->nrqs = nrqs;
1163: smat_i->nrqr = nrqr;
1164: smat_i->rbuf1 = rbuf1;
1165: smat_i->rbuf2 = rbuf2;
1166: smat_i->rbuf3 = rbuf3;
1167: smat_i->sbuf2 = sbuf2;
1168: smat_i->req_source2 = req_source2;
1170: smat_i->sbuf1 = sbuf1;
1171: smat_i->ptr = ptr;
1172: smat_i->tmp = tmp;
1173: smat_i->ctr = ctr;
1175: smat_i->pa = pa;
1176: smat_i->req_size = req_size;
1177: smat_i->req_source1 = req_source1;
1179: smat_i->allcolumns = allcolumns[i];
1180: smat_i->allrows = allrows[i];
1181: smat_i->singleis = PETSC_FALSE;
1182: smat_i->row2proc = row2proc[i];
1183: smat_i->rmap = rmap[i];
1184: smat_i->cmap = cmap[i];
1185: }
1187: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1188: MatCreate(PETSC_COMM_SELF,&submats[0]);
1189: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
1190: MatSetType(submats[0],MATDUMMY);
1192: /* create struct Mat_SubSppt and attached it to submat */
1193: PetscNewLog(submats[0],&smat_i);
1194: submats[0]->data = (void*)smat_i;
1196: smat_i->destroy = submats[0]->ops->destroy;
1197: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1198: submats[0]->factortype = C->factortype;
1200: smat_i->id = 0;
1201: smat_i->nrqs = nrqs;
1202: smat_i->nrqr = nrqr;
1203: smat_i->rbuf1 = rbuf1;
1204: smat_i->rbuf2 = rbuf2;
1205: smat_i->rbuf3 = rbuf3;
1206: smat_i->sbuf2 = sbuf2;
1207: smat_i->req_source2 = req_source2;
1209: smat_i->sbuf1 = sbuf1;
1210: smat_i->ptr = ptr;
1211: smat_i->tmp = tmp;
1212: smat_i->ctr = ctr;
1214: smat_i->pa = pa;
1215: smat_i->req_size = req_size;
1216: smat_i->req_source1 = req_source1;
1218: smat_i->allcolumns = PETSC_FALSE;
1219: smat_i->singleis = PETSC_FALSE;
1220: smat_i->row2proc = NULL;
1221: smat_i->rmap = NULL;
1222: smat_i->cmap = NULL;
1223: }
1226: if (ismax) {PetscFree(lens[0]);}
1227: PetscFree(lens);
1228: PetscFree(sbuf_aj[0]);
1229: PetscFree(sbuf_aj);
1231: } /* endof scall == MAT_INITIAL_MATRIX */
1233: /* Post recv matrix values */
1234: if (!ijonly) {
1235: PetscObjectGetNewTag((PetscObject)C,&tag4);
1236: PetscMalloc1(nrqs+1,&rbuf4);
1237: PetscMalloc1(nrqs+1,&r_waits4);
1238: PetscMalloc1(nrqs+1,&r_status4);
1239: PetscMalloc1(nrqr+1,&s_status4);
1240: for (i=0; i<nrqs; ++i) {
1241: PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);
1242: MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1243: }
1245: /* Allocate sending buffers for a->a, and send them off */
1246: PetscMalloc1(nrqr+1,&sbuf_aa);
1247: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1249: PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);
1250: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1252: PetscMalloc1(nrqr+1,&s_waits4);
1254: for (i=0; i<nrqr; i++) {
1255: rbuf1_i = rbuf1[i];
1256: sbuf_aa_i = sbuf_aa[i];
1257: ct1 = 2*rbuf1_i[0]+1;
1258: ct2 = 0;
1259: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1260: kmax = rbuf1_i[2*j];
1261: for (k=0; k<kmax; k++,ct1++) {
1262: row = rbuf1_i[ct1] - rstart;
1263: nzA = a_i[row+1] - a_i[row];
1264: nzB = b_i[row+1] - b_i[row];
1265: ncols = nzA + nzB;
1266: cworkB = b_j + b_i[row];
1267: vworkA = a_a + a_i[row]*bs2;
1268: vworkB = b_a + b_i[row]*bs2;
1270: /* load the column values for this row into vals*/
1271: vals = sbuf_aa_i+ct2*bs2;
1272: for (l=0; l<nzB; l++) {
1273: if ((bmap[cworkB[l]]) < cstart) {
1274: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1275: } else break;
1276: }
1277: imark = l;
1278: for (l=0; l<nzA; l++) {
1279: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
1280: }
1281: for (l=imark; l<nzB; l++) {
1282: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1283: }
1285: ct2 += ncols;
1286: }
1287: }
1288: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1289: }
1290: }
1292: /* Assemble the matrices */
1293: /* First assemble the local rows */
1294: for (i=0; i<ismax; i++) {
1295: row2proc_i = row2proc[i];
1296: subc = (Mat_SeqBAIJ*)submats[i]->data;
1297: imat_ilen = subc->ilen;
1298: imat_j = subc->j;
1299: imat_i = subc->i;
1300: imat_a = subc->a;
1302: if (!allcolumns[i]) cmap_i = cmap[i];
1303: rmap_i = rmap[i];
1304: irow_i = irow[i];
1305: jmax = nrow[i];
1306: for (j=0; j<jmax; j++) {
1307: if (allrows[i]) row = j;
1308: else row = irow_i[j];
1309: proc = row2proc_i[j];
1311: if (proc == rank) {
1313: row = row - rstart;
1314: nzA = a_i[row+1] - a_i[row];
1315: nzB = b_i[row+1] - b_i[row];
1316: cworkA = a_j + a_i[row];
1317: cworkB = b_j + b_i[row];
1318: if (!ijonly) {
1319: vworkA = a_a + a_i[row]*bs2;
1320: vworkB = b_a + b_i[row]*bs2;
1321: }
1323: if (allrows[i]) {
1324: row = row+rstart;
1325: } else {
1326: #if defined(PETSC_USE_CTABLE)
1327: PetscTableFind(rmap_i,row+rstart+1,&row);
1328: row--;
1330: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1331: #else
1332: row = rmap_i[row + rstart];
1333: #endif
1334: }
1335: mat_i = imat_i[row];
1336: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1337: mat_j = imat_j + mat_i;
1338: ilen = imat_ilen[row];
1340: /* load the column indices for this row into cols*/
1341: if (!allcolumns[i]) {
1342: for (l=0; l<nzB; l++) {
1343: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1344: #if defined(PETSC_USE_CTABLE)
1345: PetscTableFind(cmap_i,ctmp+1,&tcol);
1346: if (tcol) {
1347: #else
1348: if ((tcol = cmap_i[ctmp])) {
1349: #endif
1350: *mat_j++ = tcol - 1;
1351: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1352: mat_a += bs2;
1353: ilen++;
1354: }
1355: } else break;
1356: }
1357: imark = l;
1358: for (l=0; l<nzA; l++) {
1359: #if defined(PETSC_USE_CTABLE)
1360: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1361: if (tcol) {
1362: #else
1363: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1364: #endif
1365: *mat_j++ = tcol - 1;
1366: if (!ijonly) {
1367: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1368: mat_a += bs2;
1369: }
1370: ilen++;
1371: }
1372: }
1373: for (l=imark; l<nzB; l++) {
1374: #if defined(PETSC_USE_CTABLE)
1375: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1376: if (tcol) {
1377: #else
1378: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1379: #endif
1380: *mat_j++ = tcol - 1;
1381: if (!ijonly) {
1382: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1383: mat_a += bs2;
1384: }
1385: ilen++;
1386: }
1387: }
1388: } else { /* allcolumns */
1389: for (l=0; l<nzB; l++) {
1390: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1391: *mat_j++ = ctmp;
1392: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1393: mat_a += bs2;
1394: ilen++;
1395: } else break;
1396: }
1397: imark = l;
1398: for (l=0; l<nzA; l++) {
1399: *mat_j++ = cstart+cworkA[l];
1400: if (!ijonly) {
1401: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1402: mat_a += bs2;
1403: }
1404: ilen++;
1405: }
1406: for (l=imark; l<nzB; l++) {
1407: *mat_j++ = bmap[cworkB[l]];
1408: if (!ijonly) {
1409: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1410: mat_a += bs2;
1411: }
1412: ilen++;
1413: }
1414: }
1415: imat_ilen[row] = ilen;
1416: }
1417: }
1418: }
1420: /* Now assemble the off proc rows */
1421: if (!ijonly) {
1422: MPI_Waitall(nrqs,r_waits4,r_status4);
1423: }
1424: for (tmp2=0; tmp2<nrqs; tmp2++) {
1425: sbuf1_i = sbuf1[pa[tmp2]];
1426: jmax = sbuf1_i[0];
1427: ct1 = 2*jmax + 1;
1428: ct2 = 0;
1429: rbuf2_i = rbuf2[tmp2];
1430: rbuf3_i = rbuf3[tmp2];
1431: if (!ijonly) rbuf4_i = rbuf4[tmp2];
1432: for (j=1; j<=jmax; j++) {
1433: is_no = sbuf1_i[2*j-1];
1434: rmap_i = rmap[is_no];
1435: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1436: subc = (Mat_SeqBAIJ*)submats[is_no]->data;
1437: imat_ilen = subc->ilen;
1438: imat_j = subc->j;
1439: imat_i = subc->i;
1440: if (!ijonly) imat_a = subc->a;
1441: max1 = sbuf1_i[2*j];
1442: for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1443: row = sbuf1_i[ct1];
1445: if (allrows[is_no]) {
1446: row = sbuf1_i[ct1];
1447: } else {
1448: #if defined(PETSC_USE_CTABLE)
1449: PetscTableFind(rmap_i,row+1,&row);
1450: row--;
1451: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1452: #else
1453: row = rmap_i[row];
1454: #endif
1455: }
1456: ilen = imat_ilen[row];
1457: mat_i = imat_i[row];
1458: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1459: mat_j = imat_j + mat_i;
1460: max2 = rbuf2_i[ct1];
1461: if (!allcolumns[is_no]) {
1462: for (l=0; l<max2; l++,ct2++) {
1463: #if defined(PETSC_USE_CTABLE)
1464: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1465: #else
1466: tcol = cmap_i[rbuf3_i[ct2]];
1467: #endif
1468: if (tcol) {
1469: *mat_j++ = tcol - 1;
1470: if (!ijonly) {
1471: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1472: mat_a += bs2;
1473: }
1474: ilen++;
1475: }
1476: }
1477: } else { /* allcolumns */
1478: for (l=0; l<max2; l++,ct2++) {
1479: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1480: if (!ijonly) {
1481: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1482: mat_a += bs2;
1483: }
1484: ilen++;
1485: }
1486: }
1487: imat_ilen[row] = ilen;
1488: }
1489: }
1490: }
1492: if (!iscsorted) { /* sort column indices of the rows */
1493: MatScalar *work;
1495: PetscMalloc1(bs2,&work);
1496: for (i=0; i<ismax; i++) {
1497: subc = (Mat_SeqBAIJ*)submats[i]->data;
1498: imat_ilen = subc->ilen;
1499: imat_j = subc->j;
1500: imat_i = subc->i;
1501: if (!ijonly) imat_a = subc->a;
1502: if (allcolumns[i]) continue;
1504: jmax = nrow[i];
1505: for (j=0; j<jmax; j++) {
1506: mat_i = imat_i[j];
1507: mat_j = imat_j + mat_i;
1508: ilen = imat_ilen[j];
1509: if (ijonly) {
1510: PetscSortInt(ilen,mat_j);
1511: } else {
1512: mat_a = imat_a + mat_i*bs2;
1513: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
1514: }
1515: }
1516: }
1517: PetscFree(work);
1518: }
1520: if (!ijonly) {
1521: PetscFree(r_status4);
1522: PetscFree(r_waits4);
1523: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1524: PetscFree(s_waits4);
1525: PetscFree(s_status4);
1526: }
1528: /* Restore the indices */
1529: for (i=0; i<ismax; i++) {
1530: if (!allrows[i]) {
1531: ISRestoreIndices(isrow[i],irow+i);
1532: }
1533: if (!allcolumns[i]) {
1534: ISRestoreIndices(iscol[i],icol+i);
1535: }
1536: }
1538: for (i=0; i<ismax; i++) {
1539: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1540: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1541: }
1543: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
1544: PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);
1546: if (!ijonly) {
1547: PetscFree(sbuf_aa[0]);
1548: PetscFree(sbuf_aa);
1550: for (i=0; i<nrqs; ++i) {
1551: PetscFree(rbuf4[i]);
1552: }
1553: PetscFree(rbuf4);
1554: }
1555: c->ijonly = PETSC_FALSE; /* set back to the default */
1556: return(0);
1557: }