Actual source code: baijov.c
petsc-3.8.4 2018-03-24
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15: {
17: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
18: IS *is_new;
21: PetscMalloc1(imax,&is_new);
22: /* Convert the indices into block format */
23: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
27: }
28: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
29: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
30: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
31: PetscFree(is_new);
32: return(0);
33: }
35: /*
36: Sample message format:
37: If a processor A wants processor B to process some elements corresponding
38: to index sets is[1], is[5]
39: mesg [0] = 2 (no of index sets in the mesg)
40: -----------
41: mesg [1] = 1 => is[1]
42: mesg [2] = sizeof(is[1]);
43: -----------
44: mesg [5] = 5 => is[5]
45: mesg [6] = sizeof(is[5]);
46: -----------
47: mesg [7]
48: mesg [n] data(is[1])
49: -----------
50: mesg[n+1]
51: mesg[m] data(is[5])
52: -----------
54: Notes:
55: nrqs - no of requests sent (or to be sent out)
56: nrqr - no of requests recieved (which have to be or which have been processed
57: */
58: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
59: {
60: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*w3,*w4,**data,len;
64: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
65: PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
67: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
68: PetscBT *table;
69: MPI_Comm comm;
70: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
71: MPI_Status *s_status,*recv_status;
72: char *t_p;
75: PetscObjectGetComm((PetscObject)C,&comm);
76: size = c->size;
77: rank = c->rank;
78: Mbs = c->Mbs;
80: PetscObjectGetNewTag((PetscObject)C,&tag1);
81: PetscObjectGetNewTag((PetscObject)C,&tag2);
83: PetscMalloc2(imax+1,&idx,imax,&n);
85: for (i=0; i<imax; i++) {
86: ISGetIndices(is[i],&idx[i]);
87: ISGetLocalSize(is[i],&n[i]);
88: }
90: /* evaluate communication - mesg to who,length of mesg, and buffer space
91: required. Based on this, buffers are allocated, and data copied into them*/
92: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
93: for (i=0; i<imax; i++) {
94: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
95: idx_i = idx[i];
96: len = n[i];
97: for (j=0; j<len; j++) {
98: row = idx_i[j];
99: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
101: w4[proc]++;
102: }
103: for (j=0; j<size; j++) {
104: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105: }
106: }
108: nrqs = 0; /* no of outgoing messages */
109: msz = 0; /* total mesg length (for all proc */
110: w1[rank] = 0; /* no mesg sent to itself */
111: w3[rank] = 0;
112: for (i=0; i<size; i++) {
113: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114: }
115: /* pa - is list of processors to communicate with */
116: PetscMalloc1(nrqs+1,&pa);
117: for (i=0,j=0; i<size; i++) {
118: if (w1[i]) {pa[j] = i; j++;}
119: }
121: /* Each message would have a header = 1 + 2*(no of IS) + data */
122: for (i=0; i<nrqs; i++) {
123: j = pa[i];
124: w1[j] += w2[j] + 2*w3[j];
125: msz += w1[j];
126: }
128: /* Determine the number of messages to expect, their lengths, from from-ids */
129: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
130: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
132: /* Now post the Irecvs corresponding to these messages */
133: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
135: /* Allocate Memory for outgoing messages */
136: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
137: PetscMemzero(outdat,size*sizeof(PetscInt*));
138: PetscMemzero(ptr,size*sizeof(PetscInt*));
139: {
140: PetscInt *iptr = tmp,ict = 0;
141: for (i=0; i<nrqs; i++) {
142: j = pa[i];
143: iptr += ict;
144: outdat[j] = iptr;
145: ict = w1[j];
146: }
147: }
149: /* Form the outgoing messages */
150: /*plug in the headers*/
151: for (i=0; i<nrqs; i++) {
152: j = pa[i];
153: outdat[j][0] = 0;
154: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
155: ptr[j] = outdat[j] + 2*w3[j] + 1;
156: }
158: /* Memory for doing local proc's work*/
159: {
160: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
162: for (i=0; i<imax; i++) {
163: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
164: data[i] = d_p + (Mbs)*i;
165: }
166: }
168: /* Parse the IS and update local tables and the outgoing buf with the data*/
169: {
170: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
171: PetscBT table_i;
173: for (i=0; i<imax; i++) {
174: PetscMemzero(ctr,size*sizeof(PetscInt));
175: n_i = n[i];
176: table_i = table[i];
177: idx_i = idx[i];
178: data_i = data[i];
179: isz_i = isz[i];
180: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
181: row = idx_i[j];
182: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
183: if (proc != rank) { /* copy to the outgoing buffer */
184: ctr[proc]++;
185: *ptr[proc] = row;
186: ptr[proc]++;
187: } else { /* Update the local table */
188: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
189: }
190: }
191: /* Update the headers for the current IS */
192: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193: if ((ctr_j = ctr[j])) {
194: outdat_j = outdat[j];
195: k = ++outdat_j[0];
196: outdat_j[2*k] = ctr_j;
197: outdat_j[2*k-1] = i;
198: }
199: }
200: isz[i] = isz_i;
201: }
202: }
204: /* Now post the sends */
205: PetscMalloc1(nrqs+1,&s_waits1);
206: for (i=0; i<nrqs; ++i) {
207: j = pa[i];
208: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
209: }
211: /* No longer need the original indices*/
212: for (i=0; i<imax; ++i) {
213: ISRestoreIndices(is[i],idx+i);
214: }
215: PetscFree2(idx,n);
217: for (i=0; i<imax; ++i) {
218: ISDestroy(&is[i]);
219: }
221: /* Do Local work*/
222: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
224: /* Receive messages*/
225: PetscMalloc1(nrqr+1,&recv_status);
226: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
228: PetscMalloc1(nrqs+1,&s_status);
229: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
231: /* Phase 1 sends are complete - deallocate buffers */
232: PetscFree4(outdat,ptr,tmp,ctr);
233: PetscFree4(w1,w2,w3,w4);
235: PetscMalloc1(nrqr+1,&xdata);
236: PetscMalloc1(nrqr+1,&isz1);
237: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
238: PetscFree(rbuf[0]);
239: PetscFree(rbuf);
241: /* Send the data back*/
242: /* Do a global reduction to know the buffer space req for incoming messages*/
243: {
244: PetscMPIInt *rw1;
246: PetscCalloc1(size,&rw1);
248: for (i=0; i<nrqr; ++i) {
249: proc = recv_status[i].MPI_SOURCE;
250: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
251: rw1[proc] = isz1[i];
252: }
254: PetscFree(onodes1);
255: PetscFree(olengths1);
257: /* Determine the number of messages to expect, their lengths, from from-ids */
258: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
259: PetscFree(rw1);
260: }
261: /* Now post the Irecvs corresponding to these messages */
262: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
264: /* Now post the sends */
265: PetscMalloc1(nrqr+1,&s_waits2);
266: for (i=0; i<nrqr; ++i) {
267: j = recv_status[i].MPI_SOURCE;
268: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
269: }
271: /* receive work done on other processors*/
272: {
273: PetscMPIInt idex;
274: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
275: PetscBT table_i;
276: MPI_Status *status2;
278: PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);
279: for (i=0; i<nrqs; ++i) {
280: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
281: /* Process the message*/
282: rbuf2_i = rbuf2[idex];
283: ct1 = 2*rbuf2_i[0]+1;
284: jmax = rbuf2[idex][0];
285: for (j=1; j<=jmax; j++) {
286: max = rbuf2_i[2*j];
287: is_no = rbuf2_i[2*j-1];
288: isz_i = isz[is_no];
289: data_i = data[is_no];
290: table_i = table[is_no];
291: for (k=0; k<max; k++,ct1++) {
292: row = rbuf2_i[ct1];
293: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
294: }
295: isz[is_no] = isz_i;
296: }
297: }
298: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
299: PetscFree(status2);
300: }
302: for (i=0; i<imax; ++i) {
303: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
304: }
307: PetscFree(onodes2);
308: PetscFree(olengths2);
310: PetscFree(pa);
311: PetscFree(rbuf2[0]);
312: PetscFree(rbuf2);
313: PetscFree(s_waits1);
314: PetscFree(r_waits1);
315: PetscFree(s_waits2);
316: PetscFree(r_waits2);
317: PetscFree5(table,data,isz,d_p,t_p);
318: PetscFree(s_status);
319: PetscFree(recv_status);
320: PetscFree(xdata[0]);
321: PetscFree(xdata);
322: PetscFree(isz1);
323: return(0);
324: }
326: /*
327: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
328: the work on the local processor.
330: Inputs:
331: C - MAT_MPIBAIJ;
332: imax - total no of index sets processed at a time;
333: table - an array of char - size = Mbs bits.
335: Output:
336: isz - array containing the count of the solution elements corresponding
337: to each index set;
338: data - pointer to the solutions
339: */
340: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
341: {
342: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
343: Mat A = c->A,B = c->B;
344: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
345: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
346: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
347: PetscBT table_i;
350: rstart = c->rstartbs;
351: cstart = c->cstartbs;
352: ai = a->i;
353: aj = a->j;
354: bi = b->i;
355: bj = b->j;
356: garray = c->garray;
359: for (i=0; i<imax; i++) {
360: data_i = data[i];
361: table_i = table[i];
362: isz_i = isz[i];
363: for (j=0,max=isz[i]; j<max; j++) {
364: row = data_i[j] - rstart;
365: start = ai[row];
366: end = ai[row+1];
367: for (k=start; k<end; k++) { /* Amat */
368: val = aj[k] + cstart;
369: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
370: }
371: start = bi[row];
372: end = bi[row+1];
373: for (k=start; k<end; k++) { /* Bmat */
374: val = garray[bj[k]];
375: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
376: }
377: }
378: isz[i] = isz_i;
379: }
380: return(0);
381: }
382: /*
383: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
384: and return the output
386: Input:
387: C - the matrix
388: nrqr - no of messages being processed.
389: rbuf - an array of pointers to the recieved requests
391: Output:
392: xdata - array of messages to be sent back
393: isz1 - size of each message
395: For better efficiency perhaps we should malloc separately each xdata[i],
396: then if a remalloc is required we need only copy the data for that one row
397: rather than all previous rows as it is now where a single large chunck of
398: memory is used.
400: */
401: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
402: {
403: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
404: Mat A = c->A,B = c->B;
405: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
407: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
408: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
409: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
410: PetscInt *rbuf_i,kmax,rbuf_0;
411: PetscBT xtable;
414: Mbs = c->Mbs;
415: rstart = c->rstartbs;
416: cstart = c->cstartbs;
417: ai = a->i;
418: aj = a->j;
419: bi = b->i;
420: bj = b->j;
421: garray = c->garray;
424: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
425: rbuf_i = rbuf[i];
426: rbuf_0 = rbuf_i[0];
427: ct += rbuf_0;
428: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
429: }
431: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
432: else max1 = 1;
433: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
434: PetscMalloc1(mem_estimate,&xdata[0]);
435: ++no_malloc;
436: PetscBTCreate(Mbs,&xtable);
437: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
439: ct3 = 0;
440: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
441: rbuf_i = rbuf[i];
442: rbuf_0 = rbuf_i[0];
443: ct1 = 2*rbuf_0+1;
444: ct2 = ct1;
445: ct3 += ct1;
446: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
447: PetscBTMemzero(Mbs,xtable);
448: oct2 = ct2;
449: kmax = rbuf_i[2*j];
450: for (k=0; k<kmax; k++,ct1++) {
451: row = rbuf_i[ct1];
452: if (!PetscBTLookupSet(xtable,row)) {
453: if (!(ct3 < mem_estimate)) {
454: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
455: PetscMalloc1(new_estimate,&tmp);
456: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
457: PetscFree(xdata[0]);
458: xdata[0] = tmp;
459: mem_estimate = new_estimate; ++no_malloc;
460: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
461: }
462: xdata[i][ct2++] = row;
463: ct3++;
464: }
465: }
466: for (k=oct2,max2=ct2; k<max2; k++) {
467: row = xdata[i][k] - rstart;
468: start = ai[row];
469: end = ai[row+1];
470: for (l=start; l<end; l++) {
471: val = aj[l] + cstart;
472: if (!PetscBTLookupSet(xtable,val)) {
473: if (!(ct3 < mem_estimate)) {
474: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
475: PetscMalloc1(new_estimate,&tmp);
476: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
477: PetscFree(xdata[0]);
478: xdata[0] = tmp;
479: mem_estimate = new_estimate; ++no_malloc;
480: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
481: }
482: xdata[i][ct2++] = val;
483: ct3++;
484: }
485: }
486: start = bi[row];
487: end = bi[row+1];
488: for (l=start; l<end; l++) {
489: val = garray[bj[l]];
490: if (!PetscBTLookupSet(xtable,val)) {
491: if (!(ct3 < mem_estimate)) {
492: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
493: PetscMalloc1(new_estimate,&tmp);
494: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
495: PetscFree(xdata[0]);
496: xdata[0] = tmp;
497: mem_estimate = new_estimate; ++no_malloc;
498: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
499: }
500: xdata[i][ct2++] = val;
501: ct3++;
502: }
503: }
504: }
505: /* Update the header*/
506: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
507: xdata[i][2*j-1] = rbuf_i[2*j-1];
508: }
509: xdata[i][0] = rbuf_0;
510: xdata[i+1] = xdata[i] + ct2;
511: isz1[i] = ct2; /* size of each message */
512: }
513: PetscBTDestroy(&xtable);
514: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
515: return(0);
516: }
518: PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
519: {
520: IS *isrow_block,*iscol_block;
521: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
523: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
524: Mat_SeqBAIJ *subc;
525: Mat_SubSppt *smat;
528: /* The compression and expansion should be avoided. Doesn't point
529: out errors, might change the indices, hence buggey */
530: PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);
531: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);
532: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);
534: /* Determine the number of stages through which submatrices are done */
535: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
536: else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
537: if (!nmax) nmax = 1;
539: if (scall == MAT_INITIAL_MATRIX) {
540: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
542: /* Make sure every processor loops through the nstages */
543: MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
545: /* Allocate memory to hold all the submatrices and dummy submatrices */
546: PetscCalloc1(ismax+nstages,submat);
547: } else { /* MAT_REUSE_MATRIX */
548: if (ismax) {
549: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
550: smat = subc->submatis1;
551: } else { /* (*submat)[0] is a dummy matrix */
552: smat = (Mat_SubSppt*)(*submat)[0]->data;
553: }
554: if (!smat) {
555: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat");
556: }
557: nstages = smat->nstages;
558: }
560: for (i=0,pos=0; i<nstages; i++) {
561: if (pos+nmax <= ismax) max_no = nmax;
562: else if (pos == ismax) max_no = 0;
563: else max_no = ismax-pos;
565: MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);
566: if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
567: smat = (Mat_SubSppt*)(*submat)[pos]->data;
568: smat->nstages = nstages;
569: }
570: pos += max_no;
571: }
573: if (scall == MAT_INITIAL_MATRIX && ismax) {
574: /* save nstages for reuse */
575: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
576: smat = subc->submatis1;
577: smat->nstages = nstages;
578: }
580: for (i=0; i<ismax; i++) {
581: ISDestroy(&isrow_block[i]);
582: ISDestroy(&iscol_block[i]);
583: }
584: PetscFree2(isrow_block,iscol_block);
585: return(0);
586: }
588: #if defined(PETSC_USE_CTABLE)
589: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
590: {
591: PetscInt nGlobalNd = proc_gnode[size];
592: PetscMPIInt fproc;
596: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
597: if (fproc > size) fproc = size;
598: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
599: if (row < proc_gnode[fproc]) fproc--;
600: else fproc++;
601: }
602: *rank = fproc;
603: return(0);
604: }
605: #endif
607: /* -------------------------------------------------------------------------*/
608: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
609: PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
610: {
611: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
612: Mat A = c->A;
613: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
614: const PetscInt **icol,**irow;
615: PetscInt *nrow,*ncol,start;
617: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
618: PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
619: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
620: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
621: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
622: #if defined(PETSC_USE_CTABLE)
623: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
624: #else
625: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
626: #endif
627: const PetscInt *irow_i,*icol_i;
628: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
629: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
630: MPI_Request *r_waits4,*s_waits3,*s_waits4;
631: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
632: MPI_Status *r_status3,*r_status4,*s_status4;
633: MPI_Comm comm;
634: PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a,*sbuf_aa_i;
635: PetscMPIInt *onodes1,*olengths1,end;
636: PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
637: Mat_SubSppt *smat_i;
638: PetscBool *issorted,colflag,iscsorted=PETSC_TRUE;
639: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
640: PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
641: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
642: PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
643: PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
644: PetscInt cstart = c->cstartbs,*bmap = c->garray;
645: PetscBool *allrows,*allcolumns;
648: PetscObjectGetComm((PetscObject)C,&comm);
649: size = c->size;
650: rank = c->rank;
652: PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);
653: PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
655: for (i=0; i<ismax; i++) {
656: ISSorted(iscol[i],&issorted[i]);
657: if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
658: ISSorted(isrow[i],&issorted[i]);
660: /* Check for special case: allcolumns */
661: ISIdentity(iscol[i],&colflag);
662: ISGetLocalSize(iscol[i],&ncol[i]);
664: if (colflag && ncol[i] == c->Nbs) {
665: allcolumns[i] = PETSC_TRUE;
666: icol[i] = NULL;
667: } else {
668: allcolumns[i] = PETSC_FALSE;
669: ISGetIndices(iscol[i],&icol[i]);
670: }
672: /* Check for special case: allrows */
673: ISIdentity(isrow[i],&colflag);
674: ISGetLocalSize(isrow[i],&nrow[i]);
675: if (colflag && nrow[i] == c->Mbs) {
676: allrows[i] = PETSC_TRUE;
677: irow[i] = NULL;
678: } else {
679: allrows[i] = PETSC_FALSE;
680: ISGetIndices(isrow[i],&irow[i]);
681: }
682: }
684: if (scall == MAT_REUSE_MATRIX) {
685: /* Assumes new rows are same length as the old rows */
686: for (i=0; i<ismax; i++) {
687: subc = (Mat_SeqBAIJ*)(submats[i]->data);
688: if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
690: /* Initial matrix as if empty */
691: PetscMemzero(subc->ilen,subc->mbs*sizeof(PetscInt));
693: /* Initial matrix as if empty */
694: submats[i]->factortype = C->factortype;
696: smat_i = subc->submatis1;
698: nrqs = smat_i->nrqs;
699: nrqr = smat_i->nrqr;
700: rbuf1 = smat_i->rbuf1;
701: rbuf2 = smat_i->rbuf2;
702: rbuf3 = smat_i->rbuf3;
703: req_source2 = smat_i->req_source2;
705: sbuf1 = smat_i->sbuf1;
706: sbuf2 = smat_i->sbuf2;
707: ptr = smat_i->ptr;
708: tmp = smat_i->tmp;
709: ctr = smat_i->ctr;
711: pa = smat_i->pa;
712: req_size = smat_i->req_size;
713: req_source1 = smat_i->req_source1;
715: allcolumns[i] = smat_i->allcolumns;
716: allrows[i] = smat_i->allrows;
717: row2proc[i] = smat_i->row2proc;
718: rmap[i] = smat_i->rmap;
719: cmap[i] = smat_i->cmap;
720: }
722: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
723: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
724: smat_i = (Mat_SubSppt*)submats[0]->data;
726: nrqs = smat_i->nrqs;
727: nrqr = smat_i->nrqr;
728: rbuf1 = smat_i->rbuf1;
729: rbuf2 = smat_i->rbuf2;
730: rbuf3 = smat_i->rbuf3;
731: req_source2 = smat_i->req_source2;
733: sbuf1 = smat_i->sbuf1;
734: sbuf2 = smat_i->sbuf2;
735: ptr = smat_i->ptr;
736: tmp = smat_i->tmp;
737: ctr = smat_i->ctr;
739: pa = smat_i->pa;
740: req_size = smat_i->req_size;
741: req_source1 = smat_i->req_source1;
743: allcolumns[0] = PETSC_FALSE;
744: }
745: } else { /* scall == MAT_INITIAL_MATRIX */
746: /* Get some new tags to keep the communication clean */
747: PetscObjectGetNewTag((PetscObject)C,&tag2);
748: PetscObjectGetNewTag((PetscObject)C,&tag3);
750: /* evaluate communication - mesg to who, length of mesg, and buffer space
751: required. Based on this, buffers are allocated, and data copied into them*/
752: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
754: for (i=0; i<ismax; i++) {
755: jmax = nrow[i];
756: irow_i = irow[i];
758: PetscMalloc1(jmax,&row2proc_i);
759: row2proc[i] = row2proc_i;
761: if (issorted[i]) proc = 0;
762: for (j=0; j<jmax; j++) {
763: if (!issorted[i]) proc = 0;
764: if (allrows[i]) row = j;
765: else row = irow_i[j];
767: while (row >= c->rangebs[proc+1]) proc++;
768: w4[proc]++;
769: row2proc_i[j] = proc; /* map row index to proc */
770: }
771: for (j=0; j<size; j++) {
772: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
773: }
774: }
776: nrqs = 0; /* no of outgoing messages */
777: msz = 0; /* total mesg length (for all procs) */
778: w1[rank] = 0; /* no mesg sent to self */
779: w3[rank] = 0;
780: for (i=0; i<size; i++) {
781: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
782: }
783: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
784: for (i=0,j=0; i<size; i++) {
785: if (w1[i]) { pa[j] = i; j++; }
786: }
788: /* Each message would have a header = 1 + 2*(no of IS) + data */
789: for (i=0; i<nrqs; i++) {
790: j = pa[i];
791: w1[j] += w2[j] + 2* w3[j];
792: msz += w1[j];
793: }
794: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
796: /* Determine the number of messages to expect, their lengths, from from-ids */
797: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
798: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
800: /* Now post the Irecvs corresponding to these messages */
801: tag0 = ((PetscObject)C)->tag;
802: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
804: PetscFree(onodes1);
805: PetscFree(olengths1);
807: /* Allocate Memory for outgoing messages */
808: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
809: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
810: PetscMemzero(ptr,size*sizeof(PetscInt*));
812: {
813: PetscInt *iptr = tmp;
814: k = 0;
815: for (i=0; i<nrqs; i++) {
816: j = pa[i];
817: iptr += k;
818: sbuf1[j] = iptr;
819: k = w1[j];
820: }
821: }
823: /* Form the outgoing messages. Initialize the header space */
824: for (i=0; i<nrqs; i++) {
825: j = pa[i];
826: sbuf1[j][0] = 0;
827: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
828: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
829: }
831: /* Parse the isrow and copy data into outbuf */
832: for (i=0; i<ismax; i++) {
833: row2proc_i = row2proc[i];
834: PetscMemzero(ctr,size*sizeof(PetscInt));
835: irow_i = irow[i];
836: jmax = nrow[i];
837: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
838: proc = row2proc_i[j];
839: if (allrows[i]) row = j;
840: else row = irow_i[j];
842: if (proc != rank) { /* copy to the outgoing buf*/
843: ctr[proc]++;
844: *ptr[proc] = row;
845: ptr[proc]++;
846: }
847: }
848: /* Update the headers for the current IS */
849: for (j=0; j<size; j++) { /* Can Optimise this loop too */
850: if ((ctr_j = ctr[j])) {
851: sbuf1_j = sbuf1[j];
852: k = ++sbuf1_j[0];
853: sbuf1_j[2*k] = ctr_j;
854: sbuf1_j[2*k-1] = i;
855: }
856: }
857: }
859: /* Now post the sends */
860: PetscMalloc1(nrqs+1,&s_waits1);
861: for (i=0; i<nrqs; ++i) {
862: j = pa[i];
863: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
864: }
866: /* Post Receives to capture the buffer size */
867: PetscMalloc1(nrqs+1,&r_waits2);
868: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
869: rbuf2[0] = tmp + msz;
870: for (i=1; i<nrqs; ++i) {
871: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
872: }
873: for (i=0; i<nrqs; ++i) {
874: j = pa[i];
875: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
876: }
878: /* Send to other procs the buf size they should allocate */
879: /* Receive messages*/
880: PetscMalloc1(nrqr+1,&s_waits2);
881: PetscMalloc1(nrqr+1,&r_status1);
882: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
884: MPI_Waitall(nrqr,r_waits1,r_status1);
885: for (i=0; i<nrqr; ++i) {
886: req_size[i] = 0;
887: rbuf1_i = rbuf1[i];
888: start = 2*rbuf1_i[0] + 1;
889: MPI_Get_count(r_status1+i,MPIU_INT,&end);
890: PetscMalloc1(end+1,&sbuf2[i]);
891: sbuf2_i = sbuf2[i];
892: for (j=start; j<end; j++) {
893: row = rbuf1_i[j] - rstart;
894: ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
895: sbuf2_i[j] = ncols;
896: req_size[i] += ncols;
897: }
898: req_source1[i] = r_status1[i].MPI_SOURCE;
899: /* form the header */
900: sbuf2_i[0] = req_size[i];
901: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
903: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
904: }
906: PetscFree(r_status1);
907: PetscFree(r_waits1);
908: PetscFree4(w1,w2,w3,w4);
910: /* Receive messages*/
911: PetscMalloc1(nrqs+1,&r_waits3);
912: PetscMalloc1(nrqs+1,&r_status2);
914: MPI_Waitall(nrqs,r_waits2,r_status2);
915: for (i=0; i<nrqs; ++i) {
916: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
917: req_source2[i] = r_status2[i].MPI_SOURCE;
918: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
919: }
920: PetscFree(r_status2);
921: PetscFree(r_waits2);
923: /* Wait on sends1 and sends2 */
924: PetscMalloc1(nrqs+1,&s_status1);
925: PetscMalloc1(nrqr+1,&s_status2);
927: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
928: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
929: PetscFree(s_status1);
930: PetscFree(s_status2);
931: PetscFree(s_waits1);
932: PetscFree(s_waits2);
934: /* Now allocate sending buffers for a->j, and send them off */
935: PetscMalloc1(nrqr+1,&sbuf_aj);
936: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
937: PetscMalloc1(j+1,&sbuf_aj[0]);
938: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
940: PetscMalloc1(nrqr+1,&s_waits3);
941: {
943: for (i=0; i<nrqr; i++) {
944: rbuf1_i = rbuf1[i];
945: sbuf_aj_i = sbuf_aj[i];
946: ct1 = 2*rbuf1_i[0] + 1;
947: ct2 = 0;
948: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
949: kmax = rbuf1[i][2*j];
950: for (k=0; k<kmax; k++,ct1++) {
951: row = rbuf1_i[ct1] - rstart;
952: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
953: ncols = nzA + nzB;
954: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
956: /* load the column indices for this row into cols */
957: cols = sbuf_aj_i + ct2;
958: for (l=0; l<nzB; l++) {
959: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
960: else break;
961: }
962: imark = l;
963: for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
964: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
965: ct2 += ncols;
966: }
967: }
968: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
969: }
970: }
971: PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);
973: /* create col map: global col of C -> local col of submatrices */
974: #if defined(PETSC_USE_CTABLE)
975: for (i=0; i<ismax; i++) {
976: if (!allcolumns[i]) {
977: PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);
979: jmax = ncol[i];
980: icol_i = icol[i];
981: cmap_i = cmap[i];
982: for (j=0; j<jmax; j++) {
983: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
984: }
985: } else cmap[i] = NULL;
986: }
987: #else
988: for (i=0; i<ismax; i++) {
989: if (!allcolumns[i]) {
990: PetscCalloc1(c->Nbs,&cmap[i]);
991: jmax = ncol[i];
992: icol_i = icol[i];
993: cmap_i = cmap[i];
994: for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
995: } else cmap[i] = NULL;
996: }
997: #endif
999: /* Create lens which is required for MatCreate... */
1000: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1001: PetscMalloc1(ismax,&lens);
1003: if (ismax) {
1004: PetscCalloc1(j,&lens[0]);
1005: }
1006: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1008: /* Update lens from local data */
1009: for (i=0; i<ismax; i++) {
1010: row2proc_i = row2proc[i];
1011: jmax = nrow[i];
1012: if (!allcolumns[i]) cmap_i = cmap[i];
1013: irow_i = irow[i];
1014: lens_i = lens[i];
1015: for (j=0; j<jmax; j++) {
1016: if (allrows[i]) row = j;
1017: else row = irow_i[j]; /* global blocked row of C */
1019: proc = row2proc_i[j];
1020: if (proc == rank) {
1021: /* Get indices from matA and then from matB */
1022: #if defined(PETSC_USE_CTABLE)
1023: PetscInt tt;
1024: #endif
1025: row = row - rstart;
1026: nzA = a_i[row+1] - a_i[row];
1027: nzB = b_i[row+1] - b_i[row];
1028: cworkA = a_j + a_i[row];
1029: cworkB = b_j + b_i[row];
1031: if (!allcolumns[i]) {
1032: #if defined(PETSC_USE_CTABLE)
1033: for (k=0; k<nzA; k++) {
1034: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1035: if (tt) lens_i[j]++;
1036: }
1037: for (k=0; k<nzB; k++) {
1038: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1039: if (tt) lens_i[j]++;
1040: }
1042: #else
1043: for (k=0; k<nzA; k++) {
1044: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1045: }
1046: for (k=0; k<nzB; k++) {
1047: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1048: }
1049: #endif
1050: } else { /* allcolumns */
1051: lens_i[j] = nzA + nzB;
1052: }
1053: }
1054: }
1055: }
1057: /* Create row map: global row of C -> local row of submatrices */
1058: for (i=0; i<ismax; i++) {
1059: if (!allrows[i]) {
1060: #if defined(PETSC_USE_CTABLE)
1061: PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);
1062: irow_i = irow[i];
1063: jmax = nrow[i];
1064: for (j=0; j<jmax; j++) {
1065: if (allrows[i]) {
1066: PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);
1067: } else {
1068: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1069: }
1070: }
1071: #else
1072: PetscCalloc1(c->Mbs,&rmap[i]);
1073: rmap_i = rmap[i];
1074: irow_i = irow[i];
1075: jmax = nrow[i];
1076: for (j=0; j<jmax; j++) {
1077: if (allrows[i]) rmap_i[j] = j;
1078: else rmap_i[irow_i[j]] = j;
1079: }
1080: #endif
1081: } else rmap[i] = NULL;
1082: }
1084: /* Update lens from offproc data */
1085: {
1086: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1088: MPI_Waitall(nrqs,r_waits3,r_status3);
1089: for (tmp2=0; tmp2<nrqs; tmp2++) {
1090: sbuf1_i = sbuf1[pa[tmp2]];
1091: jmax = sbuf1_i[0];
1092: ct1 = 2*jmax+1;
1093: ct2 = 0;
1094: rbuf2_i = rbuf2[tmp2];
1095: rbuf3_i = rbuf3[tmp2];
1096: for (j=1; j<=jmax; j++) {
1097: is_no = sbuf1_i[2*j-1];
1098: max1 = sbuf1_i[2*j];
1099: lens_i = lens[is_no];
1100: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1101: rmap_i = rmap[is_no];
1102: for (k=0; k<max1; k++,ct1++) {
1103: if (allrows[is_no]) {
1104: row = sbuf1_i[ct1];
1105: } else {
1106: #if defined(PETSC_USE_CTABLE)
1107: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1108: row--;
1109: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1110: #else
1111: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1112: #endif
1113: }
1114: max2 = rbuf2_i[ct1];
1115: for (l=0; l<max2; l++,ct2++) {
1116: if (!allcolumns[is_no]) {
1117: #if defined(PETSC_USE_CTABLE)
1118: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1119: #else
1120: tcol = cmap_i[rbuf3_i[ct2]];
1121: #endif
1122: if (tcol) lens_i[row]++;
1123: } else { /* allcolumns */
1124: lens_i[row]++; /* lens_i[row] += max2 ? */
1125: }
1126: }
1127: }
1128: }
1129: }
1130: }
1131: PetscFree(r_waits3);
1132: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1133: PetscFree2(r_status3,s_status3);
1134: PetscFree(s_waits3);
1136: /* Create the submatrices */
1137: for (i=0; i<ismax; i++) {
1138: PetscInt bs_tmp;
1139: if (ijonly) bs_tmp = 1;
1140: else bs_tmp = bs;
1142: MatCreate(PETSC_COMM_SELF,submats+i);
1143: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);
1145: MatSetType(submats[i],((PetscObject)A)->type_name);
1146: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1147: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1149: /* create struct Mat_SubSppt and attached it to submat */
1150: PetscNew(&smat_i);
1151: subc = (Mat_SeqBAIJ*)submats[i]->data;
1152: subc->submatis1 = smat_i;
1154: smat_i->destroy = submats[i]->ops->destroy;
1155: submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1156: submats[i]->factortype = C->factortype;
1158: smat_i->id = i;
1159: smat_i->nrqs = nrqs;
1160: smat_i->nrqr = nrqr;
1161: smat_i->rbuf1 = rbuf1;
1162: smat_i->rbuf2 = rbuf2;
1163: smat_i->rbuf3 = rbuf3;
1164: smat_i->sbuf2 = sbuf2;
1165: smat_i->req_source2 = req_source2;
1167: smat_i->sbuf1 = sbuf1;
1168: smat_i->ptr = ptr;
1169: smat_i->tmp = tmp;
1170: smat_i->ctr = ctr;
1172: smat_i->pa = pa;
1173: smat_i->req_size = req_size;
1174: smat_i->req_source1 = req_source1;
1176: smat_i->allcolumns = allcolumns[i];
1177: smat_i->allrows = allrows[i];
1178: smat_i->singleis = PETSC_FALSE;
1179: smat_i->row2proc = row2proc[i];
1180: smat_i->rmap = rmap[i];
1181: smat_i->cmap = cmap[i];
1182: }
1184: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1185: MatCreate(PETSC_COMM_SELF,&submats[0]);
1186: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
1187: MatSetType(submats[0],MATDUMMY);
1189: /* create struct Mat_SubSppt and attached it to submat */
1190: PetscNewLog(submats[0],&smat_i);
1191: submats[0]->data = (void*)smat_i;
1193: smat_i->destroy = submats[0]->ops->destroy;
1194: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1195: submats[0]->factortype = C->factortype;
1197: smat_i->id = 0;
1198: smat_i->nrqs = nrqs;
1199: smat_i->nrqr = nrqr;
1200: smat_i->rbuf1 = rbuf1;
1201: smat_i->rbuf2 = rbuf2;
1202: smat_i->rbuf3 = rbuf3;
1203: smat_i->sbuf2 = sbuf2;
1204: smat_i->req_source2 = req_source2;
1206: smat_i->sbuf1 = sbuf1;
1207: smat_i->ptr = ptr;
1208: smat_i->tmp = tmp;
1209: smat_i->ctr = ctr;
1211: smat_i->pa = pa;
1212: smat_i->req_size = req_size;
1213: smat_i->req_source1 = req_source1;
1215: smat_i->allcolumns = PETSC_FALSE;
1216: smat_i->singleis = PETSC_FALSE;
1217: smat_i->row2proc = NULL;
1218: smat_i->rmap = NULL;
1219: smat_i->cmap = NULL;
1220: }
1223: if (ismax) {PetscFree(lens[0]);}
1224: PetscFree(lens);
1225: PetscFree(sbuf_aj[0]);
1226: PetscFree(sbuf_aj);
1228: } /* endof scall == MAT_INITIAL_MATRIX */
1230: /* Post recv matrix values */
1231: if (!ijonly) {
1232: PetscObjectGetNewTag((PetscObject)C,&tag4);
1233: PetscMalloc1(nrqs+1,&rbuf4);
1234: PetscMalloc1(nrqs+1,&r_waits4);
1235: PetscMalloc1(nrqs+1,&r_status4);
1236: PetscMalloc1(nrqr+1,&s_status4);
1237: for (i=0; i<nrqs; ++i) {
1238: PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);
1239: MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1240: }
1242: /* Allocate sending buffers for a->a, and send them off */
1243: PetscMalloc1(nrqr+1,&sbuf_aa);
1244: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1246: PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);
1247: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1249: PetscMalloc1(nrqr+1,&s_waits4);
1251: for (i=0; i<nrqr; i++) {
1252: rbuf1_i = rbuf1[i];
1253: sbuf_aa_i = sbuf_aa[i];
1254: ct1 = 2*rbuf1_i[0]+1;
1255: ct2 = 0;
1256: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1257: kmax = rbuf1_i[2*j];
1258: for (k=0; k<kmax; k++,ct1++) {
1259: row = rbuf1_i[ct1] - rstart;
1260: nzA = a_i[row+1] - a_i[row];
1261: nzB = b_i[row+1] - b_i[row];
1262: ncols = nzA + nzB;
1263: cworkB = b_j + b_i[row];
1264: vworkA = a_a + a_i[row]*bs2;
1265: vworkB = b_a + b_i[row]*bs2;
1267: /* load the column values for this row into vals*/
1268: vals = sbuf_aa_i+ct2*bs2;
1269: for (l=0; l<nzB; l++) {
1270: if ((bmap[cworkB[l]]) < cstart) {
1271: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1272: } else break;
1273: }
1274: imark = l;
1275: for (l=0; l<nzA; l++) {
1276: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
1277: }
1278: for (l=imark; l<nzB; l++) {
1279: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1280: }
1282: ct2 += ncols;
1283: }
1284: }
1285: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1286: }
1287: }
1289: /* Assemble the matrices */
1290: /* First assemble the local rows */
1291: for (i=0; i<ismax; i++) {
1292: row2proc_i = row2proc[i];
1293: subc = (Mat_SeqBAIJ*)submats[i]->data;
1294: imat_ilen = subc->ilen;
1295: imat_j = subc->j;
1296: imat_i = subc->i;
1297: imat_a = subc->a;
1299: if (!allcolumns[i]) cmap_i = cmap[i];
1300: rmap_i = rmap[i];
1301: irow_i = irow[i];
1302: jmax = nrow[i];
1303: for (j=0; j<jmax; j++) {
1304: if (allrows[i]) row = j;
1305: else row = irow_i[j];
1306: proc = row2proc_i[j];
1308: if (proc == rank) {
1310: row = row - rstart;
1311: nzA = a_i[row+1] - a_i[row];
1312: nzB = b_i[row+1] - b_i[row];
1313: cworkA = a_j + a_i[row];
1314: cworkB = b_j + b_i[row];
1315: if (!ijonly) {
1316: vworkA = a_a + a_i[row]*bs2;
1317: vworkB = b_a + b_i[row]*bs2;
1318: }
1320: if (allrows[i]) {
1321: row = row+rstart;
1322: } else {
1323: #if defined(PETSC_USE_CTABLE)
1324: PetscTableFind(rmap_i,row+rstart+1,&row);
1325: row--;
1327: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1328: #else
1329: row = rmap_i[row + rstart];
1330: #endif
1331: }
1332: mat_i = imat_i[row];
1333: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1334: mat_j = imat_j + mat_i;
1335: ilen = imat_ilen[row];
1337: /* load the column indices for this row into cols*/
1338: if (!allcolumns[i]) {
1339: for (l=0; l<nzB; l++) {
1340: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1341: #if defined(PETSC_USE_CTABLE)
1342: PetscTableFind(cmap_i,ctmp+1,&tcol);
1343: if (tcol) {
1344: #else
1345: if ((tcol = cmap_i[ctmp])) {
1346: #endif
1347: *mat_j++ = tcol - 1;
1348: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1349: mat_a += bs2;
1350: ilen++;
1351: }
1352: } else break;
1353: }
1354: imark = l;
1355: for (l=0; l<nzA; l++) {
1356: #if defined(PETSC_USE_CTABLE)
1357: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1358: if (tcol) {
1359: #else
1360: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1361: #endif
1362: *mat_j++ = tcol - 1;
1363: if (!ijonly) {
1364: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1365: mat_a += bs2;
1366: }
1367: ilen++;
1368: }
1369: }
1370: for (l=imark; l<nzB; l++) {
1371: #if defined(PETSC_USE_CTABLE)
1372: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1373: if (tcol) {
1374: #else
1375: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1376: #endif
1377: *mat_j++ = tcol - 1;
1378: if (!ijonly) {
1379: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1380: mat_a += bs2;
1381: }
1382: ilen++;
1383: }
1384: }
1385: } else { /* allcolumns */
1386: for (l=0; l<nzB; l++) {
1387: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1388: *mat_j++ = ctmp;
1389: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1390: mat_a += bs2;
1391: ilen++;
1392: } else break;
1393: }
1394: imark = l;
1395: for (l=0; l<nzA; l++) {
1396: *mat_j++ = cstart+cworkA[l];
1397: if (!ijonly) {
1398: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1399: mat_a += bs2;
1400: }
1401: ilen++;
1402: }
1403: for (l=imark; l<nzB; l++) {
1404: *mat_j++ = bmap[cworkB[l]];
1405: if (!ijonly) {
1406: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1407: mat_a += bs2;
1408: }
1409: ilen++;
1410: }
1411: }
1412: imat_ilen[row] = ilen;
1413: }
1414: }
1415: }
1417: /* Now assemble the off proc rows */
1418: if (!ijonly) {
1419: MPI_Waitall(nrqs,r_waits4,r_status4);
1420: }
1421: for (tmp2=0; tmp2<nrqs; tmp2++) {
1422: sbuf1_i = sbuf1[pa[tmp2]];
1423: jmax = sbuf1_i[0];
1424: ct1 = 2*jmax + 1;
1425: ct2 = 0;
1426: rbuf2_i = rbuf2[tmp2];
1427: rbuf3_i = rbuf3[tmp2];
1428: if (!ijonly) rbuf4_i = rbuf4[tmp2];
1429: for (j=1; j<=jmax; j++) {
1430: is_no = sbuf1_i[2*j-1];
1431: rmap_i = rmap[is_no];
1432: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1433: subc = (Mat_SeqBAIJ*)submats[is_no]->data;
1434: imat_ilen = subc->ilen;
1435: imat_j = subc->j;
1436: imat_i = subc->i;
1437: if (!ijonly) imat_a = subc->a;
1438: max1 = sbuf1_i[2*j];
1439: for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1440: row = sbuf1_i[ct1];
1442: if (allrows[is_no]) {
1443: row = sbuf1_i[ct1];
1444: } else {
1445: #if defined(PETSC_USE_CTABLE)
1446: PetscTableFind(rmap_i,row+1,&row);
1447: row--;
1448: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1449: #else
1450: row = rmap_i[row];
1451: #endif
1452: }
1453: ilen = imat_ilen[row];
1454: mat_i = imat_i[row];
1455: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1456: mat_j = imat_j + mat_i;
1457: max2 = rbuf2_i[ct1];
1458: if (!allcolumns[is_no]) {
1459: for (l=0; l<max2; l++,ct2++) {
1460: #if defined(PETSC_USE_CTABLE)
1461: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1462: #else
1463: tcol = cmap_i[rbuf3_i[ct2]];
1464: #endif
1465: if (tcol) {
1466: *mat_j++ = tcol - 1;
1467: if (!ijonly) {
1468: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1469: mat_a += bs2;
1470: }
1471: ilen++;
1472: }
1473: }
1474: } else { /* allcolumns */
1475: for (l=0; l<max2; l++,ct2++) {
1476: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1477: if (!ijonly) {
1478: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1479: mat_a += bs2;
1480: }
1481: ilen++;
1482: }
1483: }
1484: imat_ilen[row] = ilen;
1485: }
1486: }
1487: }
1489: if (!iscsorted) { /* sort column indices of the rows */
1490: MatScalar *work;
1492: PetscMalloc1(bs2,&work);
1493: for (i=0; i<ismax; i++) {
1494: subc = (Mat_SeqBAIJ*)submats[i]->data;
1495: imat_ilen = subc->ilen;
1496: imat_j = subc->j;
1497: imat_i = subc->i;
1498: if (!ijonly) imat_a = subc->a;
1499: if (allcolumns[i]) continue;
1501: jmax = nrow[i];
1502: for (j=0; j<jmax; j++) {
1503: mat_i = imat_i[j];
1504: mat_j = imat_j + mat_i;
1505: ilen = imat_ilen[j];
1506: if (ijonly) {
1507: PetscSortInt(ilen,mat_j);
1508: } else {
1509: mat_a = imat_a + mat_i*bs2;
1510: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
1511: }
1512: }
1513: }
1514: PetscFree(work);
1515: }
1517: if (!ijonly) {
1518: PetscFree(r_status4);
1519: PetscFree(r_waits4);
1520: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1521: PetscFree(s_waits4);
1522: PetscFree(s_status4);
1523: }
1525: /* Restore the indices */
1526: for (i=0; i<ismax; i++) {
1527: if (!allrows[i]) {
1528: ISRestoreIndices(isrow[i],irow+i);
1529: }
1530: if (!allcolumns[i]) {
1531: ISRestoreIndices(iscol[i],icol+i);
1532: }
1533: }
1535: for (i=0; i<ismax; i++) {
1536: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1537: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1538: }
1540: /* Destroy allocated memory */
1541: PetscFree5(irow,icol,nrow,ncol,issorted);
1542: PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);
1544: if (!ijonly) {
1545: PetscFree(sbuf_aa[0]);
1546: PetscFree(sbuf_aa);
1548: for (i=0; i<nrqs; ++i) {
1549: PetscFree(rbuf4[i]);
1550: }
1551: PetscFree(rbuf4);
1552: }
1553: c->ijonly = PETSC_FALSE; /* set back to the default */
1554: return(0);
1555: }