Actual source code: baijov.c
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15: {
16: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
17: IS *is_new;
19: PetscMalloc1(imax,&is_new);
20: /* Convert the indices into block format */
21: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
23: for (i=0; i<ov; ++i) {
24: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
25: }
26: for (i=0; i<imax; i++) ISDestroy(&is[i]);
27: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
28: for (i=0; i<imax; i++) ISDestroy(&is_new[i]);
29: PetscFree(is_new);
30: return 0;
31: }
33: /*
34: Sample message format:
35: If a processor A wants processor B to process some elements corresponding
36: to index sets is[1], is[5]
37: mesg [0] = 2 (no of index sets in the mesg)
38: -----------
39: mesg [1] = 1 => is[1]
40: mesg [2] = sizeof(is[1]);
41: -----------
42: mesg [5] = 5 => is[5]
43: mesg [6] = sizeof(is[5]);
44: -----------
45: mesg [7]
46: mesg [n] data(is[1])
47: -----------
48: mesg[n+1]
49: mesg[m] data(is[5])
50: -----------
52: Notes:
53: nrqs - no of requests sent (or to be sent out)
54: nrqr - no of requests received (which have to be or which have been processed)
55: */
56: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
57: {
58: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
59: const PetscInt **idx,*idx_i;
60: PetscInt *n,*w3,*w4,**data,len;
61: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
62: PetscInt Mbs,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
63: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
64: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2,proc=-1;
65: PetscBT *table;
66: MPI_Comm comm,*iscomms;
67: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68: char *t_p;
70: PetscObjectGetComm((PetscObject)C,&comm);
71: size = c->size;
72: rank = c->rank;
73: Mbs = c->Mbs;
75: PetscObjectGetNewTag((PetscObject)C,&tag1);
76: PetscObjectGetNewTag((PetscObject)C,&tag2);
78: PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);
80: for (i=0; i<imax; i++) {
81: ISGetIndices(is[i],&idx[i]);
82: ISGetLocalSize(is[i],&n[i]);
83: }
85: /* evaluate communication - mesg to who,length of mesg, and buffer space
86: required. Based on this, buffers are allocated, and data copied into them*/
87: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
88: for (i=0; i<imax; i++) {
89: PetscArrayzero(w4,size); /* initialise work vector*/
90: idx_i = idx[i];
91: len = n[i];
92: for (j=0; j<len; j++) {
93: row = idx_i[j];
95: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
96: w4[proc]++;
97: }
98: for (j=0; j<size; j++) {
99: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
100: }
101: }
103: nrqs = 0; /* no of outgoing messages */
104: msz = 0; /* total mesg length (for all proc */
105: w1[rank] = 0; /* no mesg sent to itself */
106: w3[rank] = 0;
107: for (i=0; i<size; i++) {
108: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
109: }
110: /* pa - is list of processors to communicate with */
111: PetscMalloc1(nrqs,&pa);
112: for (i=0,j=0; i<size; i++) {
113: if (w1[i]) {pa[j] = i; j++;}
114: }
116: /* Each message would have a header = 1 + 2*(no of IS) + data */
117: for (i=0; i<nrqs; i++) {
118: j = pa[i];
119: w1[j] += w2[j] + 2*w3[j];
120: msz += w1[j];
121: }
123: /* Determine the number of messages to expect, their lengths, from from-ids */
124: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
125: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
127: /* Now post the Irecvs corresponding to these messages */
128: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
130: /* Allocate Memory for outgoing messages */
131: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
132: PetscArrayzero(outdat,size);
133: PetscArrayzero(ptr,size);
134: {
135: PetscInt *iptr = tmp,ict = 0;
136: for (i=0; i<nrqs; i++) {
137: j = pa[i];
138: iptr += ict;
139: outdat[j] = iptr;
140: ict = w1[j];
141: }
142: }
144: /* Form the outgoing messages */
145: /*plug in the headers*/
146: for (i=0; i<nrqs; i++) {
147: j = pa[i];
148: outdat[j][0] = 0;
149: PetscArrayzero(outdat[j]+1,2*w3[j]);
150: ptr[j] = outdat[j] + 2*w3[j] + 1;
151: }
153: /* Memory for doing local proc's work*/
154: {
155: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
157: for (i=0; i<imax; i++) {
158: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
159: data[i] = d_p + (Mbs)*i;
160: }
161: }
163: /* Parse the IS and update local tables and the outgoing buf with the data*/
164: {
165: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
166: PetscBT table_i;
168: for (i=0; i<imax; i++) {
169: PetscArrayzero(ctr,size);
170: n_i = n[i];
171: table_i = table[i];
172: idx_i = idx[i];
173: data_i = data[i];
174: isz_i = isz[i];
175: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
176: row = idx_i[j];
177: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
178: if (proc != rank) { /* copy to the outgoing buffer */
179: ctr[proc]++;
180: *ptr[proc] = row;
181: ptr[proc]++;
182: } else { /* Update the local table */
183: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
184: }
185: }
186: /* Update the headers for the current IS */
187: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
188: if ((ctr_j = ctr[j])) {
189: outdat_j = outdat[j];
190: k = ++outdat_j[0];
191: outdat_j[2*k] = ctr_j;
192: outdat_j[2*k-1] = i;
193: }
194: }
195: isz[i] = isz_i;
196: }
197: }
199: /* Now post the sends */
200: PetscMalloc1(nrqs,&s_waits1);
201: for (i=0; i<nrqs; ++i) {
202: j = pa[i];
203: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
204: }
206: /* No longer need the original indices*/
207: for (i=0; i<imax; ++i) {
208: ISRestoreIndices(is[i],idx+i);
209: }
210: PetscFree2(*(PetscInt***)&idx,n);
212: PetscMalloc1(imax,&iscomms);
213: for (i=0; i<imax; ++i) {
214: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
215: ISDestroy(&is[i]);
216: }
218: /* Do Local work*/
219: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
221: /* Receive messages*/
222: MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
223: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
225: /* Phase 1 sends are complete - deallocate buffers */
226: PetscFree4(outdat,ptr,tmp,ctr);
227: PetscFree4(w1,w2,w3,w4);
229: PetscMalloc1(nrqr,&xdata);
230: PetscMalloc1(nrqr,&isz1);
231: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
232: if (rbuf) {
233: PetscFree(rbuf[0]);
234: PetscFree(rbuf);
235: }
237: /* Send the data back*/
238: /* Do a global reduction to know the buffer space req for incoming messages*/
239: {
240: PetscMPIInt *rw1;
242: PetscCalloc1(size,&rw1);
244: for (i=0; i<nrqr; ++i) {
245: proc = onodes1[i];
246: rw1[proc] = isz1[i];
247: }
249: /* Determine the number of messages to expect, their lengths, from from-ids */
250: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
251: PetscFree(rw1);
252: }
253: /* Now post the Irecvs corresponding to these messages */
254: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
256: /* Now post the sends */
257: PetscMalloc1(nrqr,&s_waits2);
258: for (i=0; i<nrqr; ++i) {
259: j = onodes1[i];
260: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
261: }
263: PetscFree(onodes1);
264: PetscFree(olengths1);
266: /* receive work done on other processors*/
267: {
268: PetscMPIInt idex;
269: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
270: PetscBT table_i;
272: for (i=0; i<nrqs; ++i) {
273: MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);
274: /* Process the message*/
275: rbuf2_i = rbuf2[idex];
276: ct1 = 2*rbuf2_i[0]+1;
277: jmax = rbuf2[idex][0];
278: for (j=1; j<=jmax; j++) {
279: max = rbuf2_i[2*j];
280: is_no = rbuf2_i[2*j-1];
281: isz_i = isz[is_no];
282: data_i = data[is_no];
283: table_i = table[is_no];
284: for (k=0; k<max; k++,ct1++) {
285: row = rbuf2_i[ct1];
286: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
287: }
288: isz[is_no] = isz_i;
289: }
290: }
291: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
292: }
294: for (i=0; i<imax; ++i) {
295: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
296: PetscCommDestroy(&iscomms[i]);
297: }
299: PetscFree(iscomms);
300: PetscFree(onodes2);
301: PetscFree(olengths2);
303: PetscFree(pa);
304: if (rbuf2) {
305: PetscFree(rbuf2[0]);
306: PetscFree(rbuf2);
307: }
308: PetscFree(s_waits1);
309: PetscFree(r_waits1);
310: PetscFree(s_waits2);
311: PetscFree(r_waits2);
312: PetscFree5(table,data,isz,d_p,t_p);
313: if (xdata) {
314: PetscFree(xdata[0]);
315: PetscFree(xdata);
316: }
317: PetscFree(isz1);
318: return 0;
319: }
321: /*
322: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
323: the work on the local processor.
325: Inputs:
326: C - MAT_MPIBAIJ;
327: imax - total no of index sets processed at a time;
328: table - an array of char - size = Mbs bits.
330: Output:
331: isz - array containing the count of the solution elements corresponding
332: to each index set;
333: data - pointer to the solutions
334: */
335: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
336: {
337: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
338: Mat A = c->A,B = c->B;
339: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
340: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
341: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
342: PetscBT table_i;
344: rstart = c->rstartbs;
345: cstart = c->cstartbs;
346: ai = a->i;
347: aj = a->j;
348: bi = b->i;
349: bj = b->j;
350: garray = c->garray;
352: for (i=0; i<imax; i++) {
353: data_i = data[i];
354: table_i = table[i];
355: isz_i = isz[i];
356: for (j=0,max=isz[i]; j<max; j++) {
357: row = data_i[j] - rstart;
358: start = ai[row];
359: end = ai[row+1];
360: for (k=start; k<end; k++) { /* Amat */
361: val = aj[k] + cstart;
362: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
363: }
364: start = bi[row];
365: end = bi[row+1];
366: for (k=start; k<end; k++) { /* Bmat */
367: val = garray[bj[k]];
368: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
369: }
370: }
371: isz[i] = isz_i;
372: }
373: return 0;
374: }
375: /*
376: MatIncreaseOverlap_MPIBAIJ_Receive - Process the received messages,
377: and return the output
379: Input:
380: C - the matrix
381: nrqr - no of messages being processed.
382: rbuf - an array of pointers to the received requests
384: Output:
385: xdata - array of messages to be sent back
386: isz1 - size of each message
388: For better efficiency perhaps we should malloc separately each xdata[i],
389: then if a remalloc is required we need only copy the data for that one row
390: rather than all previous rows as it is now where a single large chunk of
391: memory is used.
393: */
394: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
395: {
396: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
397: Mat A = c->A,B = c->B;
398: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
399: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
400: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
401: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
402: PetscInt *rbuf_i,kmax,rbuf_0;
403: PetscBT xtable;
405: Mbs = c->Mbs;
406: rstart = c->rstartbs;
407: cstart = c->cstartbs;
408: ai = a->i;
409: aj = a->j;
410: bi = b->i;
411: bj = b->j;
412: garray = c->garray;
414: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
415: rbuf_i = rbuf[i];
416: rbuf_0 = rbuf_i[0];
417: ct += rbuf_0;
418: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
419: }
421: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
422: else max1 = 1;
423: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
424: if (nrqr) {
425: PetscMalloc1(mem_estimate,&xdata[0]);
426: ++no_malloc;
427: }
428: PetscBTCreate(Mbs,&xtable);
429: PetscArrayzero(isz1,nrqr);
431: ct3 = 0;
432: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
433: rbuf_i = rbuf[i];
434: rbuf_0 = rbuf_i[0];
435: ct1 = 2*rbuf_0+1;
436: ct2 = ct1;
437: ct3 += ct1;
438: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
439: PetscBTMemzero(Mbs,xtable);
440: oct2 = ct2;
441: kmax = rbuf_i[2*j];
442: for (k=0; k<kmax; k++,ct1++) {
443: row = rbuf_i[ct1];
444: if (!PetscBTLookupSet(xtable,row)) {
445: if (!(ct3 < mem_estimate)) {
446: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
447: PetscMalloc1(new_estimate,&tmp);
448: PetscArraycpy(tmp,xdata[0],mem_estimate);
449: PetscFree(xdata[0]);
450: xdata[0] = tmp;
451: mem_estimate = new_estimate; ++no_malloc;
452: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
453: }
454: xdata[i][ct2++] = row;
455: ct3++;
456: }
457: }
458: for (k=oct2,max2=ct2; k<max2; k++) {
459: row = xdata[i][k] - rstart;
460: start = ai[row];
461: end = ai[row+1];
462: for (l=start; l<end; l++) {
463: val = aj[l] + cstart;
464: if (!PetscBTLookupSet(xtable,val)) {
465: if (!(ct3 < mem_estimate)) {
466: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
467: PetscMalloc1(new_estimate,&tmp);
468: PetscArraycpy(tmp,xdata[0],mem_estimate);
469: PetscFree(xdata[0]);
470: xdata[0] = tmp;
471: mem_estimate = new_estimate; ++no_malloc;
472: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
473: }
474: xdata[i][ct2++] = val;
475: ct3++;
476: }
477: }
478: start = bi[row];
479: end = bi[row+1];
480: for (l=start; l<end; l++) {
481: val = garray[bj[l]];
482: if (!PetscBTLookupSet(xtable,val)) {
483: if (!(ct3 < mem_estimate)) {
484: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
485: PetscMalloc1(new_estimate,&tmp);
486: PetscArraycpy(tmp,xdata[0],mem_estimate);
487: PetscFree(xdata[0]);
488: xdata[0] = tmp;
489: mem_estimate = new_estimate; ++no_malloc;
490: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
491: }
492: xdata[i][ct2++] = val;
493: ct3++;
494: }
495: }
496: }
497: /* Update the header*/
498: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
499: xdata[i][2*j-1] = rbuf_i[2*j-1];
500: }
501: xdata[i][0] = rbuf_0;
502: if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2;
503: isz1[i] = ct2; /* size of each message */
504: }
505: PetscBTDestroy(&xtable);
506: PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT ", no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc);
507: return 0;
508: }
510: PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
511: {
512: IS *isrow_block,*iscol_block;
513: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
514: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
515: Mat_SeqBAIJ *subc;
516: Mat_SubSppt *smat;
518: /* The compression and expansion should be avoided. Doesn't point
519: out errors, might change the indices, hence buggey */
520: PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);
521: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);
522: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);
524: /* Determine the number of stages through which submatrices are done */
525: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
526: else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
527: if (!nmax) nmax = 1;
529: if (scall == MAT_INITIAL_MATRIX) {
530: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
532: /* Make sure every processor loops through the nstages */
533: MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
535: /* Allocate memory to hold all the submatrices and dummy submatrices */
536: PetscCalloc1(ismax+nstages,submat);
537: } else { /* MAT_REUSE_MATRIX */
538: if (ismax) {
539: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
540: smat = subc->submatis1;
541: } else { /* (*submat)[0] is a dummy matrix */
542: smat = (Mat_SubSppt*)(*submat)[0]->data;
543: }
545: nstages = smat->nstages;
546: }
548: for (i=0,pos=0; i<nstages; i++) {
549: if (pos+nmax <= ismax) max_no = nmax;
550: else if (pos >= ismax) max_no = 0;
551: else max_no = ismax-pos;
553: MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);
554: if (!max_no) {
555: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
556: smat = (Mat_SubSppt*)(*submat)[pos]->data;
557: smat->nstages = nstages;
558: }
559: pos++; /* advance to next dummy matrix if any */
560: } else pos += max_no;
561: }
563: if (scall == MAT_INITIAL_MATRIX && ismax) {
564: /* save nstages for reuse */
565: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
566: smat = subc->submatis1;
567: smat->nstages = nstages;
568: }
570: for (i=0; i<ismax; i++) {
571: ISDestroy(&isrow_block[i]);
572: ISDestroy(&iscol_block[i]);
573: }
574: PetscFree2(isrow_block,iscol_block);
575: return 0;
576: }
578: #if defined(PETSC_USE_CTABLE)
579: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
580: {
581: PetscInt nGlobalNd = proc_gnode[size];
582: PetscMPIInt fproc;
584: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
585: if (fproc > size) fproc = size;
586: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
587: if (row < proc_gnode[fproc]) fproc--;
588: else fproc++;
589: }
590: *rank = fproc;
591: return 0;
592: }
593: #endif
595: /* -------------------------------------------------------------------------*/
596: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
597: PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
598: {
599: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
600: Mat A = c->A;
601: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
602: const PetscInt **icol,**irow;
603: PetscInt *nrow,*ncol,start;
604: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
605: PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
606: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
607: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
608: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
609: #if defined(PETSC_USE_CTABLE)
610: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
611: #else
612: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
613: #endif
614: const PetscInt *irow_i,*icol_i;
615: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
616: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
617: MPI_Request *r_waits4,*s_waits3,*s_waits4;
618: MPI_Comm comm;
619: PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i;
620: PetscMPIInt *onodes1,*olengths1,end;
621: PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
622: Mat_SubSppt *smat_i;
623: PetscBool *issorted,colflag,iscsorted=PETSC_TRUE;
624: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
625: PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
626: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
627: PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
628: PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
629: PetscInt cstart = c->cstartbs,*bmap = c->garray;
630: PetscBool *allrows,*allcolumns;
632: PetscObjectGetComm((PetscObject)C,&comm);
633: size = c->size;
634: rank = c->rank;
636: PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);
637: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
639: for (i=0; i<ismax; i++) {
640: ISSorted(iscol[i],&issorted[i]);
641: if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
642: ISSorted(isrow[i],&issorted[i]);
644: /* Check for special case: allcolumns */
645: ISIdentity(iscol[i],&colflag);
646: ISGetLocalSize(iscol[i],&ncol[i]);
648: if (colflag && ncol[i] == c->Nbs) {
649: allcolumns[i] = PETSC_TRUE;
650: icol[i] = NULL;
651: } else {
652: allcolumns[i] = PETSC_FALSE;
653: ISGetIndices(iscol[i],&icol[i]);
654: }
656: /* Check for special case: allrows */
657: ISIdentity(isrow[i],&colflag);
658: ISGetLocalSize(isrow[i],&nrow[i]);
659: if (colflag && nrow[i] == c->Mbs) {
660: allrows[i] = PETSC_TRUE;
661: irow[i] = NULL;
662: } else {
663: allrows[i] = PETSC_FALSE;
664: ISGetIndices(isrow[i],&irow[i]);
665: }
666: }
668: if (scall == MAT_REUSE_MATRIX) {
669: /* Assumes new rows are same length as the old rows */
670: for (i=0; i<ismax; i++) {
671: subc = (Mat_SeqBAIJ*)(submats[i]->data);
674: /* Initial matrix as if empty */
675: PetscArrayzero(subc->ilen,subc->mbs);
677: /* Initial matrix as if empty */
678: submats[i]->factortype = C->factortype;
680: smat_i = subc->submatis1;
682: nrqs = smat_i->nrqs;
683: nrqr = smat_i->nrqr;
684: rbuf1 = smat_i->rbuf1;
685: rbuf2 = smat_i->rbuf2;
686: rbuf3 = smat_i->rbuf3;
687: req_source2 = smat_i->req_source2;
689: sbuf1 = smat_i->sbuf1;
690: sbuf2 = smat_i->sbuf2;
691: ptr = smat_i->ptr;
692: tmp = smat_i->tmp;
693: ctr = smat_i->ctr;
695: pa = smat_i->pa;
696: req_size = smat_i->req_size;
697: req_source1 = smat_i->req_source1;
699: allcolumns[i] = smat_i->allcolumns;
700: allrows[i] = smat_i->allrows;
701: row2proc[i] = smat_i->row2proc;
702: rmap[i] = smat_i->rmap;
703: cmap[i] = smat_i->cmap;
704: }
706: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
708: smat_i = (Mat_SubSppt*)submats[0]->data;
710: nrqs = smat_i->nrqs;
711: nrqr = smat_i->nrqr;
712: rbuf1 = smat_i->rbuf1;
713: rbuf2 = smat_i->rbuf2;
714: rbuf3 = smat_i->rbuf3;
715: req_source2 = smat_i->req_source2;
717: sbuf1 = smat_i->sbuf1;
718: sbuf2 = smat_i->sbuf2;
719: ptr = smat_i->ptr;
720: tmp = smat_i->tmp;
721: ctr = smat_i->ctr;
723: pa = smat_i->pa;
724: req_size = smat_i->req_size;
725: req_source1 = smat_i->req_source1;
727: allcolumns[0] = PETSC_FALSE;
728: }
729: } else { /* scall == MAT_INITIAL_MATRIX */
730: /* Get some new tags to keep the communication clean */
731: PetscObjectGetNewTag((PetscObject)C,&tag2);
732: PetscObjectGetNewTag((PetscObject)C,&tag3);
734: /* evaluate communication - mesg to who, length of mesg, and buffer space
735: required. Based on this, buffers are allocated, and data copied into them*/
736: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
738: for (i=0; i<ismax; i++) {
739: jmax = nrow[i];
740: irow_i = irow[i];
742: PetscMalloc1(jmax,&row2proc_i);
743: row2proc[i] = row2proc_i;
745: if (issorted[i]) proc = 0;
746: for (j=0; j<jmax; j++) {
747: if (!issorted[i]) proc = 0;
748: if (allrows[i]) row = j;
749: else row = irow_i[j];
751: while (row >= c->rangebs[proc+1]) proc++;
752: w4[proc]++;
753: row2proc_i[j] = proc; /* map row index to proc */
754: }
755: for (j=0; j<size; j++) {
756: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
757: }
758: }
760: nrqs = 0; /* no of outgoing messages */
761: msz = 0; /* total mesg length (for all procs) */
762: w1[rank] = 0; /* no mesg sent to self */
763: w3[rank] = 0;
764: for (i=0; i<size; i++) {
765: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
766: }
767: PetscMalloc1(nrqs,&pa); /*(proc -array)*/
768: for (i=0,j=0; i<size; i++) {
769: if (w1[i]) { pa[j] = i; j++; }
770: }
772: /* Each message would have a header = 1 + 2*(no of IS) + data */
773: for (i=0; i<nrqs; i++) {
774: j = pa[i];
775: w1[j] += w2[j] + 2* w3[j];
776: msz += w1[j];
777: }
778: PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);
780: /* Determine the number of messages to expect, their lengths, from from-ids */
781: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
782: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
784: /* Now post the Irecvs corresponding to these messages */
785: PetscObjectGetNewTag((PetscObject)C,&tag0);
786: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
788: /* Allocate Memory for outgoing messages */
789: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
790: PetscArrayzero(sbuf1,size);
791: PetscArrayzero(ptr,size);
793: {
794: PetscInt *iptr = tmp;
795: k = 0;
796: for (i=0; i<nrqs; i++) {
797: j = pa[i];
798: iptr += k;
799: sbuf1[j] = iptr;
800: k = w1[j];
801: }
802: }
804: /* Form the outgoing messages. Initialize the header space */
805: for (i=0; i<nrqs; i++) {
806: j = pa[i];
807: sbuf1[j][0] = 0;
808: PetscArrayzero(sbuf1[j]+1,2*w3[j]);
809: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
810: }
812: /* Parse the isrow and copy data into outbuf */
813: for (i=0; i<ismax; i++) {
814: row2proc_i = row2proc[i];
815: PetscArrayzero(ctr,size);
816: irow_i = irow[i];
817: jmax = nrow[i];
818: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
819: proc = row2proc_i[j];
820: if (allrows[i]) row = j;
821: else row = irow_i[j];
823: if (proc != rank) { /* copy to the outgoing buf*/
824: ctr[proc]++;
825: *ptr[proc] = row;
826: ptr[proc]++;
827: }
828: }
829: /* Update the headers for the current IS */
830: for (j=0; j<size; j++) { /* Can Optimise this loop too */
831: if ((ctr_j = ctr[j])) {
832: sbuf1_j = sbuf1[j];
833: k = ++sbuf1_j[0];
834: sbuf1_j[2*k] = ctr_j;
835: sbuf1_j[2*k-1] = i;
836: }
837: }
838: }
840: /* Now post the sends */
841: PetscMalloc1(nrqs,&s_waits1);
842: for (i=0; i<nrqs; ++i) {
843: j = pa[i];
844: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
845: }
847: /* Post Receives to capture the buffer size */
848: PetscMalloc1(nrqs,&r_waits2);
849: PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
850: if (nrqs) rbuf2[0] = tmp + msz;
851: for (i=1; i<nrqs; ++i) {
852: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
853: }
854: for (i=0; i<nrqs; ++i) {
855: j = pa[i];
856: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
857: }
859: /* Send to other procs the buf size they should allocate */
860: /* Receive messages*/
861: PetscMalloc1(nrqr,&s_waits2);
862: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
864: MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
865: for (i=0; i<nrqr; ++i) {
866: req_size[i] = 0;
867: rbuf1_i = rbuf1[i];
868: start = 2*rbuf1_i[0] + 1;
869: end = olengths1[i];
870: PetscMalloc1(end,&sbuf2[i]);
871: sbuf2_i = sbuf2[i];
872: for (j=start; j<end; j++) {
873: row = rbuf1_i[j] - rstart;
874: ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
875: sbuf2_i[j] = ncols;
876: req_size[i] += ncols;
877: }
878: req_source1[i] = onodes1[i];
879: /* form the header */
880: sbuf2_i[0] = req_size[i];
881: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
883: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
884: }
886: PetscFree(onodes1);
887: PetscFree(olengths1);
889: PetscFree(r_waits1);
890: PetscFree4(w1,w2,w3,w4);
892: /* Receive messages*/
893: PetscMalloc1(nrqs,&r_waits3);
895: MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);
896: for (i=0; i<nrqs; ++i) {
897: PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
898: req_source2[i] = pa[i];
899: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
900: }
901: PetscFree(r_waits2);
903: /* Wait on sends1 and sends2 */
904: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
905: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
906: PetscFree(s_waits1);
907: PetscFree(s_waits2);
909: /* Now allocate sending buffers for a->j, and send them off */
910: PetscMalloc1(nrqr,&sbuf_aj);
911: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
912: if (nrqr) PetscMalloc1(j,&sbuf_aj[0]);
913: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
915: PetscMalloc1(nrqr,&s_waits3);
916: {
918: for (i=0; i<nrqr; i++) {
919: rbuf1_i = rbuf1[i];
920: sbuf_aj_i = sbuf_aj[i];
921: ct1 = 2*rbuf1_i[0] + 1;
922: ct2 = 0;
923: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
924: kmax = rbuf1[i][2*j];
925: for (k=0; k<kmax; k++,ct1++) {
926: row = rbuf1_i[ct1] - rstart;
927: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
928: ncols = nzA + nzB;
929: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
931: /* load the column indices for this row into cols */
932: cols = sbuf_aj_i + ct2;
933: for (l=0; l<nzB; l++) {
934: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
935: else break;
936: }
937: imark = l;
938: for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
939: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
940: ct2 += ncols;
941: }
942: }
943: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
944: }
945: }
947: /* create col map: global col of C -> local col of submatrices */
948: #if defined(PETSC_USE_CTABLE)
949: for (i=0; i<ismax; i++) {
950: if (!allcolumns[i]) {
951: PetscTableCreate(ncol[i],c->Nbs,&cmap[i]);
953: jmax = ncol[i];
954: icol_i = icol[i];
955: cmap_i = cmap[i];
956: for (j=0; j<jmax; j++) {
957: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
958: }
959: } else cmap[i] = NULL;
960: }
961: #else
962: for (i=0; i<ismax; i++) {
963: if (!allcolumns[i]) {
964: PetscCalloc1(c->Nbs,&cmap[i]);
965: jmax = ncol[i];
966: icol_i = icol[i];
967: cmap_i = cmap[i];
968: for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
969: } else cmap[i] = NULL;
970: }
971: #endif
973: /* Create lens which is required for MatCreate... */
974: for (i=0,j=0; i<ismax; i++) j += nrow[i];
975: PetscMalloc1(ismax,&lens);
977: if (ismax) {
978: PetscCalloc1(j,&lens[0]);
979: }
980: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
982: /* Update lens from local data */
983: for (i=0; i<ismax; i++) {
984: row2proc_i = row2proc[i];
985: jmax = nrow[i];
986: if (!allcolumns[i]) cmap_i = cmap[i];
987: irow_i = irow[i];
988: lens_i = lens[i];
989: for (j=0; j<jmax; j++) {
990: if (allrows[i]) row = j;
991: else row = irow_i[j]; /* global blocked row of C */
993: proc = row2proc_i[j];
994: if (proc == rank) {
995: /* Get indices from matA and then from matB */
996: #if defined(PETSC_USE_CTABLE)
997: PetscInt tt;
998: #endif
999: row = row - rstart;
1000: nzA = a_i[row+1] - a_i[row];
1001: nzB = b_i[row+1] - b_i[row];
1002: cworkA = a_j + a_i[row];
1003: cworkB = b_j + b_i[row];
1005: if (!allcolumns[i]) {
1006: #if defined(PETSC_USE_CTABLE)
1007: for (k=0; k<nzA; k++) {
1008: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1009: if (tt) lens_i[j]++;
1010: }
1011: for (k=0; k<nzB; k++) {
1012: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1013: if (tt) lens_i[j]++;
1014: }
1016: #else
1017: for (k=0; k<nzA; k++) {
1018: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1019: }
1020: for (k=0; k<nzB; k++) {
1021: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1022: }
1023: #endif
1024: } else { /* allcolumns */
1025: lens_i[j] = nzA + nzB;
1026: }
1027: }
1028: }
1029: }
1031: /* Create row map: global row of C -> local row of submatrices */
1032: for (i=0; i<ismax; i++) {
1033: if (!allrows[i]) {
1034: #if defined(PETSC_USE_CTABLE)
1035: PetscTableCreate(nrow[i],c->Mbs,&rmap[i]);
1036: irow_i = irow[i];
1037: jmax = nrow[i];
1038: for (j=0; j<jmax; j++) {
1039: if (allrows[i]) {
1040: PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);
1041: } else {
1042: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1043: }
1044: }
1045: #else
1046: PetscCalloc1(c->Mbs,&rmap[i]);
1047: rmap_i = rmap[i];
1048: irow_i = irow[i];
1049: jmax = nrow[i];
1050: for (j=0; j<jmax; j++) {
1051: if (allrows[i]) rmap_i[j] = j;
1052: else rmap_i[irow_i[j]] = j;
1053: }
1054: #endif
1055: } else rmap[i] = NULL;
1056: }
1058: /* Update lens from offproc data */
1059: {
1060: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1062: MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);
1063: for (tmp2=0; tmp2<nrqs; tmp2++) {
1064: sbuf1_i = sbuf1[pa[tmp2]];
1065: jmax = sbuf1_i[0];
1066: ct1 = 2*jmax+1;
1067: ct2 = 0;
1068: rbuf2_i = rbuf2[tmp2];
1069: rbuf3_i = rbuf3[tmp2];
1070: for (j=1; j<=jmax; j++) {
1071: is_no = sbuf1_i[2*j-1];
1072: max1 = sbuf1_i[2*j];
1073: lens_i = lens[is_no];
1074: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1075: rmap_i = rmap[is_no];
1076: for (k=0; k<max1; k++,ct1++) {
1077: if (allrows[is_no]) {
1078: row = sbuf1_i[ct1];
1079: } else {
1080: #if defined(PETSC_USE_CTABLE)
1081: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1082: row--;
1084: #else
1085: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1086: #endif
1087: }
1088: max2 = rbuf2_i[ct1];
1089: for (l=0; l<max2; l++,ct2++) {
1090: if (!allcolumns[is_no]) {
1091: #if defined(PETSC_USE_CTABLE)
1092: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1093: #else
1094: tcol = cmap_i[rbuf3_i[ct2]];
1095: #endif
1096: if (tcol) lens_i[row]++;
1097: } else { /* allcolumns */
1098: lens_i[row]++; /* lens_i[row] += max2 ? */
1099: }
1100: }
1101: }
1102: }
1103: }
1104: }
1105: PetscFree(r_waits3);
1106: MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);
1107: PetscFree(s_waits3);
1109: /* Create the submatrices */
1110: for (i=0; i<ismax; i++) {
1111: PetscInt bs_tmp;
1112: if (ijonly) bs_tmp = 1;
1113: else bs_tmp = bs;
1115: MatCreate(PETSC_COMM_SELF,submats+i);
1116: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);
1118: MatSetType(submats[i],((PetscObject)A)->type_name);
1119: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1120: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1122: /* create struct Mat_SubSppt and attached it to submat */
1123: PetscNew(&smat_i);
1124: subc = (Mat_SeqBAIJ*)submats[i]->data;
1125: subc->submatis1 = smat_i;
1127: smat_i->destroy = submats[i]->ops->destroy;
1128: submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1129: submats[i]->factortype = C->factortype;
1131: smat_i->id = i;
1132: smat_i->nrqs = nrqs;
1133: smat_i->nrqr = nrqr;
1134: smat_i->rbuf1 = rbuf1;
1135: smat_i->rbuf2 = rbuf2;
1136: smat_i->rbuf3 = rbuf3;
1137: smat_i->sbuf2 = sbuf2;
1138: smat_i->req_source2 = req_source2;
1140: smat_i->sbuf1 = sbuf1;
1141: smat_i->ptr = ptr;
1142: smat_i->tmp = tmp;
1143: smat_i->ctr = ctr;
1145: smat_i->pa = pa;
1146: smat_i->req_size = req_size;
1147: smat_i->req_source1 = req_source1;
1149: smat_i->allcolumns = allcolumns[i];
1150: smat_i->allrows = allrows[i];
1151: smat_i->singleis = PETSC_FALSE;
1152: smat_i->row2proc = row2proc[i];
1153: smat_i->rmap = rmap[i];
1154: smat_i->cmap = cmap[i];
1155: }
1157: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1158: MatCreate(PETSC_COMM_SELF,&submats[0]);
1159: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
1160: MatSetType(submats[0],MATDUMMY);
1162: /* create struct Mat_SubSppt and attached it to submat */
1163: PetscNewLog(submats[0],&smat_i);
1164: submats[0]->data = (void*)smat_i;
1166: smat_i->destroy = submats[0]->ops->destroy;
1167: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1168: submats[0]->factortype = C->factortype;
1170: smat_i->id = 0;
1171: smat_i->nrqs = nrqs;
1172: smat_i->nrqr = nrqr;
1173: smat_i->rbuf1 = rbuf1;
1174: smat_i->rbuf2 = rbuf2;
1175: smat_i->rbuf3 = rbuf3;
1176: smat_i->sbuf2 = sbuf2;
1177: smat_i->req_source2 = req_source2;
1179: smat_i->sbuf1 = sbuf1;
1180: smat_i->ptr = ptr;
1181: smat_i->tmp = tmp;
1182: smat_i->ctr = ctr;
1184: smat_i->pa = pa;
1185: smat_i->req_size = req_size;
1186: smat_i->req_source1 = req_source1;
1188: smat_i->allcolumns = PETSC_FALSE;
1189: smat_i->singleis = PETSC_FALSE;
1190: smat_i->row2proc = NULL;
1191: smat_i->rmap = NULL;
1192: smat_i->cmap = NULL;
1193: }
1195: if (ismax) PetscFree(lens[0]);
1196: PetscFree(lens);
1197: if (sbuf_aj) {
1198: PetscFree(sbuf_aj[0]);
1199: PetscFree(sbuf_aj);
1200: }
1202: } /* endof scall == MAT_INITIAL_MATRIX */
1204: /* Post recv matrix values */
1205: if (!ijonly) {
1206: PetscObjectGetNewTag((PetscObject)C,&tag4);
1207: PetscMalloc1(nrqs,&rbuf4);
1208: PetscMalloc1(nrqs,&r_waits4);
1209: for (i=0; i<nrqs; ++i) {
1210: PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);
1211: MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1212: }
1214: /* Allocate sending buffers for a->a, and send them off */
1215: PetscMalloc1(nrqr,&sbuf_aa);
1216: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1218: if (nrqr) PetscMalloc1(j*bs2,&sbuf_aa[0]);
1219: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1221: PetscMalloc1(nrqr,&s_waits4);
1223: for (i=0; i<nrqr; i++) {
1224: rbuf1_i = rbuf1[i];
1225: sbuf_aa_i = sbuf_aa[i];
1226: ct1 = 2*rbuf1_i[0]+1;
1227: ct2 = 0;
1228: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1229: kmax = rbuf1_i[2*j];
1230: for (k=0; k<kmax; k++,ct1++) {
1231: row = rbuf1_i[ct1] - rstart;
1232: nzA = a_i[row+1] - a_i[row];
1233: nzB = b_i[row+1] - b_i[row];
1234: ncols = nzA + nzB;
1235: cworkB = b_j + b_i[row];
1236: vworkA = a_a + a_i[row]*bs2;
1237: vworkB = b_a + b_i[row]*bs2;
1239: /* load the column values for this row into vals*/
1240: vals = sbuf_aa_i+ct2*bs2;
1241: for (l=0; l<nzB; l++) {
1242: if ((bmap[cworkB[l]]) < cstart) {
1243: PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2);
1244: } else break;
1245: }
1246: imark = l;
1247: for (l=0; l<nzA; l++) {
1248: PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2);
1249: }
1250: for (l=imark; l<nzB; l++) {
1251: PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2);
1252: }
1254: ct2 += ncols;
1255: }
1256: }
1257: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1258: }
1259: }
1261: /* Assemble the matrices */
1262: /* First assemble the local rows */
1263: for (i=0; i<ismax; i++) {
1264: row2proc_i = row2proc[i];
1265: subc = (Mat_SeqBAIJ*)submats[i]->data;
1266: imat_ilen = subc->ilen;
1267: imat_j = subc->j;
1268: imat_i = subc->i;
1269: imat_a = subc->a;
1271: if (!allcolumns[i]) cmap_i = cmap[i];
1272: rmap_i = rmap[i];
1273: irow_i = irow[i];
1274: jmax = nrow[i];
1275: for (j=0; j<jmax; j++) {
1276: if (allrows[i]) row = j;
1277: else row = irow_i[j];
1278: proc = row2proc_i[j];
1280: if (proc == rank) {
1282: row = row - rstart;
1283: nzA = a_i[row+1] - a_i[row];
1284: nzB = b_i[row+1] - b_i[row];
1285: cworkA = a_j + a_i[row];
1286: cworkB = b_j + b_i[row];
1287: if (!ijonly) {
1288: vworkA = a_a + a_i[row]*bs2;
1289: vworkB = b_a + b_i[row]*bs2;
1290: }
1292: if (allrows[i]) {
1293: row = row+rstart;
1294: } else {
1295: #if defined(PETSC_USE_CTABLE)
1296: PetscTableFind(rmap_i,row+rstart+1,&row);
1297: row--;
1300: #else
1301: row = rmap_i[row + rstart];
1302: #endif
1303: }
1304: mat_i = imat_i[row];
1305: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1306: mat_j = imat_j + mat_i;
1307: ilen = imat_ilen[row];
1309: /* load the column indices for this row into cols*/
1310: if (!allcolumns[i]) {
1311: for (l=0; l<nzB; l++) {
1312: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1313: #if defined(PETSC_USE_CTABLE)
1314: PetscTableFind(cmap_i,ctmp+1,&tcol);
1315: if (tcol) {
1316: #else
1317: if ((tcol = cmap_i[ctmp])) {
1318: #endif
1319: *mat_j++ = tcol - 1;
1320: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1321: mat_a += bs2;
1322: ilen++;
1323: }
1324: } else break;
1325: }
1326: imark = l;
1327: for (l=0; l<nzA; l++) {
1328: #if defined(PETSC_USE_CTABLE)
1329: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1330: if (tcol) {
1331: #else
1332: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1333: #endif
1334: *mat_j++ = tcol - 1;
1335: if (!ijonly) {
1336: PetscArraycpy(mat_a,vworkA+l*bs2,bs2);
1337: mat_a += bs2;
1338: }
1339: ilen++;
1340: }
1341: }
1342: for (l=imark; l<nzB; l++) {
1343: #if defined(PETSC_USE_CTABLE)
1344: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1345: if (tcol) {
1346: #else
1347: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1348: #endif
1349: *mat_j++ = tcol - 1;
1350: if (!ijonly) {
1351: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1352: mat_a += bs2;
1353: }
1354: ilen++;
1355: }
1356: }
1357: } else { /* allcolumns */
1358: for (l=0; l<nzB; l++) {
1359: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1360: *mat_j++ = ctmp;
1361: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1362: mat_a += bs2;
1363: ilen++;
1364: } else break;
1365: }
1366: imark = l;
1367: for (l=0; l<nzA; l++) {
1368: *mat_j++ = cstart+cworkA[l];
1369: if (!ijonly) {
1370: PetscArraycpy(mat_a,vworkA+l*bs2,bs2);
1371: mat_a += bs2;
1372: }
1373: ilen++;
1374: }
1375: for (l=imark; l<nzB; l++) {
1376: *mat_j++ = bmap[cworkB[l]];
1377: if (!ijonly) {
1378: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1379: mat_a += bs2;
1380: }
1381: ilen++;
1382: }
1383: }
1384: imat_ilen[row] = ilen;
1385: }
1386: }
1387: }
1389: /* Now assemble the off proc rows */
1390: if (!ijonly) {
1391: MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);
1392: }
1393: for (tmp2=0; tmp2<nrqs; tmp2++) {
1394: sbuf1_i = sbuf1[pa[tmp2]];
1395: jmax = sbuf1_i[0];
1396: ct1 = 2*jmax + 1;
1397: ct2 = 0;
1398: rbuf2_i = rbuf2[tmp2];
1399: rbuf3_i = rbuf3[tmp2];
1400: if (!ijonly) rbuf4_i = rbuf4[tmp2];
1401: for (j=1; j<=jmax; j++) {
1402: is_no = sbuf1_i[2*j-1];
1403: rmap_i = rmap[is_no];
1404: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1405: subc = (Mat_SeqBAIJ*)submats[is_no]->data;
1406: imat_ilen = subc->ilen;
1407: imat_j = subc->j;
1408: imat_i = subc->i;
1409: if (!ijonly) imat_a = subc->a;
1410: max1 = sbuf1_i[2*j];
1411: for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1412: row = sbuf1_i[ct1];
1414: if (allrows[is_no]) {
1415: row = sbuf1_i[ct1];
1416: } else {
1417: #if defined(PETSC_USE_CTABLE)
1418: PetscTableFind(rmap_i,row+1,&row);
1419: row--;
1421: #else
1422: row = rmap_i[row];
1423: #endif
1424: }
1425: ilen = imat_ilen[row];
1426: mat_i = imat_i[row];
1427: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1428: mat_j = imat_j + mat_i;
1429: max2 = rbuf2_i[ct1];
1430: if (!allcolumns[is_no]) {
1431: for (l=0; l<max2; l++,ct2++) {
1432: #if defined(PETSC_USE_CTABLE)
1433: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1434: #else
1435: tcol = cmap_i[rbuf3_i[ct2]];
1436: #endif
1437: if (tcol) {
1438: *mat_j++ = tcol - 1;
1439: if (!ijonly) {
1440: PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);
1441: mat_a += bs2;
1442: }
1443: ilen++;
1444: }
1445: }
1446: } else { /* allcolumns */
1447: for (l=0; l<max2; l++,ct2++) {
1448: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1449: if (!ijonly) {
1450: PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);
1451: mat_a += bs2;
1452: }
1453: ilen++;
1454: }
1455: }
1456: imat_ilen[row] = ilen;
1457: }
1458: }
1459: }
1461: if (!iscsorted) { /* sort column indices of the rows */
1462: MatScalar *work;
1464: PetscMalloc1(bs2,&work);
1465: for (i=0; i<ismax; i++) {
1466: subc = (Mat_SeqBAIJ*)submats[i]->data;
1467: imat_ilen = subc->ilen;
1468: imat_j = subc->j;
1469: imat_i = subc->i;
1470: if (!ijonly) imat_a = subc->a;
1471: if (allcolumns[i]) continue;
1473: jmax = nrow[i];
1474: for (j=0; j<jmax; j++) {
1475: mat_i = imat_i[j];
1476: mat_j = imat_j + mat_i;
1477: ilen = imat_ilen[j];
1478: if (ijonly) {
1479: PetscSortInt(ilen,mat_j);
1480: } else {
1481: mat_a = imat_a + mat_i*bs2;
1482: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
1483: }
1484: }
1485: }
1486: PetscFree(work);
1487: }
1489: if (!ijonly) {
1490: PetscFree(r_waits4);
1491: MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);
1492: PetscFree(s_waits4);
1493: }
1495: /* Restore the indices */
1496: for (i=0; i<ismax; i++) {
1497: if (!allrows[i]) {
1498: ISRestoreIndices(isrow[i],irow+i);
1499: }
1500: if (!allcolumns[i]) {
1501: ISRestoreIndices(iscol[i],icol+i);
1502: }
1503: }
1505: for (i=0; i<ismax; i++) {
1506: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1507: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1508: }
1510: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
1511: PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);
1513: if (!ijonly) {
1514: if (sbuf_aa) {
1515: PetscFree(sbuf_aa[0]);
1516: PetscFree(sbuf_aa);
1517: }
1519: for (i=0; i<nrqs; ++i) {
1520: PetscFree(rbuf4[i]);
1521: }
1522: PetscFree(rbuf4);
1523: }
1524: c->ijonly = PETSC_FALSE; /* set back to the default */
1525: return 0;
1526: }