Actual source code: baijov.c
petsc-3.5.4 2015-05-23
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
17: {
19: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
20: IS *is_new;
23: PetscMalloc1(imax,&is_new);
24: /* Convert the indices into block format */
25: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
26: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
27: for (i=0; i<ov; ++i) {
28: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
29: }
30: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
31: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
32: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
33: PetscFree(is_new);
34: return(0);
35: }
37: /*
38: Sample message format:
39: If a processor A wants processor B to process some elements corresponding
40: to index sets is[1], is[5]
41: mesg [0] = 2 (no of index sets in the mesg)
42: -----------
43: mesg [1] = 1 => is[1]
44: mesg [2] = sizeof(is[1]);
45: -----------
46: mesg [5] = 5 => is[5]
47: mesg [6] = sizeof(is[5]);
48: -----------
49: mesg [7]
50: mesg [n] data(is[1])
51: -----------
52: mesg[n+1]
53: mesg[m] data(is[5])
54: -----------
56: Notes:
57: nrqs - no of requests sent (or to be sent out)
58: nrqr - no of requests recieved (which have to be or which have been processed
59: */
62: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
63: {
64: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
65: const PetscInt **idx,*idx_i;
66: PetscInt *n,*w3,*w4,**data,len;
68: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
69: PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr;
70: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
71: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
72: PetscBT *table;
73: MPI_Comm comm;
74: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
75: MPI_Status *s_status,*recv_status;
76: char *t_p;
79: PetscObjectGetComm((PetscObject)C,&comm);
80: size = c->size;
81: rank = c->rank;
82: Mbs = c->Mbs;
84: PetscObjectGetNewTag((PetscObject)C,&tag1);
85: PetscObjectGetNewTag((PetscObject)C,&tag2);
87: PetscMalloc2(imax+1,&idx,imax,&n);
89: for (i=0; i<imax; i++) {
90: ISGetIndices(is[i],&idx[i]);
91: ISGetLocalSize(is[i],&n[i]);
92: }
94: /* evaluate communication - mesg to who,length of mesg, and buffer space
95: required. Based on this, buffers are allocated, and data copied into them*/
96: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
97: for (i=0; i<imax; i++) {
98: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
99: idx_i = idx[i];
100: len = n[i];
101: for (j=0; j<len; j++) {
102: row = idx_i[j];
103: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
104: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
105: w4[proc]++;
106: }
107: for (j=0; j<size; j++) {
108: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
109: }
110: }
112: nrqs = 0; /* no of outgoing messages */
113: msz = 0; /* total mesg length (for all proc */
114: w1[rank] = 0; /* no mesg sent to itself */
115: w3[rank] = 0;
116: for (i=0; i<size; i++) {
117: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
118: }
119: /* pa - is list of processors to communicate with */
120: PetscMalloc1((nrqs+1),&pa);
121: for (i=0,j=0; i<size; i++) {
122: if (w1[i]) {pa[j] = i; j++;}
123: }
125: /* Each message would have a header = 1 + 2*(no of IS) + data */
126: for (i=0; i<nrqs; i++) {
127: j = pa[i];
128: w1[j] += w2[j] + 2*w3[j];
129: msz += w1[j];
130: }
132: /* Determine the number of messages to expect, their lengths, from from-ids */
133: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
134: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
136: /* Now post the Irecvs corresponding to these messages */
137: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
139: /* Allocate Memory for outgoing messages */
140: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
141: PetscMemzero(outdat,size*sizeof(PetscInt*));
142: PetscMemzero(ptr,size*sizeof(PetscInt*));
143: {
144: PetscInt *iptr = tmp,ict = 0;
145: for (i=0; i<nrqs; i++) {
146: j = pa[i];
147: iptr += ict;
148: outdat[j] = iptr;
149: ict = w1[j];
150: }
151: }
153: /* Form the outgoing messages */
154: /*plug in the headers*/
155: for (i=0; i<nrqs; i++) {
156: j = pa[i];
157: outdat[j][0] = 0;
158: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
159: ptr[j] = outdat[j] + 2*w3[j] + 1;
160: }
162: /* Memory for doing local proc's work*/
163: {
164: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
166: for (i=0; i<imax; i++) {
167: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
168: data[i] = d_p + (Mbs)*i;
169: }
170: }
172: /* Parse the IS and update local tables and the outgoing buf with the data*/
173: {
174: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
175: PetscBT table_i;
177: for (i=0; i<imax; i++) {
178: PetscMemzero(ctr,size*sizeof(PetscInt));
179: n_i = n[i];
180: table_i = table[i];
181: idx_i = idx[i];
182: data_i = data[i];
183: isz_i = isz[i];
184: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
185: row = idx_i[j];
186: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
187: if (proc != rank) { /* copy to the outgoing buffer */
188: ctr[proc]++;
189: *ptr[proc] = row;
190: ptr[proc]++;
191: } else { /* Update the local table */
192: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
193: }
194: }
195: /* Update the headers for the current IS */
196: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
197: if ((ctr_j = ctr[j])) {
198: outdat_j = outdat[j];
199: k = ++outdat_j[0];
200: outdat_j[2*k] = ctr_j;
201: outdat_j[2*k-1] = i;
202: }
203: }
204: isz[i] = isz_i;
205: }
206: }
208: /* Now post the sends */
209: PetscMalloc1((nrqs+1),&s_waits1);
210: for (i=0; i<nrqs; ++i) {
211: j = pa[i];
212: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
213: }
215: /* No longer need the original indices*/
216: for (i=0; i<imax; ++i) {
217: ISRestoreIndices(is[i],idx+i);
218: }
219: PetscFree2(idx,n);
221: for (i=0; i<imax; ++i) {
222: ISDestroy(&is[i]);
223: }
225: /* Do Local work*/
226: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
228: /* Receive messages*/
229: PetscMalloc1((nrqr+1),&recv_status);
230: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
232: PetscMalloc1((nrqs+1),&s_status);
233: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
235: /* Phase 1 sends are complete - deallocate buffers */
236: PetscFree4(outdat,ptr,tmp,ctr);
237: PetscFree4(w1,w2,w3,w4);
239: PetscMalloc1((nrqr+1),&xdata);
240: PetscMalloc1((nrqr+1),&isz1);
241: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
242: PetscFree(rbuf[0]);
243: PetscFree(rbuf);
245: /* Send the data back*/
246: /* Do a global reduction to know the buffer space req for incoming messages*/
247: {
248: PetscMPIInt *rw1;
250: PetscCalloc1(size,&rw1);
252: for (i=0; i<nrqr; ++i) {
253: proc = recv_status[i].MPI_SOURCE;
254: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
255: rw1[proc] = isz1[i];
256: }
258: PetscFree(onodes1);
259: PetscFree(olengths1);
261: /* Determine the number of messages to expect, their lengths, from from-ids */
262: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
263: PetscFree(rw1);
264: }
265: /* Now post the Irecvs corresponding to these messages */
266: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
268: /* Now post the sends */
269: PetscMalloc1((nrqr+1),&s_waits2);
270: for (i=0; i<nrqr; ++i) {
271: j = recv_status[i].MPI_SOURCE;
272: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
273: }
275: /* receive work done on other processors*/
276: {
277: PetscMPIInt idex;
278: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
279: PetscBT table_i;
280: MPI_Status *status2;
282: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
283: for (i=0; i<nrqs; ++i) {
284: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
285: /* Process the message*/
286: rbuf2_i = rbuf2[idex];
287: ct1 = 2*rbuf2_i[0]+1;
288: jmax = rbuf2[idex][0];
289: for (j=1; j<=jmax; j++) {
290: max = rbuf2_i[2*j];
291: is_no = rbuf2_i[2*j-1];
292: isz_i = isz[is_no];
293: data_i = data[is_no];
294: table_i = table[is_no];
295: for (k=0; k<max; k++,ct1++) {
296: row = rbuf2_i[ct1];
297: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
298: }
299: isz[is_no] = isz_i;
300: }
301: }
302: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
303: PetscFree(status2);
304: }
306: for (i=0; i<imax; ++i) {
307: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
308: }
311: PetscFree(onodes2);
312: PetscFree(olengths2);
314: PetscFree(pa);
315: PetscFree(rbuf2[0]);
316: PetscFree(rbuf2);
317: PetscFree(s_waits1);
318: PetscFree(r_waits1);
319: PetscFree(s_waits2);
320: PetscFree(r_waits2);
321: PetscFree5(table,data,isz,d_p,t_p);
322: PetscFree(s_status);
323: PetscFree(recv_status);
324: PetscFree(xdata[0]);
325: PetscFree(xdata);
326: PetscFree(isz1);
327: return(0);
328: }
332: /*
333: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
334: the work on the local processor.
336: Inputs:
337: C - MAT_MPIBAIJ;
338: imax - total no of index sets processed at a time;
339: table - an array of char - size = Mbs bits.
341: Output:
342: isz - array containing the count of the solution elements corresponding
343: to each index set;
344: data - pointer to the solutions
345: */
346: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
347: {
348: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
349: Mat A = c->A,B = c->B;
350: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
351: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
352: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
353: PetscBT table_i;
356: rstart = c->rstartbs;
357: cstart = c->cstartbs;
358: ai = a->i;
359: aj = a->j;
360: bi = b->i;
361: bj = b->j;
362: garray = c->garray;
365: for (i=0; i<imax; i++) {
366: data_i = data[i];
367: table_i = table[i];
368: isz_i = isz[i];
369: for (j=0,max=isz[i]; j<max; j++) {
370: row = data_i[j] - rstart;
371: start = ai[row];
372: end = ai[row+1];
373: for (k=start; k<end; k++) { /* Amat */
374: val = aj[k] + cstart;
375: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
376: }
377: start = bi[row];
378: end = bi[row+1];
379: for (k=start; k<end; k++) { /* Bmat */
380: val = garray[bj[k]];
381: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
382: }
383: }
384: isz[i] = isz_i;
385: }
386: return(0);
387: }
390: /*
391: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
392: and return the output
394: Input:
395: C - the matrix
396: nrqr - no of messages being processed.
397: rbuf - an array of pointers to the recieved requests
399: Output:
400: xdata - array of messages to be sent back
401: isz1 - size of each message
403: For better efficiency perhaps we should malloc separately each xdata[i],
404: then if a remalloc is required we need only copy the data for that one row
405: rather than all previous rows as it is now where a single large chunck of
406: memory is used.
408: */
409: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
410: {
411: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
412: Mat A = c->A,B = c->B;
413: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
415: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
416: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
417: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
418: PetscInt *rbuf_i,kmax,rbuf_0;
419: PetscBT xtable;
422: Mbs = c->Mbs;
423: rstart = c->rstartbs;
424: cstart = c->cstartbs;
425: ai = a->i;
426: aj = a->j;
427: bi = b->i;
428: bj = b->j;
429: garray = c->garray;
432: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
433: rbuf_i = rbuf[i];
434: rbuf_0 = rbuf_i[0];
435: ct += rbuf_0;
436: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
437: }
439: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
440: else max1 = 1;
441: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
442: PetscMalloc1(mem_estimate,&xdata[0]);
443: ++no_malloc;
444: PetscBTCreate(Mbs,&xtable);
445: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
447: ct3 = 0;
448: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
449: rbuf_i = rbuf[i];
450: rbuf_0 = rbuf_i[0];
451: ct1 = 2*rbuf_0+1;
452: ct2 = ct1;
453: ct3 += ct1;
454: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
455: PetscBTMemzero(Mbs,xtable);
456: oct2 = ct2;
457: kmax = rbuf_i[2*j];
458: for (k=0; k<kmax; k++,ct1++) {
459: row = rbuf_i[ct1];
460: if (!PetscBTLookupSet(xtable,row)) {
461: if (!(ct3 < mem_estimate)) {
462: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
463: PetscMalloc1(new_estimate,&tmp);
464: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
465: PetscFree(xdata[0]);
466: xdata[0] = tmp;
467: mem_estimate = new_estimate; ++no_malloc;
468: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
469: }
470: xdata[i][ct2++] = row;
471: ct3++;
472: }
473: }
474: for (k=oct2,max2=ct2; k<max2; k++) {
475: row = xdata[i][k] - rstart;
476: start = ai[row];
477: end = ai[row+1];
478: for (l=start; l<end; l++) {
479: val = aj[l] + cstart;
480: if (!PetscBTLookupSet(xtable,val)) {
481: if (!(ct3 < mem_estimate)) {
482: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483: PetscMalloc1(new_estimate,&tmp);
484: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
485: PetscFree(xdata[0]);
486: xdata[0] = tmp;
487: mem_estimate = new_estimate; ++no_malloc;
488: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
489: }
490: xdata[i][ct2++] = val;
491: ct3++;
492: }
493: }
494: start = bi[row];
495: end = bi[row+1];
496: for (l=start; l<end; l++) {
497: val = garray[bj[l]];
498: if (!PetscBTLookupSet(xtable,val)) {
499: if (!(ct3 < mem_estimate)) {
500: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
501: PetscMalloc1(new_estimate,&tmp);
502: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
503: PetscFree(xdata[0]);
504: xdata[0] = tmp;
505: mem_estimate = new_estimate; ++no_malloc;
506: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
507: }
508: xdata[i][ct2++] = val;
509: ct3++;
510: }
511: }
512: }
513: /* Update the header*/
514: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
515: xdata[i][2*j-1] = rbuf_i[2*j-1];
516: }
517: xdata[i][0] = rbuf_0;
518: xdata[i+1] = xdata[i] + ct2;
519: isz1[i] = ct2; /* size of each message */
520: }
521: PetscBTDestroy(&xtable);
522: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
523: return(0);
524: }
528: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
529: {
530: IS *isrow_new,*iscol_new;
531: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
533: PetscInt nmax,nstages_local,nstages,i,pos,max_no,ncol,nrow,N=C->cmap->N,bs=C->rmap->bs;
534: PetscBool colflag,*allcolumns,*allrows;
537: /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
538: for (i = 0; i < ismax; ++i) {
539: PetscBool sorted;
540: ISSorted(iscol[i], &sorted);
541: if (!sorted) SETERRQ1(((PetscObject)iscol[i])->comm, PETSC_ERR_SUP, "Column index set %D not sorted", i);
542: }
543: /* The compression and expansion should be avoided. Doesn't point
544: out errors, might change the indices, hence buggey */
545: PetscMalloc2(ismax+1,&isrow_new,ismax+1,&iscol_new);
546: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_new);
547: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_new);
549: /* Check for special case: each processor gets entire matrix columns */
550: PetscMalloc2(ismax+1,&allcolumns,ismax+1,&allrows);
551: for (i=0; i<ismax; i++) {
552: ISIdentity(iscol[i],&colflag);
553: ISGetLocalSize(iscol[i],&ncol);
554: if (colflag && ncol == C->cmap->N) allcolumns[i] = PETSC_TRUE;
555: else allcolumns[i] = PETSC_FALSE;
557: ISIdentity(isrow[i],&colflag);
558: ISGetLocalSize(isrow[i],&nrow);
559: if (colflag && nrow == C->rmap->N) allrows[i] = PETSC_TRUE;
560: else allrows[i] = PETSC_FALSE;
561: }
563: /* Allocate memory to hold all the submatrices */
564: if (scall != MAT_REUSE_MATRIX) {
565: PetscMalloc1((ismax+1),submat);
566: }
567: /* Determine the number of stages through which submatrices are done */
568: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
569: if (!nmax) nmax = 1;
570: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
572: /* Make sure every processor loops through the nstages */
573: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
574: for (i=0,pos=0; i<nstages; i++) {
575: if (pos+nmax <= ismax) max_no = nmax;
576: else if (pos == ismax) max_no = 0;
577: else max_no = ismax-pos;
578: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,allrows+pos,allcolumns+pos,*submat+pos);
579: pos += max_no;
580: }
582: for (i=0; i<ismax; i++) {
583: ISDestroy(&isrow_new[i]);
584: ISDestroy(&iscol_new[i]);
585: }
586: PetscFree2(isrow_new,iscol_new);
587: PetscFree2(allcolumns,allrows);
588: return(0);
589: }
591: #if defined(PETSC_USE_CTABLE)
594: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
595: {
596: PetscInt nGlobalNd = proc_gnode[size];
597: PetscMPIInt fproc;
601: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
602: if (fproc > size) fproc = size;
603: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
604: if (row < proc_gnode[fproc]) fproc--;
605: else fproc++;
606: }
607: *rank = fproc;
608: return(0);
609: }
610: #endif
612: /* -------------------------------------------------------------------------*/
613: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
616: PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allrows,PetscBool *allcolumns,Mat *submats)
617: {
618: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
619: Mat A = c->A;
620: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
621: const PetscInt **irow,**icol,*irow_i;
622: PetscInt *nrow,*ncol,*w3,*w4,start;
624: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
625: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
626: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
627: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
628: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
629: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
630: PetscInt bs =C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
631: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
632: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
633: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3;
634: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3;
635: MPI_Comm comm;
636: PetscBool flag;
637: PetscMPIInt *onodes1,*olengths1;
638: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
640: /* variables below are used for the matrix numerical values - case of !ijonly */
641: MPI_Request *r_waits4,*s_waits4;
642: MPI_Status *r_status4,*s_status4;
643: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a = NULL,*sbuf_aa_i,*vworkA = NULL,*vworkB = NULL;
644: MatScalar *a_a=a->a,*b_a=b->a;
646: #if defined(PETSC_USE_CTABLE)
647: PetscInt tt;
648: PetscTable *rmap,*cmap,rmap_i,cmap_i=NULL;
649: #else
650: PetscInt **cmap,*cmap_i=NULL,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
651: #endif
654: PetscObjectGetComm((PetscObject)C,&comm);
655: tag0 = ((PetscObject)C)->tag;
656: size = c->size;
657: rank = c->rank;
659: /* Get some new tags to keep the communication clean */
660: PetscObjectGetNewTag((PetscObject)C,&tag1);
661: PetscObjectGetNewTag((PetscObject)C,&tag2);
662: PetscObjectGetNewTag((PetscObject)C,&tag3);
664: #if defined(PETSC_USE_CTABLE)
665: PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);
666: #else
667: PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,Mbs+1,&rtable);
668: /* Create hash table for the mapping :row -> proc*/
669: for (i=0,j=0; i<size; i++) {
670: jmax = C->rmap->range[i+1]/bs;
671: for (; j<jmax; j++) rtable[j] = i;
672: }
673: #endif
675: for (i=0; i<ismax; i++) {
676: if (allrows[i]) {
677: irow[i] = NULL;
678: nrow[i] = C->rmap->N/bs;
679: } else {
680: ISGetIndices(isrow[i],&irow[i]);
681: ISGetLocalSize(isrow[i],&nrow[i]);
682: }
684: if (allcolumns[i]) {
685: icol[i] = NULL;
686: ncol[i] = C->cmap->N/bs;
687: } else {
688: ISGetIndices(iscol[i],&icol[i]);
689: ISGetLocalSize(iscol[i],&ncol[i]);
690: }
691: }
693: /* evaluate communication - mesg to who,length of mesg,and buffer space
694: required. Based on this, buffers are allocated, and data copied into them*/
695: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
696: for (i=0; i<ismax; i++) {
697: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
698: jmax = nrow[i];
699: irow_i = irow[i];
700: for (j=0; j<jmax; j++) {
701: if (allrows[i]) row = j;
702: else row = irow_i[j];
704: #if defined(PETSC_USE_CTABLE)
705: PetscGetProc(row,size,c->rangebs,&proc);
706: #else
707: proc = rtable[row];
708: #endif
709: w4[proc]++;
710: }
711: for (j=0; j<size; j++) {
712: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
713: }
714: }
716: nrqs = 0; /* no of outgoing messages */
717: msz = 0; /* total mesg length for all proc */
718: w1[rank] = 0; /* no mesg sent to intself */
719: w3[rank] = 0;
720: for (i=0; i<size; i++) {
721: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
722: }
723: PetscMalloc1((nrqs+1),&pa); /*(proc -array)*/
724: for (i=0,j=0; i<size; i++) {
725: if (w1[i]) { pa[j] = i; j++; }
726: }
728: /* Each message would have a header = 1 + 2*(no of IS) + data */
729: for (i=0; i<nrqs; i++) {
730: j = pa[i];
731: w1[j] += w2[j] + 2* w3[j];
732: msz += w1[j];
733: }
735: /* Determine the number of messages to expect, their lengths, from from-ids */
736: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
737: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
739: /* Now post the Irecvs corresponding to these messages */
740: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
742: PetscFree(onodes1);
743: PetscFree(olengths1);
745: /* Allocate Memory for outgoing messages */
746: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
747: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
748: PetscMemzero(ptr,size*sizeof(PetscInt*));
749: {
750: PetscInt *iptr = tmp,ict = 0;
751: for (i=0; i<nrqs; i++) {
752: j = pa[i];
753: iptr += ict;
754: sbuf1[j] = iptr;
755: ict = w1[j];
756: }
757: }
759: /* Form the outgoing messages */
760: /* Initialise the header space */
761: for (i=0; i<nrqs; i++) {
762: j = pa[i];
763: sbuf1[j][0] = 0;
764: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
765: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
766: }
768: /* Parse the isrow and copy data into outbuf */
769: for (i=0; i<ismax; i++) {
770: PetscMemzero(ctr,size*sizeof(PetscInt));
771: irow_i = irow[i];
772: jmax = nrow[i];
773: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
774: if (allrows[i]) row = j;
775: else row = irow_i[j];
777: #if defined(PETSC_USE_CTABLE)
778: PetscGetProc(row,size,c->rangebs,&proc);
779: #else
780: proc = rtable[row];
781: #endif
782: if (proc != rank) { /* copy to the outgoing buf*/
783: ctr[proc]++;
784: *ptr[proc] = row;
785: ptr[proc]++;
786: }
787: }
788: /* Update the headers for the current IS */
789: for (j=0; j<size; j++) { /* Can Optimise this loop too */
790: if ((ctr_j = ctr[j])) {
791: sbuf1_j = sbuf1[j];
792: k = ++sbuf1_j[0];
793: sbuf1_j[2*k] = ctr_j;
794: sbuf1_j[2*k-1] = i;
795: }
796: }
797: }
799: /* Now post the sends */
800: PetscMalloc1((nrqs+1),&s_waits1);
801: for (i=0; i<nrqs; ++i) {
802: j = pa[i];
803: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
804: }
806: /* Post Recieves to capture the buffer size */
807: PetscMalloc1((nrqs+1),&r_waits2);
808: PetscMalloc1((nrqs+1),&rbuf2);
809: rbuf2[0] = tmp + msz;
810: for (i=1; i<nrqs; ++i) {
811: j = pa[i];
812: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
813: }
814: for (i=0; i<nrqs; ++i) {
815: j = pa[i];
816: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
817: }
819: /* Send to other procs the buf size they should allocate */
821: /* Receive messages*/
822: PetscMalloc1((nrqr+1),&s_waits2);
823: PetscMalloc1((nrqr+1),&r_status1);
824: PetscMalloc3(nrqr+1,&sbuf2,nrqr,&req_size,nrqr,&req_source);
825: {
826: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
827: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
829: for (i=0; i<nrqr; ++i) {
830: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
832: req_size[idex] = 0;
833: rbuf1_i = rbuf1[idex];
834: start = 2*rbuf1_i[0] + 1;
835: MPI_Get_count(r_status1+i,MPIU_INT,&end);
836: PetscMalloc1(end,&sbuf2[idex]);
837: sbuf2_i = sbuf2[idex];
838: for (j=start; j<end; j++) {
839: id = rbuf1_i[j] - rstart;
840: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
841: sbuf2_i[j] = ncols;
842: req_size[idex] += ncols;
843: }
844: req_source[idex] = r_status1[i].MPI_SOURCE;
845: /* form the header */
846: sbuf2_i[0] = req_size[idex];
847: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
848: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
849: }
850: }
851: PetscFree(r_status1);
852: PetscFree(r_waits1);
854: /* recv buffer sizes */
855: /* Receive messages*/
856: PetscMalloc1((nrqs+1),&rbuf3);
857: PetscMalloc1((nrqs+1),&r_waits3);
858: PetscMalloc1((nrqs+1),&r_status2);
859: if (!ijonly) {
860: PetscMalloc1((nrqs+1),&rbuf4);
861: PetscMalloc1((nrqs+1),&r_waits4);
862: }
864: for (i=0; i<nrqs; ++i) {
865: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
866: PetscMalloc1(rbuf2[idex][0],&rbuf3[idex]);
867: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
868: if (!ijonly) {
869: PetscMalloc1(rbuf2[idex][0]*bs2,&rbuf4[idex]);
870: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
871: }
872: }
873: PetscFree(r_status2);
874: PetscFree(r_waits2);
876: /* Wait on sends1 and sends2 */
877: PetscMalloc1((nrqs+1),&s_status1);
878: PetscMalloc1((nrqr+1),&s_status2);
880: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
881: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
882: PetscFree(s_status1);
883: PetscFree(s_status2);
884: PetscFree(s_waits1);
885: PetscFree(s_waits2);
887: /* Now allocate buffers for a->j, and send them off */
888: PetscMalloc1((nrqr+1),&sbuf_aj);
889: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
890: PetscMalloc1((j+1),&sbuf_aj[0]);
891: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
893: PetscMalloc1((nrqr+1),&s_waits3);
894: {
895: for (i=0; i<nrqr; i++) {
896: rbuf1_i = rbuf1[i];
897: sbuf_aj_i = sbuf_aj[i];
898: ct1 = 2*rbuf1_i[0] + 1;
899: ct2 = 0;
900: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
901: kmax = rbuf1[i][2*j];
902: for (k=0; k<kmax; k++,ct1++) {
903: row = rbuf1_i[ct1] - rstart;
904: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
905: ncols = nzA + nzB;
906: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
908: /* load the column indices for this row into cols*/
909: cols = sbuf_aj_i + ct2;
910: for (l=0; l<nzB; l++) {
911: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
912: else break;
913: }
914: imark = l;
915: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
916: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
917: ct2 += ncols;
918: }
919: }
920: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
921: }
922: }
923: PetscMalloc1((nrqs+1),&r_status3);
924: PetscMalloc1((nrqr+1),&s_status3);
926: /* Allocate buffers for a->a, and send them off */
927: if (!ijonly) {
928: PetscMalloc1((nrqr+1),&sbuf_aa);
929: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
930: PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);
931: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
933: PetscMalloc1((nrqr+1),&s_waits4);
934: {
935: for (i=0; i<nrqr; i++) {
936: rbuf1_i = rbuf1[i];
937: sbuf_aa_i = sbuf_aa[i];
938: ct1 = 2*rbuf1_i[0]+1;
939: ct2 = 0;
940: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
941: kmax = rbuf1_i[2*j];
942: for (k=0; k<kmax; k++,ct1++) {
943: row = rbuf1_i[ct1] - rstart;
944: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
945: ncols = nzA + nzB;
946: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
947: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
949: /* load the column values for this row into vals*/
950: vals = sbuf_aa_i+ct2*bs2;
951: for (l=0; l<nzB; l++) {
952: if ((bmap[cworkB[l]]) < cstart) {
953: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
954: } else break;
955: }
956: imark = l;
957: for (l=0; l<nzA; l++) {
958: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
959: }
960: for (l=imark; l<nzB; l++) {
961: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
962: }
963: ct2 += ncols;
964: }
965: }
966: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
967: }
968: }
969: PetscMalloc1((nrqs+1),&r_status4);
970: PetscMalloc1((nrqr+1),&s_status4);
971: }
972: PetscFree(rbuf1[0]);
973: PetscFree(rbuf1);
975: /* Form the matrix */
976: /* create col map: global col of C -> local col of submatrices */
977: {
978: const PetscInt *icol_i;
979: #if defined(PETSC_USE_CTABLE)
980: PetscMalloc1((1+ismax),&cmap);
981: for (i=0; i<ismax; i++) {
982: if (!allcolumns[i]) {
983: PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);
984: jmax = ncol[i];
985: icol_i = icol[i];
986: cmap_i = cmap[i];
987: for (j=0; j<jmax; j++) {
988: PetscTableAdd(cmap_i,icol_i[j]+1,j+1,INSERT_VALUES);
989: }
990: } else {
991: cmap[i] = NULL;
992: }
993: }
994: #else
995: PetscMalloc1(ismax,&cmap);
996: for (i=0; i<ismax; i++) {
997: if (!allcolumns[i]) {
998: PetscCalloc1(c->Nbs,&cmap[i]);
999: jmax = ncol[i];
1000: icol_i = icol[i];
1001: cmap_i = cmap[i];
1002: for (j=0; j<jmax; j++) {
1003: cmap_i[icol_i[j]] = j+1;
1004: }
1005: } else { /* allcolumns[i] */
1006: cmap[i] = NULL;
1007: }
1008: }
1009: #endif
1010: }
1012: /* Create lens which is required for MatCreate... */
1013: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1014: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);
1015: lens[0] = (PetscInt*)(lens + ismax);
1016: PetscMemzero(lens[0],j*sizeof(PetscInt));
1017: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1019: /* Update lens from local data */
1020: for (i=0; i<ismax; i++) {
1021: jmax = nrow[i];
1022: if (!allcolumns[i]) cmap_i = cmap[i];
1023: irow_i = irow[i];
1024: lens_i = lens[i];
1025: for (j=0; j<jmax; j++) {
1026: if (allrows[i]) row = j;
1027: else row = irow_i[j];
1029: #if defined(PETSC_USE_CTABLE)
1030: PetscGetProc(row,size,c->rangebs,&proc);
1031: #else
1032: proc = rtable[row];
1033: #endif
1034: if (proc == rank) {
1035: /* Get indices from matA and then from matB */
1036: row = row - rstart;
1037: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1038: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1039: if (!allcolumns[i]) {
1040: #if defined(PETSC_USE_CTABLE)
1041: for (k=0; k<nzA; k++) {
1042: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1043: if (tt) lens_i[j]++;
1044: }
1045: for (k=0; k<nzB; k++) {
1046: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1047: if (tt) lens_i[j]++;
1048: }
1050: #else
1051: for (k=0; k<nzA; k++) {
1052: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1053: }
1054: for (k=0; k<nzB; k++) {
1055: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1056: }
1057: #endif
1058: } else { /* allcolumns */
1059: lens_i[j] = nzA + nzB;
1060: }
1061: }
1062: }
1063: }
1064: #if defined(PETSC_USE_CTABLE)
1065: /* Create row map*/
1066: PetscMalloc1((1+ismax),&rmap);
1067: for (i=0; i<ismax; i++) {
1068: PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);
1069: }
1070: #else
1071: /* Create row map*/
1072: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);
1073: rmap[0] = (PetscInt*)(rmap + ismax);
1074: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1075: for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + Mbs;
1076: #endif
1077: for (i=0; i<ismax; i++) {
1078: irow_i = irow[i];
1079: jmax = nrow[i];
1080: #if defined(PETSC_USE_CTABLE)
1081: rmap_i = rmap[i];
1082: for (j=0; j<jmax; j++) {
1083: if (allrows[i]) {
1084: PetscTableAdd(rmap_i,j+1,j+1,INSERT_VALUES);
1085: } else {
1086: PetscTableAdd(rmap_i,irow_i[j]+1,j+1,INSERT_VALUES);
1087: }
1088: }
1089: #else
1090: rmap_i = rmap[i];
1091: for (j=0; j<jmax; j++) {
1092: if (allrows[i]) rmap_i[j] = j;
1093: else rmap_i[irow_i[j]] = j;
1094: }
1095: #endif
1096: }
1098: /* Update lens from offproc data */
1099: {
1100: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1101: PetscMPIInt ii;
1103: for (tmp2=0; tmp2<nrqs; tmp2++) {
1104: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1105: idex = pa[ii];
1106: sbuf1_i = sbuf1[idex];
1107: jmax = sbuf1_i[0];
1108: ct1 = 2*jmax+1;
1109: ct2 = 0;
1110: rbuf2_i = rbuf2[ii];
1111: rbuf3_i = rbuf3[ii];
1112: for (j=1; j<=jmax; j++) {
1113: is_no = sbuf1_i[2*j-1];
1114: max1 = sbuf1_i[2*j];
1115: lens_i = lens[is_no];
1116: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1117: rmap_i = rmap[is_no];
1118: for (k=0; k<max1; k++,ct1++) {
1119: #if defined(PETSC_USE_CTABLE)
1120: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1121: row--;
1122: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1123: #else
1124: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1125: #endif
1126: max2 = rbuf2_i[ct1];
1127: for (l=0; l<max2; l++,ct2++) {
1128: if (!allcolumns[is_no]) {
1129: #if defined(PETSC_USE_CTABLE)
1130: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tt);
1131: if (tt) lens_i[row]++;
1132: #else
1133: if (cmap_i[rbuf3_i[ct2]]) lens_i[row]++;
1134: #endif
1135: } else { /* allcolumns */
1136: lens_i[row]++;
1137: }
1138: }
1139: }
1140: }
1141: }
1142: }
1143: PetscFree(r_status3);
1144: PetscFree(r_waits3);
1145: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1146: PetscFree(s_status3);
1147: PetscFree(s_waits3);
1149: /* Create the submatrices */
1150: if (scall == MAT_REUSE_MATRIX) {
1151: if (ijonly) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP," MAT_REUSE_MATRIX and ijonly is not supported yet");
1152: /*
1153: Assumes new rows are same length as the old rows, hence bug!
1154: */
1155: for (i=0; i<ismax; i++) {
1156: mat = (Mat_SeqBAIJ*)(submats[i]->data);
1157: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1158: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1159: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1160: /* Initial matrix as if empty */
1161: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1163: submats[i]->factortype = C->factortype;
1164: }
1165: } else {
1166: PetscInt bs_tmp;
1167: if (ijonly) bs_tmp = 1;
1168: else bs_tmp = bs;
1169: for (i=0; i<ismax; i++) {
1170: MatCreate(PETSC_COMM_SELF,submats+i);
1171: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,nrow[i]*bs_tmp,ncol[i]*bs_tmp);
1172: MatSetType(submats[i],((PetscObject)A)->type_name);
1173: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1174: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1175: }
1176: }
1178: /* Assemble the matrices */
1179: /* First assemble the local rows */
1180: {
1181: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1182: MatScalar *imat_a = NULL;
1184: for (i=0; i<ismax; i++) {
1185: mat = (Mat_SeqBAIJ*)submats[i]->data;
1186: imat_ilen = mat->ilen;
1187: imat_j = mat->j;
1188: imat_i = mat->i;
1189: if (!ijonly) imat_a = mat->a;
1190: if (!allcolumns[i]) cmap_i = cmap[i];
1191: rmap_i = rmap[i];
1192: irow_i = irow[i];
1193: jmax = nrow[i];
1194: for (j=0; j<jmax; j++) {
1195: if (allrows[i]) row = j;
1196: else row = irow_i[j];
1198: #if defined(PETSC_USE_CTABLE)
1199: PetscGetProc(row,size,c->rangebs,&proc);
1200: #else
1201: proc = rtable[row];
1202: #endif
1203: if (proc == rank) {
1204: row = row - rstart;
1205: nzA = a_i[row+1] - a_i[row];
1206: nzB = b_i[row+1] - b_i[row];
1207: cworkA = a_j + a_i[row];
1208: cworkB = b_j + b_i[row];
1209: if (!ijonly) {
1210: vworkA = a_a + a_i[row]*bs2;
1211: vworkB = b_a + b_i[row]*bs2;
1212: }
1213: #if defined(PETSC_USE_CTABLE)
1214: PetscTableFind(rmap_i,row+rstart+1,&row);
1215: row--;
1216: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1217: #else
1218: row = rmap_i[row + rstart];
1219: #endif
1220: mat_i = imat_i[row];
1221: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1222: mat_j = imat_j + mat_i;
1223: ilen_row = imat_ilen[row];
1225: /* load the column indices for this row into cols*/
1226: if (!allcolumns[i]) {
1227: for (l=0; l<nzB; l++) {
1228: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1229: #if defined(PETSC_USE_CTABLE)
1230: PetscTableFind(cmap_i,ctmp+1,&tcol);
1231: if (tcol) {
1232: #else
1233: if ((tcol = cmap_i[ctmp])) {
1234: #endif
1235: *mat_j++ = tcol - 1;
1236: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1237: mat_a += bs2;
1238: ilen_row++;
1239: }
1240: } else break;
1241: }
1242: imark = l;
1243: for (l=0; l<nzA; l++) {
1244: #if defined(PETSC_USE_CTABLE)
1245: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1246: if (tcol) {
1247: #else
1248: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1249: #endif
1250: *mat_j++ = tcol - 1;
1251: if (!ijonly) {
1252: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1253: mat_a += bs2;
1254: }
1255: ilen_row++;
1256: }
1257: }
1258: for (l=imark; l<nzB; l++) {
1259: #if defined(PETSC_USE_CTABLE)
1260: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1261: if (tcol) {
1262: #else
1263: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1264: #endif
1265: *mat_j++ = tcol - 1;
1266: if (!ijonly) {
1267: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1268: mat_a += bs2;
1269: }
1270: ilen_row++;
1271: }
1272: }
1273: } else { /* allcolumns */
1274: for (l=0; l<nzB; l++) {
1275: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1276: *mat_j++ = ctmp;
1277: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1278: mat_a += bs2;
1279: ilen_row++;
1280: } else break;
1281: }
1282: imark = l;
1283: for (l=0; l<nzA; l++) {
1284: *mat_j++ = cstart+cworkA[l];
1285: if (!ijonly) {
1286: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1287: mat_a += bs2;
1288: }
1289: ilen_row++;
1290: }
1291: for (l=imark; l<nzB; l++) {
1292: *mat_j++ = bmap[cworkB[l]];
1293: if (!ijonly) {
1294: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1295: mat_a += bs2;
1296: }
1297: ilen_row++;
1298: }
1299: }
1300: imat_ilen[row] = ilen_row;
1301: }
1302: }
1303: }
1304: }
1306: /* Now assemble the off proc rows*/
1307: {
1308: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1309: PetscInt *imat_j,*imat_i;
1310: MatScalar *imat_a = NULL,*rbuf4_i = NULL;
1311: PetscMPIInt ii;
1313: for (tmp2=0; tmp2<nrqs; tmp2++) {
1314: if (ijonly) ii = tmp2;
1315: else {
1316: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1317: }
1318: idex = pa[ii];
1319: sbuf1_i = sbuf1[idex];
1320: jmax = sbuf1_i[0];
1321: ct1 = 2*jmax + 1;
1322: ct2 = 0;
1323: rbuf2_i = rbuf2[ii];
1324: rbuf3_i = rbuf3[ii];
1325: if (!ijonly) rbuf4_i = rbuf4[ii];
1326: for (j=1; j<=jmax; j++) {
1327: is_no = sbuf1_i[2*j-1];
1328: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1329: rmap_i = rmap[is_no];
1330: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1331: imat_ilen = mat->ilen;
1332: imat_j = mat->j;
1333: imat_i = mat->i;
1334: if (!ijonly) imat_a = mat->a;
1335: max1 = sbuf1_i[2*j];
1336: for (k=0; k<max1; k++,ct1++) {
1337: row = sbuf1_i[ct1];
1338: #if defined(PETSC_USE_CTABLE)
1339: PetscTableFind(rmap_i,row+1,&row);
1340: row--;
1341: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1342: #else
1343: row = rmap_i[row];
1344: #endif
1345: ilen = imat_ilen[row];
1346: mat_i = imat_i[row];
1347: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1348: mat_j = imat_j + mat_i;
1349: max2 = rbuf2_i[ct1];
1351: if (!allcolumns[is_no]) {
1352: for (l=0; l<max2; l++,ct2++) {
1353: #if defined(PETSC_USE_CTABLE)
1354: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1355: if (tcol) {
1356: #else
1357: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1358: #endif
1359: *mat_j++ = tcol - 1;
1360: if (!ijonly) {
1361: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1362: mat_a += bs2;
1363: }
1364: ilen++;
1365: }
1366: }
1367: } else { /* allcolumns */
1368: for (l=0; l<max2; l++,ct2++) {
1369: *mat_j++ = rbuf3_i[ct2];
1370: if (!ijonly) {
1371: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1372: mat_a += bs2;
1373: }
1374: ilen++;
1375: }
1376: }
1377: imat_ilen[row] = ilen;
1378: }
1379: }
1380: }
1381: }
1382: if (!ijonly) {
1383: PetscFree(r_status4);
1384: PetscFree(r_waits4);
1385: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1386: PetscFree(s_waits4);
1387: PetscFree(s_status4);
1388: }
1390: /* Restore the indices */
1391: for (i=0; i<ismax; i++) {
1392: if (!allrows[i]) {
1393: ISRestoreIndices(isrow[i],irow+i);
1394: }
1395: if (!allcolumns[i]) {
1396: ISRestoreIndices(iscol[i],icol+i);
1397: }
1398: }
1400: /* Destroy allocated memory */
1401: #if defined(PETSC_USE_CTABLE)
1402: PetscFree4(irow,icol,nrow,ncol);
1403: #else
1404: PetscFree5(irow,icol,nrow,ncol,rtable);
1405: #endif
1406: PetscFree4(w1,w2,w3,w4);
1407: PetscFree(pa);
1409: PetscFree4(sbuf1,ptr,tmp,ctr);
1410: PetscFree(sbuf1);
1411: PetscFree(rbuf2);
1412: for (i=0; i<nrqr; ++i) {
1413: PetscFree(sbuf2[i]);
1414: }
1415: for (i=0; i<nrqs; ++i) {
1416: PetscFree(rbuf3[i]);
1417: }
1418: PetscFree3(sbuf2,req_size,req_source);
1419: PetscFree(rbuf3);
1420: PetscFree(sbuf_aj[0]);
1421: PetscFree(sbuf_aj);
1422: if (!ijonly) {
1423: for (i=0; i<nrqs; ++i) {PetscFree(rbuf4[i]);}
1424: PetscFree(rbuf4);
1425: PetscFree(sbuf_aa[0]);
1426: PetscFree(sbuf_aa);
1427: }
1429: #if defined(PETSC_USE_CTABLE)
1430: for (i=0; i<ismax; i++) {
1431: PetscTableDestroy((PetscTable*)&rmap[i]);
1432: }
1433: #endif
1434: PetscFree(rmap);
1436: for (i=0; i<ismax; i++) {
1437: if (!allcolumns[i]) {
1438: #if defined(PETSC_USE_CTABLE)
1439: PetscTableDestroy((PetscTable*)&cmap[i]);
1440: #else
1441: PetscFree(cmap[i]);
1442: #endif
1443: }
1444: }
1445: PetscFree(cmap);
1446: PetscFree(lens);
1448: for (i=0; i<ismax; i++) {
1449: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1450: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1451: }
1453: c->ijonly = PETSC_FALSE; /* set back to the default */
1454: return(0);
1455: }