Actual source code: baijov.c
petsc-3.6.1 2015-08-06
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
17: {
19: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
20: IS *is_new;
23: PetscMalloc1(imax,&is_new);
24: /* Convert the indices into block format */
25: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
26: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
27: for (i=0; i<ov; ++i) {
28: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
29: }
30: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
31: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
32: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
33: PetscFree(is_new);
34: return(0);
35: }
37: /*
38: Sample message format:
39: If a processor A wants processor B to process some elements corresponding
40: to index sets is[1], is[5]
41: mesg [0] = 2 (no of index sets in the mesg)
42: -----------
43: mesg [1] = 1 => is[1]
44: mesg [2] = sizeof(is[1]);
45: -----------
46: mesg [5] = 5 => is[5]
47: mesg [6] = sizeof(is[5]);
48: -----------
49: mesg [7]
50: mesg [n] data(is[1])
51: -----------
52: mesg[n+1]
53: mesg[m] data(is[5])
54: -----------
56: Notes:
57: nrqs - no of requests sent (or to be sent out)
58: nrqr - no of requests recieved (which have to be or which have been processed
59: */
62: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
63: {
64: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
65: const PetscInt **idx,*idx_i;
66: PetscInt *n,*w3,*w4,**data,len;
68: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
69: PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr;
70: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
71: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
72: PetscBT *table;
73: MPI_Comm comm;
74: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
75: MPI_Status *s_status,*recv_status;
76: char *t_p;
79: PetscObjectGetComm((PetscObject)C,&comm);
80: size = c->size;
81: rank = c->rank;
82: Mbs = c->Mbs;
84: PetscObjectGetNewTag((PetscObject)C,&tag1);
85: PetscObjectGetNewTag((PetscObject)C,&tag2);
87: PetscMalloc2(imax+1,&idx,imax,&n);
89: for (i=0; i<imax; i++) {
90: ISGetIndices(is[i],&idx[i]);
91: ISGetLocalSize(is[i],&n[i]);
92: }
94: /* evaluate communication - mesg to who,length of mesg, and buffer space
95: required. Based on this, buffers are allocated, and data copied into them*/
96: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
97: for (i=0; i<imax; i++) {
98: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
99: idx_i = idx[i];
100: len = n[i];
101: for (j=0; j<len; j++) {
102: row = idx_i[j];
103: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
104: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
105: w4[proc]++;
106: }
107: for (j=0; j<size; j++) {
108: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
109: }
110: }
112: nrqs = 0; /* no of outgoing messages */
113: msz = 0; /* total mesg length (for all proc */
114: w1[rank] = 0; /* no mesg sent to itself */
115: w3[rank] = 0;
116: for (i=0; i<size; i++) {
117: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
118: }
119: /* pa - is list of processors to communicate with */
120: PetscMalloc1(nrqs+1,&pa);
121: for (i=0,j=0; i<size; i++) {
122: if (w1[i]) {pa[j] = i; j++;}
123: }
125: /* Each message would have a header = 1 + 2*(no of IS) + data */
126: for (i=0; i<nrqs; i++) {
127: j = pa[i];
128: w1[j] += w2[j] + 2*w3[j];
129: msz += w1[j];
130: }
132: /* Determine the number of messages to expect, their lengths, from from-ids */
133: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
134: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
136: /* Now post the Irecvs corresponding to these messages */
137: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
139: /* Allocate Memory for outgoing messages */
140: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
141: PetscMemzero(outdat,size*sizeof(PetscInt*));
142: PetscMemzero(ptr,size*sizeof(PetscInt*));
143: {
144: PetscInt *iptr = tmp,ict = 0;
145: for (i=0; i<nrqs; i++) {
146: j = pa[i];
147: iptr += ict;
148: outdat[j] = iptr;
149: ict = w1[j];
150: }
151: }
153: /* Form the outgoing messages */
154: /*plug in the headers*/
155: for (i=0; i<nrqs; i++) {
156: j = pa[i];
157: outdat[j][0] = 0;
158: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
159: ptr[j] = outdat[j] + 2*w3[j] + 1;
160: }
162: /* Memory for doing local proc's work*/
163: {
164: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
166: for (i=0; i<imax; i++) {
167: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
168: data[i] = d_p + (Mbs)*i;
169: }
170: }
172: /* Parse the IS and update local tables and the outgoing buf with the data*/
173: {
174: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
175: PetscBT table_i;
177: for (i=0; i<imax; i++) {
178: PetscMemzero(ctr,size*sizeof(PetscInt));
179: n_i = n[i];
180: table_i = table[i];
181: idx_i = idx[i];
182: data_i = data[i];
183: isz_i = isz[i];
184: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
185: row = idx_i[j];
186: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
187: if (proc != rank) { /* copy to the outgoing buffer */
188: ctr[proc]++;
189: *ptr[proc] = row;
190: ptr[proc]++;
191: } else { /* Update the local table */
192: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
193: }
194: }
195: /* Update the headers for the current IS */
196: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
197: if ((ctr_j = ctr[j])) {
198: outdat_j = outdat[j];
199: k = ++outdat_j[0];
200: outdat_j[2*k] = ctr_j;
201: outdat_j[2*k-1] = i;
202: }
203: }
204: isz[i] = isz_i;
205: }
206: }
208: /* Now post the sends */
209: PetscMalloc1(nrqs+1,&s_waits1);
210: for (i=0; i<nrqs; ++i) {
211: j = pa[i];
212: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
213: }
215: /* No longer need the original indices*/
216: for (i=0; i<imax; ++i) {
217: ISRestoreIndices(is[i],idx+i);
218: }
219: PetscFree2(idx,n);
221: for (i=0; i<imax; ++i) {
222: ISDestroy(&is[i]);
223: }
225: /* Do Local work*/
226: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
228: /* Receive messages*/
229: PetscMalloc1(nrqr+1,&recv_status);
230: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
232: PetscMalloc1(nrqs+1,&s_status);
233: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
235: /* Phase 1 sends are complete - deallocate buffers */
236: PetscFree4(outdat,ptr,tmp,ctr);
237: PetscFree4(w1,w2,w3,w4);
239: PetscMalloc1(nrqr+1,&xdata);
240: PetscMalloc1(nrqr+1,&isz1);
241: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
242: PetscFree(rbuf[0]);
243: PetscFree(rbuf);
245: /* Send the data back*/
246: /* Do a global reduction to know the buffer space req for incoming messages*/
247: {
248: PetscMPIInt *rw1;
250: PetscCalloc1(size,&rw1);
252: for (i=0; i<nrqr; ++i) {
253: proc = recv_status[i].MPI_SOURCE;
254: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
255: rw1[proc] = isz1[i];
256: }
258: PetscFree(onodes1);
259: PetscFree(olengths1);
261: /* Determine the number of messages to expect, their lengths, from from-ids */
262: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
263: PetscFree(rw1);
264: }
265: /* Now post the Irecvs corresponding to these messages */
266: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
268: /* Now post the sends */
269: PetscMalloc1(nrqr+1,&s_waits2);
270: for (i=0; i<nrqr; ++i) {
271: j = recv_status[i].MPI_SOURCE;
272: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
273: }
275: /* receive work done on other processors*/
276: {
277: PetscMPIInt idex;
278: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
279: PetscBT table_i;
280: MPI_Status *status2;
282: PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);
283: for (i=0; i<nrqs; ++i) {
284: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
285: /* Process the message*/
286: rbuf2_i = rbuf2[idex];
287: ct1 = 2*rbuf2_i[0]+1;
288: jmax = rbuf2[idex][0];
289: for (j=1; j<=jmax; j++) {
290: max = rbuf2_i[2*j];
291: is_no = rbuf2_i[2*j-1];
292: isz_i = isz[is_no];
293: data_i = data[is_no];
294: table_i = table[is_no];
295: for (k=0; k<max; k++,ct1++) {
296: row = rbuf2_i[ct1];
297: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
298: }
299: isz[is_no] = isz_i;
300: }
301: }
302: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
303: PetscFree(status2);
304: }
306: for (i=0; i<imax; ++i) {
307: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
308: }
311: PetscFree(onodes2);
312: PetscFree(olengths2);
314: PetscFree(pa);
315: PetscFree(rbuf2[0]);
316: PetscFree(rbuf2);
317: PetscFree(s_waits1);
318: PetscFree(r_waits1);
319: PetscFree(s_waits2);
320: PetscFree(r_waits2);
321: PetscFree5(table,data,isz,d_p,t_p);
322: PetscFree(s_status);
323: PetscFree(recv_status);
324: PetscFree(xdata[0]);
325: PetscFree(xdata);
326: PetscFree(isz1);
327: return(0);
328: }
332: /*
333: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
334: the work on the local processor.
336: Inputs:
337: C - MAT_MPIBAIJ;
338: imax - total no of index sets processed at a time;
339: table - an array of char - size = Mbs bits.
341: Output:
342: isz - array containing the count of the solution elements corresponding
343: to each index set;
344: data - pointer to the solutions
345: */
346: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
347: {
348: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
349: Mat A = c->A,B = c->B;
350: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
351: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
352: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
353: PetscBT table_i;
356: rstart = c->rstartbs;
357: cstart = c->cstartbs;
358: ai = a->i;
359: aj = a->j;
360: bi = b->i;
361: bj = b->j;
362: garray = c->garray;
365: for (i=0; i<imax; i++) {
366: data_i = data[i];
367: table_i = table[i];
368: isz_i = isz[i];
369: for (j=0,max=isz[i]; j<max; j++) {
370: row = data_i[j] - rstart;
371: start = ai[row];
372: end = ai[row+1];
373: for (k=start; k<end; k++) { /* Amat */
374: val = aj[k] + cstart;
375: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
376: }
377: start = bi[row];
378: end = bi[row+1];
379: for (k=start; k<end; k++) { /* Bmat */
380: val = garray[bj[k]];
381: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
382: }
383: }
384: isz[i] = isz_i;
385: }
386: return(0);
387: }
390: /*
391: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
392: and return the output
394: Input:
395: C - the matrix
396: nrqr - no of messages being processed.
397: rbuf - an array of pointers to the recieved requests
399: Output:
400: xdata - array of messages to be sent back
401: isz1 - size of each message
403: For better efficiency perhaps we should malloc separately each xdata[i],
404: then if a remalloc is required we need only copy the data for that one row
405: rather than all previous rows as it is now where a single large chunck of
406: memory is used.
408: */
409: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
410: {
411: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
412: Mat A = c->A,B = c->B;
413: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
415: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
416: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
417: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
418: PetscInt *rbuf_i,kmax,rbuf_0;
419: PetscBT xtable;
422: Mbs = c->Mbs;
423: rstart = c->rstartbs;
424: cstart = c->cstartbs;
425: ai = a->i;
426: aj = a->j;
427: bi = b->i;
428: bj = b->j;
429: garray = c->garray;
432: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
433: rbuf_i = rbuf[i];
434: rbuf_0 = rbuf_i[0];
435: ct += rbuf_0;
436: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
437: }
439: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
440: else max1 = 1;
441: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
442: PetscMalloc1(mem_estimate,&xdata[0]);
443: ++no_malloc;
444: PetscBTCreate(Mbs,&xtable);
445: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
447: ct3 = 0;
448: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
449: rbuf_i = rbuf[i];
450: rbuf_0 = rbuf_i[0];
451: ct1 = 2*rbuf_0+1;
452: ct2 = ct1;
453: ct3 += ct1;
454: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
455: PetscBTMemzero(Mbs,xtable);
456: oct2 = ct2;
457: kmax = rbuf_i[2*j];
458: for (k=0; k<kmax; k++,ct1++) {
459: row = rbuf_i[ct1];
460: if (!PetscBTLookupSet(xtable,row)) {
461: if (!(ct3 < mem_estimate)) {
462: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
463: PetscMalloc1(new_estimate,&tmp);
464: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
465: PetscFree(xdata[0]);
466: xdata[0] = tmp;
467: mem_estimate = new_estimate; ++no_malloc;
468: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
469: }
470: xdata[i][ct2++] = row;
471: ct3++;
472: }
473: }
474: for (k=oct2,max2=ct2; k<max2; k++) {
475: row = xdata[i][k] - rstart;
476: start = ai[row];
477: end = ai[row+1];
478: for (l=start; l<end; l++) {
479: val = aj[l] + cstart;
480: if (!PetscBTLookupSet(xtable,val)) {
481: if (!(ct3 < mem_estimate)) {
482: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483: PetscMalloc1(new_estimate,&tmp);
484: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
485: PetscFree(xdata[0]);
486: xdata[0] = tmp;
487: mem_estimate = new_estimate; ++no_malloc;
488: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
489: }
490: xdata[i][ct2++] = val;
491: ct3++;
492: }
493: }
494: start = bi[row];
495: end = bi[row+1];
496: for (l=start; l<end; l++) {
497: val = garray[bj[l]];
498: if (!PetscBTLookupSet(xtable,val)) {
499: if (!(ct3 < mem_estimate)) {
500: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
501: PetscMalloc1(new_estimate,&tmp);
502: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
503: PetscFree(xdata[0]);
504: xdata[0] = tmp;
505: mem_estimate = new_estimate; ++no_malloc;
506: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
507: }
508: xdata[i][ct2++] = val;
509: ct3++;
510: }
511: }
512: }
513: /* Update the header*/
514: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
515: xdata[i][2*j-1] = rbuf_i[2*j-1];
516: }
517: xdata[i][0] = rbuf_0;
518: xdata[i+1] = xdata[i] + ct2;
519: isz1[i] = ct2; /* size of each message */
520: }
521: PetscBTDestroy(&xtable);
522: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
523: return(0);
524: }
528: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
529: {
530: IS *isrow_new,*iscol_new;
531: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
533: PetscInt nmax,nstages_local,nstages,i,pos,max_no,ncol,nrow,N=C->cmap->N,bs=C->rmap->bs;
534: PetscBool colflag,*allcolumns,*allrows;
537: /* The compression and expansion should be avoided. Doesn't point
538: out errors, might change the indices, hence buggey */
539: PetscMalloc2(ismax+1,&isrow_new,ismax+1,&iscol_new);
540: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_new);
541: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_new);
543: /* Check for special case: each processor gets entire matrix columns */
544: PetscMalloc2(ismax+1,&allcolumns,ismax+1,&allrows);
545: for (i=0; i<ismax; i++) {
546: ISIdentity(iscol[i],&colflag);
547: ISGetLocalSize(iscol[i],&ncol);
548: if (colflag && ncol == C->cmap->N) allcolumns[i] = PETSC_TRUE;
549: else allcolumns[i] = PETSC_FALSE;
551: ISIdentity(isrow[i],&colflag);
552: ISGetLocalSize(isrow[i],&nrow);
553: if (colflag && nrow == C->rmap->N) allrows[i] = PETSC_TRUE;
554: else allrows[i] = PETSC_FALSE;
555: }
557: /* Allocate memory to hold all the submatrices */
558: if (scall != MAT_REUSE_MATRIX) {
559: PetscMalloc1(ismax+1,submat);
560: }
561: /* Determine the number of stages through which submatrices are done */
562: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
563: if (!nmax) nmax = 1;
564: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
566: /* Make sure every processor loops through the nstages */
567: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
568: for (i=0,pos=0; i<nstages; i++) {
569: if (pos+nmax <= ismax) max_no = nmax;
570: else if (pos == ismax) max_no = 0;
571: else max_no = ismax-pos;
572: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,allrows+pos,allcolumns+pos,*submat+pos);
573: pos += max_no;
574: }
576: for (i=0; i<ismax; i++) {
577: ISDestroy(&isrow_new[i]);
578: ISDestroy(&iscol_new[i]);
579: }
580: PetscFree2(isrow_new,iscol_new);
581: PetscFree2(allcolumns,allrows);
582: return(0);
583: }
585: #if defined(PETSC_USE_CTABLE)
588: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
589: {
590: PetscInt nGlobalNd = proc_gnode[size];
591: PetscMPIInt fproc;
595: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
596: if (fproc > size) fproc = size;
597: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
598: if (row < proc_gnode[fproc]) fproc--;
599: else fproc++;
600: }
601: *rank = fproc;
602: return(0);
603: }
604: #endif
606: /* -------------------------------------------------------------------------*/
607: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
610: PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allrows,PetscBool *allcolumns,Mat *submats)
611: {
612: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
613: Mat A = c->A;
614: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
615: const PetscInt **irow,**icol,*irow_i;
616: PetscInt *nrow,*ncol,*w3,*w4,start;
618: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
619: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
620: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
621: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
622: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
623: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
624: PetscInt bs =C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
625: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
626: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
627: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3;
628: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3;
629: MPI_Comm comm;
630: PetscBool flag;
631: PetscMPIInt *onodes1,*olengths1;
632: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
634: /* variables below are used for the matrix numerical values - case of !ijonly */
635: MPI_Request *r_waits4,*s_waits4;
636: MPI_Status *r_status4,*s_status4;
637: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a = NULL,*sbuf_aa_i,*vworkA = NULL,*vworkB = NULL;
638: MatScalar *a_a=a->a,*b_a=b->a;
640: #if defined(PETSC_USE_CTABLE)
641: PetscInt tt;
642: PetscTable *rmap,*cmap,rmap_i,cmap_i=NULL;
643: #else
644: PetscInt **cmap,*cmap_i=NULL,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
645: #endif
648: PetscObjectGetComm((PetscObject)C,&comm);
649: tag0 = ((PetscObject)C)->tag;
650: size = c->size;
651: rank = c->rank;
653: /* Get some new tags to keep the communication clean */
654: PetscObjectGetNewTag((PetscObject)C,&tag1);
655: PetscObjectGetNewTag((PetscObject)C,&tag2);
656: PetscObjectGetNewTag((PetscObject)C,&tag3);
658: #if defined(PETSC_USE_CTABLE)
659: PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);
660: #else
661: PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,Mbs+1,&rtable);
662: /* Create hash table for the mapping :row -> proc*/
663: for (i=0,j=0; i<size; i++) {
664: jmax = C->rmap->range[i+1]/bs;
665: for (; j<jmax; j++) rtable[j] = i;
666: }
667: #endif
669: for (i=0; i<ismax; i++) {
670: if (allrows[i]) {
671: irow[i] = NULL;
672: nrow[i] = C->rmap->N/bs;
673: } else {
674: ISGetIndices(isrow[i],&irow[i]);
675: ISGetLocalSize(isrow[i],&nrow[i]);
676: }
678: if (allcolumns[i]) {
679: icol[i] = NULL;
680: ncol[i] = C->cmap->N/bs;
681: } else {
682: ISGetIndices(iscol[i],&icol[i]);
683: ISGetLocalSize(iscol[i],&ncol[i]);
684: }
685: }
687: /* evaluate communication - mesg to who,length of mesg,and buffer space
688: required. Based on this, buffers are allocated, and data copied into them*/
689: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
690: for (i=0; i<ismax; i++) {
691: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
692: jmax = nrow[i];
693: irow_i = irow[i];
694: for (j=0; j<jmax; j++) {
695: if (allrows[i]) row = j;
696: else row = irow_i[j];
698: #if defined(PETSC_USE_CTABLE)
699: PetscGetProc(row,size,c->rangebs,&proc);
700: #else
701: proc = rtable[row];
702: #endif
703: w4[proc]++;
704: }
705: for (j=0; j<size; j++) {
706: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
707: }
708: }
710: nrqs = 0; /* no of outgoing messages */
711: msz = 0; /* total mesg length for all proc */
712: w1[rank] = 0; /* no mesg sent to intself */
713: w3[rank] = 0;
714: for (i=0; i<size; i++) {
715: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
716: }
717: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
718: for (i=0,j=0; i<size; i++) {
719: if (w1[i]) { pa[j] = i; j++; }
720: }
722: /* Each message would have a header = 1 + 2*(no of IS) + data */
723: for (i=0; i<nrqs; i++) {
724: j = pa[i];
725: w1[j] += w2[j] + 2* w3[j];
726: msz += w1[j];
727: }
729: /* Determine the number of messages to expect, their lengths, from from-ids */
730: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
731: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
733: /* Now post the Irecvs corresponding to these messages */
734: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
736: PetscFree(onodes1);
737: PetscFree(olengths1);
739: /* Allocate Memory for outgoing messages */
740: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
741: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
742: PetscMemzero(ptr,size*sizeof(PetscInt*));
743: {
744: PetscInt *iptr = tmp,ict = 0;
745: for (i=0; i<nrqs; i++) {
746: j = pa[i];
747: iptr += ict;
748: sbuf1[j] = iptr;
749: ict = w1[j];
750: }
751: }
753: /* Form the outgoing messages */
754: /* Initialise the header space */
755: for (i=0; i<nrqs; i++) {
756: j = pa[i];
757: sbuf1[j][0] = 0;
758: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
759: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
760: }
762: /* Parse the isrow and copy data into outbuf */
763: for (i=0; i<ismax; i++) {
764: PetscMemzero(ctr,size*sizeof(PetscInt));
765: irow_i = irow[i];
766: jmax = nrow[i];
767: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
768: if (allrows[i]) row = j;
769: else row = irow_i[j];
771: #if defined(PETSC_USE_CTABLE)
772: PetscGetProc(row,size,c->rangebs,&proc);
773: #else
774: proc = rtable[row];
775: #endif
776: if (proc != rank) { /* copy to the outgoing buf*/
777: ctr[proc]++;
778: *ptr[proc] = row;
779: ptr[proc]++;
780: }
781: }
782: /* Update the headers for the current IS */
783: for (j=0; j<size; j++) { /* Can Optimise this loop too */
784: if ((ctr_j = ctr[j])) {
785: sbuf1_j = sbuf1[j];
786: k = ++sbuf1_j[0];
787: sbuf1_j[2*k] = ctr_j;
788: sbuf1_j[2*k-1] = i;
789: }
790: }
791: }
793: /* Now post the sends */
794: PetscMalloc1(nrqs+1,&s_waits1);
795: for (i=0; i<nrqs; ++i) {
796: j = pa[i];
797: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
798: }
800: /* Post Recieves to capture the buffer size */
801: PetscMalloc1(nrqs+1,&r_waits2);
802: PetscMalloc1(nrqs+1,&rbuf2);
803: rbuf2[0] = tmp + msz;
804: for (i=1; i<nrqs; ++i) {
805: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
806: }
807: for (i=0; i<nrqs; ++i) {
808: j = pa[i];
809: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
810: }
812: /* Send to other procs the buf size they should allocate */
814: /* Receive messages*/
815: PetscMalloc1(nrqr+1,&s_waits2);
816: PetscMalloc1(nrqr+1,&r_status1);
817: PetscMalloc3(nrqr+1,&sbuf2,nrqr,&req_size,nrqr,&req_source);
818: {
819: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
820: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
822: for (i=0; i<nrqr; ++i) {
823: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
825: req_size[idex] = 0;
826: rbuf1_i = rbuf1[idex];
827: start = 2*rbuf1_i[0] + 1;
828: MPI_Get_count(r_status1+i,MPIU_INT,&end);
829: PetscMalloc1(end,&sbuf2[idex]);
830: sbuf2_i = sbuf2[idex];
831: for (j=start; j<end; j++) {
832: id = rbuf1_i[j] - rstart;
833: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
834: sbuf2_i[j] = ncols;
835: req_size[idex] += ncols;
836: }
837: req_source[idex] = r_status1[i].MPI_SOURCE;
838: /* form the header */
839: sbuf2_i[0] = req_size[idex];
840: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
841: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
842: }
843: }
844: PetscFree(r_status1);
845: PetscFree(r_waits1);
847: /* recv buffer sizes */
848: /* Receive messages*/
849: PetscMalloc1(nrqs+1,&rbuf3);
850: PetscMalloc1(nrqs+1,&r_waits3);
851: PetscMalloc1(nrqs+1,&r_status2);
852: if (!ijonly) {
853: PetscMalloc1(nrqs+1,&rbuf4);
854: PetscMalloc1(nrqs+1,&r_waits4);
855: }
857: for (i=0; i<nrqs; ++i) {
858: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
859: PetscMalloc1(rbuf2[idex][0],&rbuf3[idex]);
860: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
861: if (!ijonly) {
862: PetscMalloc1(rbuf2[idex][0]*bs2,&rbuf4[idex]);
863: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
864: }
865: }
866: PetscFree(r_status2);
867: PetscFree(r_waits2);
869: /* Wait on sends1 and sends2 */
870: PetscMalloc1(nrqs+1,&s_status1);
871: PetscMalloc1(nrqr+1,&s_status2);
873: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
874: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
875: PetscFree(s_status1);
876: PetscFree(s_status2);
877: PetscFree(s_waits1);
878: PetscFree(s_waits2);
880: /* Now allocate buffers for a->j, and send them off */
881: PetscMalloc1(nrqr+1,&sbuf_aj);
882: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
883: PetscMalloc1(j+1,&sbuf_aj[0]);
884: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
886: PetscMalloc1(nrqr+1,&s_waits3);
887: {
888: for (i=0; i<nrqr; i++) {
889: rbuf1_i = rbuf1[i];
890: sbuf_aj_i = sbuf_aj[i];
891: ct1 = 2*rbuf1_i[0] + 1;
892: ct2 = 0;
893: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
894: kmax = rbuf1[i][2*j];
895: for (k=0; k<kmax; k++,ct1++) {
896: row = rbuf1_i[ct1] - rstart;
897: nzA = a_i[row+1] - a_i[row];
898: nzB = b_i[row+1] - b_i[row];
899: ncols = nzA + nzB;
900: cworkA = a_j + a_i[row];
901: cworkB = b_j + b_i[row];
903: /* load the column indices for this row into cols*/
904: cols = sbuf_aj_i + ct2;
905: for (l=0; l<nzB; l++) {
906: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
907: else break;
908: }
909: imark = l;
910: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
911: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
912: ct2 += ncols;
913: }
914: }
915: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
916: }
917: }
918: PetscMalloc1(nrqs+1,&r_status3);
919: PetscMalloc1(nrqr+1,&s_status3);
921: /* Allocate buffers for a->a, and send them off */
922: if (!ijonly) {
923: PetscMalloc1(nrqr+1,&sbuf_aa);
924: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
925: PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);
926: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
928: PetscMalloc1(nrqr+1,&s_waits4);
929: {
930: for (i=0; i<nrqr; i++) {
931: rbuf1_i = rbuf1[i];
932: sbuf_aa_i = sbuf_aa[i];
933: ct1 = 2*rbuf1_i[0]+1;
934: ct2 = 0;
935: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
936: kmax = rbuf1_i[2*j];
937: for (k=0; k<kmax; k++,ct1++) {
938: row = rbuf1_i[ct1] - rstart;
939: nzA = a_i[row+1] - a_i[row];
940: nzB = b_i[row+1] - b_i[row];
941: ncols = nzA + nzB;
942: cworkB = b_j + b_i[row];
943: vworkA = a_a + a_i[row]*bs2;
944: vworkB = b_a + b_i[row]*bs2;
946: /* load the column values for this row into vals*/
947: vals = sbuf_aa_i+ct2*bs2;
948: for (l=0; l<nzB; l++) {
949: if ((bmap[cworkB[l]]) < cstart) {
950: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
951: } else break;
952: }
953: imark = l;
954: for (l=0; l<nzA; l++) {
955: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
956: }
957: for (l=imark; l<nzB; l++) {
958: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
959: }
960: ct2 += ncols;
961: }
962: }
963: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
964: }
965: }
966: PetscMalloc1(nrqs+1,&r_status4);
967: PetscMalloc1(nrqr+1,&s_status4);
968: }
969: PetscFree(rbuf1[0]);
970: PetscFree(rbuf1);
972: /* Form the matrix */
973: /* create col map: global col of C -> local col of submatrices */
974: {
975: const PetscInt *icol_i;
976: #if defined(PETSC_USE_CTABLE)
977: PetscMalloc1(1+ismax,&cmap);
978: for (i=0; i<ismax; i++) {
979: if (!allcolumns[i]) {
980: PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);
981: jmax = ncol[i];
982: icol_i = icol[i];
983: cmap_i = cmap[i];
984: for (j=0; j<jmax; j++) {
985: PetscTableAdd(cmap_i,icol_i[j]+1,j+1,INSERT_VALUES);
986: }
987: } else {
988: cmap[i] = NULL;
989: }
990: }
991: #else
992: PetscMalloc1(ismax,&cmap);
993: for (i=0; i<ismax; i++) {
994: if (!allcolumns[i]) {
995: PetscCalloc1(c->Nbs,&cmap[i]);
996: jmax = ncol[i];
997: icol_i = icol[i];
998: cmap_i = cmap[i];
999: for (j=0; j<jmax; j++) {
1000: cmap_i[icol_i[j]] = j+1;
1001: }
1002: } else { /* allcolumns[i] */
1003: cmap[i] = NULL;
1004: }
1005: }
1006: #endif
1007: }
1009: /* Create lens which is required for MatCreate... */
1010: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1011: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);
1012: lens[0] = (PetscInt*)(lens + ismax);
1013: PetscMemzero(lens[0],j*sizeof(PetscInt));
1014: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1016: /* Update lens from local data */
1017: for (i=0; i<ismax; i++) {
1018: jmax = nrow[i];
1019: if (!allcolumns[i]) cmap_i = cmap[i];
1020: irow_i = irow[i];
1021: lens_i = lens[i];
1022: for (j=0; j<jmax; j++) {
1023: if (allrows[i]) row = j;
1024: else row = irow_i[j];
1026: #if defined(PETSC_USE_CTABLE)
1027: PetscGetProc(row,size,c->rangebs,&proc);
1028: #else
1029: proc = rtable[row];
1030: #endif
1031: if (proc == rank) {
1032: /* Get indices from matA and then from matB */
1033: row = row - rstart;
1034: nzA = a_i[row+1] - a_i[row];
1035: nzB = b_i[row+1] - b_i[row];
1036: cworkA = a_j + a_i[row];
1037: cworkB = b_j + b_i[row];
1038: if (!allcolumns[i]) {
1039: #if defined(PETSC_USE_CTABLE)
1040: for (k=0; k<nzA; k++) {
1041: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1042: if (tt) lens_i[j]++;
1043: }
1044: for (k=0; k<nzB; k++) {
1045: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1046: if (tt) lens_i[j]++;
1047: }
1049: #else
1050: for (k=0; k<nzA; k++) {
1051: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1052: }
1053: for (k=0; k<nzB; k++) {
1054: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1055: }
1056: #endif
1057: } else { /* allcolumns */
1058: lens_i[j] = nzA + nzB;
1059: }
1060: }
1061: }
1062: }
1063: #if defined(PETSC_USE_CTABLE)
1064: /* Create row map*/
1065: PetscMalloc1(1+ismax,&rmap);
1066: for (i=0; i<ismax; i++) {
1067: PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);
1068: }
1069: #else
1070: /* Create row map*/
1071: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);
1072: rmap[0] = (PetscInt*)(rmap + ismax);
1073: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1074: for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + Mbs;
1075: #endif
1076: for (i=0; i<ismax; i++) {
1077: irow_i = irow[i];
1078: jmax = nrow[i];
1079: #if defined(PETSC_USE_CTABLE)
1080: rmap_i = rmap[i];
1081: for (j=0; j<jmax; j++) {
1082: if (allrows[i]) {
1083: PetscTableAdd(rmap_i,j+1,j+1,INSERT_VALUES);
1084: } else {
1085: PetscTableAdd(rmap_i,irow_i[j]+1,j+1,INSERT_VALUES);
1086: }
1087: }
1088: #else
1089: rmap_i = rmap[i];
1090: for (j=0; j<jmax; j++) {
1091: if (allrows[i]) rmap_i[j] = j;
1092: else rmap_i[irow_i[j]] = j;
1093: }
1094: #endif
1095: }
1097: /* Update lens from offproc data */
1098: {
1099: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1100: PetscMPIInt ii;
1102: for (tmp2=0; tmp2<nrqs; tmp2++) {
1103: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1104: idex = pa[ii];
1105: sbuf1_i = sbuf1[idex];
1106: jmax = sbuf1_i[0];
1107: ct1 = 2*jmax+1;
1108: ct2 = 0;
1109: rbuf2_i = rbuf2[ii];
1110: rbuf3_i = rbuf3[ii];
1111: for (j=1; j<=jmax; j++) {
1112: is_no = sbuf1_i[2*j-1];
1113: max1 = sbuf1_i[2*j];
1114: lens_i = lens[is_no];
1115: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1116: rmap_i = rmap[is_no];
1117: for (k=0; k<max1; k++,ct1++) {
1118: #if defined(PETSC_USE_CTABLE)
1119: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1120: row--;
1121: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1122: #else
1123: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1124: #endif
1125: max2 = rbuf2_i[ct1];
1126: for (l=0; l<max2; l++,ct2++) {
1127: if (!allcolumns[is_no]) {
1128: #if defined(PETSC_USE_CTABLE)
1129: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tt);
1130: if (tt) lens_i[row]++;
1131: #else
1132: if (cmap_i[rbuf3_i[ct2]]) lens_i[row]++;
1133: #endif
1134: } else { /* allcolumns */
1135: lens_i[row]++;
1136: }
1137: }
1138: }
1139: }
1140: }
1141: }
1142: PetscFree(r_status3);
1143: PetscFree(r_waits3);
1144: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1145: PetscFree(s_status3);
1146: PetscFree(s_waits3);
1148: /* Create the submatrices */
1149: if (scall == MAT_REUSE_MATRIX) {
1150: if (ijonly) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP," MAT_REUSE_MATRIX and ijonly is not supported yet");
1151: /*
1152: Assumes new rows are same length as the old rows, hence bug!
1153: */
1154: for (i=0; i<ismax; i++) {
1155: mat = (Mat_SeqBAIJ*)(submats[i]->data);
1156: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1157: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1158: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1159: /* Initial matrix as if empty */
1160: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1162: submats[i]->factortype = C->factortype;
1163: }
1164: } else {
1165: PetscInt bs_tmp;
1166: if (ijonly) bs_tmp = 1;
1167: else bs_tmp = bs;
1168: for (i=0; i<ismax; i++) {
1169: MatCreate(PETSC_COMM_SELF,submats+i);
1170: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,nrow[i]*bs_tmp,ncol[i]*bs_tmp);
1171: MatSetType(submats[i],((PetscObject)A)->type_name);
1172: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1173: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1174: }
1175: }
1177: /* Assemble the matrices */
1178: /* First assemble the local rows */
1179: {
1180: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1181: MatScalar *imat_a = NULL;
1183: for (i=0; i<ismax; i++) {
1184: mat = (Mat_SeqBAIJ*)submats[i]->data;
1185: imat_ilen = mat->ilen;
1186: imat_j = mat->j;
1187: imat_i = mat->i;
1188: if (!ijonly) imat_a = mat->a;
1189: if (!allcolumns[i]) cmap_i = cmap[i];
1190: rmap_i = rmap[i];
1191: irow_i = irow[i];
1192: jmax = nrow[i];
1193: for (j=0; j<jmax; j++) {
1194: if (allrows[i]) row = j;
1195: else row = irow_i[j];
1197: #if defined(PETSC_USE_CTABLE)
1198: PetscGetProc(row,size,c->rangebs,&proc);
1199: #else
1200: proc = rtable[row];
1201: #endif
1202: if (proc == rank) {
1203: row = row - rstart;
1204: nzA = a_i[row+1] - a_i[row];
1205: nzB = b_i[row+1] - b_i[row];
1206: cworkA = a_j + a_i[row];
1207: cworkB = b_j + b_i[row];
1208: if (!ijonly) {
1209: vworkA = a_a + a_i[row]*bs2;
1210: vworkB = b_a + b_i[row]*bs2;
1211: }
1212: #if defined(PETSC_USE_CTABLE)
1213: PetscTableFind(rmap_i,row+rstart+1,&row);
1214: row--;
1215: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1216: #else
1217: row = rmap_i[row + rstart];
1218: #endif
1219: mat_i = imat_i[row];
1220: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1221: mat_j = imat_j + mat_i;
1222: ilen_row = imat_ilen[row];
1224: /* load the column indices for this row into cols*/
1225: if (!allcolumns[i]) {
1226: for (l=0; l<nzB; l++) {
1227: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1228: #if defined(PETSC_USE_CTABLE)
1229: PetscTableFind(cmap_i,ctmp+1,&tcol);
1230: if (tcol) {
1231: #else
1232: if ((tcol = cmap_i[ctmp])) {
1233: #endif
1234: *mat_j++ = tcol - 1;
1235: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1236: mat_a += bs2;
1237: ilen_row++;
1238: }
1239: } else break;
1240: }
1241: imark = l;
1242: for (l=0; l<nzA; l++) {
1243: #if defined(PETSC_USE_CTABLE)
1244: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1245: if (tcol) {
1246: #else
1247: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1248: #endif
1249: *mat_j++ = tcol - 1;
1250: if (!ijonly) {
1251: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1252: mat_a += bs2;
1253: }
1254: ilen_row++;
1255: }
1256: }
1257: for (l=imark; l<nzB; l++) {
1258: #if defined(PETSC_USE_CTABLE)
1259: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1260: if (tcol) {
1261: #else
1262: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1263: #endif
1264: *mat_j++ = tcol - 1;
1265: if (!ijonly) {
1266: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1267: mat_a += bs2;
1268: }
1269: ilen_row++;
1270: }
1271: }
1272: } else { /* allcolumns */
1273: for (l=0; l<nzB; l++) {
1274: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1275: *mat_j++ = ctmp;
1276: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1277: mat_a += bs2;
1278: ilen_row++;
1279: } else break;
1280: }
1281: imark = l;
1282: for (l=0; l<nzA; l++) {
1283: *mat_j++ = cstart+cworkA[l];
1284: if (!ijonly) {
1285: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1286: mat_a += bs2;
1287: }
1288: ilen_row++;
1289: }
1290: for (l=imark; l<nzB; l++) {
1291: *mat_j++ = bmap[cworkB[l]];
1292: if (!ijonly) {
1293: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1294: mat_a += bs2;
1295: }
1296: ilen_row++;
1297: }
1298: }
1299: imat_ilen[row] = ilen_row;
1300: }
1301: }
1302: }
1303: }
1305: /* Now assemble the off proc rows*/
1306: {
1307: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1308: PetscInt *imat_j,*imat_i;
1309: MatScalar *imat_a = NULL,*rbuf4_i = NULL;
1310: PetscMPIInt ii;
1312: for (tmp2=0; tmp2<nrqs; tmp2++) {
1313: if (ijonly) ii = tmp2;
1314: else {
1315: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1316: }
1317: idex = pa[ii];
1318: sbuf1_i = sbuf1[idex];
1319: jmax = sbuf1_i[0];
1320: ct1 = 2*jmax + 1;
1321: ct2 = 0;
1322: rbuf2_i = rbuf2[ii];
1323: rbuf3_i = rbuf3[ii];
1324: if (!ijonly) rbuf4_i = rbuf4[ii];
1325: for (j=1; j<=jmax; j++) {
1326: is_no = sbuf1_i[2*j-1];
1327: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1328: rmap_i = rmap[is_no];
1329: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1330: imat_ilen = mat->ilen;
1331: imat_j = mat->j;
1332: imat_i = mat->i;
1333: if (!ijonly) imat_a = mat->a;
1334: max1 = sbuf1_i[2*j];
1335: for (k=0; k<max1; k++,ct1++) {
1336: row = sbuf1_i[ct1];
1337: #if defined(PETSC_USE_CTABLE)
1338: PetscTableFind(rmap_i,row+1,&row);
1339: row--;
1340: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1341: #else
1342: row = rmap_i[row];
1343: #endif
1344: ilen = imat_ilen[row];
1345: mat_i = imat_i[row];
1346: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1347: mat_j = imat_j + mat_i;
1348: max2 = rbuf2_i[ct1];
1350: if (!allcolumns[is_no]) {
1351: for (l=0; l<max2; l++,ct2++) {
1352: #if defined(PETSC_USE_CTABLE)
1353: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1354: if (tcol) {
1355: #else
1356: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1357: #endif
1358: *mat_j++ = tcol - 1;
1359: if (!ijonly) {
1360: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1361: mat_a += bs2;
1362: }
1363: ilen++;
1364: }
1365: }
1366: } else { /* allcolumns */
1367: for (l=0; l<max2; l++,ct2++) {
1368: *mat_j++ = rbuf3_i[ct2];
1369: if (!ijonly) {
1370: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1371: mat_a += bs2;
1372: }
1373: ilen++;
1374: }
1375: }
1376: imat_ilen[row] = ilen;
1377: }
1378: }
1379: }
1380: }
1381: /* sort the rows */
1382: {
1383: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1384: MatScalar *imat_a = NULL;
1385: MatScalar *work;
1387: PetscMalloc1(bs2,&work);
1388: for (i=0; i<ismax; i++) {
1389: mat = (Mat_SeqBAIJ*)submats[i]->data;
1390: imat_ilen = mat->ilen;
1391: imat_j = mat->j;
1392: imat_i = mat->i;
1393: if (!ijonly) imat_a = mat->a;
1394: if (allcolumns[i]) continue;
1395: jmax = nrow[i];
1396: for (j=0; j<jmax; j++) {
1397: mat_i = imat_i[j];
1398: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1399: mat_j = imat_j + mat_i;
1400: ilen_row = imat_ilen[j];
1401: if (!ijonly) {PetscSortIntWithDataArray(ilen_row,mat_j,mat_a,bs2*sizeof(MatScalar),work);}
1402: else {PetscSortInt(ilen_row,mat_j);}
1403: }
1404: }
1405: PetscFree(work);
1406: }
1407: if (!ijonly) {
1408: PetscFree(r_status4);
1409: PetscFree(r_waits4);
1410: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1411: PetscFree(s_waits4);
1412: PetscFree(s_status4);
1413: }
1415: /* Restore the indices */
1416: for (i=0; i<ismax; i++) {
1417: if (!allrows[i]) {
1418: ISRestoreIndices(isrow[i],irow+i);
1419: }
1420: if (!allcolumns[i]) {
1421: ISRestoreIndices(iscol[i],icol+i);
1422: }
1423: }
1425: /* Destroy allocated memory */
1426: #if defined(PETSC_USE_CTABLE)
1427: PetscFree4(irow,icol,nrow,ncol);
1428: #else
1429: PetscFree5(irow,icol,nrow,ncol,rtable);
1430: #endif
1431: PetscFree4(w1,w2,w3,w4);
1432: PetscFree(pa);
1434: PetscFree4(sbuf1,ptr,tmp,ctr);
1435: PetscFree(sbuf1);
1436: PetscFree(rbuf2);
1437: for (i=0; i<nrqr; ++i) {
1438: PetscFree(sbuf2[i]);
1439: }
1440: for (i=0; i<nrqs; ++i) {
1441: PetscFree(rbuf3[i]);
1442: }
1443: PetscFree3(sbuf2,req_size,req_source);
1444: PetscFree(rbuf3);
1445: PetscFree(sbuf_aj[0]);
1446: PetscFree(sbuf_aj);
1447: if (!ijonly) {
1448: for (i=0; i<nrqs; ++i) {PetscFree(rbuf4[i]);}
1449: PetscFree(rbuf4);
1450: PetscFree(sbuf_aa[0]);
1451: PetscFree(sbuf_aa);
1452: }
1454: #if defined(PETSC_USE_CTABLE)
1455: for (i=0; i<ismax; i++) {
1456: PetscTableDestroy((PetscTable*)&rmap[i]);
1457: }
1458: #endif
1459: PetscFree(rmap);
1461: for (i=0; i<ismax; i++) {
1462: if (!allcolumns[i]) {
1463: #if defined(PETSC_USE_CTABLE)
1464: PetscTableDestroy((PetscTable*)&cmap[i]);
1465: #else
1466: PetscFree(cmap[i]);
1467: #endif
1468: }
1469: }
1470: PetscFree(cmap);
1471: PetscFree(lens);
1473: for (i=0; i<ismax; i++) {
1474: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1475: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1476: }
1478: c->ijonly = PETSC_FALSE; /* set back to the default */
1479: return(0);
1480: }