Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
21: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22: {
23: PetscInt i;
26: for (i=0; i<ov; ++i) {
27: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
28: }
29: return 0;
30: }
32: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
33: {
34: PetscInt i;
37: for (i=0; i<ov; ++i) {
38: MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
39: }
40: return 0;
41: }
43: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
44: {
45: MPI_Comm comm;
46: PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
47: PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
48: PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL;
49: const PetscInt *indices_i,**indices;
50: PetscLayout rmap;
51: PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner;
52: PetscSF sf;
53: PetscSFNode *remote;
55: PetscObjectGetComm((PetscObject)mat,&comm);
56: MPI_Comm_rank(comm,&rank);
57: MPI_Comm_size(comm,&size);
58: /* get row map to determine where rows should be going */
59: MatGetLayouts(mat,&rmap,NULL);
60: /* retrieve IS data and put all together so that we
61: * can optimize communication
62: * */
63: PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
64: for (i=0,tlength=0; i<nidx; i++) {
65: ISGetLocalSize(is[i],&length[i]);
66: tlength += length[i];
67: ISGetIndices(is[i],&indices[i]);
68: }
69: /* find these rows on remote processors */
70: PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
71: PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
72: nrrows = 0;
73: for (i=0; i<nidx; i++) {
74: length_i = length[i];
75: indices_i = indices[i];
76: for (j=0; j<length_i; j++) {
77: owner = -1;
78: PetscLayoutFindOwner(rmap,indices_i[j],&owner);
79: /* remote processors */
80: if (owner != rank) {
81: tosizes_temp[owner]++; /* number of rows to owner */
82: rrow_ranks[nrrows] = owner; /* processor */
83: rrow_isids[nrrows] = i; /* is id */
84: remoterows[nrrows++] = indices_i[j]; /* row */
85: }
86: }
87: ISRestoreIndices(is[i],&indices[i]);
88: }
89: PetscFree2(*(PetscInt***)&indices,length);
90: /* test if we need to exchange messages
91: * generally speaking, we do not need to exchange
92: * data when overlap is 1
93: * */
94: MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
95: /* we do not have any messages
96: * It usually corresponds to overlap 1
97: * */
98: if (!reducednrrows) {
99: PetscFree3(toranks,tosizes,tosizes_temp);
100: PetscFree3(remoterows,rrow_ranks,rrow_isids);
101: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
102: return 0;
103: }
104: nto = 0;
105: /* send sizes and ranks for building a two-sided communcation */
106: for (i=0; i<size; i++) {
107: if (tosizes_temp[i]) {
108: tosizes[nto*2] = tosizes_temp[i]*2; /* size */
109: tosizes_temp[i] = nto; /* a map from processor to index */
110: toranks[nto++] = i; /* processor */
111: }
112: }
113: PetscMalloc1(nto+1,&toffsets);
114: toffsets[0] = 0;
115: for (i=0; i<nto; i++) {
116: toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */
117: tosizes[2*i+1] = toffsets[i]; /* offsets to send */
118: }
119: /* send information to other processors */
120: PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
121: nrecvrows = 0;
122: for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
123: PetscMalloc1(nrecvrows,&remote);
124: nrecvrows = 0;
125: for (i=0; i<nfrom; i++) {
126: for (j=0; j<fromsizes[2*i]; j++) {
127: remote[nrecvrows].rank = fromranks[i];
128: remote[nrecvrows++].index = fromsizes[2*i+1]+j;
129: }
130: }
131: PetscSFCreate(comm,&sf);
132: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
133: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
134: PetscSFSetType(sf,PETSCSFBASIC);
135: PetscSFSetFromOptions(sf);
136: /* message pair <no of is, row> */
137: PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
138: for (i=0; i<nrrows; i++) {
139: owner = rrow_ranks[i]; /* processor */
140: j = tosizes_temp[owner]; /* index */
141: todata[toffsets[j]++] = rrow_isids[i];
142: todata[toffsets[j]++] = remoterows[i];
143: }
144: PetscFree3(toranks,tosizes,tosizes_temp);
145: PetscFree3(remoterows,rrow_ranks,rrow_isids);
146: PetscFree(toffsets);
147: PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
148: PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
149: PetscSFDestroy(&sf);
150: /* send rows belonging to the remote so that then we could get the overlapping data back */
151: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
152: PetscFree2(todata,fromdata);
153: PetscFree(fromsizes);
154: PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
155: PetscFree(fromranks);
156: nrecvrows = 0;
157: for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
158: PetscCalloc1(nrecvrows,&todata);
159: PetscMalloc1(nrecvrows,&remote);
160: nrecvrows = 0;
161: for (i=0; i<nto; i++) {
162: for (j=0; j<tosizes[2*i]; j++) {
163: remote[nrecvrows].rank = toranks[i];
164: remote[nrecvrows++].index = tosizes[2*i+1]+j;
165: }
166: }
167: PetscSFCreate(comm,&sf);
168: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
169: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
170: PetscSFSetType(sf,PETSCSFBASIC);
171: PetscSFSetFromOptions(sf);
172: /* overlap communication and computation */
173: PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
174: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
175: PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
176: PetscSFDestroy(&sf);
177: PetscFree2(sbdata,sbsizes);
178: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
179: PetscFree(toranks);
180: PetscFree(tosizes);
181: PetscFree(todata);
182: return 0;
183: }
185: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
186: {
187: PetscInt *isz,isz_i,i,j,is_id, data_size;
188: PetscInt col,lsize,max_lsize,*indices_temp, *indices_i;
189: const PetscInt *indices_i_temp;
190: MPI_Comm *iscomms;
192: max_lsize = 0;
193: PetscMalloc1(nidx,&isz);
194: for (i=0; i<nidx; i++) {
195: ISGetLocalSize(is[i],&lsize);
196: max_lsize = lsize>max_lsize ? lsize:max_lsize;
197: isz[i] = lsize;
198: }
199: PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
200: for (i=0; i<nidx; i++) {
201: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
202: ISGetIndices(is[i],&indices_i_temp);
203: PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);
204: ISRestoreIndices(is[i],&indices_i_temp);
205: ISDestroy(&is[i]);
206: }
207: /* retrieve information to get row id and its overlap */
208: for (i=0; i<nrecvs;) {
209: is_id = recvdata[i++];
210: data_size = recvdata[i++];
211: indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
212: isz_i = isz[is_id];
213: for (j=0; j< data_size; j++) {
214: col = recvdata[i++];
215: indices_i[isz_i++] = col;
216: }
217: isz[is_id] = isz_i;
218: }
219: /* remove duplicate entities */
220: for (i=0; i<nidx; i++) {
221: indices_i = indices_temp+(max_lsize+nrecvs)*i;
222: isz_i = isz[i];
223: PetscSortRemoveDupsInt(&isz_i,indices_i);
224: ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
225: PetscCommDestroy(&iscomms[i]);
226: }
227: PetscFree(isz);
228: PetscFree2(indices_temp,iscomms);
229: return 0;
230: }
232: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
233: {
234: PetscLayout rmap,cmap;
235: PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
236: PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
237: PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
238: const PetscInt *gcols,*ai,*aj,*bi,*bj;
239: Mat amat,bmat;
240: PetscMPIInt rank;
241: PetscBool done;
242: MPI_Comm comm;
244: PetscObjectGetComm((PetscObject)mat,&comm);
245: MPI_Comm_rank(comm,&rank);
246: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
247: /* Even if the mat is symmetric, we still assume it is not symmetric */
248: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
250: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
252: /* total number of nonzero values is used to estimate the memory usage in the next step */
253: tnz = ai[an]+bi[bn];
254: MatGetLayouts(mat,&rmap,&cmap);
255: PetscLayoutGetRange(rmap,&rstart,NULL);
256: PetscLayoutGetRange(cmap,&cstart,NULL);
257: /* to find the longest message */
258: max_fszs = 0;
259: for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
260: /* better way to estimate number of nonzero in the mat??? */
261: PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
262: for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
263: rows_pos = 0;
264: totalrows = 0;
265: for (i=0; i<nfrom; i++) {
266: PetscArrayzero(rows_pos_i,nidx);
267: /* group data together */
268: for (j=0; j<fromsizes[2*i]; j+=2) {
269: is_id = fromrows[rows_pos++];/* no of is */
270: rows_i = rows_data[is_id];
271: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
272: }
273: /* estimate a space to avoid multiple allocations */
274: for (j=0; j<nidx; j++) {
275: indvc_ij = 0;
276: rows_i = rows_data[j];
277: for (l=0; l<rows_pos_i[j]; l++) {
278: row = rows_i[l]-rstart;
279: start = ai[row];
280: end = ai[row+1];
281: for (k=start; k<end; k++) { /* Amat */
282: col = aj[k] + cstart;
283: indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
284: }
285: start = bi[row];
286: end = bi[row+1];
287: for (k=start; k<end; k++) { /* Bmat */
288: col = gcols[bj[k]];
289: indices_tmp[indvc_ij++] = col;
290: }
291: }
292: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
293: indv_counts[i*nidx+j] = indvc_ij;
294: totalrows += indvc_ij;
295: }
296: }
297: /* message triple <no of is, number of rows, rows> */
298: PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
299: totalrows = 0;
300: rows_pos = 0;
301: /* use this code again */
302: for (i=0;i<nfrom;i++) {
303: PetscArrayzero(rows_pos_i,nidx);
304: for (j=0; j<fromsizes[2*i]; j+=2) {
305: is_id = fromrows[rows_pos++];
306: rows_i = rows_data[is_id];
307: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
308: }
309: /* add data */
310: for (j=0; j<nidx; j++) {
311: if (!indv_counts[i*nidx+j]) continue;
312: indvc_ij = 0;
313: sbdata[totalrows++] = j;
314: sbdata[totalrows++] = indv_counts[i*nidx+j];
315: sbsizes[2*i] += 2;
316: rows_i = rows_data[j];
317: for (l=0; l<rows_pos_i[j]; l++) {
318: row = rows_i[l]-rstart;
319: start = ai[row];
320: end = ai[row+1];
321: for (k=start; k<end; k++) { /* Amat */
322: col = aj[k] + cstart;
323: indices_tmp[indvc_ij++] = col;
324: }
325: start = bi[row];
326: end = bi[row+1];
327: for (k=start; k<end; k++) { /* Bmat */
328: col = gcols[bj[k]];
329: indices_tmp[indvc_ij++] = col;
330: }
331: }
332: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
333: sbsizes[2*i] += indvc_ij;
334: PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);
335: totalrows += indvc_ij;
336: }
337: }
338: PetscMalloc1(nfrom+1,&offsets);
339: offsets[0] = 0;
340: for (i=0; i<nfrom; i++) {
341: offsets[i+1] = offsets[i] + sbsizes[2*i];
342: sbsizes[2*i+1] = offsets[i];
343: }
344: PetscFree(offsets);
345: if (sbrowsizes) *sbrowsizes = sbsizes;
346: if (sbrows) *sbrows = sbdata;
347: PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
348: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
349: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
350: return 0;
351: }
353: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
354: {
355: const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices;
356: PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
357: PetscInt lsize,lsize_tmp;
358: PetscMPIInt rank,owner;
359: Mat amat,bmat;
360: PetscBool done;
361: PetscLayout cmap,rmap;
362: MPI_Comm comm;
364: PetscObjectGetComm((PetscObject)mat,&comm);
365: MPI_Comm_rank(comm,&rank);
366: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
367: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
369: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
371: /* is it a safe way to compute number of nonzero values ? */
372: tnz = ai[an]+bi[bn];
373: MatGetLayouts(mat,&rmap,&cmap);
374: PetscLayoutGetRange(rmap,&rstart,NULL);
375: PetscLayoutGetRange(cmap,&cstart,NULL);
376: /* it is a better way to estimate memory than the old implementation
377: * where global size of matrix is used
378: * */
379: PetscMalloc1(tnz,&indices_temp);
380: for (i=0; i<nidx; i++) {
381: MPI_Comm iscomm;
383: ISGetLocalSize(is[i],&lsize);
384: ISGetIndices(is[i],&indices);
385: lsize_tmp = 0;
386: for (j=0; j<lsize; j++) {
387: owner = -1;
388: row = indices[j];
389: PetscLayoutFindOwner(rmap,row,&owner);
390: if (owner != rank) continue;
391: /* local number */
392: row -= rstart;
393: start = ai[row];
394: end = ai[row+1];
395: for (k=start; k<end; k++) { /* Amat */
396: col = aj[k] + cstart;
397: indices_temp[lsize_tmp++] = col;
398: }
399: start = bi[row];
400: end = bi[row+1];
401: for (k=start; k<end; k++) { /* Bmat */
402: col = gcols[bj[k]];
403: indices_temp[lsize_tmp++] = col;
404: }
405: }
406: ISRestoreIndices(is[i],&indices);
407: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
408: ISDestroy(&is[i]);
409: PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
410: ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
411: PetscCommDestroy(&iscomm);
412: }
413: PetscFree(indices_temp);
414: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
415: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
416: return 0;
417: }
419: /*
420: Sample message format:
421: If a processor A wants processor B to process some elements corresponding
422: to index sets is[1],is[5]
423: mesg [0] = 2 (no of index sets in the mesg)
424: -----------
425: mesg [1] = 1 => is[1]
426: mesg [2] = sizeof(is[1]);
427: -----------
428: mesg [3] = 5 => is[5]
429: mesg [4] = sizeof(is[5]);
430: -----------
431: mesg [5]
432: mesg [n] datas[1]
433: -----------
434: mesg[n+1]
435: mesg[m] data(is[5])
436: -----------
438: Notes:
439: nrqs - no of requests sent (or to be sent out)
440: nrqr - no of requests received (which have to be or which have been processed)
441: */
442: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
443: {
444: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
445: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
446: const PetscInt **idx,*idx_i;
447: PetscInt *n,**data,len;
448: #if defined(PETSC_USE_CTABLE)
449: PetscTable *table_data,table_data_i;
450: PetscInt *tdata,tcount,tcount_max;
451: #else
452: PetscInt *data_i,*d_p;
453: #endif
454: PetscMPIInt size,rank,tag1,tag2,proc = 0;
455: PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
456: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
457: PetscBT *table;
458: MPI_Comm comm;
459: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
460: MPI_Status *recv_status;
461: MPI_Comm *iscomms;
462: char *t_p;
464: PetscObjectGetComm((PetscObject)C,&comm);
465: size = c->size;
466: rank = c->rank;
467: M = C->rmap->N;
469: PetscObjectGetNewTag((PetscObject)C,&tag1);
470: PetscObjectGetNewTag((PetscObject)C,&tag2);
472: PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);
474: for (i=0; i<imax; i++) {
475: ISGetIndices(is[i],&idx[i]);
476: ISGetLocalSize(is[i],&n[i]);
477: }
479: /* evaluate communication - mesg to who,length of mesg, and buffer space
480: required. Based on this, buffers are allocated, and data copied into them */
481: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
482: for (i=0; i<imax; i++) {
483: PetscArrayzero(w4,size); /* initialise work vector*/
484: idx_i = idx[i];
485: len = n[i];
486: for (j=0; j<len; j++) {
487: row = idx_i[j];
489: PetscLayoutFindOwner(C->rmap,row,&proc);
490: w4[proc]++;
491: }
492: for (j=0; j<size; j++) {
493: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
494: }
495: }
497: nrqs = 0; /* no of outgoing messages */
498: msz = 0; /* total mesg length (for all proc */
499: w1[rank] = 0; /* no mesg sent to intself */
500: w3[rank] = 0;
501: for (i=0; i<size; i++) {
502: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
503: }
504: /* pa - is list of processors to communicate with */
505: PetscMalloc1(nrqs,&pa);
506: for (i=0,j=0; i<size; i++) {
507: if (w1[i]) {pa[j] = i; j++;}
508: }
510: /* Each message would have a header = 1 + 2*(no of IS) + data */
511: for (i=0; i<nrqs; i++) {
512: j = pa[i];
513: w1[j] += w2[j] + 2*w3[j];
514: msz += w1[j];
515: }
517: /* Determine the number of messages to expect, their lengths, from from-ids */
518: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
519: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
521: /* Now post the Irecvs corresponding to these messages */
522: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
524: /* Allocate Memory for outgoing messages */
525: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
526: PetscArrayzero(outdat,size);
527: PetscArrayzero(ptr,size);
529: {
530: PetscInt *iptr = tmp,ict = 0;
531: for (i=0; i<nrqs; i++) {
532: j = pa[i];
533: iptr += ict;
534: outdat[j] = iptr;
535: ict = w1[j];
536: }
537: }
539: /* Form the outgoing messages */
540: /* plug in the headers */
541: for (i=0; i<nrqs; i++) {
542: j = pa[i];
543: outdat[j][0] = 0;
544: PetscArrayzero(outdat[j]+1,2*w3[j]);
545: ptr[j] = outdat[j] + 2*w3[j] + 1;
546: }
548: /* Memory for doing local proc's work */
549: {
550: PetscInt M_BPB_imax = 0;
551: #if defined(PETSC_USE_CTABLE)
552: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
553: PetscMalloc1(imax,&table_data);
554: for (i=0; i<imax; i++) {
555: PetscTableCreate(n[i],M,&table_data[i]);
556: }
557: PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
558: for (i=0; i<imax; i++) {
559: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
560: }
561: #else
562: PetscInt Mimax = 0;
563: PetscIntMultError(M,imax, &Mimax);
564: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
565: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
566: for (i=0; i<imax; i++) {
567: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
568: data[i] = d_p + M*i;
569: }
570: #endif
571: }
573: /* Parse the IS and update local tables and the outgoing buf with the data */
574: {
575: PetscInt n_i,isz_i,*outdat_j,ctr_j;
576: PetscBT table_i;
578: for (i=0; i<imax; i++) {
579: PetscArrayzero(ctr,size);
580: n_i = n[i];
581: table_i = table[i];
582: idx_i = idx[i];
583: #if defined(PETSC_USE_CTABLE)
584: table_data_i = table_data[i];
585: #else
586: data_i = data[i];
587: #endif
588: isz_i = isz[i];
589: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
590: row = idx_i[j];
591: PetscLayoutFindOwner(C->rmap,row,&proc);
592: if (proc != rank) { /* copy to the outgoing buffer */
593: ctr[proc]++;
594: *ptr[proc] = row;
595: ptr[proc]++;
596: } else if (!PetscBTLookupSet(table_i,row)) {
597: #if defined(PETSC_USE_CTABLE)
598: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
599: #else
600: data_i[isz_i] = row; /* Update the local table */
601: #endif
602: isz_i++;
603: }
604: }
605: /* Update the headers for the current IS */
606: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
607: if ((ctr_j = ctr[j])) {
608: outdat_j = outdat[j];
609: k = ++outdat_j[0];
610: outdat_j[2*k] = ctr_j;
611: outdat_j[2*k-1] = i;
612: }
613: }
614: isz[i] = isz_i;
615: }
616: }
618: /* Now post the sends */
619: PetscMalloc1(nrqs,&s_waits1);
620: for (i=0; i<nrqs; ++i) {
621: j = pa[i];
622: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
623: }
625: /* No longer need the original indices */
626: PetscMalloc1(imax,&iscomms);
627: for (i=0; i<imax; ++i) {
628: ISRestoreIndices(is[i],idx+i);
629: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
630: }
631: PetscFree2(*(PetscInt***)&idx,n);
633: for (i=0; i<imax; ++i) {
634: ISDestroy(&is[i]);
635: }
637: /* Do Local work */
638: #if defined(PETSC_USE_CTABLE)
639: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
640: #else
641: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
642: #endif
644: /* Receive messages */
645: PetscMalloc1(nrqr,&recv_status);
646: MPI_Waitall(nrqr,r_waits1,recv_status);
647: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
649: /* Phase 1 sends are complete - deallocate buffers */
650: PetscFree4(outdat,ptr,tmp,ctr);
651: PetscFree4(w1,w2,w3,w4);
653: PetscMalloc1(nrqr,&xdata);
654: PetscMalloc1(nrqr,&isz1);
655: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
656: PetscFree(rbuf[0]);
657: PetscFree(rbuf);
659: /* Send the data back */
660: /* Do a global reduction to know the buffer space req for incoming messages */
661: {
662: PetscMPIInt *rw1;
664: PetscCalloc1(size,&rw1);
666: for (i=0; i<nrqr; ++i) {
667: proc = recv_status[i].MPI_SOURCE;
670: rw1[proc] = isz1[i];
671: }
672: PetscFree(onodes1);
673: PetscFree(olengths1);
675: /* Determine the number of messages to expect, their lengths, from from-ids */
676: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
677: PetscFree(rw1);
678: }
679: /* Now post the Irecvs corresponding to these messages */
680: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
682: /* Now post the sends */
683: PetscMalloc1(nrqr,&s_waits2);
684: for (i=0; i<nrqr; ++i) {
685: j = recv_status[i].MPI_SOURCE;
686: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
687: }
689: /* receive work done on other processors */
690: {
691: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax;
692: PetscMPIInt idex;
693: PetscBT table_i;
695: for (i=0; i<nrqs; ++i) {
696: MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);
697: /* Process the message */
698: rbuf2_i = rbuf2[idex];
699: ct1 = 2*rbuf2_i[0]+1;
700: jmax = rbuf2[idex][0];
701: for (j=1; j<=jmax; j++) {
702: max = rbuf2_i[2*j];
703: is_no = rbuf2_i[2*j-1];
704: isz_i = isz[is_no];
705: table_i = table[is_no];
706: #if defined(PETSC_USE_CTABLE)
707: table_data_i = table_data[is_no];
708: #else
709: data_i = data[is_no];
710: #endif
711: for (k=0; k<max; k++,ct1++) {
712: row = rbuf2_i[ct1];
713: if (!PetscBTLookupSet(table_i,row)) {
714: #if defined(PETSC_USE_CTABLE)
715: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
716: #else
717: data_i[isz_i] = row;
718: #endif
719: isz_i++;
720: }
721: }
722: isz[is_no] = isz_i;
723: }
724: }
726: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
727: }
729: #if defined(PETSC_USE_CTABLE)
730: tcount_max = 0;
731: for (i=0; i<imax; ++i) {
732: table_data_i = table_data[i];
733: PetscTableGetCount(table_data_i,&tcount);
734: if (tcount_max < tcount) tcount_max = tcount;
735: }
736: PetscMalloc1(tcount_max+1,&tdata);
737: #endif
739: for (i=0; i<imax; ++i) {
740: #if defined(PETSC_USE_CTABLE)
741: PetscTablePosition tpos;
742: table_data_i = table_data[i];
744: PetscTableGetHeadPosition(table_data_i,&tpos);
745: while (tpos) {
746: PetscTableGetNext(table_data_i,&tpos,&k,&j);
747: tdata[--j] = --k;
748: }
749: ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
750: #else
751: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
752: #endif
753: PetscCommDestroy(&iscomms[i]);
754: }
756: PetscFree(iscomms);
757: PetscFree(onodes2);
758: PetscFree(olengths2);
760: PetscFree(pa);
761: PetscFree(rbuf2[0]);
762: PetscFree(rbuf2);
763: PetscFree(s_waits1);
764: PetscFree(r_waits1);
765: PetscFree(s_waits2);
766: PetscFree(r_waits2);
767: PetscFree(recv_status);
768: if (xdata) {
769: PetscFree(xdata[0]);
770: PetscFree(xdata);
771: }
772: PetscFree(isz1);
773: #if defined(PETSC_USE_CTABLE)
774: for (i=0; i<imax; i++) {
775: PetscTableDestroy((PetscTable*)&table_data[i]);
776: }
777: PetscFree(table_data);
778: PetscFree(tdata);
779: PetscFree4(table,data,isz,t_p);
780: #else
781: PetscFree5(table,data,isz,d_p,t_p);
782: #endif
783: return 0;
784: }
786: /*
787: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
788: the work on the local processor.
790: Inputs:
791: C - MAT_MPIAIJ;
792: imax - total no of index sets processed at a time;
793: table - an array of char - size = m bits.
795: Output:
796: isz - array containing the count of the solution elements corresponding
797: to each index set;
798: data or table_data - pointer to the solutions
799: */
800: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
801: {
802: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
803: Mat A = c->A,B = c->B;
804: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
805: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
806: PetscInt *bi,*bj,*garray,i,j,k,row,isz_i;
807: PetscBT table_i;
808: #if defined(PETSC_USE_CTABLE)
809: PetscTable table_data_i;
810: PetscTablePosition tpos;
811: PetscInt tcount,*tdata;
812: #else
813: PetscInt *data_i;
814: #endif
816: rstart = C->rmap->rstart;
817: cstart = C->cmap->rstart;
818: ai = a->i;
819: aj = a->j;
820: bi = b->i;
821: bj = b->j;
822: garray = c->garray;
824: for (i=0; i<imax; i++) {
825: #if defined(PETSC_USE_CTABLE)
826: /* copy existing entries of table_data_i into tdata[] */
827: table_data_i = table_data[i];
828: PetscTableGetCount(table_data_i,&tcount);
831: PetscMalloc1(tcount,&tdata);
832: PetscTableGetHeadPosition(table_data_i,&tpos);
833: while (tpos) {
834: PetscTableGetNext(table_data_i,&tpos,&row,&j);
835: tdata[--j] = --row;
837: }
838: #else
839: data_i = data[i];
840: #endif
841: table_i = table[i];
842: isz_i = isz[i];
843: max = isz[i];
845: for (j=0; j<max; j++) {
846: #if defined(PETSC_USE_CTABLE)
847: row = tdata[j] - rstart;
848: #else
849: row = data_i[j] - rstart;
850: #endif
851: start = ai[row];
852: end = ai[row+1];
853: for (k=start; k<end; k++) { /* Amat */
854: val = aj[k] + cstart;
855: if (!PetscBTLookupSet(table_i,val)) {
856: #if defined(PETSC_USE_CTABLE)
857: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
858: #else
859: data_i[isz_i] = val;
860: #endif
861: isz_i++;
862: }
863: }
864: start = bi[row];
865: end = bi[row+1];
866: for (k=start; k<end; k++) { /* Bmat */
867: val = garray[bj[k]];
868: if (!PetscBTLookupSet(table_i,val)) {
869: #if defined(PETSC_USE_CTABLE)
870: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
871: #else
872: data_i[isz_i] = val;
873: #endif
874: isz_i++;
875: }
876: }
877: }
878: isz[i] = isz_i;
880: #if defined(PETSC_USE_CTABLE)
881: PetscFree(tdata);
882: #endif
883: }
884: return 0;
885: }
887: /*
888: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
889: and return the output
891: Input:
892: C - the matrix
893: nrqr - no of messages being processed.
894: rbuf - an array of pointers to the received requests
896: Output:
897: xdata - array of messages to be sent back
898: isz1 - size of each message
900: For better efficiency perhaps we should malloc separately each xdata[i],
901: then if a remalloc is required we need only copy the data for that one row
902: rather then all previous rows as it is now where a single large chunk of
903: memory is used.
905: */
906: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
907: {
908: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
909: Mat A = c->A,B = c->B;
910: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
911: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
912: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
913: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
914: PetscInt *rbuf_i,kmax,rbuf_0;
915: PetscBT xtable;
917: m = C->rmap->N;
918: rstart = C->rmap->rstart;
919: cstart = C->cmap->rstart;
920: ai = a->i;
921: aj = a->j;
922: bi = b->i;
923: bj = b->j;
924: garray = c->garray;
926: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
927: rbuf_i = rbuf[i];
928: rbuf_0 = rbuf_i[0];
929: ct += rbuf_0;
930: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
931: }
933: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
934: else max1 = 1;
935: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
936: if (nrqr) {
937: PetscMalloc1(mem_estimate,&xdata[0]);
938: ++no_malloc;
939: }
940: PetscBTCreate(m,&xtable);
941: PetscArrayzero(isz1,nrqr);
943: ct3 = 0;
944: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
945: rbuf_i = rbuf[i];
946: rbuf_0 = rbuf_i[0];
947: ct1 = 2*rbuf_0+1;
948: ct2 = ct1;
949: ct3 += ct1;
950: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
951: PetscBTMemzero(m,xtable);
952: oct2 = ct2;
953: kmax = rbuf_i[2*j];
954: for (k=0; k<kmax; k++,ct1++) {
955: row = rbuf_i[ct1];
956: if (!PetscBTLookupSet(xtable,row)) {
957: if (!(ct3 < mem_estimate)) {
958: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
959: PetscMalloc1(new_estimate,&tmp);
960: PetscArraycpy(tmp,xdata[0],mem_estimate);
961: PetscFree(xdata[0]);
962: xdata[0] = tmp;
963: mem_estimate = new_estimate; ++no_malloc;
964: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
965: }
966: xdata[i][ct2++] = row;
967: ct3++;
968: }
969: }
970: for (k=oct2,max2=ct2; k<max2; k++) {
971: row = xdata[i][k] - rstart;
972: start = ai[row];
973: end = ai[row+1];
974: for (l=start; l<end; l++) {
975: val = aj[l] + cstart;
976: if (!PetscBTLookupSet(xtable,val)) {
977: if (!(ct3 < mem_estimate)) {
978: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
979: PetscMalloc1(new_estimate,&tmp);
980: PetscArraycpy(tmp,xdata[0],mem_estimate);
981: PetscFree(xdata[0]);
982: xdata[0] = tmp;
983: mem_estimate = new_estimate; ++no_malloc;
984: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
985: }
986: xdata[i][ct2++] = val;
987: ct3++;
988: }
989: }
990: start = bi[row];
991: end = bi[row+1];
992: for (l=start; l<end; l++) {
993: val = garray[bj[l]];
994: if (!PetscBTLookupSet(xtable,val)) {
995: if (!(ct3 < mem_estimate)) {
996: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997: PetscMalloc1(new_estimate,&tmp);
998: PetscArraycpy(tmp,xdata[0],mem_estimate);
999: PetscFree(xdata[0]);
1000: xdata[0] = tmp;
1001: mem_estimate = new_estimate; ++no_malloc;
1002: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003: }
1004: xdata[i][ct2++] = val;
1005: ct3++;
1006: }
1007: }
1008: }
1009: /* Update the header*/
1010: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1011: xdata[i][2*j-1] = rbuf_i[2*j-1];
1012: }
1013: xdata[i][0] = rbuf_0;
1014: if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2;
1015: isz1[i] = ct2; /* size of each message */
1016: }
1017: PetscBTDestroy(&xtable);
1018: PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc);
1019: return 0;
1020: }
1021: /* -------------------------------------------------------------------------*/
1022: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1023: /*
1024: Every processor gets the entire matrix
1025: */
1026: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1027: {
1028: Mat B;
1029: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1030: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1031: PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL;
1032: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1033: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1035: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1036: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1037: if (scall == MAT_INITIAL_MATRIX) {
1038: /* ----------------------------------------------------------------
1039: Tell every processor the number of nonzeros per row
1040: */
1041: PetscMalloc1(A->rmap->N,&lens);
1042: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1043: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1044: }
1045: PetscMalloc2(size,&recvcounts,size,&displs);
1046: for (i=0; i<size; i++) {
1047: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1048: displs[i] = A->rmap->range[i];
1049: }
1050: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1051: /* ---------------------------------------------------------------
1052: Create the sequential matrix of the same type as the local block diagonal
1053: */
1054: MatCreate(PETSC_COMM_SELF,&B);
1055: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1056: MatSetBlockSizesFromMats(B,A,A);
1057: MatSetType(B,((PetscObject)a->A)->type_name);
1058: MatSeqAIJSetPreallocation(B,0,lens);
1059: PetscCalloc1(2,Bin);
1060: **Bin = B;
1061: b = (Mat_SeqAIJ*)B->data;
1063: /*--------------------------------------------------------------------
1064: Copy my part of matrix column indices over
1065: */
1066: sendcount = ad->nz + bd->nz;
1067: jsendbuf = b->j + b->i[rstarts[rank]];
1068: a_jsendbuf = ad->j;
1069: b_jsendbuf = bd->j;
1070: n = A->rmap->rend - A->rmap->rstart;
1071: cnt = 0;
1072: for (i=0; i<n; i++) {
1073: /* put in lower diagonal portion */
1074: m = bd->i[i+1] - bd->i[i];
1075: while (m > 0) {
1076: /* is it above diagonal (in bd (compressed) numbering) */
1077: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1078: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1079: m--;
1080: }
1082: /* put in diagonal portion */
1083: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1084: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1085: }
1087: /* put in upper diagonal portion */
1088: while (m-- > 0) {
1089: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1090: }
1091: }
1094: /*--------------------------------------------------------------------
1095: Gather all column indices to all processors
1096: */
1097: for (i=0; i<size; i++) {
1098: recvcounts[i] = 0;
1099: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1100: recvcounts[i] += lens[j];
1101: }
1102: }
1103: displs[0] = 0;
1104: for (i=1; i<size; i++) {
1105: displs[i] = displs[i-1] + recvcounts[i-1];
1106: }
1107: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1108: /*--------------------------------------------------------------------
1109: Assemble the matrix into useable form (note numerical values not yet set)
1110: */
1111: /* set the b->ilen (length of each row) values */
1112: PetscArraycpy(b->ilen,lens,A->rmap->N);
1113: /* set the b->i indices */
1114: b->i[0] = 0;
1115: for (i=1; i<=A->rmap->N; i++) {
1116: b->i[i] = b->i[i-1] + lens[i-1];
1117: }
1118: PetscFree(lens);
1119: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1120: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
1122: } else {
1123: B = **Bin;
1124: b = (Mat_SeqAIJ*)B->data;
1125: }
1127: /*--------------------------------------------------------------------
1128: Copy my part of matrix numerical values into the values location
1129: */
1130: if (flag == MAT_GET_VALUES) {
1131: const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1132: MatScalar *sendbuf,*recvbuf;
1134: MatSeqAIJGetArrayRead(a->A,&ada);
1135: MatSeqAIJGetArrayRead(a->B,&bda);
1136: sendcount = ad->nz + bd->nz;
1137: sendbuf = b->a + b->i[rstarts[rank]];
1138: a_sendbuf = ada;
1139: b_sendbuf = bda;
1140: b_sendj = bd->j;
1141: n = A->rmap->rend - A->rmap->rstart;
1142: cnt = 0;
1143: for (i=0; i<n; i++) {
1144: /* put in lower diagonal portion */
1145: m = bd->i[i+1] - bd->i[i];
1146: while (m > 0) {
1147: /* is it above diagonal (in bd (compressed) numbering) */
1148: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1149: sendbuf[cnt++] = *b_sendbuf++;
1150: m--;
1151: b_sendj++;
1152: }
1154: /* put in diagonal portion */
1155: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1156: sendbuf[cnt++] = *a_sendbuf++;
1157: }
1159: /* put in upper diagonal portion */
1160: while (m-- > 0) {
1161: sendbuf[cnt++] = *b_sendbuf++;
1162: b_sendj++;
1163: }
1164: }
1167: /* -----------------------------------------------------------------
1168: Gather all numerical values to all processors
1169: */
1170: if (!recvcounts) {
1171: PetscMalloc2(size,&recvcounts,size,&displs);
1172: }
1173: for (i=0; i<size; i++) {
1174: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1175: }
1176: displs[0] = 0;
1177: for (i=1; i<size; i++) {
1178: displs[i] = displs[i-1] + recvcounts[i-1];
1179: }
1180: recvbuf = b->a;
1181: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1182: MatSeqAIJRestoreArrayRead(a->A,&ada);
1183: MatSeqAIJRestoreArrayRead(a->B,&bda);
1184: } /* endof (flag == MAT_GET_VALUES) */
1185: PetscFree2(recvcounts,displs);
1187: if (A->symmetric) {
1188: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1189: } else if (A->hermitian) {
1190: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1191: } else if (A->structurally_symmetric) {
1192: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1193: }
1194: return 0;
1195: }
1197: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1198: {
1199: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
1200: Mat submat,A = c->A,B = c->B;
1201: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1202: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1203: PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1204: const PetscInt *icol,*irow;
1205: PetscInt nrow,ncol,start;
1206: PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1207: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1208: PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1209: PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1210: PetscInt *lens,rmax,ncols,*cols,Crow;
1211: #if defined(PETSC_USE_CTABLE)
1212: PetscTable cmap,rmap;
1213: PetscInt *cmap_loc,*rmap_loc;
1214: #else
1215: PetscInt *cmap,*rmap;
1216: #endif
1217: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1218: PetscInt *cworkB,lwrite,*subcols,*row2proc;
1219: PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1220: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1221: MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4;
1222: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1223: MPI_Status *r_status3 = NULL,*r_status4,*s_status4;
1224: MPI_Comm comm;
1225: PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1226: PetscMPIInt *onodes1,*olengths1,idex,end;
1227: Mat_SubSppt *smatis1;
1228: PetscBool isrowsorted,iscolsorted;
1233: MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
1234: MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);
1235: PetscObjectGetComm((PetscObject)C,&comm);
1236: size = c->size;
1237: rank = c->rank;
1239: ISSorted(iscol[0],&iscolsorted);
1240: ISSorted(isrow[0],&isrowsorted);
1241: ISGetIndices(isrow[0],&irow);
1242: ISGetLocalSize(isrow[0],&nrow);
1243: if (allcolumns) {
1244: icol = NULL;
1245: ncol = C->cmap->N;
1246: } else {
1247: ISGetIndices(iscol[0],&icol);
1248: ISGetLocalSize(iscol[0],&ncol);
1249: }
1251: if (scall == MAT_INITIAL_MATRIX) {
1252: PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1254: /* Get some new tags to keep the communication clean */
1255: tag1 = ((PetscObject)C)->tag;
1256: PetscObjectGetNewTag((PetscObject)C,&tag2);
1257: PetscObjectGetNewTag((PetscObject)C,&tag3);
1259: /* evaluate communication - mesg to who, length of mesg, and buffer space
1260: required. Based on this, buffers are allocated, and data copied into them */
1261: PetscCalloc2(size,&w1,size,&w2);
1262: PetscMalloc1(nrow,&row2proc);
1264: /* w1[proc] = num of rows owned by proc -- to be requested */
1265: proc = 0;
1266: nrqs = 0; /* num of outgoing messages */
1267: for (j=0; j<nrow; j++) {
1268: row = irow[j];
1269: if (!isrowsorted) proc = 0;
1270: while (row >= C->rmap->range[proc+1]) proc++;
1271: w1[proc]++;
1272: row2proc[j] = proc; /* map row index to proc */
1274: if (proc != rank && !w2[proc]) {
1275: w2[proc] = 1; nrqs++;
1276: }
1277: }
1278: w1[rank] = 0; /* rows owned by self will not be requested */
1280: PetscMalloc1(nrqs,&pa); /*(proc -array)*/
1281: for (proc=0,j=0; proc<size; proc++) {
1282: if (w1[proc]) { pa[j++] = proc;}
1283: }
1285: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1286: msz = 0; /* total mesg length (for all procs) */
1287: for (i=0; i<nrqs; i++) {
1288: proc = pa[i];
1289: w1[proc] += 3;
1290: msz += w1[proc];
1291: }
1292: PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);
1294: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1295: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1296: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1298: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1299: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1300: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
1302: /* Now post the Irecvs corresponding to these messages */
1303: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
1305: PetscFree(onodes1);
1306: PetscFree(olengths1);
1308: /* Allocate Memory for outgoing messages */
1309: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1310: PetscArrayzero(sbuf1,size);
1311: PetscArrayzero(ptr,size);
1313: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1314: iptr = tmp;
1315: for (i=0; i<nrqs; i++) {
1316: proc = pa[i];
1317: sbuf1[proc] = iptr;
1318: iptr += w1[proc];
1319: }
1321: /* Form the outgoing messages */
1322: /* Initialize the header space */
1323: for (i=0; i<nrqs; i++) {
1324: proc = pa[i];
1325: PetscArrayzero(sbuf1[proc],3);
1326: ptr[proc] = sbuf1[proc] + 3;
1327: }
1329: /* Parse the isrow and copy data into outbuf */
1330: PetscArrayzero(ctr,size);
1331: for (j=0; j<nrow; j++) { /* parse the indices of each IS */
1332: proc = row2proc[j];
1333: if (proc != rank) { /* copy to the outgoing buf*/
1334: *ptr[proc] = irow[j];
1335: ctr[proc]++; ptr[proc]++;
1336: }
1337: }
1339: /* Update the headers for the current IS */
1340: for (j=0; j<size; j++) { /* Can Optimise this loop too */
1341: if ((ctr_j = ctr[j])) {
1342: sbuf1_j = sbuf1[j];
1343: k = ++sbuf1_j[0];
1344: sbuf1_j[2*k] = ctr_j;
1345: sbuf1_j[2*k-1] = 0;
1346: }
1347: }
1349: /* Now post the sends */
1350: PetscMalloc1(nrqs,&s_waits1);
1351: for (i=0; i<nrqs; ++i) {
1352: proc = pa[i];
1353: MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1354: }
1356: /* Post Receives to capture the buffer size */
1357: PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);
1358: PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
1360: if (nrqs) rbuf2[0] = tmp + msz;
1361: for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1363: for (i=0; i<nrqs; ++i) {
1364: proc = pa[i];
1365: MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1366: }
1368: PetscFree2(w1,w2);
1370: /* Send to other procs the buf size they should allocate */
1371: /* Receive messages*/
1372: PetscMalloc1(nrqr,&r_status1);
1373: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
1375: MPI_Waitall(nrqr,r_waits1,r_status1);
1376: for (i=0; i<nrqr; ++i) {
1377: req_size[i] = 0;
1378: rbuf1_i = rbuf1[i];
1379: start = 2*rbuf1_i[0] + 1;
1380: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1381: PetscMalloc1(end,&sbuf2[i]);
1382: sbuf2_i = sbuf2[i];
1383: for (j=start; j<end; j++) {
1384: k = rbuf1_i[j] - rstart;
1385: ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1386: sbuf2_i[j] = ncols;
1387: req_size[i] += ncols;
1388: }
1389: req_source1[i] = r_status1[i].MPI_SOURCE;
1391: /* form the header */
1392: sbuf2_i[0] = req_size[i];
1393: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1395: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1396: }
1398: PetscFree(r_status1);
1399: PetscFree(r_waits1);
1401: /* rbuf2 is received, Post recv column indices a->j */
1402: MPI_Waitall(nrqs,r_waits2,r_status2);
1404: PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);
1405: for (i=0; i<nrqs; ++i) {
1406: PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
1407: req_source2[i] = r_status2[i].MPI_SOURCE;
1408: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1409: }
1411: /* Wait on sends1 and sends2 */
1412: PetscMalloc1(nrqs,&s_status1);
1413: MPI_Waitall(nrqs,s_waits1,s_status1);
1414: PetscFree(s_waits1);
1415: PetscFree(s_status1);
1417: MPI_Waitall(nrqr,s_waits2,s_status2);
1418: PetscFree4(r_status2,s_waits2,r_waits2,s_status2);
1420: /* Now allocate sending buffers for a->j, and send them off */
1421: PetscMalloc1(nrqr,&sbuf_aj);
1422: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1423: if (nrqr) PetscMalloc1(j,&sbuf_aj[0]);
1424: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1426: for (i=0; i<nrqr; i++) { /* for each requested message */
1427: rbuf1_i = rbuf1[i];
1428: sbuf_aj_i = sbuf_aj[i];
1429: ct1 = 2*rbuf1_i[0] + 1;
1430: ct2 = 0;
1433: kmax = rbuf1[i][2];
1434: for (k=0; k<kmax; k++,ct1++) { /* for each row */
1435: row = rbuf1_i[ct1] - rstart;
1436: nzA = ai[row+1] - ai[row];
1437: nzB = bi[row+1] - bi[row];
1438: ncols = nzA + nzB;
1439: cworkA = aj + ai[row]; cworkB = bj + bi[row];
1441: /* load the column indices for this row into cols*/
1442: cols = sbuf_aj_i + ct2;
1444: lwrite = 0;
1445: for (l=0; l<nzB; l++) {
1446: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1447: }
1448: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1449: for (l=0; l<nzB; l++) {
1450: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1451: }
1453: ct2 += ncols;
1454: }
1455: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1456: }
1458: /* create column map (cmap): global col of C -> local col of submat */
1459: #if defined(PETSC_USE_CTABLE)
1460: if (!allcolumns) {
1461: PetscTableCreate(ncol,C->cmap->N,&cmap);
1462: PetscCalloc1(C->cmap->n,&cmap_loc);
1463: for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1464: if (icol[j] >= cstart && icol[j] <cend) {
1465: cmap_loc[icol[j] - cstart] = j+1;
1466: } else { /* use PetscTable for non-local col indices */
1467: PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1468: }
1469: }
1470: } else {
1471: cmap = NULL;
1472: cmap_loc = NULL;
1473: }
1474: PetscCalloc1(C->rmap->n,&rmap_loc);
1475: #else
1476: if (!allcolumns) {
1477: PetscCalloc1(C->cmap->N,&cmap);
1478: for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1479: } else {
1480: cmap = NULL;
1481: }
1482: #endif
1484: /* Create lens for MatSeqAIJSetPreallocation() */
1485: PetscCalloc1(nrow,&lens);
1487: /* Compute lens from local part of C */
1488: for (j=0; j<nrow; j++) {
1489: row = irow[j];
1490: proc = row2proc[j];
1491: if (proc == rank) {
1492: /* diagonal part A = c->A */
1493: ncols = ai[row-rstart+1] - ai[row-rstart];
1494: cols = aj + ai[row-rstart];
1495: if (!allcolumns) {
1496: for (k=0; k<ncols; k++) {
1497: #if defined(PETSC_USE_CTABLE)
1498: tcol = cmap_loc[cols[k]];
1499: #else
1500: tcol = cmap[cols[k]+cstart];
1501: #endif
1502: if (tcol) lens[j]++;
1503: }
1504: } else { /* allcolumns */
1505: lens[j] = ncols;
1506: }
1508: /* off-diagonal part B = c->B */
1509: ncols = bi[row-rstart+1] - bi[row-rstart];
1510: cols = bj + bi[row-rstart];
1511: if (!allcolumns) {
1512: for (k=0; k<ncols; k++) {
1513: #if defined(PETSC_USE_CTABLE)
1514: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1515: #else
1516: tcol = cmap[bmap[cols[k]]];
1517: #endif
1518: if (tcol) lens[j]++;
1519: }
1520: } else { /* allcolumns */
1521: lens[j] += ncols;
1522: }
1523: }
1524: }
1526: /* Create row map (rmap): global row of C -> local row of submat */
1527: #if defined(PETSC_USE_CTABLE)
1528: PetscTableCreate(nrow,C->rmap->N,&rmap);
1529: for (j=0; j<nrow; j++) {
1530: row = irow[j];
1531: proc = row2proc[j];
1532: if (proc == rank) { /* a local row */
1533: rmap_loc[row - rstart] = j;
1534: } else {
1535: PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1536: }
1537: }
1538: #else
1539: PetscCalloc1(C->rmap->N,&rmap);
1540: for (j=0; j<nrow; j++) {
1541: rmap[irow[j]] = j;
1542: }
1543: #endif
1545: /* Update lens from offproc data */
1546: /* recv a->j is done */
1547: MPI_Waitall(nrqs,r_waits3,r_status3);
1548: for (i=0; i<nrqs; i++) {
1549: proc = pa[i];
1550: sbuf1_i = sbuf1[proc];
1552: ct1 = 2 + 1;
1553: ct2 = 0;
1554: rbuf2_i = rbuf2[i]; /* received length of C->j */
1555: rbuf3_i = rbuf3[i]; /* received C->j */
1558: max1 = sbuf1_i[2];
1559: for (k=0; k<max1; k++,ct1++) {
1560: #if defined(PETSC_USE_CTABLE)
1561: PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1562: row--;
1564: #else
1565: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1566: #endif
1567: /* Now, store row index of submat in sbuf1_i[ct1] */
1568: sbuf1_i[ct1] = row;
1570: nnz = rbuf2_i[ct1];
1571: if (!allcolumns) {
1572: for (l=0; l<nnz; l++,ct2++) {
1573: #if defined(PETSC_USE_CTABLE)
1574: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1575: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1576: } else {
1577: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1578: }
1579: #else
1580: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1581: #endif
1582: if (tcol) lens[row]++;
1583: }
1584: } else { /* allcolumns */
1585: lens[row] += nnz;
1586: }
1587: }
1588: }
1589: MPI_Waitall(nrqr,s_waits3,s_status3);
1590: PetscFree4(r_waits3,s_waits3,r_status3,s_status3);
1592: /* Create the submatrices */
1593: MatCreate(PETSC_COMM_SELF,&submat);
1594: MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);
1596: ISGetBlockSize(isrow[0],&i);
1597: ISGetBlockSize(iscol[0],&j);
1598: MatSetBlockSizes(submat,i,j);
1599: MatSetType(submat,((PetscObject)A)->type_name);
1600: MatSeqAIJSetPreallocation(submat,0,lens);
1602: /* create struct Mat_SubSppt and attached it to submat */
1603: PetscNew(&smatis1);
1604: subc = (Mat_SeqAIJ*)submat->data;
1605: subc->submatis1 = smatis1;
1607: smatis1->id = 0;
1608: smatis1->nrqs = nrqs;
1609: smatis1->nrqr = nrqr;
1610: smatis1->rbuf1 = rbuf1;
1611: smatis1->rbuf2 = rbuf2;
1612: smatis1->rbuf3 = rbuf3;
1613: smatis1->sbuf2 = sbuf2;
1614: smatis1->req_source2 = req_source2;
1616: smatis1->sbuf1 = sbuf1;
1617: smatis1->ptr = ptr;
1618: smatis1->tmp = tmp;
1619: smatis1->ctr = ctr;
1621: smatis1->pa = pa;
1622: smatis1->req_size = req_size;
1623: smatis1->req_source1 = req_source1;
1625: smatis1->allcolumns = allcolumns;
1626: smatis1->singleis = PETSC_TRUE;
1627: smatis1->row2proc = row2proc;
1628: smatis1->rmap = rmap;
1629: smatis1->cmap = cmap;
1630: #if defined(PETSC_USE_CTABLE)
1631: smatis1->rmap_loc = rmap_loc;
1632: smatis1->cmap_loc = cmap_loc;
1633: #endif
1635: smatis1->destroy = submat->ops->destroy;
1636: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1637: submat->factortype = C->factortype;
1639: /* compute rmax */
1640: rmax = 0;
1641: for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1643: } else { /* scall == MAT_REUSE_MATRIX */
1644: submat = submats[0];
1647: subc = (Mat_SeqAIJ*)submat->data;
1648: rmax = subc->rmax;
1649: smatis1 = subc->submatis1;
1650: nrqs = smatis1->nrqs;
1651: nrqr = smatis1->nrqr;
1652: rbuf1 = smatis1->rbuf1;
1653: rbuf2 = smatis1->rbuf2;
1654: rbuf3 = smatis1->rbuf3;
1655: req_source2 = smatis1->req_source2;
1657: sbuf1 = smatis1->sbuf1;
1658: sbuf2 = smatis1->sbuf2;
1659: ptr = smatis1->ptr;
1660: tmp = smatis1->tmp;
1661: ctr = smatis1->ctr;
1663: pa = smatis1->pa;
1664: req_size = smatis1->req_size;
1665: req_source1 = smatis1->req_source1;
1667: allcolumns = smatis1->allcolumns;
1668: row2proc = smatis1->row2proc;
1669: rmap = smatis1->rmap;
1670: cmap = smatis1->cmap;
1671: #if defined(PETSC_USE_CTABLE)
1672: rmap_loc = smatis1->rmap_loc;
1673: cmap_loc = smatis1->cmap_loc;
1674: #endif
1675: }
1677: /* Post recv matrix values */
1678: PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);
1679: PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);
1680: PetscObjectGetNewTag((PetscObject)C,&tag4);
1681: for (i=0; i<nrqs; ++i) {
1682: PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
1683: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1684: }
1686: /* Allocate sending buffers for a->a, and send them off */
1687: PetscMalloc1(nrqr,&sbuf_aa);
1688: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1689: if (nrqr) PetscMalloc1(j,&sbuf_aa[0]);
1690: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1692: for (i=0; i<nrqr; i++) {
1693: rbuf1_i = rbuf1[i];
1694: sbuf_aa_i = sbuf_aa[i];
1695: ct1 = 2*rbuf1_i[0]+1;
1696: ct2 = 0;
1699: kmax = rbuf1_i[2];
1700: for (k=0; k<kmax; k++,ct1++) {
1701: row = rbuf1_i[ct1] - rstart;
1702: nzA = ai[row+1] - ai[row];
1703: nzB = bi[row+1] - bi[row];
1704: ncols = nzA + nzB;
1705: cworkB = bj + bi[row];
1706: vworkA = a_a + ai[row];
1707: vworkB = b_a + bi[row];
1709: /* load the column values for this row into vals*/
1710: vals = sbuf_aa_i + ct2;
1712: lwrite = 0;
1713: for (l=0; l<nzB; l++) {
1714: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1715: }
1716: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1717: for (l=0; l<nzB; l++) {
1718: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1719: }
1721: ct2 += ncols;
1722: }
1723: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1724: }
1726: /* Assemble submat */
1727: /* First assemble the local rows */
1728: for (j=0; j<nrow; j++) {
1729: row = irow[j];
1730: proc = row2proc[j];
1731: if (proc == rank) {
1732: Crow = row - rstart; /* local row index of C */
1733: #if defined(PETSC_USE_CTABLE)
1734: row = rmap_loc[Crow]; /* row index of submat */
1735: #else
1736: row = rmap[row];
1737: #endif
1739: if (allcolumns) {
1740: /* diagonal part A = c->A */
1741: ncols = ai[Crow+1] - ai[Crow];
1742: cols = aj + ai[Crow];
1743: vals = a_a + ai[Crow];
1744: i = 0;
1745: for (k=0; k<ncols; k++) {
1746: subcols[i] = cols[k] + cstart;
1747: subvals[i++] = vals[k];
1748: }
1750: /* off-diagonal part B = c->B */
1751: ncols = bi[Crow+1] - bi[Crow];
1752: cols = bj + bi[Crow];
1753: vals = b_a + bi[Crow];
1754: for (k=0; k<ncols; k++) {
1755: subcols[i] = bmap[cols[k]];
1756: subvals[i++] = vals[k];
1757: }
1759: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1761: } else { /* !allcolumns */
1762: #if defined(PETSC_USE_CTABLE)
1763: /* diagonal part A = c->A */
1764: ncols = ai[Crow+1] - ai[Crow];
1765: cols = aj + ai[Crow];
1766: vals = a_a + ai[Crow];
1767: i = 0;
1768: for (k=0; k<ncols; k++) {
1769: tcol = cmap_loc[cols[k]];
1770: if (tcol) {
1771: subcols[i] = --tcol;
1772: subvals[i++] = vals[k];
1773: }
1774: }
1776: /* off-diagonal part B = c->B */
1777: ncols = bi[Crow+1] - bi[Crow];
1778: cols = bj + bi[Crow];
1779: vals = b_a + bi[Crow];
1780: for (k=0; k<ncols; k++) {
1781: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1782: if (tcol) {
1783: subcols[i] = --tcol;
1784: subvals[i++] = vals[k];
1785: }
1786: }
1787: #else
1788: /* diagonal part A = c->A */
1789: ncols = ai[Crow+1] - ai[Crow];
1790: cols = aj + ai[Crow];
1791: vals = a_a + ai[Crow];
1792: i = 0;
1793: for (k=0; k<ncols; k++) {
1794: tcol = cmap[cols[k]+cstart];
1795: if (tcol) {
1796: subcols[i] = --tcol;
1797: subvals[i++] = vals[k];
1798: }
1799: }
1801: /* off-diagonal part B = c->B */
1802: ncols = bi[Crow+1] - bi[Crow];
1803: cols = bj + bi[Crow];
1804: vals = b_a + bi[Crow];
1805: for (k=0; k<ncols; k++) {
1806: tcol = cmap[bmap[cols[k]]];
1807: if (tcol) {
1808: subcols[i] = --tcol;
1809: subvals[i++] = vals[k];
1810: }
1811: }
1812: #endif
1813: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1814: }
1815: }
1816: }
1818: /* Now assemble the off-proc rows */
1819: for (i=0; i<nrqs; i++) { /* for each requested message */
1820: /* recv values from other processes */
1821: MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1822: proc = pa[idex];
1823: sbuf1_i = sbuf1[proc];
1825: ct1 = 2 + 1;
1826: ct2 = 0; /* count of received C->j */
1827: ct3 = 0; /* count of received C->j that will be inserted into submat */
1828: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1829: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1830: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1833: max1 = sbuf1_i[2]; /* num of rows */
1834: for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1835: row = sbuf1_i[ct1]; /* row index of submat */
1836: if (!allcolumns) {
1837: idex = 0;
1838: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1839: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1840: for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1841: #if defined(PETSC_USE_CTABLE)
1842: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1843: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1844: } else {
1845: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1846: }
1847: #else
1848: tcol = cmap[rbuf3_i[ct2]];
1849: #endif
1850: if (tcol) {
1851: subcols[idex] = --tcol; /* may not be sorted */
1852: subvals[idex++] = rbuf4_i[ct2];
1854: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1855: For reuse, we replace received C->j with index that should be inserted to submat */
1856: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1857: }
1858: }
1859: MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1860: } else { /* scall == MAT_REUSE_MATRIX */
1861: submat = submats[0];
1862: subc = (Mat_SeqAIJ*)submat->data;
1864: nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1865: for (l=0; l<nnz; l++) {
1866: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1867: subvals[idex++] = rbuf4_i[ct2];
1868: }
1870: bj = subc->j + subc->i[row]; /* sorted column indices */
1871: MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1872: }
1873: } else { /* allcolumns */
1874: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1875: MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1876: ct2 += nnz;
1877: }
1878: }
1879: }
1881: /* sending a->a are done */
1882: MPI_Waitall(nrqr,s_waits4,s_status4);
1883: PetscFree4(r_waits4,s_waits4,r_status4,s_status4);
1885: MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1886: MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1887: submats[0] = submat;
1889: /* Restore the indices */
1890: ISRestoreIndices(isrow[0],&irow);
1891: if (!allcolumns) {
1892: ISRestoreIndices(iscol[0],&icol);
1893: }
1895: /* Destroy allocated memory */
1896: for (i=0; i<nrqs; ++i) {
1897: PetscFree(rbuf4[i]);
1898: }
1899: PetscFree3(rbuf4,subcols,subvals);
1900: if (sbuf_aa) {
1901: PetscFree(sbuf_aa[0]);
1902: PetscFree(sbuf_aa);
1903: }
1905: if (scall == MAT_INITIAL_MATRIX) {
1906: PetscFree(lens);
1907: if (sbuf_aj) {
1908: PetscFree(sbuf_aj[0]);
1909: PetscFree(sbuf_aj);
1910: }
1911: }
1912: MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
1913: MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);
1914: return 0;
1915: }
1917: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1918: {
1919: PetscInt ncol;
1920: PetscBool colflag,allcolumns=PETSC_FALSE;
1922: /* Allocate memory to hold all the submatrices */
1923: if (scall == MAT_INITIAL_MATRIX) {
1924: PetscCalloc1(2,submat);
1925: }
1927: /* Check for special case: each processor gets entire matrix columns */
1928: ISIdentity(iscol[0],&colflag);
1929: ISGetLocalSize(iscol[0],&ncol);
1930: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1932: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1933: return 0;
1934: }
1936: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1937: {
1938: PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1939: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE;
1940: Mat_SeqAIJ *subc;
1941: Mat_SubSppt *smat;
1943: /* Check for special case: each processor has a single IS */
1944: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1945: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1946: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1947: return 0;
1948: }
1950: /* Collect global wantallmatrix and nstages */
1951: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1952: else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1953: if (!nmax) nmax = 1;
1955: if (scall == MAT_INITIAL_MATRIX) {
1956: /* Collect global wantallmatrix and nstages */
1957: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1958: ISIdentity(*isrow,&rowflag);
1959: ISIdentity(*iscol,&colflag);
1960: ISGetLocalSize(*isrow,&nrow);
1961: ISGetLocalSize(*iscol,&ncol);
1962: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1963: wantallmatrix = PETSC_TRUE;
1965: PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
1966: }
1967: }
1969: /* Determine the number of stages through which submatrices are done
1970: Each stage will extract nmax submatrices.
1971: nmax is determined by the matrix column dimension.
1972: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1973: */
1974: nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1976: in[0] = -1*(PetscInt)wantallmatrix;
1977: in[1] = nstages;
1978: MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
1979: wantallmatrix = (PetscBool)(-out[0]);
1980: nstages = out[1]; /* Make sure every processor loops through the global nstages */
1982: } else { /* MAT_REUSE_MATRIX */
1983: if (ismax) {
1984: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
1985: smat = subc->submatis1;
1986: } else { /* (*submat)[0] is a dummy matrix */
1987: smat = (Mat_SubSppt*)(*submat)[0]->data;
1988: }
1989: if (!smat) {
1990: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
1991: wantallmatrix = PETSC_TRUE;
1992: } else if (smat->singleis) {
1993: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1994: return 0;
1995: } else {
1996: nstages = smat->nstages;
1997: }
1998: }
2000: if (wantallmatrix) {
2001: MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2002: return 0;
2003: }
2005: /* Allocate memory to hold all the submatrices and dummy submatrices */
2006: if (scall == MAT_INITIAL_MATRIX) {
2007: PetscCalloc1(ismax+nstages,submat);
2008: }
2010: for (i=0,pos=0; i<nstages; i++) {
2011: if (pos+nmax <= ismax) max_no = nmax;
2012: else if (pos >= ismax) max_no = 0;
2013: else max_no = ismax-pos;
2015: MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2016: if (!max_no) {
2017: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2018: smat = (Mat_SubSppt*)(*submat)[pos]->data;
2019: smat->nstages = nstages;
2020: }
2021: pos++; /* advance to next dummy matrix if any */
2022: } else pos += max_no;
2023: }
2025: if (ismax && scall == MAT_INITIAL_MATRIX) {
2026: /* save nstages for reuse */
2027: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2028: smat = subc->submatis1;
2029: smat->nstages = nstages;
2030: }
2031: return 0;
2032: }
2034: /* -------------------------------------------------------------------------*/
2035: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2036: {
2037: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
2038: Mat A = c->A;
2039: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2040: const PetscInt **icol,**irow;
2041: PetscInt *nrow,*ncol,start;
2042: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2043: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2044: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2045: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2046: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2047: #if defined(PETSC_USE_CTABLE)
2048: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2049: #else
2050: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2051: #endif
2052: const PetscInt *irow_i;
2053: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2054: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2055: MPI_Request *r_waits4,*s_waits3,*s_waits4;
2056: MPI_Comm comm;
2057: PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2058: PetscMPIInt *onodes1,*olengths1,end;
2059: PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2060: Mat_SubSppt *smat_i;
2061: PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2062: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2064: PetscObjectGetComm((PetscObject)C,&comm);
2065: size = c->size;
2066: rank = c->rank;
2068: PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2069: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
2071: for (i=0; i<ismax; i++) {
2072: ISSorted(iscol[i],&issorted[i]);
2073: if (!issorted[i]) iscsorted = issorted[i];
2075: ISSorted(isrow[i],&issorted[i]);
2077: ISGetIndices(isrow[i],&irow[i]);
2078: ISGetLocalSize(isrow[i],&nrow[i]);
2080: /* Check for special case: allcolumn */
2081: ISIdentity(iscol[i],&colflag);
2082: ISGetLocalSize(iscol[i],&ncol[i]);
2083: if (colflag && ncol[i] == C->cmap->N) {
2084: allcolumns[i] = PETSC_TRUE;
2085: icol[i] = NULL;
2086: } else {
2087: allcolumns[i] = PETSC_FALSE;
2088: ISGetIndices(iscol[i],&icol[i]);
2089: }
2090: }
2092: if (scall == MAT_REUSE_MATRIX) {
2093: /* Assumes new rows are same length as the old rows */
2094: for (i=0; i<ismax; i++) {
2096: subc = (Mat_SeqAIJ*)submats[i]->data;
2099: /* Initial matrix as if empty */
2100: PetscArrayzero(subc->ilen,submats[i]->rmap->n);
2102: smat_i = subc->submatis1;
2104: nrqs = smat_i->nrqs;
2105: nrqr = smat_i->nrqr;
2106: rbuf1 = smat_i->rbuf1;
2107: rbuf2 = smat_i->rbuf2;
2108: rbuf3 = smat_i->rbuf3;
2109: req_source2 = smat_i->req_source2;
2111: sbuf1 = smat_i->sbuf1;
2112: sbuf2 = smat_i->sbuf2;
2113: ptr = smat_i->ptr;
2114: tmp = smat_i->tmp;
2115: ctr = smat_i->ctr;
2117: pa = smat_i->pa;
2118: req_size = smat_i->req_size;
2119: req_source1 = smat_i->req_source1;
2121: allcolumns[i] = smat_i->allcolumns;
2122: row2proc[i] = smat_i->row2proc;
2123: rmap[i] = smat_i->rmap;
2124: cmap[i] = smat_i->cmap;
2125: }
2127: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2129: smat_i = (Mat_SubSppt*)submats[0]->data;
2131: nrqs = smat_i->nrqs;
2132: nrqr = smat_i->nrqr;
2133: rbuf1 = smat_i->rbuf1;
2134: rbuf2 = smat_i->rbuf2;
2135: rbuf3 = smat_i->rbuf3;
2136: req_source2 = smat_i->req_source2;
2138: sbuf1 = smat_i->sbuf1;
2139: sbuf2 = smat_i->sbuf2;
2140: ptr = smat_i->ptr;
2141: tmp = smat_i->tmp;
2142: ctr = smat_i->ctr;
2144: pa = smat_i->pa;
2145: req_size = smat_i->req_size;
2146: req_source1 = smat_i->req_source1;
2148: allcolumns[0] = PETSC_FALSE;
2149: }
2150: } else { /* scall == MAT_INITIAL_MATRIX */
2151: /* Get some new tags to keep the communication clean */
2152: PetscObjectGetNewTag((PetscObject)C,&tag2);
2153: PetscObjectGetNewTag((PetscObject)C,&tag3);
2155: /* evaluate communication - mesg to who, length of mesg, and buffer space
2156: required. Based on this, buffers are allocated, and data copied into them*/
2157: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
2159: for (i=0; i<ismax; i++) {
2160: jmax = nrow[i];
2161: irow_i = irow[i];
2163: PetscMalloc1(jmax,&row2proc_i);
2164: row2proc[i] = row2proc_i;
2166: if (issorted[i]) proc = 0;
2167: for (j=0; j<jmax; j++) {
2168: if (!issorted[i]) proc = 0;
2169: row = irow_i[j];
2170: while (row >= C->rmap->range[proc+1]) proc++;
2171: w4[proc]++;
2172: row2proc_i[j] = proc; /* map row index to proc */
2173: }
2174: for (j=0; j<size; j++) {
2175: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
2176: }
2177: }
2179: nrqs = 0; /* no of outgoing messages */
2180: msz = 0; /* total mesg length (for all procs) */
2181: w1[rank] = 0; /* no mesg sent to self */
2182: w3[rank] = 0;
2183: for (i=0; i<size; i++) {
2184: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2185: }
2186: PetscMalloc1(nrqs,&pa); /*(proc -array)*/
2187: for (i=0,j=0; i<size; i++) {
2188: if (w1[i]) { pa[j] = i; j++; }
2189: }
2191: /* Each message would have a header = 1 + 2*(no of IS) + data */
2192: for (i=0; i<nrqs; i++) {
2193: j = pa[i];
2194: w1[j] += w2[j] + 2* w3[j];
2195: msz += w1[j];
2196: }
2197: PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);
2199: /* Determine the number of messages to expect, their lengths, from from-ids */
2200: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2201: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
2203: /* Now post the Irecvs corresponding to these messages */
2204: PetscObjectGetNewTag((PetscObject)C,&tag0);
2205: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
2207: /* Allocate Memory for outgoing messages */
2208: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2209: PetscArrayzero(sbuf1,size);
2210: PetscArrayzero(ptr,size);
2212: {
2213: PetscInt *iptr = tmp;
2214: k = 0;
2215: for (i=0; i<nrqs; i++) {
2216: j = pa[i];
2217: iptr += k;
2218: sbuf1[j] = iptr;
2219: k = w1[j];
2220: }
2221: }
2223: /* Form the outgoing messages. Initialize the header space */
2224: for (i=0; i<nrqs; i++) {
2225: j = pa[i];
2226: sbuf1[j][0] = 0;
2227: PetscArrayzero(sbuf1[j]+1,2*w3[j]);
2228: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
2229: }
2231: /* Parse the isrow and copy data into outbuf */
2232: for (i=0; i<ismax; i++) {
2233: row2proc_i = row2proc[i];
2234: PetscArrayzero(ctr,size);
2235: irow_i = irow[i];
2236: jmax = nrow[i];
2237: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
2238: proc = row2proc_i[j];
2239: if (proc != rank) { /* copy to the outgoing buf*/
2240: ctr[proc]++;
2241: *ptr[proc] = irow_i[j];
2242: ptr[proc]++;
2243: }
2244: }
2245: /* Update the headers for the current IS */
2246: for (j=0; j<size; j++) { /* Can Optimise this loop too */
2247: if ((ctr_j = ctr[j])) {
2248: sbuf1_j = sbuf1[j];
2249: k = ++sbuf1_j[0];
2250: sbuf1_j[2*k] = ctr_j;
2251: sbuf1_j[2*k-1] = i;
2252: }
2253: }
2254: }
2256: /* Now post the sends */
2257: PetscMalloc1(nrqs,&s_waits1);
2258: for (i=0; i<nrqs; ++i) {
2259: j = pa[i];
2260: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2261: }
2263: /* Post Receives to capture the buffer size */
2264: PetscMalloc1(nrqs,&r_waits2);
2265: PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
2266: if (nrqs) rbuf2[0] = tmp + msz;
2267: for (i=1; i<nrqs; ++i) {
2268: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2269: }
2270: for (i=0; i<nrqs; ++i) {
2271: j = pa[i];
2272: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2273: }
2275: /* Send to other procs the buf size they should allocate */
2276: /* Receive messages*/
2277: PetscMalloc1(nrqr,&s_waits2);
2278: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2279: {
2280: PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2281: PetscInt *sbuf2_i;
2283: MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
2284: for (i=0; i<nrqr; ++i) {
2285: req_size[i] = 0;
2286: rbuf1_i = rbuf1[i];
2287: start = 2*rbuf1_i[0] + 1;
2288: end = olengths1[i];
2289: PetscMalloc1(end,&sbuf2[i]);
2290: sbuf2_i = sbuf2[i];
2291: for (j=start; j<end; j++) {
2292: id = rbuf1_i[j] - rstart;
2293: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2294: sbuf2_i[j] = ncols;
2295: req_size[i] += ncols;
2296: }
2297: req_source1[i] = onodes1[i];
2298: /* form the header */
2299: sbuf2_i[0] = req_size[i];
2300: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2302: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2303: }
2304: }
2306: PetscFree(onodes1);
2307: PetscFree(olengths1);
2308: PetscFree(r_waits1);
2309: PetscFree4(w1,w2,w3,w4);
2311: /* Receive messages*/
2312: PetscMalloc1(nrqs,&r_waits3);
2313: MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);
2314: for (i=0; i<nrqs; ++i) {
2315: PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
2316: req_source2[i] = pa[i];
2317: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2318: }
2319: PetscFree(r_waits2);
2321: /* Wait on sends1 and sends2 */
2322: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
2323: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
2324: PetscFree(s_waits1);
2325: PetscFree(s_waits2);
2327: /* Now allocate sending buffers for a->j, and send them off */
2328: PetscMalloc1(nrqr,&sbuf_aj);
2329: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2330: if (nrqr) PetscMalloc1(j,&sbuf_aj[0]);
2331: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2333: PetscMalloc1(nrqr,&s_waits3);
2334: {
2335: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2336: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2337: PetscInt cend = C->cmap->rend;
2338: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2340: for (i=0; i<nrqr; i++) {
2341: rbuf1_i = rbuf1[i];
2342: sbuf_aj_i = sbuf_aj[i];
2343: ct1 = 2*rbuf1_i[0] + 1;
2344: ct2 = 0;
2345: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2346: kmax = rbuf1[i][2*j];
2347: for (k=0; k<kmax; k++,ct1++) {
2348: row = rbuf1_i[ct1] - rstart;
2349: nzA = a_i[row+1] - a_i[row];
2350: nzB = b_i[row+1] - b_i[row];
2351: ncols = nzA + nzB;
2352: cworkA = a_j + a_i[row];
2353: cworkB = b_j + b_i[row];
2355: /* load the column indices for this row into cols */
2356: cols = sbuf_aj_i + ct2;
2358: lwrite = 0;
2359: for (l=0; l<nzB; l++) {
2360: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2361: }
2362: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2363: for (l=0; l<nzB; l++) {
2364: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2365: }
2367: ct2 += ncols;
2368: }
2369: }
2370: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2371: }
2372: }
2374: /* create col map: global col of C -> local col of submatrices */
2375: {
2376: const PetscInt *icol_i;
2377: #if defined(PETSC_USE_CTABLE)
2378: for (i=0; i<ismax; i++) {
2379: if (!allcolumns[i]) {
2380: PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);
2382: jmax = ncol[i];
2383: icol_i = icol[i];
2384: cmap_i = cmap[i];
2385: for (j=0; j<jmax; j++) {
2386: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2387: }
2388: } else cmap[i] = NULL;
2389: }
2390: #else
2391: for (i=0; i<ismax; i++) {
2392: if (!allcolumns[i]) {
2393: PetscCalloc1(C->cmap->N,&cmap[i]);
2394: jmax = ncol[i];
2395: icol_i = icol[i];
2396: cmap_i = cmap[i];
2397: for (j=0; j<jmax; j++) {
2398: cmap_i[icol_i[j]] = j+1;
2399: }
2400: } else cmap[i] = NULL;
2401: }
2402: #endif
2403: }
2405: /* Create lens which is required for MatCreate... */
2406: for (i=0,j=0; i<ismax; i++) j += nrow[i];
2407: PetscMalloc1(ismax,&lens);
2409: if (ismax) {
2410: PetscCalloc1(j,&lens[0]);
2411: }
2412: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2414: /* Update lens from local data */
2415: for (i=0; i<ismax; i++) {
2416: row2proc_i = row2proc[i];
2417: jmax = nrow[i];
2418: if (!allcolumns[i]) cmap_i = cmap[i];
2419: irow_i = irow[i];
2420: lens_i = lens[i];
2421: for (j=0; j<jmax; j++) {
2422: row = irow_i[j];
2423: proc = row2proc_i[j];
2424: if (proc == rank) {
2425: MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2426: if (!allcolumns[i]) {
2427: for (k=0; k<ncols; k++) {
2428: #if defined(PETSC_USE_CTABLE)
2429: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2430: #else
2431: tcol = cmap_i[cols[k]];
2432: #endif
2433: if (tcol) lens_i[j]++;
2434: }
2435: } else { /* allcolumns */
2436: lens_i[j] = ncols;
2437: }
2438: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2439: }
2440: }
2441: }
2443: /* Create row map: global row of C -> local row of submatrices */
2444: #if defined(PETSC_USE_CTABLE)
2445: for (i=0; i<ismax; i++) {
2446: PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);
2447: irow_i = irow[i];
2448: jmax = nrow[i];
2449: for (j=0; j<jmax; j++) {
2450: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2451: }
2452: }
2453: #else
2454: for (i=0; i<ismax; i++) {
2455: PetscCalloc1(C->rmap->N,&rmap[i]);
2456: rmap_i = rmap[i];
2457: irow_i = irow[i];
2458: jmax = nrow[i];
2459: for (j=0; j<jmax; j++) {
2460: rmap_i[irow_i[j]] = j;
2461: }
2462: }
2463: #endif
2465: /* Update lens from offproc data */
2466: {
2467: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2469: MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);
2470: for (tmp2=0; tmp2<nrqs; tmp2++) {
2471: sbuf1_i = sbuf1[pa[tmp2]];
2472: jmax = sbuf1_i[0];
2473: ct1 = 2*jmax+1;
2474: ct2 = 0;
2475: rbuf2_i = rbuf2[tmp2];
2476: rbuf3_i = rbuf3[tmp2];
2477: for (j=1; j<=jmax; j++) {
2478: is_no = sbuf1_i[2*j-1];
2479: max1 = sbuf1_i[2*j];
2480: lens_i = lens[is_no];
2481: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2482: rmap_i = rmap[is_no];
2483: for (k=0; k<max1; k++,ct1++) {
2484: #if defined(PETSC_USE_CTABLE)
2485: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2486: row--;
2488: #else
2489: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2490: #endif
2491: max2 = rbuf2_i[ct1];
2492: for (l=0; l<max2; l++,ct2++) {
2493: if (!allcolumns[is_no]) {
2494: #if defined(PETSC_USE_CTABLE)
2495: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2496: #else
2497: tcol = cmap_i[rbuf3_i[ct2]];
2498: #endif
2499: if (tcol) lens_i[row]++;
2500: } else { /* allcolumns */
2501: lens_i[row]++; /* lens_i[row] += max2 ? */
2502: }
2503: }
2504: }
2505: }
2506: }
2507: }
2508: PetscFree(r_waits3);
2509: MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);
2510: PetscFree(s_waits3);
2512: /* Create the submatrices */
2513: for (i=0; i<ismax; i++) {
2514: PetscInt rbs,cbs;
2516: ISGetBlockSize(isrow[i],&rbs);
2517: ISGetBlockSize(iscol[i],&cbs);
2519: MatCreate(PETSC_COMM_SELF,submats+i);
2520: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
2522: MatSetBlockSizes(submats[i],rbs,cbs);
2523: MatSetType(submats[i],((PetscObject)A)->type_name);
2524: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
2526: /* create struct Mat_SubSppt and attached it to submat */
2527: PetscNew(&smat_i);
2528: subc = (Mat_SeqAIJ*)submats[i]->data;
2529: subc->submatis1 = smat_i;
2531: smat_i->destroy = submats[i]->ops->destroy;
2532: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2533: submats[i]->factortype = C->factortype;
2535: smat_i->id = i;
2536: smat_i->nrqs = nrqs;
2537: smat_i->nrqr = nrqr;
2538: smat_i->rbuf1 = rbuf1;
2539: smat_i->rbuf2 = rbuf2;
2540: smat_i->rbuf3 = rbuf3;
2541: smat_i->sbuf2 = sbuf2;
2542: smat_i->req_source2 = req_source2;
2544: smat_i->sbuf1 = sbuf1;
2545: smat_i->ptr = ptr;
2546: smat_i->tmp = tmp;
2547: smat_i->ctr = ctr;
2549: smat_i->pa = pa;
2550: smat_i->req_size = req_size;
2551: smat_i->req_source1 = req_source1;
2553: smat_i->allcolumns = allcolumns[i];
2554: smat_i->singleis = PETSC_FALSE;
2555: smat_i->row2proc = row2proc[i];
2556: smat_i->rmap = rmap[i];
2557: smat_i->cmap = cmap[i];
2558: }
2560: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2561: MatCreate(PETSC_COMM_SELF,&submats[0]);
2562: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2563: MatSetType(submats[0],MATDUMMY);
2565: /* create struct Mat_SubSppt and attached it to submat */
2566: PetscNewLog(submats[0],&smat_i);
2567: submats[0]->data = (void*)smat_i;
2569: smat_i->destroy = submats[0]->ops->destroy;
2570: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2571: submats[0]->factortype = C->factortype;
2573: smat_i->id = 0;
2574: smat_i->nrqs = nrqs;
2575: smat_i->nrqr = nrqr;
2576: smat_i->rbuf1 = rbuf1;
2577: smat_i->rbuf2 = rbuf2;
2578: smat_i->rbuf3 = rbuf3;
2579: smat_i->sbuf2 = sbuf2;
2580: smat_i->req_source2 = req_source2;
2582: smat_i->sbuf1 = sbuf1;
2583: smat_i->ptr = ptr;
2584: smat_i->tmp = tmp;
2585: smat_i->ctr = ctr;
2587: smat_i->pa = pa;
2588: smat_i->req_size = req_size;
2589: smat_i->req_source1 = req_source1;
2591: smat_i->allcolumns = PETSC_FALSE;
2592: smat_i->singleis = PETSC_FALSE;
2593: smat_i->row2proc = NULL;
2594: smat_i->rmap = NULL;
2595: smat_i->cmap = NULL;
2596: }
2598: if (ismax) PetscFree(lens[0]);
2599: PetscFree(lens);
2600: if (sbuf_aj) {
2601: PetscFree(sbuf_aj[0]);
2602: PetscFree(sbuf_aj);
2603: }
2605: } /* endof scall == MAT_INITIAL_MATRIX */
2607: /* Post recv matrix values */
2608: PetscObjectGetNewTag((PetscObject)C,&tag4);
2609: PetscMalloc1(nrqs,&rbuf4);
2610: PetscMalloc1(nrqs,&r_waits4);
2611: for (i=0; i<nrqs; ++i) {
2612: PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
2613: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2614: }
2616: /* Allocate sending buffers for a->a, and send them off */
2617: PetscMalloc1(nrqr,&sbuf_aa);
2618: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2619: if (nrqr) PetscMalloc1(j,&sbuf_aa[0]);
2620: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2622: PetscMalloc1(nrqr,&s_waits4);
2623: {
2624: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2625: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2626: PetscInt cend = C->cmap->rend;
2627: PetscInt *b_j = b->j;
2628: PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2630: MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
2631: MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);
2632: for (i=0; i<nrqr; i++) {
2633: rbuf1_i = rbuf1[i];
2634: sbuf_aa_i = sbuf_aa[i];
2635: ct1 = 2*rbuf1_i[0]+1;
2636: ct2 = 0;
2637: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2638: kmax = rbuf1_i[2*j];
2639: for (k=0; k<kmax; k++,ct1++) {
2640: row = rbuf1_i[ct1] - rstart;
2641: nzA = a_i[row+1] - a_i[row];
2642: nzB = b_i[row+1] - b_i[row];
2643: ncols = nzA + nzB;
2644: cworkB = b_j + b_i[row];
2645: vworkA = a_a + a_i[row];
2646: vworkB = b_a + b_i[row];
2648: /* load the column values for this row into vals*/
2649: vals = sbuf_aa_i+ct2;
2651: lwrite = 0;
2652: for (l=0; l<nzB; l++) {
2653: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2654: }
2655: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2656: for (l=0; l<nzB; l++) {
2657: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2658: }
2660: ct2 += ncols;
2661: }
2662: }
2663: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2664: }
2665: MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
2666: MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);
2667: }
2669: /* Assemble the matrices */
2670: /* First assemble the local rows */
2671: for (i=0; i<ismax; i++) {
2672: row2proc_i = row2proc[i];
2673: subc = (Mat_SeqAIJ*)submats[i]->data;
2674: imat_ilen = subc->ilen;
2675: imat_j = subc->j;
2676: imat_i = subc->i;
2677: imat_a = subc->a;
2679: if (!allcolumns[i]) cmap_i = cmap[i];
2680: rmap_i = rmap[i];
2681: irow_i = irow[i];
2682: jmax = nrow[i];
2683: for (j=0; j<jmax; j++) {
2684: row = irow_i[j];
2685: proc = row2proc_i[j];
2686: if (proc == rank) {
2687: old_row = row;
2688: #if defined(PETSC_USE_CTABLE)
2689: PetscTableFind(rmap_i,row+1,&row);
2690: row--;
2691: #else
2692: row = rmap_i[row];
2693: #endif
2694: ilen_row = imat_ilen[row];
2695: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2696: mat_i = imat_i[row];
2697: mat_a = imat_a + mat_i;
2698: mat_j = imat_j + mat_i;
2699: if (!allcolumns[i]) {
2700: for (k=0; k<ncols; k++) {
2701: #if defined(PETSC_USE_CTABLE)
2702: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2703: #else
2704: tcol = cmap_i[cols[k]];
2705: #endif
2706: if (tcol) {
2707: *mat_j++ = tcol - 1;
2708: *mat_a++ = vals[k];
2709: ilen_row++;
2710: }
2711: }
2712: } else { /* allcolumns */
2713: for (k=0; k<ncols; k++) {
2714: *mat_j++ = cols[k]; /* global col index! */
2715: *mat_a++ = vals[k];
2716: ilen_row++;
2717: }
2718: }
2719: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2721: imat_ilen[row] = ilen_row;
2722: }
2723: }
2724: }
2726: /* Now assemble the off proc rows */
2727: MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);
2728: for (tmp2=0; tmp2<nrqs; tmp2++) {
2729: sbuf1_i = sbuf1[pa[tmp2]];
2730: jmax = sbuf1_i[0];
2731: ct1 = 2*jmax + 1;
2732: ct2 = 0;
2733: rbuf2_i = rbuf2[tmp2];
2734: rbuf3_i = rbuf3[tmp2];
2735: rbuf4_i = rbuf4[tmp2];
2736: for (j=1; j<=jmax; j++) {
2737: is_no = sbuf1_i[2*j-1];
2738: rmap_i = rmap[is_no];
2739: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2740: subc = (Mat_SeqAIJ*)submats[is_no]->data;
2741: imat_ilen = subc->ilen;
2742: imat_j = subc->j;
2743: imat_i = subc->i;
2744: imat_a = subc->a;
2745: max1 = sbuf1_i[2*j];
2746: for (k=0; k<max1; k++,ct1++) {
2747: row = sbuf1_i[ct1];
2748: #if defined(PETSC_USE_CTABLE)
2749: PetscTableFind(rmap_i,row+1,&row);
2750: row--;
2751: #else
2752: row = rmap_i[row];
2753: #endif
2754: ilen = imat_ilen[row];
2755: mat_i = imat_i[row];
2756: mat_a = imat_a + mat_i;
2757: mat_j = imat_j + mat_i;
2758: max2 = rbuf2_i[ct1];
2759: if (!allcolumns[is_no]) {
2760: for (l=0; l<max2; l++,ct2++) {
2761: #if defined(PETSC_USE_CTABLE)
2762: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2763: #else
2764: tcol = cmap_i[rbuf3_i[ct2]];
2765: #endif
2766: if (tcol) {
2767: *mat_j++ = tcol - 1;
2768: *mat_a++ = rbuf4_i[ct2];
2769: ilen++;
2770: }
2771: }
2772: } else { /* allcolumns */
2773: for (l=0; l<max2; l++,ct2++) {
2774: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2775: *mat_a++ = rbuf4_i[ct2];
2776: ilen++;
2777: }
2778: }
2779: imat_ilen[row] = ilen;
2780: }
2781: }
2782: }
2784: if (!iscsorted) { /* sort column indices of the rows */
2785: for (i=0; i<ismax; i++) {
2786: subc = (Mat_SeqAIJ*)submats[i]->data;
2787: imat_j = subc->j;
2788: imat_i = subc->i;
2789: imat_a = subc->a;
2790: imat_ilen = subc->ilen;
2792: if (allcolumns[i]) continue;
2793: jmax = nrow[i];
2794: for (j=0; j<jmax; j++) {
2795: mat_i = imat_i[j];
2796: mat_a = imat_a + mat_i;
2797: mat_j = imat_j + mat_i;
2798: PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2799: }
2800: }
2801: }
2803: PetscFree(r_waits4);
2804: MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);
2805: PetscFree(s_waits4);
2807: /* Restore the indices */
2808: for (i=0; i<ismax; i++) {
2809: ISRestoreIndices(isrow[i],irow+i);
2810: if (!allcolumns[i]) {
2811: ISRestoreIndices(iscol[i],icol+i);
2812: }
2813: }
2815: for (i=0; i<ismax; i++) {
2816: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2817: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2818: }
2820: /* Destroy allocated memory */
2821: if (sbuf_aa) {
2822: PetscFree(sbuf_aa[0]);
2823: PetscFree(sbuf_aa);
2824: }
2825: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
2827: for (i=0; i<nrqs; ++i) {
2828: PetscFree(rbuf4[i]);
2829: }
2830: PetscFree(rbuf4);
2832: PetscFree4(row2proc,cmap,rmap,allcolumns);
2833: return 0;
2834: }
2836: /*
2837: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2838: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2839: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2840: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2841: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2842: state, and needs to be "assembled" later by compressing B's column space.
2844: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2845: Following this call, C->A & C->B have been created, even if empty.
2846: */
2847: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2848: {
2849: /* If making this function public, change the error returned in this function away from _PLIB. */
2850: Mat_MPIAIJ *aij;
2851: Mat_SeqAIJ *Baij;
2852: PetscBool seqaij,Bdisassembled;
2853: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2854: PetscScalar v;
2855: const PetscInt *rowindices,*colindices;
2857: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2858: if (A) {
2859: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2861: if (rowemb) {
2862: ISGetLocalSize(rowemb,&m);
2864: } else {
2865: if (C->rmap->n != A->rmap->n) {
2866: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2867: }
2868: }
2869: if (dcolemb) {
2870: ISGetLocalSize(dcolemb,&n);
2872: } else {
2874: }
2875: }
2876: if (B) {
2877: PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2879: if (rowemb) {
2880: ISGetLocalSize(rowemb,&m);
2882: } else {
2883: if (C->rmap->n != B->rmap->n) {
2884: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2885: }
2886: }
2887: if (ocolemb) {
2888: ISGetLocalSize(ocolemb,&n);
2890: } else {
2892: }
2893: }
2895: aij = (Mat_MPIAIJ*)C->data;
2896: if (!aij->A) {
2897: /* Mimic parts of MatMPIAIJSetPreallocation() */
2898: MatCreate(PETSC_COMM_SELF,&aij->A);
2899: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2900: MatSetBlockSizesFromMats(aij->A,C,C);
2901: MatSetType(aij->A,MATSEQAIJ);
2902: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2903: }
2904: if (A) {
2905: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2906: } else {
2907: MatSetUp(aij->A);
2908: }
2909: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2910: /*
2911: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2912: need to "disassemble" B -- convert it to using C's global indices.
2913: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2915: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2916: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2918: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2919: At least avoid calling MatSetValues() and the implied searches?
2920: */
2922: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2923: #if defined(PETSC_USE_CTABLE)
2924: PetscTableDestroy(&aij->colmap);
2925: #else
2926: PetscFree(aij->colmap);
2927: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2928: if (aij->B) {
2929: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2930: }
2931: #endif
2932: ngcol = 0;
2933: if (aij->lvec) {
2934: VecGetSize(aij->lvec,&ngcol);
2935: }
2936: if (aij->garray) {
2937: PetscFree(aij->garray);
2938: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2939: }
2940: VecDestroy(&aij->lvec);
2941: VecScatterDestroy(&aij->Mvctx);
2942: }
2943: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2944: MatDestroy(&aij->B);
2945: }
2946: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2947: MatZeroEntries(aij->B);
2948: }
2949: }
2950: Bdisassembled = PETSC_FALSE;
2951: if (!aij->B) {
2952: MatCreate(PETSC_COMM_SELF,&aij->B);
2953: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2954: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2955: MatSetBlockSizesFromMats(aij->B,B,B);
2956: MatSetType(aij->B,MATSEQAIJ);
2957: Bdisassembled = PETSC_TRUE;
2958: }
2959: if (B) {
2960: Baij = (Mat_SeqAIJ*)B->data;
2961: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2962: PetscMalloc1(B->rmap->n,&nz);
2963: for (i=0; i<B->rmap->n; i++) {
2964: nz[i] = Baij->i[i+1] - Baij->i[i];
2965: }
2966: MatSeqAIJSetPreallocation(aij->B,0,nz);
2967: PetscFree(nz);
2968: }
2970: PetscLayoutGetRange(C->rmap,&rstart,&rend);
2971: shift = rend-rstart;
2972: count = 0;
2973: rowindices = NULL;
2974: colindices = NULL;
2975: if (rowemb) {
2976: ISGetIndices(rowemb,&rowindices);
2977: }
2978: if (ocolemb) {
2979: ISGetIndices(ocolemb,&colindices);
2980: }
2981: for (i=0; i<B->rmap->n; i++) {
2982: PetscInt row;
2983: row = i;
2984: if (rowindices) row = rowindices[i];
2985: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2986: col = Baij->j[count];
2987: if (colindices) col = colindices[col];
2988: if (Bdisassembled && col>=rstart) col += shift;
2989: v = Baij->a[count];
2990: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
2991: ++count;
2992: }
2993: }
2994: /* No assembly for aij->B is necessary. */
2995: /* FIXME: set aij->B's nonzerostate correctly. */
2996: } else {
2997: MatSetUp(aij->B);
2998: }
2999: C->preallocated = PETSC_TRUE;
3000: C->was_assembled = PETSC_FALSE;
3001: C->assembled = PETSC_FALSE;
3002: /*
3003: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3004: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3005: */
3006: return 0;
3007: }
3009: /*
3010: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
3011: */
3012: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3013: {
3014: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3018: /* FIXME: make sure C is assembled */
3019: *A = aij->A;
3020: *B = aij->B;
3021: /* Note that we don't incref *A and *B, so be careful! */
3022: return 0;
3023: }
3025: /*
3026: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3027: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3028: */
3029: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3030: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3031: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3032: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3033: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3034: {
3035: PetscMPIInt size,flag;
3036: PetscInt i,ii,cismax,ispar;
3037: Mat *A,*B;
3038: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3040: if (!ismax) return 0;
3042: for (i = 0, cismax = 0; i < ismax; ++i) {
3043: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3045: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3046: if (size > 1) ++cismax;
3047: }
3049: /*
3050: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3051: ispar counts the number of parallel ISs across C's comm.
3052: */
3053: MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3054: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3055: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3056: return 0;
3057: }
3059: /* if (ispar) */
3060: /*
3061: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3062: These are used to extract the off-diag portion of the resulting parallel matrix.
3063: The row IS for the off-diag portion is the same as for the diag portion,
3064: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3065: */
3066: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3067: PetscMalloc1(cismax,&ciscol_p);
3068: for (i = 0, ii = 0; i < ismax; ++i) {
3069: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3070: if (size > 1) {
3071: /*
3072: TODO: This is the part that's ***NOT SCALABLE***.
3073: To fix this we need to extract just the indices of C's nonzero columns
3074: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3075: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3076: to be done without serializing on the IS list, so, most likely, it is best
3077: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3078: */
3079: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3080: /* Now we have to
3081: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3082: were sorted on each rank, concatenated they might no longer be sorted;
3083: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3084: indices in the nondecreasing order to the original index positions.
3085: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3086: */
3087: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3088: ISSort(ciscol[ii]);
3089: ++ii;
3090: }
3091: }
3092: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3093: for (i = 0, ii = 0; i < ismax; ++i) {
3094: PetscInt j,issize;
3095: const PetscInt *indices;
3097: /*
3098: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3099: */
3100: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3101: ISSort(isrow[i]);
3102: ISGetLocalSize(isrow[i],&issize);
3103: ISGetIndices(isrow[i],&indices);
3104: for (j = 1; j < issize; ++j) {
3106: }
3107: ISRestoreIndices(isrow[i],&indices);
3108: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3109: ISSort(iscol[i]);
3110: ISGetLocalSize(iscol[i],&issize);
3111: ISGetIndices(iscol[i],&indices);
3112: for (j = 1; j < issize; ++j) {
3114: }
3115: ISRestoreIndices(iscol[i],&indices);
3116: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3117: if (size > 1) {
3118: cisrow[ii] = isrow[i];
3119: ++ii;
3120: }
3121: }
3122: /*
3123: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3124: array of sequential matrices underlying the resulting parallel matrices.
3125: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3126: contain duplicates.
3128: There are as many diag matrices as there are original index sets. There are only as many parallel
3129: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3131: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3132: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3133: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3134: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3135: with A[i] and B[ii] extracted from the corresponding MPI submat.
3136: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3137: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3138: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3139: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3140: values into A[i] and B[ii] sitting inside the corresponding submat.
3141: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3142: A[i], B[ii], AA[i] or BB[ii] matrices.
3143: */
3144: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3145: if (scall == MAT_INITIAL_MATRIX) {
3146: PetscMalloc1(ismax,submat);
3147: }
3149: /* Now obtain the sequential A and B submatrices separately. */
3150: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3151: (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3152: (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);
3154: /*
3155: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3156: matrices A & B have been extracted directly into the parallel matrices containing them, or
3157: simply into the sequential matrix identical with the corresponding A (if size == 1).
3158: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3159: to have the same sparsity pattern.
3160: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3161: must be constructed for C. This is done by setseqmat(s).
3162: */
3163: for (i = 0, ii = 0; i < ismax; ++i) {
3164: /*
3165: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3166: That way we can avoid sorting and computing permutations when reusing.
3167: To this end:
3168: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3169: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3170: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3171: */
3172: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3174: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3175: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3176: if (size > 1) {
3177: if (scall == MAT_INITIAL_MATRIX) {
3178: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3179: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3180: MatSetType((*submat)[i],MATMPIAIJ);
3181: PetscLayoutSetUp((*submat)[i]->rmap);
3182: PetscLayoutSetUp((*submat)[i]->cmap);
3183: }
3184: /*
3185: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3186: */
3187: {
3188: Mat AA = A[i],BB = B[ii];
3190: if (AA || BB) {
3191: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3192: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3193: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3194: }
3195: MatDestroy(&AA);
3196: }
3197: ISDestroy(ciscol+ii);
3198: ISDestroy(ciscol_p+ii);
3199: ++ii;
3200: } else { /* if (size == 1) */
3201: if (scall == MAT_REUSE_MATRIX) {
3202: MatDestroy(&(*submat)[i]);
3203: }
3204: if (isrow_p[i] || iscol_p[i]) {
3205: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3206: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3207: /* Otherwise A is extracted straight into (*submats)[i]. */
3208: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3209: MatDestroy(A+i);
3210: } else (*submat)[i] = A[i];
3211: }
3212: ISDestroy(&isrow_p[i]);
3213: ISDestroy(&iscol_p[i]);
3214: }
3215: PetscFree2(cisrow,ciscol);
3216: PetscFree2(isrow_p,iscol_p);
3217: PetscFree(ciscol_p);
3218: PetscFree(A);
3219: MatDestroySubMatrices(cismax,&B);
3220: return 0;
3221: }
3223: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3224: {
3225: MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3226: return 0;
3227: }