Actual source code: mpiov.c
petsc-3.10.5 2019-03-28
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
22: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
23: {
25: PetscInt i;
28: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
31: }
32: return(0);
33: }
35: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
36: {
38: PetscInt i;
41: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
42: for (i=0; i<ov; ++i) {
43: MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
44: }
45: return(0);
46: }
49: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
50: {
52: MPI_Comm comm;
53: PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
54: PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
55: PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0;
56: const PetscInt *indices_i,**indices;
57: PetscLayout rmap;
58: PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom;
59: PetscSF sf;
60: PetscSFNode *remote;
63: PetscObjectGetComm((PetscObject)mat,&comm);
64: MPI_Comm_rank(comm,&rank);
65: MPI_Comm_size(comm,&size);
66: /* get row map to determine where rows should be going */
67: MatGetLayouts(mat,&rmap,NULL);
68: /* retrieve IS data and put all together so that we
69: * can optimize communication
70: * */
71: PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
72: for (i=0,tlength=0; i<nidx; i++){
73: ISGetLocalSize(is[i],&length[i]);
74: tlength += length[i];
75: ISGetIndices(is[i],&indices[i]);
76: }
77: /* find these rows on remote processors */
78: PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
79: PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
80: nrrows = 0;
81: for (i=0; i<nidx; i++) {
82: length_i = length[i];
83: indices_i = indices[i];
84: for (j=0; j<length_i; j++) {
85: owner = -1;
86: PetscLayoutFindOwner(rmap,indices_i[j],&owner);
87: /* remote processors */
88: if (owner != rank) {
89: tosizes_temp[owner]++; /* number of rows to owner */
90: rrow_ranks[nrrows] = owner; /* processor */
91: rrow_isids[nrrows] = i; /* is id */
92: remoterows[nrrows++] = indices_i[j]; /* row */
93: }
94: }
95: ISRestoreIndices(is[i],&indices[i]);
96: }
97: PetscFree2(*(PetscInt***)&indices,length);
98: /* test if we need to exchange messages
99: * generally speaking, we do not need to exchange
100: * data when overlap is 1
101: * */
102: MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
103: /* we do not have any messages
104: * It usually corresponds to overlap 1
105: * */
106: if (!reducednrrows) {
107: PetscFree3(toranks,tosizes,tosizes_temp);
108: PetscFree3(remoterows,rrow_ranks,rrow_isids);
109: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
110: return(0);
111: }
112: nto = 0;
113: /* send sizes and ranks for building a two-sided communcation */
114: for (i=0; i<size; i++) {
115: if (tosizes_temp[i]) {
116: tosizes[nto*2] = tosizes_temp[i]*2; /* size */
117: tosizes_temp[i] = nto; /* a map from processor to index */
118: toranks[nto++] = i; /* processor */
119: }
120: }
121: PetscCalloc1(nto+1,&toffsets);
122: for (i=0; i<nto; i++) {
123: toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */
124: tosizes[2*i+1] = toffsets[i]; /* offsets to send */
125: }
126: /* send information to other processors */
127: PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
128: nrecvrows = 0;
129: for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
130: PetscMalloc1(nrecvrows,&remote);
131: nrecvrows = 0;
132: for (i=0; i<nfrom; i++) {
133: for (j=0; j<fromsizes[2*i]; j++) {
134: remote[nrecvrows].rank = fromranks[i];
135: remote[nrecvrows++].index = fromsizes[2*i+1]+j;
136: }
137: }
138: PetscSFCreate(comm,&sf);
139: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
140: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
141: PetscSFSetType(sf,PETSCSFBASIC);
142: PetscSFSetFromOptions(sf);
143: /* message pair <no of is, row> */
144: PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
145: for (i=0; i<nrrows; i++) {
146: owner = rrow_ranks[i]; /* processor */
147: j = tosizes_temp[owner]; /* index */
148: todata[toffsets[j]++] = rrow_isids[i];
149: todata[toffsets[j]++] = remoterows[i];
150: }
151: PetscFree3(toranks,tosizes,tosizes_temp);
152: PetscFree3(remoterows,rrow_ranks,rrow_isids);
153: PetscFree(toffsets);
154: PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);
155: PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);
156: PetscSFDestroy(&sf);
157: /* send rows belonging to the remote so that then we could get the overlapping data back */
158: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
159: PetscFree2(todata,fromdata);
160: PetscFree(fromsizes);
161: PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
162: PetscFree(fromranks);
163: nrecvrows = 0;
164: for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
165: PetscCalloc1(nrecvrows,&todata);
166: PetscMalloc1(nrecvrows,&remote);
167: nrecvrows = 0;
168: for (i=0; i<nto; i++) {
169: for (j=0; j<tosizes[2*i]; j++) {
170: remote[nrecvrows].rank = toranks[i];
171: remote[nrecvrows++].index = tosizes[2*i+1]+j;
172: }
173: }
174: PetscSFCreate(comm,&sf);
175: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
176: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
177: PetscSFSetType(sf,PETSCSFBASIC);
178: PetscSFSetFromOptions(sf);
179: /* overlap communication and computation */
180: PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);
181: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
182: PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);
183: PetscSFDestroy(&sf);
184: PetscFree2(sbdata,sbsizes);
185: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
186: PetscFree(toranks);
187: PetscFree(tosizes);
188: PetscFree(todata);
189: return(0);
190: }
192: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
193: {
194: PetscInt *isz,isz_i,i,j,is_id, data_size;
195: PetscInt col,lsize,max_lsize,*indices_temp, *indices_i;
196: const PetscInt *indices_i_temp;
197: MPI_Comm *iscomms;
198: PetscErrorCode ierr;
201: max_lsize = 0;
202: PetscMalloc1(nidx,&isz);
203: for (i=0; i<nidx; i++){
204: ISGetLocalSize(is[i],&lsize);
205: max_lsize = lsize>max_lsize ? lsize:max_lsize;
206: isz[i] = lsize;
207: }
208: PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
209: for (i=0; i<nidx; i++){
210: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
211: ISGetIndices(is[i],&indices_i_temp);
212: PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);
213: ISRestoreIndices(is[i],&indices_i_temp);
214: ISDestroy(&is[i]);
215: }
216: /* retrieve information to get row id and its overlap */
217: for (i=0; i<nrecvs; ){
218: is_id = recvdata[i++];
219: data_size = recvdata[i++];
220: indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
221: isz_i = isz[is_id];
222: for (j=0; j< data_size; j++){
223: col = recvdata[i++];
224: indices_i[isz_i++] = col;
225: }
226: isz[is_id] = isz_i;
227: }
228: /* remove duplicate entities */
229: for (i=0; i<nidx; i++){
230: indices_i = indices_temp+(max_lsize+nrecvs)*i;
231: isz_i = isz[i];
232: PetscSortRemoveDupsInt(&isz_i,indices_i);
233: ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
234: PetscCommDestroy(&iscomms[i]);
235: }
236: PetscFree(isz);
237: PetscFree2(indices_temp,iscomms);
238: return(0);
239: }
241: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
242: {
243: PetscLayout rmap,cmap;
244: PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
245: PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
246: PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
247: const PetscInt *gcols,*ai,*aj,*bi,*bj;
248: Mat amat,bmat;
249: PetscMPIInt rank;
250: PetscBool done;
251: MPI_Comm comm;
252: PetscErrorCode ierr;
255: PetscObjectGetComm((PetscObject)mat,&comm);
256: MPI_Comm_rank(comm,&rank);
257: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
258: /* Even if the mat is symmetric, we still assume it is not symmetric */
259: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
260: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
261: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
262: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
263: /* total number of nonzero values is used to estimate the memory usage in the next step */
264: tnz = ai[an]+bi[bn];
265: MatGetLayouts(mat,&rmap,&cmap);
266: PetscLayoutGetRange(rmap,&rstart,NULL);
267: PetscLayoutGetRange(cmap,&cstart,NULL);
268: /* to find the longest message */
269: max_fszs = 0;
270: for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
271: /* better way to estimate number of nonzero in the mat??? */
272: PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
273: for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
274: rows_pos = 0;
275: totalrows = 0;
276: for (i=0; i<nfrom; i++){
277: PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
278: /* group data together */
279: for (j=0; j<fromsizes[2*i]; j+=2){
280: is_id = fromrows[rows_pos++];/* no of is */
281: rows_i = rows_data[is_id];
282: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
283: }
284: /* estimate a space to avoid multiple allocations */
285: for (j=0; j<nidx; j++){
286: indvc_ij = 0;
287: rows_i = rows_data[j];
288: for (l=0; l<rows_pos_i[j]; l++){
289: row = rows_i[l]-rstart;
290: start = ai[row];
291: end = ai[row+1];
292: for (k=start; k<end; k++){ /* Amat */
293: col = aj[k] + cstart;
294: indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
295: }
296: start = bi[row];
297: end = bi[row+1];
298: for (k=start; k<end; k++) { /* Bmat */
299: col = gcols[bj[k]];
300: indices_tmp[indvc_ij++] = col;
301: }
302: }
303: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
304: indv_counts[i*nidx+j] = indvc_ij;
305: totalrows += indvc_ij;
306: }
307: }
308: /* message triple <no of is, number of rows, rows> */
309: PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
310: totalrows = 0;
311: rows_pos = 0;
312: /* use this code again */
313: for (i=0;i<nfrom;i++){
314: PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
315: for (j=0; j<fromsizes[2*i]; j+=2){
316: is_id = fromrows[rows_pos++];
317: rows_i = rows_data[is_id];
318: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
319: }
320: /* add data */
321: for (j=0; j<nidx; j++){
322: if (!indv_counts[i*nidx+j]) continue;
323: indvc_ij = 0;
324: sbdata[totalrows++] = j;
325: sbdata[totalrows++] = indv_counts[i*nidx+j];
326: sbsizes[2*i] += 2;
327: rows_i = rows_data[j];
328: for (l=0; l<rows_pos_i[j]; l++){
329: row = rows_i[l]-rstart;
330: start = ai[row];
331: end = ai[row+1];
332: for (k=start; k<end; k++){ /* Amat */
333: col = aj[k] + cstart;
334: indices_tmp[indvc_ij++] = col;
335: }
336: start = bi[row];
337: end = bi[row+1];
338: for (k=start; k<end; k++) { /* Bmat */
339: col = gcols[bj[k]];
340: indices_tmp[indvc_ij++] = col;
341: }
342: }
343: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
344: sbsizes[2*i] += indvc_ij;
345: PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);
346: totalrows += indvc_ij;
347: }
348: }
349: PetscCalloc1(nfrom+1,&offsets);
350: for (i=0; i<nfrom; i++){
351: offsets[i+1] = offsets[i] + sbsizes[2*i];
352: sbsizes[2*i+1] = offsets[i];
353: }
354: PetscFree(offsets);
355: if (sbrowsizes) *sbrowsizes = sbsizes;
356: if (sbrows) *sbrows = sbdata;
357: PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
358: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
359: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
360: return(0);
361: }
363: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364: {
365: const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices;
366: PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367: PetscInt lsize,lsize_tmp,owner;
368: PetscMPIInt rank;
369: Mat amat,bmat;
370: PetscBool done;
371: PetscLayout cmap,rmap;
372: MPI_Comm comm;
373: PetscErrorCode ierr;
376: PetscObjectGetComm((PetscObject)mat,&comm);
377: MPI_Comm_rank(comm,&rank);
378: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
379: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
380: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
382: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383: /* is it a safe way to compute number of nonzero values ? */
384: tnz = ai[an]+bi[bn];
385: MatGetLayouts(mat,&rmap,&cmap);
386: PetscLayoutGetRange(rmap,&rstart,NULL);
387: PetscLayoutGetRange(cmap,&cstart,NULL);
388: /* it is a better way to estimate memory than the old implementation
389: * where global size of matrix is used
390: * */
391: PetscMalloc1(tnz,&indices_temp);
392: for (i=0; i<nidx; i++) {
393: MPI_Comm iscomm;
395: ISGetLocalSize(is[i],&lsize);
396: ISGetIndices(is[i],&indices);
397: lsize_tmp = 0;
398: for (j=0; j<lsize; j++) {
399: owner = -1;
400: row = indices[j];
401: PetscLayoutFindOwner(rmap,row,&owner);
402: if (owner != rank) continue;
403: /* local number */
404: row -= rstart;
405: start = ai[row];
406: end = ai[row+1];
407: for (k=start; k<end; k++) { /* Amat */
408: col = aj[k] + cstart;
409: indices_temp[lsize_tmp++] = col;
410: }
411: start = bi[row];
412: end = bi[row+1];
413: for (k=start; k<end; k++) { /* Bmat */
414: col = gcols[bj[k]];
415: indices_temp[lsize_tmp++] = col;
416: }
417: }
418: ISRestoreIndices(is[i],&indices);
419: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
420: ISDestroy(&is[i]);
421: PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
422: ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
423: PetscCommDestroy(&iscomm);
424: }
425: PetscFree(indices_temp);
426: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
427: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
428: return(0);
429: }
432: /*
433: Sample message format:
434: If a processor A wants processor B to process some elements corresponding
435: to index sets is[1],is[5]
436: mesg [0] = 2 (no of index sets in the mesg)
437: -----------
438: mesg [1] = 1 => is[1]
439: mesg [2] = sizeof(is[1]);
440: -----------
441: mesg [3] = 5 => is[5]
442: mesg [4] = sizeof(is[5]);
443: -----------
444: mesg [5]
445: mesg [n] datas[1]
446: -----------
447: mesg[n+1]
448: mesg[m] data(is[5])
449: -----------
451: Notes:
452: nrqs - no of requests sent (or to be sent out)
453: nrqr - no of requests recieved (which have to be or which have been processed
454: */
455: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
456: {
457: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
458: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
459: const PetscInt **idx,*idx_i;
460: PetscInt *n,**data,len;
461: #if defined(PETSC_USE_CTABLE)
462: PetscTable *table_data,table_data_i;
463: PetscInt *tdata,tcount,tcount_max;
464: #else
465: PetscInt *data_i,*d_p;
466: #endif
468: PetscMPIInt size,rank,tag1,tag2;
469: PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
470: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
471: PetscBT *table;
472: MPI_Comm comm;
473: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
474: MPI_Status *s_status,*recv_status;
475: MPI_Comm *iscomms;
476: char *t_p;
479: PetscObjectGetComm((PetscObject)C,&comm);
480: size = c->size;
481: rank = c->rank;
482: M = C->rmap->N;
484: PetscObjectGetNewTag((PetscObject)C,&tag1);
485: PetscObjectGetNewTag((PetscObject)C,&tag2);
487: PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);
489: for (i=0; i<imax; i++) {
490: ISGetIndices(is[i],&idx[i]);
491: ISGetLocalSize(is[i],&n[i]);
492: }
494: /* evaluate communication - mesg to who,length of mesg, and buffer space
495: required. Based on this, buffers are allocated, and data copied into them */
496: PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
497: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
498: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
499: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
500: for (i=0; i<imax; i++) {
501: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
502: idx_i = idx[i];
503: len = n[i];
504: for (j=0; j<len; j++) {
505: row = idx_i[j];
506: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
507: PetscLayoutFindOwner(C->rmap,row,&proc);
508: w4[proc]++;
509: }
510: for (j=0; j<size; j++) {
511: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
512: }
513: }
515: nrqs = 0; /* no of outgoing messages */
516: msz = 0; /* total mesg length (for all proc */
517: w1[rank] = 0; /* no mesg sent to intself */
518: w3[rank] = 0;
519: for (i=0; i<size; i++) {
520: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
521: }
522: /* pa - is list of processors to communicate with */
523: PetscMalloc1(nrqs+1,&pa);
524: for (i=0,j=0; i<size; i++) {
525: if (w1[i]) {pa[j] = i; j++;}
526: }
528: /* Each message would have a header = 1 + 2*(no of IS) + data */
529: for (i=0; i<nrqs; i++) {
530: j = pa[i];
531: w1[j] += w2[j] + 2*w3[j];
532: msz += w1[j];
533: }
535: /* Determine the number of messages to expect, their lengths, from from-ids */
536: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
537: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
539: /* Now post the Irecvs corresponding to these messages */
540: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
542: /* Allocate Memory for outgoing messages */
543: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
544: PetscMemzero(outdat,size*sizeof(PetscInt*));
545: PetscMemzero(ptr,size*sizeof(PetscInt*));
547: {
548: PetscInt *iptr = tmp,ict = 0;
549: for (i=0; i<nrqs; i++) {
550: j = pa[i];
551: iptr += ict;
552: outdat[j] = iptr;
553: ict = w1[j];
554: }
555: }
557: /* Form the outgoing messages */
558: /* plug in the headers */
559: for (i=0; i<nrqs; i++) {
560: j = pa[i];
561: outdat[j][0] = 0;
562: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
563: ptr[j] = outdat[j] + 2*w3[j] + 1;
564: }
566: /* Memory for doing local proc's work */
567: {
568: PetscInt M_BPB_imax = 0;
569: #if defined(PETSC_USE_CTABLE)
570: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
571: PetscMalloc1(imax,&table_data);
572: for (i=0; i<imax; i++) {
573: PetscTableCreate(n[i]+1,M+1,&table_data[i]);
574: }
575: PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
576: for (i=0; i<imax; i++) {
577: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
578: }
579: #else
580: PetscInt Mimax = 0;
581: PetscIntMultError(M,imax, &Mimax);
582: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
583: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
584: for (i=0; i<imax; i++) {
585: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
586: data[i] = d_p + M*i;
587: }
588: #endif
589: }
591: /* Parse the IS and update local tables and the outgoing buf with the data */
592: {
593: PetscInt n_i,isz_i,*outdat_j,ctr_j;
594: PetscBT table_i;
596: for (i=0; i<imax; i++) {
597: PetscMemzero(ctr,size*sizeof(PetscInt));
598: n_i = n[i];
599: table_i = table[i];
600: idx_i = idx[i];
601: #if defined(PETSC_USE_CTABLE)
602: table_data_i = table_data[i];
603: #else
604: data_i = data[i];
605: #endif
606: isz_i = isz[i];
607: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
608: row = idx_i[j];
609: PetscLayoutFindOwner(C->rmap,row,&proc);
610: if (proc != rank) { /* copy to the outgoing buffer */
611: ctr[proc]++;
612: *ptr[proc] = row;
613: ptr[proc]++;
614: } else if (!PetscBTLookupSet(table_i,row)) {
615: #if defined(PETSC_USE_CTABLE)
616: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
617: #else
618: data_i[isz_i] = row; /* Update the local table */
619: #endif
620: isz_i++;
621: }
622: }
623: /* Update the headers for the current IS */
624: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
625: if ((ctr_j = ctr[j])) {
626: outdat_j = outdat[j];
627: k = ++outdat_j[0];
628: outdat_j[2*k] = ctr_j;
629: outdat_j[2*k-1] = i;
630: }
631: }
632: isz[i] = isz_i;
633: }
634: }
636: /* Now post the sends */
637: PetscMalloc1(nrqs+1,&s_waits1);
638: for (i=0; i<nrqs; ++i) {
639: j = pa[i];
640: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
641: }
643: /* No longer need the original indices */
644: PetscMalloc1(imax,&iscomms);
645: for (i=0; i<imax; ++i) {
646: ISRestoreIndices(is[i],idx+i);
647: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
648: }
649: PetscFree2(*(PetscInt***)&idx,n);
651: for (i=0; i<imax; ++i) {
652: ISDestroy(&is[i]);
653: }
655: /* Do Local work */
656: #if defined(PETSC_USE_CTABLE)
657: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
658: #else
659: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
660: #endif
662: /* Receive messages */
663: PetscMalloc1(nrqr+1,&recv_status);
664: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
666: PetscMalloc1(nrqs+1,&s_status);
667: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
669: /* Phase 1 sends are complete - deallocate buffers */
670: PetscFree4(outdat,ptr,tmp,ctr);
671: PetscFree4(w1,w2,w3,w4);
673: PetscMalloc1(nrqr+1,&xdata);
674: PetscMalloc1(nrqr+1,&isz1);
675: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
676: PetscFree(rbuf[0]);
677: PetscFree(rbuf);
680: /* Send the data back */
681: /* Do a global reduction to know the buffer space req for incoming messages */
682: {
683: PetscMPIInt *rw1;
685: PetscCalloc1(size,&rw1);
687: for (i=0; i<nrqr; ++i) {
688: proc = recv_status[i].MPI_SOURCE;
690: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
691: rw1[proc] = isz1[i];
692: }
693: PetscFree(onodes1);
694: PetscFree(olengths1);
696: /* Determine the number of messages to expect, their lengths, from from-ids */
697: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
698: PetscFree(rw1);
699: }
700: /* Now post the Irecvs corresponding to these messages */
701: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
703: /* Now post the sends */
704: PetscMalloc1(nrqr+1,&s_waits2);
705: for (i=0; i<nrqr; ++i) {
706: j = recv_status[i].MPI_SOURCE;
707: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
708: }
710: /* receive work done on other processors */
711: {
712: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax;
713: PetscMPIInt idex;
714: PetscBT table_i;
715: MPI_Status *status2;
717: PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
718: for (i=0; i<nrqs; ++i) {
719: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
720: /* Process the message */
721: rbuf2_i = rbuf2[idex];
722: ct1 = 2*rbuf2_i[0]+1;
723: jmax = rbuf2[idex][0];
724: for (j=1; j<=jmax; j++) {
725: max = rbuf2_i[2*j];
726: is_no = rbuf2_i[2*j-1];
727: isz_i = isz[is_no];
728: table_i = table[is_no];
729: #if defined(PETSC_USE_CTABLE)
730: table_data_i = table_data[is_no];
731: #else
732: data_i = data[is_no];
733: #endif
734: for (k=0; k<max; k++,ct1++) {
735: row = rbuf2_i[ct1];
736: if (!PetscBTLookupSet(table_i,row)) {
737: #if defined(PETSC_USE_CTABLE)
738: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
739: #else
740: data_i[isz_i] = row;
741: #endif
742: isz_i++;
743: }
744: }
745: isz[is_no] = isz_i;
746: }
747: }
749: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
750: PetscFree(status2);
751: }
753: #if defined(PETSC_USE_CTABLE)
754: tcount_max = 0;
755: for (i=0; i<imax; ++i) {
756: table_data_i = table_data[i];
757: PetscTableGetCount(table_data_i,&tcount);
758: if (tcount_max < tcount) tcount_max = tcount;
759: }
760: PetscMalloc1(tcount_max+1,&tdata);
761: #endif
763: for (i=0; i<imax; ++i) {
764: #if defined(PETSC_USE_CTABLE)
765: PetscTablePosition tpos;
766: table_data_i = table_data[i];
768: PetscTableGetHeadPosition(table_data_i,&tpos);
769: while (tpos) {
770: PetscTableGetNext(table_data_i,&tpos,&k,&j);
771: tdata[--j] = --k;
772: }
773: ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
774: #else
775: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
776: #endif
777: PetscCommDestroy(&iscomms[i]);
778: }
780: PetscFree(iscomms);
781: PetscFree(onodes2);
782: PetscFree(olengths2);
784: PetscFree(pa);
785: PetscFree(rbuf2[0]);
786: PetscFree(rbuf2);
787: PetscFree(s_waits1);
788: PetscFree(r_waits1);
789: PetscFree(s_waits2);
790: PetscFree(r_waits2);
791: PetscFree(s_status);
792: PetscFree(recv_status);
793: PetscFree(xdata[0]);
794: PetscFree(xdata);
795: PetscFree(isz1);
796: #if defined(PETSC_USE_CTABLE)
797: for (i=0; i<imax; i++) {
798: PetscTableDestroy((PetscTable*)&table_data[i]);
799: }
800: PetscFree(table_data);
801: PetscFree(tdata);
802: PetscFree4(table,data,isz,t_p);
803: #else
804: PetscFree5(table,data,isz,d_p,t_p);
805: #endif
806: return(0);
807: }
809: /*
810: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
811: the work on the local processor.
813: Inputs:
814: C - MAT_MPIAIJ;
815: imax - total no of index sets processed at a time;
816: table - an array of char - size = m bits.
818: Output:
819: isz - array containing the count of the solution elements corresponding
820: to each index set;
821: data or table_data - pointer to the solutions
822: */
823: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
824: {
825: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
826: Mat A = c->A,B = c->B;
827: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
828: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
829: PetscInt *bi,*bj,*garray,i,j,k,row,isz_i;
830: PetscBT table_i;
831: #if defined(PETSC_USE_CTABLE)
832: PetscTable table_data_i;
833: PetscErrorCode ierr;
834: PetscTablePosition tpos;
835: PetscInt tcount,*tdata;
836: #else
837: PetscInt *data_i;
838: #endif
841: rstart = C->rmap->rstart;
842: cstart = C->cmap->rstart;
843: ai = a->i;
844: aj = a->j;
845: bi = b->i;
846: bj = b->j;
847: garray = c->garray;
849: for (i=0; i<imax; i++) {
850: #if defined(PETSC_USE_CTABLE)
851: /* copy existing entries of table_data_i into tdata[] */
852: table_data_i = table_data[i];
853: PetscTableGetCount(table_data_i,&tcount);
854: if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]);
856: PetscMalloc1(tcount,&tdata);
857: PetscTableGetHeadPosition(table_data_i,&tpos);
858: while (tpos) {
859: PetscTableGetNext(table_data_i,&tpos,&row,&j);
860: tdata[--j] = --row;
861: if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount);
862: }
863: #else
864: data_i = data[i];
865: #endif
866: table_i = table[i];
867: isz_i = isz[i];
868: max = isz[i];
870: for (j=0; j<max; j++) {
871: #if defined(PETSC_USE_CTABLE)
872: row = tdata[j] - rstart;
873: #else
874: row = data_i[j] - rstart;
875: #endif
876: start = ai[row];
877: end = ai[row+1];
878: for (k=start; k<end; k++) { /* Amat */
879: val = aj[k] + cstart;
880: if (!PetscBTLookupSet(table_i,val)) {
881: #if defined(PETSC_USE_CTABLE)
882: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
883: #else
884: data_i[isz_i] = val;
885: #endif
886: isz_i++;
887: }
888: }
889: start = bi[row];
890: end = bi[row+1];
891: for (k=start; k<end; k++) { /* Bmat */
892: val = garray[bj[k]];
893: if (!PetscBTLookupSet(table_i,val)) {
894: #if defined(PETSC_USE_CTABLE)
895: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
896: #else
897: data_i[isz_i] = val;
898: #endif
899: isz_i++;
900: }
901: }
902: }
903: isz[i] = isz_i;
905: #if defined(PETSC_USE_CTABLE)
906: PetscFree(tdata);
907: #endif
908: }
909: return(0);
910: }
912: /*
913: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
914: and return the output
916: Input:
917: C - the matrix
918: nrqr - no of messages being processed.
919: rbuf - an array of pointers to the recieved requests
921: Output:
922: xdata - array of messages to be sent back
923: isz1 - size of each message
925: For better efficiency perhaps we should malloc separately each xdata[i],
926: then if a remalloc is required we need only copy the data for that one row
927: rather then all previous rows as it is now where a single large chunck of
928: memory is used.
930: */
931: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
932: {
933: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
934: Mat A = c->A,B = c->B;
935: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
937: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
938: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
939: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
940: PetscInt *rbuf_i,kmax,rbuf_0;
941: PetscBT xtable;
944: m = C->rmap->N;
945: rstart = C->rmap->rstart;
946: cstart = C->cmap->rstart;
947: ai = a->i;
948: aj = a->j;
949: bi = b->i;
950: bj = b->j;
951: garray = c->garray;
954: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
955: rbuf_i = rbuf[i];
956: rbuf_0 = rbuf_i[0];
957: ct += rbuf_0;
958: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
959: }
961: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
962: else max1 = 1;
963: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
964: PetscMalloc1(mem_estimate,&xdata[0]);
965: ++no_malloc;
966: PetscBTCreate(m,&xtable);
967: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
969: ct3 = 0;
970: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
971: rbuf_i = rbuf[i];
972: rbuf_0 = rbuf_i[0];
973: ct1 = 2*rbuf_0+1;
974: ct2 = ct1;
975: ct3 += ct1;
976: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
977: PetscBTMemzero(m,xtable);
978: oct2 = ct2;
979: kmax = rbuf_i[2*j];
980: for (k=0; k<kmax; k++,ct1++) {
981: row = rbuf_i[ct1];
982: if (!PetscBTLookupSet(xtable,row)) {
983: if (!(ct3 < mem_estimate)) {
984: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
985: PetscMalloc1(new_estimate,&tmp);
986: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
987: PetscFree(xdata[0]);
988: xdata[0] = tmp;
989: mem_estimate = new_estimate; ++no_malloc;
990: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
991: }
992: xdata[i][ct2++] = row;
993: ct3++;
994: }
995: }
996: for (k=oct2,max2=ct2; k<max2; k++) {
997: row = xdata[i][k] - rstart;
998: start = ai[row];
999: end = ai[row+1];
1000: for (l=start; l<end; l++) {
1001: val = aj[l] + cstart;
1002: if (!PetscBTLookupSet(xtable,val)) {
1003: if (!(ct3 < mem_estimate)) {
1004: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1005: PetscMalloc1(new_estimate,&tmp);
1006: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
1007: PetscFree(xdata[0]);
1008: xdata[0] = tmp;
1009: mem_estimate = new_estimate; ++no_malloc;
1010: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1011: }
1012: xdata[i][ct2++] = val;
1013: ct3++;
1014: }
1015: }
1016: start = bi[row];
1017: end = bi[row+1];
1018: for (l=start; l<end; l++) {
1019: val = garray[bj[l]];
1020: if (!PetscBTLookupSet(xtable,val)) {
1021: if (!(ct3 < mem_estimate)) {
1022: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1023: PetscMalloc1(new_estimate,&tmp);
1024: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
1025: PetscFree(xdata[0]);
1026: xdata[0] = tmp;
1027: mem_estimate = new_estimate; ++no_malloc;
1028: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1029: }
1030: xdata[i][ct2++] = val;
1031: ct3++;
1032: }
1033: }
1034: }
1035: /* Update the header*/
1036: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1037: xdata[i][2*j-1] = rbuf_i[2*j-1];
1038: }
1039: xdata[i][0] = rbuf_0;
1040: xdata[i+1] = xdata[i] + ct2;
1041: isz1[i] = ct2; /* size of each message */
1042: }
1043: PetscBTDestroy(&xtable);
1044: PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1045: return(0);
1046: }
1047: /* -------------------------------------------------------------------------*/
1048: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1049: /*
1050: Every processor gets the entire matrix
1051: */
1052: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1053: {
1054: Mat B;
1055: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1056: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1058: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
1059: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1060: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1061: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
1064: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1065: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1067: if (scall == MAT_INITIAL_MATRIX) {
1068: /* ----------------------------------------------------------------
1069: Tell every processor the number of nonzeros per row
1070: */
1071: PetscMalloc1(A->rmap->N,&lens);
1072: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1073: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1074: }
1075: PetscMalloc2(size,&recvcounts,size,&displs);
1076: for (i=0; i<size; i++) {
1077: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1078: displs[i] = A->rmap->range[i];
1079: }
1080: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1081: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1082: #else
1083: sendcount = A->rmap->rend - A->rmap->rstart;
1084: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1085: #endif
1086: /* ---------------------------------------------------------------
1087: Create the sequential matrix of the same type as the local block diagonal
1088: */
1089: MatCreate(PETSC_COMM_SELF,&B);
1090: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1091: MatSetBlockSizesFromMats(B,A,A);
1092: MatSetType(B,((PetscObject)a->A)->type_name);
1093: MatSeqAIJSetPreallocation(B,0,lens);
1094: PetscCalloc1(2,Bin);
1095: **Bin = B;
1096: b = (Mat_SeqAIJ*)B->data;
1098: /*--------------------------------------------------------------------
1099: Copy my part of matrix column indices over
1100: */
1101: sendcount = ad->nz + bd->nz;
1102: jsendbuf = b->j + b->i[rstarts[rank]];
1103: a_jsendbuf = ad->j;
1104: b_jsendbuf = bd->j;
1105: n = A->rmap->rend - A->rmap->rstart;
1106: cnt = 0;
1107: for (i=0; i<n; i++) {
1109: /* put in lower diagonal portion */
1110: m = bd->i[i+1] - bd->i[i];
1111: while (m > 0) {
1112: /* is it above diagonal (in bd (compressed) numbering) */
1113: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1114: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115: m--;
1116: }
1118: /* put in diagonal portion */
1119: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1120: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1121: }
1123: /* put in upper diagonal portion */
1124: while (m-- > 0) {
1125: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1126: }
1127: }
1128: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1130: /*--------------------------------------------------------------------
1131: Gather all column indices to all processors
1132: */
1133: for (i=0; i<size; i++) {
1134: recvcounts[i] = 0;
1135: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1136: recvcounts[i] += lens[j];
1137: }
1138: }
1139: displs[0] = 0;
1140: for (i=1; i<size; i++) {
1141: displs[i] = displs[i-1] + recvcounts[i-1];
1142: }
1143: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1144: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1145: #else
1146: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1147: #endif
1148: /*--------------------------------------------------------------------
1149: Assemble the matrix into useable form (note numerical values not yet set)
1150: */
1151: /* set the b->ilen (length of each row) values */
1152: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
1153: /* set the b->i indices */
1154: b->i[0] = 0;
1155: for (i=1; i<=A->rmap->N; i++) {
1156: b->i[i] = b->i[i-1] + lens[i-1];
1157: }
1158: PetscFree(lens);
1159: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1160: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
1162: } else {
1163: B = **Bin;
1164: b = (Mat_SeqAIJ*)B->data;
1165: }
1167: /*--------------------------------------------------------------------
1168: Copy my part of matrix numerical values into the values location
1169: */
1170: if (flag == MAT_GET_VALUES) {
1171: sendcount = ad->nz + bd->nz;
1172: sendbuf = b->a + b->i[rstarts[rank]];
1173: a_sendbuf = ad->a;
1174: b_sendbuf = bd->a;
1175: b_sendj = bd->j;
1176: n = A->rmap->rend - A->rmap->rstart;
1177: cnt = 0;
1178: for (i=0; i<n; i++) {
1180: /* put in lower diagonal portion */
1181: m = bd->i[i+1] - bd->i[i];
1182: while (m > 0) {
1183: /* is it above diagonal (in bd (compressed) numbering) */
1184: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1185: sendbuf[cnt++] = *b_sendbuf++;
1186: m--;
1187: b_sendj++;
1188: }
1190: /* put in diagonal portion */
1191: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1192: sendbuf[cnt++] = *a_sendbuf++;
1193: }
1195: /* put in upper diagonal portion */
1196: while (m-- > 0) {
1197: sendbuf[cnt++] = *b_sendbuf++;
1198: b_sendj++;
1199: }
1200: }
1201: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1203: /* -----------------------------------------------------------------
1204: Gather all numerical values to all processors
1205: */
1206: if (!recvcounts) {
1207: PetscMalloc2(size,&recvcounts,size,&displs);
1208: }
1209: for (i=0; i<size; i++) {
1210: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1211: }
1212: displs[0] = 0;
1213: for (i=1; i<size; i++) {
1214: displs[i] = displs[i-1] + recvcounts[i-1];
1215: }
1216: recvbuf = b->a;
1217: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1218: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1219: #else
1220: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1221: #endif
1222: } /* endof (flag == MAT_GET_VALUES) */
1223: PetscFree2(recvcounts,displs);
1225: if (A->symmetric) {
1226: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1227: } else if (A->hermitian) {
1228: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1229: } else if (A->structurally_symmetric) {
1230: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1231: }
1232: return(0);
1233: }
1235: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1236: {
1237: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
1238: Mat submat,A = c->A,B = c->B;
1239: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1240: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1241: PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1242: const PetscInt *icol,*irow;
1243: PetscInt nrow,ncol,start;
1245: PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1246: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1247: PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1248: PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1249: PetscInt *lens,rmax,ncols,*cols,Crow;
1250: #if defined(PETSC_USE_CTABLE)
1251: PetscTable cmap,rmap;
1252: PetscInt *cmap_loc,*rmap_loc;
1253: #else
1254: PetscInt *cmap,*rmap;
1255: #endif
1256: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1257: PetscInt *cworkB,lwrite,*subcols,*row2proc;
1258: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL;
1259: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1260: MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4;
1261: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1262: MPI_Status *r_status3 = NULL,*r_status4,*s_status4;
1263: MPI_Comm comm;
1264: PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1265: PetscMPIInt *onodes1,*olengths1,idex,end;
1266: Mat_SubSppt *smatis1;
1267: PetscBool isrowsorted,iscolsorted;
1270: if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1272: PetscObjectGetComm((PetscObject)C,&comm);
1273: size = c->size;
1274: rank = c->rank;
1276: ISSorted(iscol[0],&iscolsorted);
1277: ISSorted(isrow[0],&isrowsorted);
1278: ISGetIndices(isrow[0],&irow);
1279: ISGetLocalSize(isrow[0],&nrow);
1280: if (allcolumns) {
1281: icol = NULL;
1282: ncol = C->cmap->N;
1283: } else {
1284: ISGetIndices(iscol[0],&icol);
1285: ISGetLocalSize(iscol[0],&ncol);
1286: }
1288: if (scall == MAT_INITIAL_MATRIX) {
1289: PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1291: /* Get some new tags to keep the communication clean */
1292: tag1 = ((PetscObject)C)->tag;
1293: PetscObjectGetNewTag((PetscObject)C,&tag2);
1294: PetscObjectGetNewTag((PetscObject)C,&tag3);
1296: /* evaluate communication - mesg to who, length of mesg, and buffer space
1297: required. Based on this, buffers are allocated, and data copied into them */
1298: PetscCalloc2(size,&w1,size,&w2);
1299: PetscMalloc1(nrow,&row2proc);
1301: /* w1[proc] = num of rows owned by proc -- to be requested */
1302: proc = 0;
1303: nrqs = 0; /* num of outgoing messages */
1304: for (j=0; j<nrow; j++) {
1305: row = irow[j];
1306: if (!isrowsorted) proc = 0;
1307: while (row >= C->rmap->range[proc+1]) proc++;
1308: w1[proc]++;
1309: row2proc[j] = proc; /* map row index to proc */
1311: if (proc != rank && !w2[proc]) {
1312: w2[proc] = 1; nrqs++;
1313: }
1314: }
1315: w1[rank] = 0; /* rows owned by self will not be requested */
1317: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1318: for (proc=0,j=0; proc<size; proc++) {
1319: if (w1[proc]) { pa[j++] = proc;}
1320: }
1322: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1323: msz = 0; /* total mesg length (for all procs) */
1324: for (i=0; i<nrqs; i++) {
1325: proc = pa[i];
1326: w1[proc] += 3;
1327: msz += w1[proc];
1328: }
1329: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
1331: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1332: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1333: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1335: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1336: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1337: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
1339: /* Now post the Irecvs corresponding to these messages */
1340: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
1342: PetscFree(onodes1);
1343: PetscFree(olengths1);
1345: /* Allocate Memory for outgoing messages */
1346: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1347: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
1348: PetscMemzero(ptr,size*sizeof(PetscInt*));
1350: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1351: iptr = tmp;
1352: for (i=0; i<nrqs; i++) {
1353: proc = pa[i];
1354: sbuf1[proc] = iptr;
1355: iptr += w1[proc];
1356: }
1358: /* Form the outgoing messages */
1359: /* Initialize the header space */
1360: for (i=0; i<nrqs; i++) {
1361: proc = pa[i];
1362: PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));
1363: ptr[proc] = sbuf1[proc] + 3;
1364: }
1366: /* Parse the isrow and copy data into outbuf */
1367: PetscMemzero(ctr,size*sizeof(PetscInt));
1368: for (j=0; j<nrow; j++) { /* parse the indices of each IS */
1369: proc = row2proc[j];
1370: if (proc != rank) { /* copy to the outgoing buf*/
1371: *ptr[proc] = irow[j];
1372: ctr[proc]++; ptr[proc]++;
1373: }
1374: }
1376: /* Update the headers for the current IS */
1377: for (j=0; j<size; j++) { /* Can Optimise this loop too */
1378: if ((ctr_j = ctr[j])) {
1379: sbuf1_j = sbuf1[j];
1380: k = ++sbuf1_j[0];
1381: sbuf1_j[2*k] = ctr_j;
1382: sbuf1_j[2*k-1] = 0;
1383: }
1384: }
1386: /* Now post the sends */
1387: PetscMalloc1(nrqs+1,&s_waits1);
1388: for (i=0; i<nrqs; ++i) {
1389: proc = pa[i];
1390: MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1391: }
1393: /* Post Receives to capture the buffer size */
1394: PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);
1395: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
1397: rbuf2[0] = tmp + msz;
1398: for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1400: for (i=0; i<nrqs; ++i) {
1401: proc = pa[i];
1402: MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1403: }
1405: PetscFree2(w1,w2);
1407: /* Send to other procs the buf size they should allocate */
1408: /* Receive messages*/
1409: PetscMalloc1(nrqr+1,&r_status1);
1410: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
1412: MPI_Waitall(nrqr,r_waits1,r_status1);
1413: for (i=0; i<nrqr; ++i) {
1414: req_size[i] = 0;
1415: rbuf1_i = rbuf1[i];
1416: start = 2*rbuf1_i[0] + 1;
1417: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1418: PetscMalloc1(end+1,&sbuf2[i]);
1419: sbuf2_i = sbuf2[i];
1420: for (j=start; j<end; j++) {
1421: k = rbuf1_i[j] - rstart;
1422: ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1423: sbuf2_i[j] = ncols;
1424: req_size[i] += ncols;
1425: }
1426: req_source1[i] = r_status1[i].MPI_SOURCE;
1428: /* form the header */
1429: sbuf2_i[0] = req_size[i];
1430: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1432: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1433: }
1435: PetscFree(r_status1);
1436: PetscFree(r_waits1);
1438: /* rbuf2 is received, Post recv column indices a->j */
1439: MPI_Waitall(nrqs,r_waits2,r_status2);
1441: PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);
1442: for (i=0; i<nrqs; ++i) {
1443: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
1444: req_source2[i] = r_status2[i].MPI_SOURCE;
1445: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1446: }
1448: /* Wait on sends1 and sends2 */
1449: PetscMalloc1(nrqs+1,&s_status1);
1450: MPI_Waitall(nrqs,s_waits1,s_status1);
1451: PetscFree(s_waits1);
1452: PetscFree(s_status1);
1454: MPI_Waitall(nrqr,s_waits2,s_status2);
1455: PetscFree4(r_status2,s_waits2,r_waits2,s_status2);
1457: /* Now allocate sending buffers for a->j, and send them off */
1458: PetscMalloc1(nrqr+1,&sbuf_aj);
1459: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1460: PetscMalloc1(j+1,&sbuf_aj[0]);
1461: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1463: for (i=0; i<nrqr; i++) { /* for each requested message */
1464: rbuf1_i = rbuf1[i];
1465: sbuf_aj_i = sbuf_aj[i];
1466: ct1 = 2*rbuf1_i[0] + 1;
1467: ct2 = 0;
1468: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */
1470: kmax = rbuf1[i][2];
1471: for (k=0; k<kmax; k++,ct1++) { /* for each row */
1472: row = rbuf1_i[ct1] - rstart;
1473: nzA = ai[row+1] - ai[row];
1474: nzB = bi[row+1] - bi[row];
1475: ncols = nzA + nzB;
1476: cworkA = aj + ai[row]; cworkB = bj + bi[row];
1478: /* load the column indices for this row into cols*/
1479: cols = sbuf_aj_i + ct2;
1481: lwrite = 0;
1482: for (l=0; l<nzB; l++) {
1483: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1484: }
1485: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1486: for (l=0; l<nzB; l++) {
1487: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1488: }
1490: ct2 += ncols;
1491: }
1492: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1493: }
1495: /* create column map (cmap): global col of C -> local col of submat */
1496: #if defined(PETSC_USE_CTABLE)
1497: if (!allcolumns) {
1498: PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);
1499: PetscCalloc1(C->cmap->n,&cmap_loc);
1500: for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1501: if (icol[j] >= cstart && icol[j] <cend) {
1502: cmap_loc[icol[j] - cstart] = j+1;
1503: } else { /* use PetscTable for non-local col indices */
1504: PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1505: }
1506: }
1507: } else {
1508: cmap = NULL;
1509: cmap_loc = NULL;
1510: }
1511: PetscCalloc1(C->rmap->n,&rmap_loc);
1512: #else
1513: if (!allcolumns) {
1514: PetscCalloc1(C->cmap->N,&cmap);
1515: for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1516: } else {
1517: cmap = NULL;
1518: }
1519: #endif
1521: /* Create lens for MatSeqAIJSetPreallocation() */
1522: PetscCalloc1(nrow,&lens);
1524: /* Compute lens from local part of C */
1525: for (j=0; j<nrow; j++) {
1526: row = irow[j];
1527: proc = row2proc[j];
1528: if (proc == rank) {
1529: /* diagonal part A = c->A */
1530: ncols = ai[row-rstart+1] - ai[row-rstart];
1531: cols = aj + ai[row-rstart];
1532: if (!allcolumns) {
1533: for (k=0; k<ncols; k++) {
1534: #if defined(PETSC_USE_CTABLE)
1535: tcol = cmap_loc[cols[k]];
1536: #else
1537: tcol = cmap[cols[k]+cstart];
1538: #endif
1539: if (tcol) lens[j]++;
1540: }
1541: } else { /* allcolumns */
1542: lens[j] = ncols;
1543: }
1545: /* off-diagonal part B = c->B */
1546: ncols = bi[row-rstart+1] - bi[row-rstart];
1547: cols = bj + bi[row-rstart];
1548: if (!allcolumns) {
1549: for (k=0; k<ncols; k++) {
1550: #if defined(PETSC_USE_CTABLE)
1551: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1552: #else
1553: tcol = cmap[bmap[cols[k]]];
1554: #endif
1555: if (tcol) lens[j]++;
1556: }
1557: } else { /* allcolumns */
1558: lens[j] += ncols;
1559: }
1560: }
1561: }
1563: /* Create row map (rmap): global row of C -> local row of submat */
1564: #if defined(PETSC_USE_CTABLE)
1565: PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);
1566: for (j=0; j<nrow; j++) {
1567: row = irow[j];
1568: proc = row2proc[j];
1569: if (proc == rank) { /* a local row */
1570: rmap_loc[row - rstart] = j;
1571: } else {
1572: PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1573: }
1574: }
1575: #else
1576: PetscCalloc1(C->rmap->N,&rmap);
1577: for (j=0; j<nrow; j++) {
1578: rmap[irow[j]] = j;
1579: }
1580: #endif
1582: /* Update lens from offproc data */
1583: /* recv a->j is done */
1584: MPI_Waitall(nrqs,r_waits3,r_status3);
1585: for (i=0; i<nrqs; i++) {
1586: proc = pa[i];
1587: sbuf1_i = sbuf1[proc];
1588: /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1589: ct1 = 2 + 1;
1590: ct2 = 0;
1591: rbuf2_i = rbuf2[i]; /* received length of C->j */
1592: rbuf3_i = rbuf3[i]; /* received C->j */
1594: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1595: max1 = sbuf1_i[2];
1596: for (k=0; k<max1; k++,ct1++) {
1597: #if defined(PETSC_USE_CTABLE)
1598: PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1599: row--;
1600: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1601: #else
1602: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1603: #endif
1604: /* Now, store row index of submat in sbuf1_i[ct1] */
1605: sbuf1_i[ct1] = row;
1607: nnz = rbuf2_i[ct1];
1608: if (!allcolumns) {
1609: for (l=0; l<nnz; l++,ct2++) {
1610: #if defined(PETSC_USE_CTABLE)
1611: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1612: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1613: } else {
1614: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1615: }
1616: #else
1617: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1618: #endif
1619: if (tcol) lens[row]++;
1620: }
1621: } else { /* allcolumns */
1622: lens[row] += nnz;
1623: }
1624: }
1625: }
1626: MPI_Waitall(nrqr,s_waits3,s_status3);
1627: PetscFree4(r_waits3,s_waits3,r_status3,s_status3);
1629: /* Create the submatrices */
1630: MatCreate(PETSC_COMM_SELF,&submat);
1631: MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);
1633: ISGetBlockSize(isrow[0],&i);
1634: ISGetBlockSize(iscol[0],&j);
1635: MatSetBlockSizes(submat,i,j);
1636: MatSetType(submat,((PetscObject)A)->type_name);
1637: MatSeqAIJSetPreallocation(submat,0,lens);
1639: /* create struct Mat_SubSppt and attached it to submat */
1640: PetscNew(&smatis1);
1641: subc = (Mat_SeqAIJ*)submat->data;
1642: subc->submatis1 = smatis1;
1644: smatis1->id = 0;
1645: smatis1->nrqs = nrqs;
1646: smatis1->nrqr = nrqr;
1647: smatis1->rbuf1 = rbuf1;
1648: smatis1->rbuf2 = rbuf2;
1649: smatis1->rbuf3 = rbuf3;
1650: smatis1->sbuf2 = sbuf2;
1651: smatis1->req_source2 = req_source2;
1653: smatis1->sbuf1 = sbuf1;
1654: smatis1->ptr = ptr;
1655: smatis1->tmp = tmp;
1656: smatis1->ctr = ctr;
1658: smatis1->pa = pa;
1659: smatis1->req_size = req_size;
1660: smatis1->req_source1 = req_source1;
1662: smatis1->allcolumns = allcolumns;
1663: smatis1->singleis = PETSC_TRUE;
1664: smatis1->row2proc = row2proc;
1665: smatis1->rmap = rmap;
1666: smatis1->cmap = cmap;
1667: #if defined(PETSC_USE_CTABLE)
1668: smatis1->rmap_loc = rmap_loc;
1669: smatis1->cmap_loc = cmap_loc;
1670: #endif
1672: smatis1->destroy = submat->ops->destroy;
1673: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1674: submat->factortype = C->factortype;
1676: /* compute rmax */
1677: rmax = 0;
1678: for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1680: } else { /* scall == MAT_REUSE_MATRIX */
1681: submat = submats[0];
1682: if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1684: subc = (Mat_SeqAIJ*)submat->data;
1685: rmax = subc->rmax;
1686: smatis1 = subc->submatis1;
1687: nrqs = smatis1->nrqs;
1688: nrqr = smatis1->nrqr;
1689: rbuf1 = smatis1->rbuf1;
1690: rbuf2 = smatis1->rbuf2;
1691: rbuf3 = smatis1->rbuf3;
1692: req_source2 = smatis1->req_source2;
1694: sbuf1 = smatis1->sbuf1;
1695: sbuf2 = smatis1->sbuf2;
1696: ptr = smatis1->ptr;
1697: tmp = smatis1->tmp;
1698: ctr = smatis1->ctr;
1700: pa = smatis1->pa;
1701: req_size = smatis1->req_size;
1702: req_source1 = smatis1->req_source1;
1704: allcolumns = smatis1->allcolumns;
1705: row2proc = smatis1->row2proc;
1706: rmap = smatis1->rmap;
1707: cmap = smatis1->cmap;
1708: #if defined(PETSC_USE_CTABLE)
1709: rmap_loc = smatis1->rmap_loc;
1710: cmap_loc = smatis1->cmap_loc;
1711: #endif
1712: }
1714: /* Post recv matrix values */
1715: PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);
1716: PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);
1717: PetscObjectGetNewTag((PetscObject)C,&tag4);
1718: for (i=0; i<nrqs; ++i) {
1719: PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
1720: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1721: }
1723: /* Allocate sending buffers for a->a, and send them off */
1724: PetscMalloc1(nrqr+1,&sbuf_aa);
1725: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1726: PetscMalloc1(j+1,&sbuf_aa[0]);
1727: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1729: for (i=0; i<nrqr; i++) {
1730: rbuf1_i = rbuf1[i];
1731: sbuf_aa_i = sbuf_aa[i];
1732: ct1 = 2*rbuf1_i[0]+1;
1733: ct2 = 0;
1734: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1736: kmax = rbuf1_i[2];
1737: for (k=0; k<kmax; k++,ct1++) {
1738: row = rbuf1_i[ct1] - rstart;
1739: nzA = ai[row+1] - ai[row];
1740: nzB = bi[row+1] - bi[row];
1741: ncols = nzA + nzB;
1742: cworkB = bj + bi[row];
1743: vworkA = a_a + ai[row];
1744: vworkB = b_a + bi[row];
1746: /* load the column values for this row into vals*/
1747: vals = sbuf_aa_i + ct2;
1749: lwrite = 0;
1750: for (l=0; l<nzB; l++) {
1751: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1752: }
1753: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1754: for (l=0; l<nzB; l++) {
1755: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1756: }
1758: ct2 += ncols;
1759: }
1760: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1761: }
1763: /* Assemble submat */
1764: /* First assemble the local rows */
1765: for (j=0; j<nrow; j++) {
1766: row = irow[j];
1767: proc = row2proc[j];
1768: if (proc == rank) {
1769: Crow = row - rstart; /* local row index of C */
1770: #if defined(PETSC_USE_CTABLE)
1771: row = rmap_loc[Crow]; /* row index of submat */
1772: #else
1773: row = rmap[row];
1774: #endif
1776: if (allcolumns) {
1777: /* diagonal part A = c->A */
1778: ncols = ai[Crow+1] - ai[Crow];
1779: cols = aj + ai[Crow];
1780: vals = a->a + ai[Crow];
1781: i = 0;
1782: for (k=0; k<ncols; k++) {
1783: subcols[i] = cols[k] + cstart;
1784: subvals[i++] = vals[k];
1785: }
1787: /* off-diagonal part B = c->B */
1788: ncols = bi[Crow+1] - bi[Crow];
1789: cols = bj + bi[Crow];
1790: vals = b->a + bi[Crow];
1791: for (k=0; k<ncols; k++) {
1792: subcols[i] = bmap[cols[k]];
1793: subvals[i++] = vals[k];
1794: }
1796: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1798: } else { /* !allcolumns */
1799: #if defined(PETSC_USE_CTABLE)
1800: /* diagonal part A = c->A */
1801: ncols = ai[Crow+1] - ai[Crow];
1802: cols = aj + ai[Crow];
1803: vals = a->a + ai[Crow];
1804: i = 0;
1805: for (k=0; k<ncols; k++) {
1806: tcol = cmap_loc[cols[k]];
1807: if (tcol) {
1808: subcols[i] = --tcol;
1809: subvals[i++] = vals[k];
1810: }
1811: }
1813: /* off-diagonal part B = c->B */
1814: ncols = bi[Crow+1] - bi[Crow];
1815: cols = bj + bi[Crow];
1816: vals = b->a + bi[Crow];
1817: for (k=0; k<ncols; k++) {
1818: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1819: if (tcol) {
1820: subcols[i] = --tcol;
1821: subvals[i++] = vals[k];
1822: }
1823: }
1824: #else
1825: /* diagonal part A = c->A */
1826: ncols = ai[Crow+1] - ai[Crow];
1827: cols = aj + ai[Crow];
1828: vals = a->a + ai[Crow];
1829: i = 0;
1830: for (k=0; k<ncols; k++) {
1831: tcol = cmap[cols[k]+cstart];
1832: if (tcol) {
1833: subcols[i] = --tcol;
1834: subvals[i++] = vals[k];
1835: }
1836: }
1838: /* off-diagonal part B = c->B */
1839: ncols = bi[Crow+1] - bi[Crow];
1840: cols = bj + bi[Crow];
1841: vals = b->a + bi[Crow];
1842: for (k=0; k<ncols; k++) {
1843: tcol = cmap[bmap[cols[k]]];
1844: if (tcol) {
1845: subcols[i] = --tcol;
1846: subvals[i++] = vals[k];
1847: }
1848: }
1849: #endif
1850: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1851: }
1852: }
1853: }
1855: /* Now assemble the off-proc rows */
1856: for (i=0; i<nrqs; i++) { /* for each requested message */
1857: /* recv values from other processes */
1858: MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1859: proc = pa[idex];
1860: sbuf1_i = sbuf1[proc];
1861: /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */
1862: ct1 = 2 + 1;
1863: ct2 = 0; /* count of received C->j */
1864: ct3 = 0; /* count of received C->j that will be inserted into submat */
1865: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1866: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1867: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1869: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1870: max1 = sbuf1_i[2]; /* num of rows */
1871: for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1872: row = sbuf1_i[ct1]; /* row index of submat */
1873: if (!allcolumns) {
1874: idex = 0;
1875: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1876: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1877: for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1878: #if defined(PETSC_USE_CTABLE)
1879: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1880: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1881: } else {
1882: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1883: }
1884: #else
1885: tcol = cmap[rbuf3_i[ct2]];
1886: #endif
1887: if (tcol) {
1888: subcols[idex] = --tcol; /* may not be sorted */
1889: subvals[idex++] = rbuf4_i[ct2];
1891: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1892: For reuse, we replace received C->j with index that should be inserted to submat */
1893: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1894: }
1895: }
1896: MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1897: } else { /* scall == MAT_REUSE_MATRIX */
1898: submat = submats[0];
1899: subc = (Mat_SeqAIJ*)submat->data;
1901: nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1902: for (l=0; l<nnz; l++) {
1903: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1904: subvals[idex++] = rbuf4_i[ct2];
1905: }
1907: bj = subc->j + subc->i[row]; /* sorted column indices */
1908: MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1909: }
1910: } else { /* allcolumns */
1911: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1912: MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1913: ct2 += nnz;
1914: }
1915: }
1916: }
1918: /* sending a->a are done */
1919: MPI_Waitall(nrqr,s_waits4,s_status4);
1920: PetscFree4(r_waits4,s_waits4,r_status4,s_status4);
1922: MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1923: MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1924: submats[0] = submat;
1926: /* Restore the indices */
1927: ISRestoreIndices(isrow[0],&irow);
1928: if (!allcolumns) {
1929: ISRestoreIndices(iscol[0],&icol);
1930: }
1932: /* Destroy allocated memory */
1933: for (i=0; i<nrqs; ++i) {
1934: PetscFree3(rbuf4[i],subcols,subvals);
1935: }
1936: PetscFree3(rbuf4,subcols,subvals);
1937: PetscFree(sbuf_aa[0]);
1938: PetscFree(sbuf_aa);
1940: if (scall == MAT_INITIAL_MATRIX) {
1941: PetscFree(lens);
1942: PetscFree(sbuf_aj[0]);
1943: PetscFree(sbuf_aj);
1944: }
1945: return(0);
1946: }
1948: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1949: {
1951: PetscInt ncol;
1952: PetscBool colflag,allcolumns=PETSC_FALSE;
1955: /* Allocate memory to hold all the submatrices */
1956: if (scall == MAT_INITIAL_MATRIX) {
1957: PetscCalloc1(2,submat);
1958: }
1960: /* Check for special case: each processor gets entire matrix columns */
1961: ISIdentity(iscol[0],&colflag);
1962: ISGetLocalSize(iscol[0],&ncol);
1963: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1965: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1966: return(0);
1967: }
1969: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1970: {
1972: PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1973: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE;
1974: Mat_SeqAIJ *subc;
1975: Mat_SubSppt *smat;
1978: /* Check for special case: each processor has a single IS */
1979: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */
1980: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1981: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */
1982: return(0);
1983: }
1985: /* Collect global wantallmatrix and nstages */
1986: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1987: else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1988: if (!nmax) nmax = 1;
1990: if (scall == MAT_INITIAL_MATRIX) {
1991: /* Collect global wantallmatrix and nstages */
1992: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1993: ISIdentity(*isrow,&rowflag);
1994: ISIdentity(*iscol,&colflag);
1995: ISGetLocalSize(*isrow,&nrow);
1996: ISGetLocalSize(*iscol,&ncol);
1997: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1998: wantallmatrix = PETSC_TRUE;
2000: PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
2001: }
2002: }
2004: /* Determine the number of stages through which submatrices are done
2005: Each stage will extract nmax submatrices.
2006: nmax is determined by the matrix column dimension.
2007: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2008: */
2009: nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2011: in[0] = -1*(PetscInt)wantallmatrix;
2012: in[1] = nstages;
2013: MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2014: wantallmatrix = (PetscBool)(-out[0]);
2015: nstages = out[1]; /* Make sure every processor loops through the global nstages */
2017: } else { /* MAT_REUSE_MATRIX */
2018: if (ismax) {
2019: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2020: smat = subc->submatis1;
2021: } else { /* (*submat)[0] is a dummy matrix */
2022: smat = (Mat_SubSppt*)(*submat)[0]->data;
2023: }
2024: if (!smat) {
2025: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2026: wantallmatrix = PETSC_TRUE;
2027: } else if (smat->singleis) {
2028: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2029: return(0);
2030: } else {
2031: nstages = smat->nstages;
2032: }
2033: }
2035: if (wantallmatrix) {
2036: MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2037: return(0);
2038: }
2040: /* Allocate memory to hold all the submatrices and dummy submatrices */
2041: if (scall == MAT_INITIAL_MATRIX) {
2042: PetscCalloc1(ismax+nstages,submat);
2043: }
2045: for (i=0,pos=0; i<nstages; i++) {
2046: if (pos+nmax <= ismax) max_no = nmax;
2047: else if (pos >= ismax) max_no = 0;
2048: else max_no = ismax-pos;
2050: MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2051: if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2052: smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++;
2053: smat->nstages = nstages;
2054: }
2055: pos += max_no;
2056: }
2058: if (ismax && scall == MAT_INITIAL_MATRIX) {
2059: /* save nstages for reuse */
2060: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2061: smat = subc->submatis1;
2062: smat->nstages = nstages;
2063: }
2064: return(0);
2065: }
2067: /* -------------------------------------------------------------------------*/
2068: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2069: {
2070: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
2071: Mat A = c->A;
2072: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2073: const PetscInt **icol,**irow;
2074: PetscInt *nrow,*ncol,start;
2076: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2077: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2078: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2079: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2080: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2081: #if defined(PETSC_USE_CTABLE)
2082: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2083: #else
2084: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2085: #endif
2086: const PetscInt *irow_i;
2087: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2088: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2089: MPI_Request *r_waits4,*s_waits3,*s_waits4;
2090: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
2091: MPI_Status *r_status3,*r_status4,*s_status4;
2092: MPI_Comm comm;
2093: PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2094: PetscMPIInt *onodes1,*olengths1,end;
2095: PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2096: Mat_SubSppt *smat_i;
2097: PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2098: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2101: PetscObjectGetComm((PetscObject)C,&comm);
2102: size = c->size;
2103: rank = c->rank;
2105: PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2106: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
2108: for (i=0; i<ismax; i++) {
2109: ISSorted(iscol[i],&issorted[i]);
2110: if (!issorted[i]) iscsorted = issorted[i];
2112: ISSorted(isrow[i],&issorted[i]);
2114: ISGetIndices(isrow[i],&irow[i]);
2115: ISGetLocalSize(isrow[i],&nrow[i]);
2117: /* Check for special case: allcolumn */
2118: ISIdentity(iscol[i],&colflag);
2119: ISGetLocalSize(iscol[i],&ncol[i]);
2120: if (colflag && ncol[i] == C->cmap->N) {
2121: allcolumns[i] = PETSC_TRUE;
2122: icol[i] = NULL;
2123: } else {
2124: allcolumns[i] = PETSC_FALSE;
2125: ISGetIndices(iscol[i],&icol[i]);
2126: }
2127: }
2129: if (scall == MAT_REUSE_MATRIX) {
2130: /* Assumes new rows are same length as the old rows */
2131: for (i=0; i<ismax; i++) {
2132: if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2133: subc = (Mat_SeqAIJ*)submats[i]->data;
2134: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2136: /* Initial matrix as if empty */
2137: PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));
2139: smat_i = subc->submatis1;
2141: nrqs = smat_i->nrqs;
2142: nrqr = smat_i->nrqr;
2143: rbuf1 = smat_i->rbuf1;
2144: rbuf2 = smat_i->rbuf2;
2145: rbuf3 = smat_i->rbuf3;
2146: req_source2 = smat_i->req_source2;
2148: sbuf1 = smat_i->sbuf1;
2149: sbuf2 = smat_i->sbuf2;
2150: ptr = smat_i->ptr;
2151: tmp = smat_i->tmp;
2152: ctr = smat_i->ctr;
2154: pa = smat_i->pa;
2155: req_size = smat_i->req_size;
2156: req_source1 = smat_i->req_source1;
2158: allcolumns[i] = smat_i->allcolumns;
2159: row2proc[i] = smat_i->row2proc;
2160: rmap[i] = smat_i->rmap;
2161: cmap[i] = smat_i->cmap;
2162: }
2164: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
2165: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2166: smat_i = (Mat_SubSppt*)submats[0]->data;
2168: nrqs = smat_i->nrqs;
2169: nrqr = smat_i->nrqr;
2170: rbuf1 = smat_i->rbuf1;
2171: rbuf2 = smat_i->rbuf2;
2172: rbuf3 = smat_i->rbuf3;
2173: req_source2 = smat_i->req_source2;
2175: sbuf1 = smat_i->sbuf1;
2176: sbuf2 = smat_i->sbuf2;
2177: ptr = smat_i->ptr;
2178: tmp = smat_i->tmp;
2179: ctr = smat_i->ctr;
2181: pa = smat_i->pa;
2182: req_size = smat_i->req_size;
2183: req_source1 = smat_i->req_source1;
2185: allcolumns[0] = PETSC_FALSE;
2186: }
2187: } else { /* scall == MAT_INITIAL_MATRIX */
2188: /* Get some new tags to keep the communication clean */
2189: PetscObjectGetNewTag((PetscObject)C,&tag2);
2190: PetscObjectGetNewTag((PetscObject)C,&tag3);
2192: /* evaluate communication - mesg to who, length of mesg, and buffer space
2193: required. Based on this, buffers are allocated, and data copied into them*/
2194: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
2196: for (i=0; i<ismax; i++) {
2197: jmax = nrow[i];
2198: irow_i = irow[i];
2200: PetscMalloc1(jmax,&row2proc_i);
2201: row2proc[i] = row2proc_i;
2203: if (issorted[i]) proc = 0;
2204: for (j=0; j<jmax; j++) {
2205: if (!issorted[i]) proc = 0;
2206: row = irow_i[j];
2207: while (row >= C->rmap->range[proc+1]) proc++;
2208: w4[proc]++;
2209: row2proc_i[j] = proc; /* map row index to proc */
2210: }
2211: for (j=0; j<size; j++) {
2212: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
2213: }
2214: }
2216: nrqs = 0; /* no of outgoing messages */
2217: msz = 0; /* total mesg length (for all procs) */
2218: w1[rank] = 0; /* no mesg sent to self */
2219: w3[rank] = 0;
2220: for (i=0; i<size; i++) {
2221: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2222: }
2223: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
2224: for (i=0,j=0; i<size; i++) {
2225: if (w1[i]) { pa[j] = i; j++; }
2226: }
2228: /* Each message would have a header = 1 + 2*(no of IS) + data */
2229: for (i=0; i<nrqs; i++) {
2230: j = pa[i];
2231: w1[j] += w2[j] + 2* w3[j];
2232: msz += w1[j];
2233: }
2234: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
2236: /* Determine the number of messages to expect, their lengths, from from-ids */
2237: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2238: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
2240: /* Now post the Irecvs corresponding to these messages */
2241: tag0 = ((PetscObject)C)->tag;
2242: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
2244: PetscFree(onodes1);
2245: PetscFree(olengths1);
2247: /* Allocate Memory for outgoing messages */
2248: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2249: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
2250: PetscMemzero(ptr,size*sizeof(PetscInt*));
2252: {
2253: PetscInt *iptr = tmp;
2254: k = 0;
2255: for (i=0; i<nrqs; i++) {
2256: j = pa[i];
2257: iptr += k;
2258: sbuf1[j] = iptr;
2259: k = w1[j];
2260: }
2261: }
2263: /* Form the outgoing messages. Initialize the header space */
2264: for (i=0; i<nrqs; i++) {
2265: j = pa[i];
2266: sbuf1[j][0] = 0;
2267: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
2268: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
2269: }
2271: /* Parse the isrow and copy data into outbuf */
2272: for (i=0; i<ismax; i++) {
2273: row2proc_i = row2proc[i];
2274: PetscMemzero(ctr,size*sizeof(PetscInt));
2275: irow_i = irow[i];
2276: jmax = nrow[i];
2277: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
2278: proc = row2proc_i[j];
2279: if (proc != rank) { /* copy to the outgoing buf*/
2280: ctr[proc]++;
2281: *ptr[proc] = irow_i[j];
2282: ptr[proc]++;
2283: }
2284: }
2285: /* Update the headers for the current IS */
2286: for (j=0; j<size; j++) { /* Can Optimise this loop too */
2287: if ((ctr_j = ctr[j])) {
2288: sbuf1_j = sbuf1[j];
2289: k = ++sbuf1_j[0];
2290: sbuf1_j[2*k] = ctr_j;
2291: sbuf1_j[2*k-1] = i;
2292: }
2293: }
2294: }
2296: /* Now post the sends */
2297: PetscMalloc1(nrqs+1,&s_waits1);
2298: for (i=0; i<nrqs; ++i) {
2299: j = pa[i];
2300: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2301: }
2303: /* Post Receives to capture the buffer size */
2304: PetscMalloc1(nrqs+1,&r_waits2);
2305: PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);
2306: rbuf2[0] = tmp + msz;
2307: for (i=1; i<nrqs; ++i) {
2308: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2309: }
2310: for (i=0; i<nrqs; ++i) {
2311: j = pa[i];
2312: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2313: }
2315: /* Send to other procs the buf size they should allocate */
2316: /* Receive messages*/
2317: PetscMalloc1(nrqr+1,&s_waits2);
2318: PetscMalloc1(nrqr+1,&r_status1);
2319: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2320: {
2321: PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2322: PetscInt *sbuf2_i;
2324: MPI_Waitall(nrqr,r_waits1,r_status1);
2325: for (i=0; i<nrqr; ++i) {
2326: req_size[i] = 0;
2327: rbuf1_i = rbuf1[i];
2328: start = 2*rbuf1_i[0] + 1;
2329: MPI_Get_count(r_status1+i,MPIU_INT,&end);
2330: PetscMalloc1(end+1,&sbuf2[i]);
2331: sbuf2_i = sbuf2[i];
2332: for (j=start; j<end; j++) {
2333: id = rbuf1_i[j] - rstart;
2334: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2335: sbuf2_i[j] = ncols;
2336: req_size[i] += ncols;
2337: }
2338: req_source1[i] = r_status1[i].MPI_SOURCE;
2339: /* form the header */
2340: sbuf2_i[0] = req_size[i];
2341: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2343: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2344: }
2345: }
2346: PetscFree(r_status1);
2347: PetscFree(r_waits1);
2348: PetscFree4(w1,w2,w3,w4);
2350: /* Receive messages*/
2351: PetscMalloc1(nrqs+1,&r_waits3);
2352: PetscMalloc1(nrqs+1,&r_status2);
2354: MPI_Waitall(nrqs,r_waits2,r_status2);
2355: for (i=0; i<nrqs; ++i) {
2356: PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);
2357: req_source2[i] = r_status2[i].MPI_SOURCE;
2358: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2359: }
2360: PetscFree(r_status2);
2361: PetscFree(r_waits2);
2363: /* Wait on sends1 and sends2 */
2364: PetscMalloc1(nrqs+1,&s_status1);
2365: PetscMalloc1(nrqr+1,&s_status2);
2367: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
2368: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
2369: PetscFree(s_status1);
2370: PetscFree(s_status2);
2371: PetscFree(s_waits1);
2372: PetscFree(s_waits2);
2374: /* Now allocate sending buffers for a->j, and send them off */
2375: PetscMalloc1(nrqr+1,&sbuf_aj);
2376: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2377: PetscMalloc1(j+1,&sbuf_aj[0]);
2378: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2380: PetscMalloc1(nrqr+1,&s_waits3);
2381: {
2382: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2383: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2384: PetscInt cend = C->cmap->rend;
2385: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2387: for (i=0; i<nrqr; i++) {
2388: rbuf1_i = rbuf1[i];
2389: sbuf_aj_i = sbuf_aj[i];
2390: ct1 = 2*rbuf1_i[0] + 1;
2391: ct2 = 0;
2392: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2393: kmax = rbuf1[i][2*j];
2394: for (k=0; k<kmax; k++,ct1++) {
2395: row = rbuf1_i[ct1] - rstart;
2396: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2397: ncols = nzA + nzB;
2398: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
2400: /* load the column indices for this row into cols */
2401: cols = sbuf_aj_i + ct2;
2403: lwrite = 0;
2404: for (l=0; l<nzB; l++) {
2405: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2406: }
2407: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2408: for (l=0; l<nzB; l++) {
2409: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2410: }
2412: ct2 += ncols;
2413: }
2414: }
2415: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2416: }
2417: }
2418: PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);
2420: /* create col map: global col of C -> local col of submatrices */
2421: {
2422: const PetscInt *icol_i;
2423: #if defined(PETSC_USE_CTABLE)
2424: for (i=0; i<ismax; i++) {
2425: if (!allcolumns[i]) {
2426: PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);
2428: jmax = ncol[i];
2429: icol_i = icol[i];
2430: cmap_i = cmap[i];
2431: for (j=0; j<jmax; j++) {
2432: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2433: }
2434: } else cmap[i] = NULL;
2435: }
2436: #else
2437: for (i=0; i<ismax; i++) {
2438: if (!allcolumns[i]) {
2439: PetscCalloc1(C->cmap->N,&cmap[i]);
2440: jmax = ncol[i];
2441: icol_i = icol[i];
2442: cmap_i = cmap[i];
2443: for (j=0; j<jmax; j++) {
2444: cmap_i[icol_i[j]] = j+1;
2445: }
2446: } else cmap[i] = NULL;
2447: }
2448: #endif
2449: }
2451: /* Create lens which is required for MatCreate... */
2452: for (i=0,j=0; i<ismax; i++) j += nrow[i];
2453: PetscMalloc1(ismax,&lens);
2455: if (ismax) {
2456: PetscCalloc1(j,&lens[0]);
2457: }
2458: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2460: /* Update lens from local data */
2461: for (i=0; i<ismax; i++) {
2462: row2proc_i = row2proc[i];
2463: jmax = nrow[i];
2464: if (!allcolumns[i]) cmap_i = cmap[i];
2465: irow_i = irow[i];
2466: lens_i = lens[i];
2467: for (j=0; j<jmax; j++) {
2468: row = irow_i[j];
2469: proc = row2proc_i[j];
2470: if (proc == rank) {
2471: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
2472: if (!allcolumns[i]) {
2473: for (k=0; k<ncols; k++) {
2474: #if defined(PETSC_USE_CTABLE)
2475: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2476: #else
2477: tcol = cmap_i[cols[k]];
2478: #endif
2479: if (tcol) lens_i[j]++;
2480: }
2481: } else { /* allcolumns */
2482: lens_i[j] = ncols;
2483: }
2484: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
2485: }
2486: }
2487: }
2489: /* Create row map: global row of C -> local row of submatrices */
2490: #if defined(PETSC_USE_CTABLE)
2491: for (i=0; i<ismax; i++) {
2492: PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
2493: irow_i = irow[i];
2494: jmax = nrow[i];
2495: for (j=0; j<jmax; j++) {
2496: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2497: }
2498: }
2499: #else
2500: for (i=0; i<ismax; i++) {
2501: PetscCalloc1(C->rmap->N,&rmap[i]);
2502: rmap_i = rmap[i];
2503: irow_i = irow[i];
2504: jmax = nrow[i];
2505: for (j=0; j<jmax; j++) {
2506: rmap_i[irow_i[j]] = j;
2507: }
2508: }
2509: #endif
2511: /* Update lens from offproc data */
2512: {
2513: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2515: MPI_Waitall(nrqs,r_waits3,r_status3);
2516: for (tmp2=0; tmp2<nrqs; tmp2++) {
2517: sbuf1_i = sbuf1[pa[tmp2]];
2518: jmax = sbuf1_i[0];
2519: ct1 = 2*jmax+1;
2520: ct2 = 0;
2521: rbuf2_i = rbuf2[tmp2];
2522: rbuf3_i = rbuf3[tmp2];
2523: for (j=1; j<=jmax; j++) {
2524: is_no = sbuf1_i[2*j-1];
2525: max1 = sbuf1_i[2*j];
2526: lens_i = lens[is_no];
2527: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2528: rmap_i = rmap[is_no];
2529: for (k=0; k<max1; k++,ct1++) {
2530: #if defined(PETSC_USE_CTABLE)
2531: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2532: row--;
2533: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2534: #else
2535: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2536: #endif
2537: max2 = rbuf2_i[ct1];
2538: for (l=0; l<max2; l++,ct2++) {
2539: if (!allcolumns[is_no]) {
2540: #if defined(PETSC_USE_CTABLE)
2541: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2542: #else
2543: tcol = cmap_i[rbuf3_i[ct2]];
2544: #endif
2545: if (tcol) lens_i[row]++;
2546: } else { /* allcolumns */
2547: lens_i[row]++; /* lens_i[row] += max2 ? */
2548: }
2549: }
2550: }
2551: }
2552: }
2553: }
2554: PetscFree(r_waits3);
2555: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
2556: PetscFree2(r_status3,s_status3);
2557: PetscFree(s_waits3);
2559: /* Create the submatrices */
2560: for (i=0; i<ismax; i++) {
2561: PetscInt rbs,cbs;
2563: ISGetBlockSize(isrow[i],&rbs);
2564: ISGetBlockSize(iscol[i],&cbs);
2566: MatCreate(PETSC_COMM_SELF,submats+i);
2567: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
2569: MatSetBlockSizes(submats[i],rbs,cbs);
2570: MatSetType(submats[i],((PetscObject)A)->type_name);
2571: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
2573: /* create struct Mat_SubSppt and attached it to submat */
2574: PetscNew(&smat_i);
2575: subc = (Mat_SeqAIJ*)submats[i]->data;
2576: subc->submatis1 = smat_i;
2578: smat_i->destroy = submats[i]->ops->destroy;
2579: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2580: submats[i]->factortype = C->factortype;
2582: smat_i->id = i;
2583: smat_i->nrqs = nrqs;
2584: smat_i->nrqr = nrqr;
2585: smat_i->rbuf1 = rbuf1;
2586: smat_i->rbuf2 = rbuf2;
2587: smat_i->rbuf3 = rbuf3;
2588: smat_i->sbuf2 = sbuf2;
2589: smat_i->req_source2 = req_source2;
2591: smat_i->sbuf1 = sbuf1;
2592: smat_i->ptr = ptr;
2593: smat_i->tmp = tmp;
2594: smat_i->ctr = ctr;
2596: smat_i->pa = pa;
2597: smat_i->req_size = req_size;
2598: smat_i->req_source1 = req_source1;
2600: smat_i->allcolumns = allcolumns[i];
2601: smat_i->singleis = PETSC_FALSE;
2602: smat_i->row2proc = row2proc[i];
2603: smat_i->rmap = rmap[i];
2604: smat_i->cmap = cmap[i];
2605: }
2607: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2608: MatCreate(PETSC_COMM_SELF,&submats[0]);
2609: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2610: MatSetType(submats[0],MATDUMMY);
2612: /* create struct Mat_SubSppt and attached it to submat */
2613: PetscNewLog(submats[0],&smat_i);
2614: submats[0]->data = (void*)smat_i;
2616: smat_i->destroy = submats[0]->ops->destroy;
2617: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2618: submats[0]->factortype = C->factortype;
2620: smat_i->id = 0;
2621: smat_i->nrqs = nrqs;
2622: smat_i->nrqr = nrqr;
2623: smat_i->rbuf1 = rbuf1;
2624: smat_i->rbuf2 = rbuf2;
2625: smat_i->rbuf3 = rbuf3;
2626: smat_i->sbuf2 = sbuf2;
2627: smat_i->req_source2 = req_source2;
2629: smat_i->sbuf1 = sbuf1;
2630: smat_i->ptr = ptr;
2631: smat_i->tmp = tmp;
2632: smat_i->ctr = ctr;
2634: smat_i->pa = pa;
2635: smat_i->req_size = req_size;
2636: smat_i->req_source1 = req_source1;
2638: smat_i->allcolumns = PETSC_FALSE;
2639: smat_i->singleis = PETSC_FALSE;
2640: smat_i->row2proc = NULL;
2641: smat_i->rmap = NULL;
2642: smat_i->cmap = NULL;
2643: }
2645: if (ismax) {PetscFree(lens[0]);}
2646: PetscFree(lens);
2647: PetscFree(sbuf_aj[0]);
2648: PetscFree(sbuf_aj);
2650: } /* endof scall == MAT_INITIAL_MATRIX */
2652: /* Post recv matrix values */
2653: PetscObjectGetNewTag((PetscObject)C,&tag4);
2654: PetscMalloc1(nrqs+1,&rbuf4);
2655: PetscMalloc1(nrqs+1,&r_waits4);
2656: PetscMalloc1(nrqs+1,&r_status4);
2657: PetscMalloc1(nrqr+1,&s_status4);
2658: for (i=0; i<nrqs; ++i) {
2659: PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);
2660: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2661: }
2663: /* Allocate sending buffers for a->a, and send them off */
2664: PetscMalloc1(nrqr+1,&sbuf_aa);
2665: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2666: PetscMalloc1(j+1,&sbuf_aa[0]);
2667: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2669: PetscMalloc1(nrqr+1,&s_waits4);
2670: {
2671: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2672: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2673: PetscInt cend = C->cmap->rend;
2674: PetscInt *b_j = b->j;
2675: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
2677: for (i=0; i<nrqr; i++) {
2678: rbuf1_i = rbuf1[i];
2679: sbuf_aa_i = sbuf_aa[i];
2680: ct1 = 2*rbuf1_i[0]+1;
2681: ct2 = 0;
2682: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2683: kmax = rbuf1_i[2*j];
2684: for (k=0; k<kmax; k++,ct1++) {
2685: row = rbuf1_i[ct1] - rstart;
2686: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
2687: ncols = nzA + nzB;
2688: cworkB = b_j + b_i[row];
2689: vworkA = a_a + a_i[row];
2690: vworkB = b_a + b_i[row];
2692: /* load the column values for this row into vals*/
2693: vals = sbuf_aa_i+ct2;
2695: lwrite = 0;
2696: for (l=0; l<nzB; l++) {
2697: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2698: }
2699: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2700: for (l=0; l<nzB; l++) {
2701: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2702: }
2704: ct2 += ncols;
2705: }
2706: }
2707: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2708: }
2709: }
2711: /* Assemble the matrices */
2712: /* First assemble the local rows */
2713: for (i=0; i<ismax; i++) {
2714: row2proc_i = row2proc[i];
2715: subc = (Mat_SeqAIJ*)submats[i]->data;
2716: imat_ilen = subc->ilen;
2717: imat_j = subc->j;
2718: imat_i = subc->i;
2719: imat_a = subc->a;
2721: if (!allcolumns[i]) cmap_i = cmap[i];
2722: rmap_i = rmap[i];
2723: irow_i = irow[i];
2724: jmax = nrow[i];
2725: for (j=0; j<jmax; j++) {
2726: row = irow_i[j];
2727: proc = row2proc_i[j];
2728: if (proc == rank) {
2729: old_row = row;
2730: #if defined(PETSC_USE_CTABLE)
2731: PetscTableFind(rmap_i,row+1,&row);
2732: row--;
2733: #else
2734: row = rmap_i[row];
2735: #endif
2736: ilen_row = imat_ilen[row];
2737: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2738: mat_i = imat_i[row];
2739: mat_a = imat_a + mat_i;
2740: mat_j = imat_j + mat_i;
2741: if (!allcolumns[i]) {
2742: for (k=0; k<ncols; k++) {
2743: #if defined(PETSC_USE_CTABLE)
2744: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2745: #else
2746: tcol = cmap_i[cols[k]];
2747: #endif
2748: if (tcol) {
2749: *mat_j++ = tcol - 1;
2750: *mat_a++ = vals[k];
2751: ilen_row++;
2752: }
2753: }
2754: } else { /* allcolumns */
2755: for (k=0; k<ncols; k++) {
2756: *mat_j++ = cols[k]; /* global col index! */
2757: *mat_a++ = vals[k];
2758: ilen_row++;
2759: }
2760: }
2761: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2763: imat_ilen[row] = ilen_row;
2764: }
2765: }
2766: }
2768: /* Now assemble the off proc rows */
2769: MPI_Waitall(nrqs,r_waits4,r_status4);
2770: for (tmp2=0; tmp2<nrqs; tmp2++) {
2771: sbuf1_i = sbuf1[pa[tmp2]];
2772: jmax = sbuf1_i[0];
2773: ct1 = 2*jmax + 1;
2774: ct2 = 0;
2775: rbuf2_i = rbuf2[tmp2];
2776: rbuf3_i = rbuf3[tmp2];
2777: rbuf4_i = rbuf4[tmp2];
2778: for (j=1; j<=jmax; j++) {
2779: is_no = sbuf1_i[2*j-1];
2780: rmap_i = rmap[is_no];
2781: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2782: subc = (Mat_SeqAIJ*)submats[is_no]->data;
2783: imat_ilen = subc->ilen;
2784: imat_j = subc->j;
2785: imat_i = subc->i;
2786: imat_a = subc->a;
2787: max1 = sbuf1_i[2*j];
2788: for (k=0; k<max1; k++,ct1++) {
2789: row = sbuf1_i[ct1];
2790: #if defined(PETSC_USE_CTABLE)
2791: PetscTableFind(rmap_i,row+1,&row);
2792: row--;
2793: #else
2794: row = rmap_i[row];
2795: #endif
2796: ilen = imat_ilen[row];
2797: mat_i = imat_i[row];
2798: mat_a = imat_a + mat_i;
2799: mat_j = imat_j + mat_i;
2800: max2 = rbuf2_i[ct1];
2801: if (!allcolumns[is_no]) {
2802: for (l=0; l<max2; l++,ct2++) {
2803: #if defined(PETSC_USE_CTABLE)
2804: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2805: #else
2806: tcol = cmap_i[rbuf3_i[ct2]];
2807: #endif
2808: if (tcol) {
2809: *mat_j++ = tcol - 1;
2810: *mat_a++ = rbuf4_i[ct2];
2811: ilen++;
2812: }
2813: }
2814: } else { /* allcolumns */
2815: for (l=0; l<max2; l++,ct2++) {
2816: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2817: *mat_a++ = rbuf4_i[ct2];
2818: ilen++;
2819: }
2820: }
2821: imat_ilen[row] = ilen;
2822: }
2823: }
2824: }
2826: if (!iscsorted) { /* sort column indices of the rows */
2827: for (i=0; i<ismax; i++) {
2828: subc = (Mat_SeqAIJ*)submats[i]->data;
2829: imat_j = subc->j;
2830: imat_i = subc->i;
2831: imat_a = subc->a;
2832: imat_ilen = subc->ilen;
2834: if (allcolumns[i]) continue;
2835: jmax = nrow[i];
2836: for (j=0; j<jmax; j++) {
2837: mat_i = imat_i[j];
2838: mat_a = imat_a + mat_i;
2839: mat_j = imat_j + mat_i;
2840: PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2841: }
2842: }
2843: }
2845: PetscFree(r_status4);
2846: PetscFree(r_waits4);
2847: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
2848: PetscFree(s_waits4);
2849: PetscFree(s_status4);
2851: /* Restore the indices */
2852: for (i=0; i<ismax; i++) {
2853: ISRestoreIndices(isrow[i],irow+i);
2854: if (!allcolumns[i]) {
2855: ISRestoreIndices(iscol[i],icol+i);
2856: }
2857: }
2859: for (i=0; i<ismax; i++) {
2860: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2861: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2862: }
2864: /* Destroy allocated memory */
2865: PetscFree(sbuf_aa[0]);
2866: PetscFree(sbuf_aa);
2867: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
2869: for (i=0; i<nrqs; ++i) {
2870: PetscFree(rbuf4[i]);
2871: }
2872: PetscFree(rbuf4);
2874: PetscFree4(row2proc,cmap,rmap,allcolumns);
2875: return(0);
2876: }
2878: /*
2879: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2880: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2881: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2882: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2883: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2884: state, and needs to be "assembled" later by compressing B's column space.
2886: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2887: Following this call, C->A & C->B have been created, even if empty.
2888: */
2889: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2890: {
2891: /* If making this function public, change the error returned in this function away from _PLIB. */
2893: Mat_MPIAIJ *aij;
2894: Mat_SeqAIJ *Baij;
2895: PetscBool seqaij,Bdisassembled;
2896: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2897: PetscScalar v;
2898: const PetscInt *rowindices,*colindices;
2901: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2902: if (A) {
2903: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2904: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2905: if (rowemb) {
2906: ISGetLocalSize(rowemb,&m);
2907: if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2908: } else {
2909: if (C->rmap->n != A->rmap->n) {
2910: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2911: }
2912: }
2913: if (dcolemb) {
2914: ISGetLocalSize(dcolemb,&n);
2915: if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2916: } else {
2917: if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2918: }
2919: }
2920: if (B) {
2921: PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2922: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2923: if (rowemb) {
2924: ISGetLocalSize(rowemb,&m);
2925: if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2926: } else {
2927: if (C->rmap->n != B->rmap->n) {
2928: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2929: }
2930: }
2931: if (ocolemb) {
2932: ISGetLocalSize(ocolemb,&n);
2933: if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2934: } else {
2935: if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2936: }
2937: }
2939: aij = (Mat_MPIAIJ*)C->data;
2940: if (!aij->A) {
2941: /* Mimic parts of MatMPIAIJSetPreallocation() */
2942: MatCreate(PETSC_COMM_SELF,&aij->A);
2943: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2944: MatSetBlockSizesFromMats(aij->A,C,C);
2945: MatSetType(aij->A,MATSEQAIJ);
2946: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2947: }
2948: if (A) {
2949: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2950: } else {
2951: MatSetUp(aij->A);
2952: }
2953: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2954: /*
2955: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2956: need to "disassemble" B -- convert it to using C's global indices.
2957: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2959: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2960: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2962: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2963: At least avoid calling MatSetValues() and the implied searches?
2964: */
2966: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2967: #if defined(PETSC_USE_CTABLE)
2968: PetscTableDestroy(&aij->colmap);
2969: #else
2970: PetscFree(aij->colmap);
2971: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2972: if (aij->B) {
2973: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2974: }
2975: #endif
2976: ngcol = 0;
2977: if (aij->lvec) {
2978: VecGetSize(aij->lvec,&ngcol);
2979: }
2980: if (aij->garray) {
2981: PetscFree(aij->garray);
2982: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2983: }
2984: VecDestroy(&aij->lvec);
2985: VecScatterDestroy(&aij->Mvctx);
2986: }
2987: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2988: MatDestroy(&aij->B);
2989: }
2990: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2991: MatZeroEntries(aij->B);
2992: }
2993: }
2994: Bdisassembled = PETSC_FALSE;
2995: if (!aij->B) {
2996: MatCreate(PETSC_COMM_SELF,&aij->B);
2997: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2998: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2999: MatSetBlockSizesFromMats(aij->B,B,B);
3000: MatSetType(aij->B,MATSEQAIJ);
3001: Bdisassembled = PETSC_TRUE;
3002: }
3003: if (B) {
3004: Baij = (Mat_SeqAIJ*)B->data;
3005: if (pattern == DIFFERENT_NONZERO_PATTERN) {
3006: PetscMalloc1(B->rmap->n,&nz);
3007: for (i=0; i<B->rmap->n; i++) {
3008: nz[i] = Baij->i[i+1] - Baij->i[i];
3009: }
3010: MatSeqAIJSetPreallocation(aij->B,0,nz);
3011: PetscFree(nz);
3012: }
3014: PetscLayoutGetRange(C->rmap,&rstart,&rend);
3015: shift = rend-rstart;
3016: count = 0;
3017: rowindices = NULL;
3018: colindices = NULL;
3019: if (rowemb) {
3020: ISGetIndices(rowemb,&rowindices);
3021: }
3022: if (ocolemb) {
3023: ISGetIndices(ocolemb,&colindices);
3024: }
3025: for (i=0; i<B->rmap->n; i++) {
3026: PetscInt row;
3027: row = i;
3028: if (rowindices) row = rowindices[i];
3029: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3030: col = Baij->j[count];
3031: if (colindices) col = colindices[col];
3032: if (Bdisassembled && col>=rstart) col += shift;
3033: v = Baij->a[count];
3034: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3035: ++count;
3036: }
3037: }
3038: /* No assembly for aij->B is necessary. */
3039: /* FIXME: set aij->B's nonzerostate correctly. */
3040: } else {
3041: MatSetUp(aij->B);
3042: }
3043: C->preallocated = PETSC_TRUE;
3044: C->was_assembled = PETSC_FALSE;
3045: C->assembled = PETSC_FALSE;
3046: /*
3047: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3048: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3049: */
3050: return(0);
3051: }
3053: /*
3054: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
3055: */
3056: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3057: {
3058: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3063: /* FIXME: make sure C is assembled */
3064: *A = aij->A;
3065: *B = aij->B;
3066: /* Note that we don't incref *A and *B, so be careful! */
3067: return(0);
3068: }
3070: /*
3071: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3072: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3073: */
3074: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3075: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3076: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3077: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3078: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3079: {
3081: PetscMPIInt isize,flag;
3082: PetscInt i,ii,cismax,ispar;
3083: Mat *A,*B;
3084: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3087: if (!ismax) return(0);
3089: for (i = 0, cismax = 0; i < ismax; ++i) {
3090: PetscMPIInt isize;
3091: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3092: if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3093: MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
3094: if (isize > 1) ++cismax;
3095: }
3097: /*
3098: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3099: ispar counts the number of parallel ISs across C's comm.
3100: */
3101: MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3102: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3103: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3104: return(0);
3105: }
3107: /* if (ispar) */
3108: /*
3109: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3110: These are used to extract the off-diag portion of the resulting parallel matrix.
3111: The row IS for the off-diag portion is the same as for the diag portion,
3112: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3113: */
3114: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3115: PetscMalloc1(cismax,&ciscol_p);
3116: for (i = 0, ii = 0; i < ismax; ++i) {
3117: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3118: if (isize > 1) {
3119: /*
3120: TODO: This is the part that's ***NOT SCALABLE***.
3121: To fix this we need to extract just the indices of C's nonzero columns
3122: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3123: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3124: to be done without serializing on the IS list, so, most likely, it is best
3125: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3126: */
3127: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3128: /* Now we have to
3129: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3130: were sorted on each rank, concatenated they might no longer be sorted;
3131: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3132: indices in the nondecreasing order to the original index positions.
3133: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3134: */
3135: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3136: ISSort(ciscol[ii]);
3137: ++ii;
3138: }
3139: }
3140: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3141: for (i = 0, ii = 0; i < ismax; ++i) {
3142: PetscInt j,issize;
3143: const PetscInt *indices;
3145: /*
3146: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3147: */
3148: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3149: ISSort(isrow[i]);
3150: ISGetLocalSize(isrow[i],&issize);
3151: ISGetIndices(isrow[i],&indices);
3152: for (j = 1; j < issize; ++j) {
3153: if (indices[j] == indices[j-1]) {
3154: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3155: }
3156: }
3157: ISRestoreIndices(isrow[i],&indices);
3160: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3161: ISSort(iscol[i]);
3162: ISGetLocalSize(iscol[i],&issize);
3163: ISGetIndices(iscol[i],&indices);
3164: for (j = 1; j < issize; ++j) {
3165: if (indices[j-1] == indices[j]) {
3166: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3167: }
3168: }
3169: ISRestoreIndices(iscol[i],&indices);
3170: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3171: if (isize > 1) {
3172: cisrow[ii] = isrow[i];
3173: ++ii;
3174: }
3175: }
3176: /*
3177: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3178: array of sequential matrices underlying the resulting parallel matrices.
3179: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3180: contain duplicates.
3182: There are as many diag matrices as there are original index sets. There are only as many parallel
3183: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3185: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3186: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3187: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3188: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3189: with A[i] and B[ii] extracted from the corresponding MPI submat.
3190: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3191: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3192: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3193: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3194: values into A[i] and B[ii] sitting inside the corresponding submat.
3195: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3196: A[i], B[ii], AA[i] or BB[ii] matrices.
3197: */
3198: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3199: if (scall == MAT_INITIAL_MATRIX) {
3200: PetscMalloc1(ismax,submat);
3201: }
3203: /* Now obtain the sequential A and B submatrices separately. */
3204: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3205: (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3206: (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);
3208: /*
3209: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3210: matrices A & B have been extracted directly into the parallel matrices containing them, or
3211: simply into the sequential matrix identical with the corresponding A (if isize == 1).
3212: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3213: to have the same sparsity pattern.
3214: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3215: must be constructed for C. This is done by setseqmat(s).
3216: */
3217: for (i = 0, ii = 0; i < ismax; ++i) {
3218: /*
3219: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3220: That way we can avoid sorting and computing permutations when reusing.
3221: To this end:
3222: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3223: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3224: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3225: */
3226: MatStructure pattern;
3227: pattern = DIFFERENT_NONZERO_PATTERN;
3229: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
3230: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3231: if (isize > 1) {
3232: if (scall == MAT_INITIAL_MATRIX) {
3233: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3234: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3235: MatSetType((*submat)[i],MATMPIAIJ);
3236: PetscLayoutSetUp((*submat)[i]->rmap);
3237: PetscLayoutSetUp((*submat)[i]->cmap);
3238: }
3239: /*
3240: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3241: */
3242: {
3243: Mat AA,BB;
3244: AA = A[i];
3245: BB = B[ii];
3246: if (AA || BB) {
3247: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3248: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3249: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3250: }
3252: MatDestroy(&AA);
3253: }
3254: ISDestroy(ciscol+ii);
3255: ISDestroy(ciscol_p+ii);
3256: ++ii;
3257: } else { /* if (isize == 1) */
3258: if (scall == MAT_REUSE_MATRIX) {
3259: MatDestroy(&(*submat)[i]);
3260: }
3261: if (isrow_p[i] || iscol_p[i]) {
3262: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3263: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3264: /* Otherwise A is extracted straight into (*submats)[i]. */
3265: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3266: MatDestroy(A+i);
3267: } else (*submat)[i] = A[i];
3268: }
3269: ISDestroy(&isrow_p[i]);
3270: ISDestroy(&iscol_p[i]);
3271: }
3272: PetscFree2(cisrow,ciscol);
3273: PetscFree2(isrow_p,iscol_p);
3274: PetscFree(ciscol_p);
3275: PetscFree(A);
3276: MatDestroySubMatrices(cismax,&B);
3277: return(0);
3278: }
3280: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3281: {
3285: MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3286: return(0);
3287: }