Actual source code: mpiov.c
petsc-3.7.3 2016-08-01
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
7: #include <../src/mat/impls/aij/mpi/mpiaij.h>
8: #include <petscbt.h>
9: #include <petscsf.h>
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
13: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
14: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
20: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
25: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
26: {
28: PetscInt i;
31: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
32: for (i=0; i<ov; ++i) {
33: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
34: }
35: return(0);
36: }
40: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
41: {
43: PetscInt i;
46: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
47: for (i=0; i<ov; ++i) {
48: MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
49: }
50: return(0);
51: }
56: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
57: {
58: PetscErrorCode ierr;
59: MPI_Comm comm;
60: PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner;
61: PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
62: PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0;
63: const PetscInt *indices_i,**indices;
64: PetscLayout rmap;
65: PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom;
66: PetscSF sf;
67: PetscSFNode *remote;
70: PetscObjectGetComm((PetscObject)mat,&comm);
71: MPI_Comm_rank(comm,&rank);
72: MPI_Comm_size(comm,&size);
73: /* get row map to determine where rows should be going */
74: MatGetLayouts(mat,&rmap,PETSC_NULL);
75: /* retrieve IS data and put all together so that we
76: * can optimize communication
77: * */
78: PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
79: for (i=0,tlength=0; i<nidx; i++){
80: ISGetLocalSize(is[i],&length[i]);
81: tlength += length[i];
82: ISGetIndices(is[i],&indices[i]);
83: }
84: /* find these rows on remote processors */
85: PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
86: PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
87: nrrows = 0;
88: for (i=0; i<nidx; i++){
89: length_i = length[i];
90: indices_i = indices[i];
91: for (j=0; j<length_i; j++){
92: owner = -1;
93: PetscLayoutFindOwner(rmap,indices_i[j],&owner);
94: /* remote processors */
95: if (owner != rank){
96: tosizes_temp[owner]++; /* number of rows to owner */
97: rrow_ranks[nrrows] = owner; /* processor */
98: rrow_isids[nrrows] = i; /* is id */
99: remoterows[nrrows++] = indices_i[j]; /* row */
100: }
101: }
102: ISRestoreIndices(is[i],&indices[i]);
103: }
104: PetscFree2(indices,length);
105: /* test if we need to exchange messages
106: * generally speaking, we do not need to exchange
107: * data when overlap is 1
108: * */
109: MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
110: /* we do not have any messages
111: * It usually corresponds to overlap 1
112: * */
113: if (!reducednrrows){
114: PetscFree3(toranks,tosizes,tosizes_temp);
115: PetscFree3(remoterows,rrow_ranks,rrow_isids);
116: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
117: return(0);
118: }
119: nto = 0;
120: /* send sizes and ranks for building a two-sided communcation */
121: for (i=0; i<size; i++){
122: if (tosizes_temp[i]){
123: tosizes[nto*2] = tosizes_temp[i]*2; /* size */
124: tosizes_temp[i] = nto; /* a map from processor to index */
125: toranks[nto++] = i; /* processor */
126: }
127: }
128: PetscCalloc1(nto+1,&toffsets);
129: for (i=0; i<nto; i++){
130: toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */
131: tosizes[2*i+1] = toffsets[i]; /* offsets to send */
132: }
133: /* send information to other processors */
134: PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
135: nrecvrows = 0;
136: for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
137: PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);
138: nrecvrows = 0;
139: for (i=0; i<nfrom; i++){
140: for (j=0; j<fromsizes[2*i]; j++){
141: remote[nrecvrows].rank = fromranks[i];
142: remote[nrecvrows++].index = fromsizes[2*i+1]+j;
143: }
144: }
145: PetscSFCreate(comm,&sf);
146: PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
147: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
148: PetscSFSetType(sf,PETSCSFBASIC);
149: PetscSFSetFromOptions(sf);
150: /* message pair <no of is, row> */
151: PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
152: for (i=0; i<nrrows; i++){
153: owner = rrow_ranks[i]; /* processor */
154: j = tosizes_temp[owner]; /* index */
155: todata[toffsets[j]++] = rrow_isids[i];
156: todata[toffsets[j]++] = remoterows[i];
157: }
158: PetscFree3(toranks,tosizes,tosizes_temp);
159: PetscFree3(remoterows,rrow_ranks,rrow_isids);
160: PetscFree(toffsets);
161: PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);
162: PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);
163: PetscSFDestroy(&sf);
164: /* send rows belonging to the remote so that then we could get the overlapping data back */
165: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
166: PetscFree2(todata,fromdata);
167: PetscFree(fromsizes);
168: PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
169: PetscFree(fromranks);
170: nrecvrows = 0;
171: for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
172: PetscCalloc1(nrecvrows,&todata);
173: PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);
174: nrecvrows = 0;
175: for (i=0; i<nto; i++){
176: for (j=0; j<tosizes[2*i]; j++){
177: remote[nrecvrows].rank = toranks[i];
178: remote[nrecvrows++].index = tosizes[2*i+1]+j;
179: }
180: }
181: PetscSFCreate(comm,&sf);
182: PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
183: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
184: PetscSFSetType(sf,PETSCSFBASIC);
185: PetscSFSetFromOptions(sf);
186: /* overlap communication and computation */
187: PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);
188: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
189: PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);
190: PetscSFDestroy(&sf);
191: PetscFree2(sbdata,sbsizes);
192: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
193: PetscFree(toranks);
194: PetscFree(tosizes);
195: PetscFree(todata);
196: return(0);
197: }
201: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
202: {
203: PetscInt *isz,isz_i,i,j,is_id, data_size;
204: PetscInt col,lsize,max_lsize,*indices_temp, *indices_i;
205: const PetscInt *indices_i_temp;
206: PetscErrorCode ierr;
209: max_lsize = 0;
210: PetscMalloc(nidx*sizeof(PetscInt),&isz);
211: for (i=0; i<nidx; i++){
212: ISGetLocalSize(is[i],&lsize);
213: max_lsize = lsize>max_lsize ? lsize:max_lsize;
214: isz[i] = lsize;
215: }
216: PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);
217: for (i=0; i<nidx; i++){
218: ISGetIndices(is[i],&indices_i_temp);
219: PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);
220: ISRestoreIndices(is[i],&indices_i_temp);
221: ISDestroy(&is[i]);
222: }
223: /* retrieve information to get row id and its overlap */
224: for (i=0; i<nrecvs; ){
225: is_id = recvdata[i++];
226: data_size = recvdata[i++];
227: indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
228: isz_i = isz[is_id];
229: for (j=0; j< data_size; j++){
230: col = recvdata[i++];
231: indices_i[isz_i++] = col;
232: }
233: isz[is_id] = isz_i;
234: }
235: /* remove duplicate entities */
236: for (i=0; i<nidx; i++){
237: indices_i = indices_temp+(max_lsize+nrecvs)*i;
238: isz_i = isz[i];
239: PetscSortRemoveDupsInt(&isz_i,indices_i);
240: ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
241: }
242: PetscFree(isz);
243: PetscFree(indices_temp);
244: return(0);
245: }
249: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
250: {
251: PetscLayout rmap,cmap;
252: PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
253: PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
254: PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
255: const PetscInt *gcols,*ai,*aj,*bi,*bj;
256: Mat amat,bmat;
257: PetscMPIInt rank;
258: PetscBool done;
259: MPI_Comm comm;
260: PetscErrorCode ierr;
263: PetscObjectGetComm((PetscObject)mat,&comm);
264: MPI_Comm_rank(comm,&rank);
265: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
266: /* Even if the mat is symmetric, we still assume it is not symmetric */
267: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
268: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
269: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
270: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
271: /* total number of nonzero values is used to estimate the memory usage in the next step */
272: tnz = ai[an]+bi[bn];
273: MatGetLayouts(mat,&rmap,&cmap);
274: PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);
275: PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);
276: /* to find the longest message */
277: max_fszs = 0;
278: for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
279: /* better way to estimate number of nonzero in the mat??? */
280: PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
281: for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
282: rows_pos = 0;
283: totalrows = 0;
284: for (i=0; i<nfrom; i++){
285: PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
286: /* group data together */
287: for (j=0; j<fromsizes[2*i]; j+=2){
288: is_id = fromrows[rows_pos++];/* no of is */
289: rows_i = rows_data[is_id];
290: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
291: }
292: /* estimate a space to avoid multiple allocations */
293: for (j=0; j<nidx; j++){
294: indvc_ij = 0;
295: rows_i = rows_data[j];
296: for (l=0; l<rows_pos_i[j]; l++){
297: row = rows_i[l]-rstart;
298: start = ai[row];
299: end = ai[row+1];
300: for (k=start; k<end; k++){ /* Amat */
301: col = aj[k] + cstart;
302: indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
303: }
304: start = bi[row];
305: end = bi[row+1];
306: for (k=start; k<end; k++) { /* Bmat */
307: col = gcols[bj[k]];
308: indices_tmp[indvc_ij++] = col;
309: }
310: }
311: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
312: indv_counts[i*nidx+j] = indvc_ij;
313: totalrows += indvc_ij;
314: }
315: }
316: /* message triple <no of is, number of rows, rows> */
317: PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
318: totalrows = 0;
319: rows_pos = 0;
320: /* use this code again */
321: for (i=0;i<nfrom;i++){
322: PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);
323: for (j=0; j<fromsizes[2*i]; j+=2){
324: is_id = fromrows[rows_pos++];
325: rows_i = rows_data[is_id];
326: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
327: }
328: /* add data */
329: for (j=0; j<nidx; j++){
330: if (!indv_counts[i*nidx+j]) continue;
331: indvc_ij = 0;
332: sbdata[totalrows++] = j;
333: sbdata[totalrows++] = indv_counts[i*nidx+j];
334: sbsizes[2*i] += 2;
335: rows_i = rows_data[j];
336: for (l=0; l<rows_pos_i[j]; l++){
337: row = rows_i[l]-rstart;
338: start = ai[row];
339: end = ai[row+1];
340: for (k=start; k<end; k++){ /* Amat */
341: col = aj[k] + cstart;
342: indices_tmp[indvc_ij++] = col;
343: }
344: start = bi[row];
345: end = bi[row+1];
346: for (k=start; k<end; k++) { /* Bmat */
347: col = gcols[bj[k]];
348: indices_tmp[indvc_ij++] = col;
349: }
350: }
351: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
352: sbsizes[2*i] += indvc_ij;
353: PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);
354: totalrows += indvc_ij;
355: }
356: }
357: PetscCalloc1(nfrom+1,&offsets);
358: for (i=0; i<nfrom; i++){
359: offsets[i+1] = offsets[i] + sbsizes[2*i];
360: sbsizes[2*i+1] = offsets[i];
361: }
362: PetscFree(offsets);
363: if (sbrowsizes) *sbrowsizes = sbsizes;
364: if (sbrows) *sbrows = sbdata;
365: PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
366: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
367: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
368: return(0);
369: }
373: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
374: {
375: const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices;
376: PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
377: PetscInt lsize,lsize_tmp,owner;
378: PetscMPIInt rank;
379: Mat amat,bmat;
380: PetscBool done;
381: PetscLayout cmap,rmap;
382: MPI_Comm comm;
383: PetscErrorCode ierr;
386: PetscObjectGetComm((PetscObject)mat,&comm);
387: MPI_Comm_rank(comm,&rank);
388: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
389: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
390: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
391: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
392: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
393: /* is it a safe way to compute number of nonzero values ? */
394: tnz = ai[an]+bi[bn];
395: MatGetLayouts(mat,&rmap,&cmap);
396: PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);
397: PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);
398: /* it is a better way to estimate memory than the old implementation
399: * where global size of matrix is used
400: * */
401: PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);
402: for (i=0; i<nidx; i++) {
403: ISGetLocalSize(is[i],&lsize);
404: ISGetIndices(is[i],&indices);
405: lsize_tmp = 0;
406: for (j=0; j<lsize; j++) {
407: owner = -1;
408: row = indices[j];
409: PetscLayoutFindOwner(rmap,row,&owner);
410: if (owner != rank) continue;
411: /* local number */
412: row -= rstart;
413: start = ai[row];
414: end = ai[row+1];
415: for (k=start; k<end; k++) { /* Amat */
416: col = aj[k] + cstart;
417: indices_temp[lsize_tmp++] = col;
418: }
419: start = bi[row];
420: end = bi[row+1];
421: for (k=start; k<end; k++) { /* Bmat */
422: col = gcols[bj[k]];
423: indices_temp[lsize_tmp++] = col;
424: }
425: }
426: ISRestoreIndices(is[i],&indices);
427: ISDestroy(&is[i]);
428: PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
429: ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
430: }
431: PetscFree(indices_temp);
432: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
433: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
434: return(0);
435: }
438: /*
439: Sample message format:
440: If a processor A wants processor B to process some elements corresponding
441: to index sets is[1],is[5]
442: mesg [0] = 2 (no of index sets in the mesg)
443: -----------
444: mesg [1] = 1 => is[1]
445: mesg [2] = sizeof(is[1]);
446: -----------
447: mesg [3] = 5 => is[5]
448: mesg [4] = sizeof(is[5]);
449: -----------
450: mesg [5]
451: mesg [n] datas[1]
452: -----------
453: mesg[n+1]
454: mesg[m] data(is[5])
455: -----------
457: Notes:
458: nrqs - no of requests sent (or to be sent out)
459: nrqr - no of requests recieved (which have to be or which have been processed
460: */
463: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
464: {
465: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
466: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
467: const PetscInt **idx,*idx_i;
468: PetscInt *n,**data,len;
470: PetscMPIInt size,rank,tag1,tag2;
471: PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
472: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
473: PetscBT *table;
474: MPI_Comm comm;
475: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
476: MPI_Status *s_status,*recv_status;
477: char *t_p;
480: PetscObjectGetComm((PetscObject)C,&comm);
481: size = c->size;
482: rank = c->rank;
483: M = C->rmap->N;
485: PetscObjectGetNewTag((PetscObject)C,&tag1);
486: PetscObjectGetNewTag((PetscObject)C,&tag2);
488: PetscMalloc2(imax,&idx,imax,&n);
490: for (i=0; i<imax; i++) {
491: ISGetIndices(is[i],&idx[i]);
492: ISGetLocalSize(is[i],&n[i]);
493: }
495: /* evaluate communication - mesg to who,length of mesg, and buffer space
496: required. Based on this, buffers are allocated, and data copied into them */
497: PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
498: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialise work vector*/
499: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialise work vector*/
500: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialise work vector*/
501: for (i=0; i<imax; i++) {
502: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
503: idx_i = idx[i];
504: len = n[i];
505: for (j=0; j<len; j++) {
506: row = idx_i[j];
507: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
508: PetscLayoutFindOwner(C->rmap,row,&proc);
509: w4[proc]++;
510: }
511: for (j=0; j<size; j++) {
512: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
513: }
514: }
516: nrqs = 0; /* no of outgoing messages */
517: msz = 0; /* total mesg length (for all proc */
518: w1[rank] = 0; /* no mesg sent to intself */
519: w3[rank] = 0;
520: for (i=0; i<size; i++) {
521: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
522: }
523: /* pa - is list of processors to communicate with */
524: PetscMalloc1(nrqs+1,&pa);
525: for (i=0,j=0; i<size; i++) {
526: if (w1[i]) {pa[j] = i; j++;}
527: }
529: /* Each message would have a header = 1 + 2*(no of IS) + data */
530: for (i=0; i<nrqs; i++) {
531: j = pa[i];
532: w1[j] += w2[j] + 2*w3[j];
533: msz += w1[j];
534: }
536: /* Determine the number of messages to expect, their lengths, from from-ids */
537: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
538: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
540: /* Now post the Irecvs corresponding to these messages */
541: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
543: /* Allocate Memory for outgoing messages */
544: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
545: PetscMemzero(outdat,size*sizeof(PetscInt*));
546: PetscMemzero(ptr,size*sizeof(PetscInt*));
548: {
549: PetscInt *iptr = tmp,ict = 0;
550: for (i=0; i<nrqs; i++) {
551: j = pa[i];
552: iptr += ict;
553: outdat[j] = iptr;
554: ict = w1[j];
555: }
556: }
558: /* Form the outgoing messages */
559: /* plug in the headers */
560: for (i=0; i<nrqs; i++) {
561: j = pa[i];
562: outdat[j][0] = 0;
563: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
564: ptr[j] = outdat[j] + 2*w3[j] + 1;
565: }
567: /* Memory for doing local proc's work */
568: {
569: PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
571: for (i=0; i<imax; i++) {
572: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
573: data[i] = d_p + M*i;
574: }
575: }
577: /* Parse the IS and update local tables and the outgoing buf with the data */
578: {
579: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
580: PetscBT table_i;
582: for (i=0; i<imax; i++) {
583: PetscMemzero(ctr,size*sizeof(PetscInt));
584: n_i = n[i];
585: table_i = table[i];
586: idx_i = idx[i];
587: data_i = data[i];
588: isz_i = isz[i];
589: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
590: row = idx_i[j];
591: PetscLayoutFindOwner(C->rmap,row,&proc);
592: if (proc != rank) { /* copy to the outgoing buffer */
593: ctr[proc]++;
594: *ptr[proc] = row;
595: ptr[proc]++;
596: } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
597: }
598: /* Update the headers for the current IS */
599: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
600: if ((ctr_j = ctr[j])) {
601: outdat_j = outdat[j];
602: k = ++outdat_j[0];
603: outdat_j[2*k] = ctr_j;
604: outdat_j[2*k-1] = i;
605: }
606: }
607: isz[i] = isz_i;
608: }
609: }
611: /* Now post the sends */
612: PetscMalloc1(nrqs+1,&s_waits1);
613: for (i=0; i<nrqs; ++i) {
614: j = pa[i];
615: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
616: }
618: /* No longer need the original indices */
619: for (i=0; i<imax; ++i) {
620: ISRestoreIndices(is[i],idx+i);
621: }
622: PetscFree2(idx,n);
624: for (i=0; i<imax; ++i) {
625: ISDestroy(&is[i]);
626: }
628: /* Do Local work */
629: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
631: /* Receive messages */
632: PetscMalloc1(nrqr+1,&recv_status);
633: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
635: PetscMalloc1(nrqs+1,&s_status);
636: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
638: /* Phase 1 sends are complete - deallocate buffers */
639: PetscFree4(outdat,ptr,tmp,ctr);
640: PetscFree4(w1,w2,w3,w4);
642: PetscMalloc1(nrqr+1,&xdata);
643: PetscMalloc1(nrqr+1,&isz1);
644: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
645: PetscFree(rbuf[0]);
646: PetscFree(rbuf);
649: /* Send the data back */
650: /* Do a global reduction to know the buffer space req for incoming messages */
651: {
652: PetscMPIInt *rw1;
654: PetscCalloc1(size,&rw1);
656: for (i=0; i<nrqr; ++i) {
657: proc = recv_status[i].MPI_SOURCE;
659: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
660: rw1[proc] = isz1[i];
661: }
662: PetscFree(onodes1);
663: PetscFree(olengths1);
665: /* Determine the number of messages to expect, their lengths, from from-ids */
666: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
667: PetscFree(rw1);
668: }
669: /* Now post the Irecvs corresponding to these messages */
670: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
672: /* Now post the sends */
673: PetscMalloc1(nrqr+1,&s_waits2);
674: for (i=0; i<nrqr; ++i) {
675: j = recv_status[i].MPI_SOURCE;
676: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
677: }
679: /* receive work done on other processors */
680: {
681: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
682: PetscMPIInt idex;
683: PetscBT table_i;
684: MPI_Status *status2;
686: PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);
687: for (i=0; i<nrqs; ++i) {
688: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
689: /* Process the message */
690: rbuf2_i = rbuf2[idex];
691: ct1 = 2*rbuf2_i[0]+1;
692: jmax = rbuf2[idex][0];
693: for (j=1; j<=jmax; j++) {
694: max = rbuf2_i[2*j];
695: is_no = rbuf2_i[2*j-1];
696: isz_i = isz[is_no];
697: data_i = data[is_no];
698: table_i = table[is_no];
699: for (k=0; k<max; k++,ct1++) {
700: row = rbuf2_i[ct1];
701: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
702: }
703: isz[is_no] = isz_i;
704: }
705: }
707: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
708: PetscFree(status2);
709: }
711: for (i=0; i<imax; ++i) {
712: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
713: }
715: PetscFree(onodes2);
716: PetscFree(olengths2);
718: PetscFree(pa);
719: PetscFree(rbuf2[0]);
720: PetscFree(rbuf2);
721: PetscFree(s_waits1);
722: PetscFree(r_waits1);
723: PetscFree(s_waits2);
724: PetscFree(r_waits2);
725: PetscFree5(table,data,isz,d_p,t_p);
726: PetscFree(s_status);
727: PetscFree(recv_status);
728: PetscFree(xdata[0]);
729: PetscFree(xdata);
730: PetscFree(isz1);
731: return(0);
732: }
736: /*
737: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
738: the work on the local processor.
740: Inputs:
741: C - MAT_MPIAIJ;
742: imax - total no of index sets processed at a time;
743: table - an array of char - size = m bits.
745: Output:
746: isz - array containing the count of the solution elements corresponding
747: to each index set;
748: data - pointer to the solutions
749: */
750: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
751: {
752: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
753: Mat A = c->A,B = c->B;
754: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
755: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
756: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
757: PetscBT table_i;
760: rstart = C->rmap->rstart;
761: cstart = C->cmap->rstart;
762: ai = a->i;
763: aj = a->j;
764: bi = b->i;
765: bj = b->j;
766: garray = c->garray;
769: for (i=0; i<imax; i++) {
770: data_i = data[i];
771: table_i = table[i];
772: isz_i = isz[i];
773: for (j=0,max=isz[i]; j<max; j++) {
774: row = data_i[j] - rstart;
775: start = ai[row];
776: end = ai[row+1];
777: for (k=start; k<end; k++) { /* Amat */
778: val = aj[k] + cstart;
779: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
780: }
781: start = bi[row];
782: end = bi[row+1];
783: for (k=start; k<end; k++) { /* Bmat */
784: val = garray[bj[k]];
785: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
786: }
787: }
788: isz[i] = isz_i;
789: }
790: return(0);
791: }
795: /*
796: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
797: and return the output
799: Input:
800: C - the matrix
801: nrqr - no of messages being processed.
802: rbuf - an array of pointers to the recieved requests
804: Output:
805: xdata - array of messages to be sent back
806: isz1 - size of each message
808: For better efficiency perhaps we should malloc separately each xdata[i],
809: then if a remalloc is required we need only copy the data for that one row
810: rather then all previous rows as it is now where a single large chunck of
811: memory is used.
813: */
814: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
815: {
816: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817: Mat A = c->A,B = c->B;
818: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
820: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
821: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
822: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
823: PetscInt *rbuf_i,kmax,rbuf_0;
824: PetscBT xtable;
827: m = C->rmap->N;
828: rstart = C->rmap->rstart;
829: cstart = C->cmap->rstart;
830: ai = a->i;
831: aj = a->j;
832: bi = b->i;
833: bj = b->j;
834: garray = c->garray;
837: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
838: rbuf_i = rbuf[i];
839: rbuf_0 = rbuf_i[0];
840: ct += rbuf_0;
841: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
842: }
844: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
845: else max1 = 1;
846: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
847: PetscMalloc1(mem_estimate,&xdata[0]);
848: ++no_malloc;
849: PetscBTCreate(m,&xtable);
850: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
852: ct3 = 0;
853: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
854: rbuf_i = rbuf[i];
855: rbuf_0 = rbuf_i[0];
856: ct1 = 2*rbuf_0+1;
857: ct2 = ct1;
858: ct3 += ct1;
859: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
860: PetscBTMemzero(m,xtable);
861: oct2 = ct2;
862: kmax = rbuf_i[2*j];
863: for (k=0; k<kmax; k++,ct1++) {
864: row = rbuf_i[ct1];
865: if (!PetscBTLookupSet(xtable,row)) {
866: if (!(ct3 < mem_estimate)) {
867: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
868: PetscMalloc1(new_estimate,&tmp);
869: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
870: PetscFree(xdata[0]);
871: xdata[0] = tmp;
872: mem_estimate = new_estimate; ++no_malloc;
873: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
874: }
875: xdata[i][ct2++] = row;
876: ct3++;
877: }
878: }
879: for (k=oct2,max2=ct2; k<max2; k++) {
880: row = xdata[i][k] - rstart;
881: start = ai[row];
882: end = ai[row+1];
883: for (l=start; l<end; l++) {
884: val = aj[l] + cstart;
885: if (!PetscBTLookupSet(xtable,val)) {
886: if (!(ct3 < mem_estimate)) {
887: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
888: PetscMalloc1(new_estimate,&tmp);
889: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
890: PetscFree(xdata[0]);
891: xdata[0] = tmp;
892: mem_estimate = new_estimate; ++no_malloc;
893: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
894: }
895: xdata[i][ct2++] = val;
896: ct3++;
897: }
898: }
899: start = bi[row];
900: end = bi[row+1];
901: for (l=start; l<end; l++) {
902: val = garray[bj[l]];
903: if (!PetscBTLookupSet(xtable,val)) {
904: if (!(ct3 < mem_estimate)) {
905: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
906: PetscMalloc1(new_estimate,&tmp);
907: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
908: PetscFree(xdata[0]);
909: xdata[0] = tmp;
910: mem_estimate = new_estimate; ++no_malloc;
911: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
912: }
913: xdata[i][ct2++] = val;
914: ct3++;
915: }
916: }
917: }
918: /* Update the header*/
919: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
920: xdata[i][2*j-1] = rbuf_i[2*j-1];
921: }
922: xdata[i][0] = rbuf_0;
923: xdata[i+1] = xdata[i] + ct2;
924: isz1[i] = ct2; /* size of each message */
925: }
926: PetscBTDestroy(&xtable);
927: PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
928: return(0);
929: }
930: /* -------------------------------------------------------------------------*/
931: extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
932: extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
933: /*
934: Every processor gets the entire matrix
935: */
938: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
939: {
940: Mat B;
941: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
942: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
944: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
945: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
946: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
947: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
950: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
951: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
953: if (scall == MAT_INITIAL_MATRIX) {
954: /* ----------------------------------------------------------------
955: Tell every processor the number of nonzeros per row
956: */
957: PetscMalloc1(A->rmap->N,&lens);
958: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
959: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
960: }
961: PetscMalloc2(size,&recvcounts,size,&displs);
962: for (i=0; i<size; i++) {
963: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
964: displs[i] = A->rmap->range[i];
965: }
966: #if defined(PETSC_HAVE_MPI_IN_PLACE)
967: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
968: #else
969: sendcount = A->rmap->rend - A->rmap->rstart;
970: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
971: #endif
972: /* ---------------------------------------------------------------
973: Create the sequential matrix of the same type as the local block diagonal
974: */
975: MatCreate(PETSC_COMM_SELF,&B);
976: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
977: MatSetBlockSizesFromMats(B,A,A);
978: MatSetType(B,((PetscObject)a->A)->type_name);
979: MatSeqAIJSetPreallocation(B,0,lens);
980: PetscMalloc1(1,Bin);
981: **Bin = B;
982: b = (Mat_SeqAIJ*)B->data;
984: /*--------------------------------------------------------------------
985: Copy my part of matrix column indices over
986: */
987: sendcount = ad->nz + bd->nz;
988: jsendbuf = b->j + b->i[rstarts[rank]];
989: a_jsendbuf = ad->j;
990: b_jsendbuf = bd->j;
991: n = A->rmap->rend - A->rmap->rstart;
992: cnt = 0;
993: for (i=0; i<n; i++) {
995: /* put in lower diagonal portion */
996: m = bd->i[i+1] - bd->i[i];
997: while (m > 0) {
998: /* is it above diagonal (in bd (compressed) numbering) */
999: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1000: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1001: m--;
1002: }
1004: /* put in diagonal portion */
1005: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1006: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1007: }
1009: /* put in upper diagonal portion */
1010: while (m-- > 0) {
1011: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1012: }
1013: }
1014: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1016: /*--------------------------------------------------------------------
1017: Gather all column indices to all processors
1018: */
1019: for (i=0; i<size; i++) {
1020: recvcounts[i] = 0;
1021: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1022: recvcounts[i] += lens[j];
1023: }
1024: }
1025: displs[0] = 0;
1026: for (i=1; i<size; i++) {
1027: displs[i] = displs[i-1] + recvcounts[i-1];
1028: }
1029: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1030: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1031: #else
1032: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1033: #endif
1034: /*--------------------------------------------------------------------
1035: Assemble the matrix into useable form (note numerical values not yet set)
1036: */
1037: /* set the b->ilen (length of each row) values */
1038: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
1039: /* set the b->i indices */
1040: b->i[0] = 0;
1041: for (i=1; i<=A->rmap->N; i++) {
1042: b->i[i] = b->i[i-1] + lens[i-1];
1043: }
1044: PetscFree(lens);
1045: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1046: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
1048: } else {
1049: B = **Bin;
1050: b = (Mat_SeqAIJ*)B->data;
1051: }
1053: /*--------------------------------------------------------------------
1054: Copy my part of matrix numerical values into the values location
1055: */
1056: if (flag == MAT_GET_VALUES) {
1057: sendcount = ad->nz + bd->nz;
1058: sendbuf = b->a + b->i[rstarts[rank]];
1059: a_sendbuf = ad->a;
1060: b_sendbuf = bd->a;
1061: b_sendj = bd->j;
1062: n = A->rmap->rend - A->rmap->rstart;
1063: cnt = 0;
1064: for (i=0; i<n; i++) {
1066: /* put in lower diagonal portion */
1067: m = bd->i[i+1] - bd->i[i];
1068: while (m > 0) {
1069: /* is it above diagonal (in bd (compressed) numbering) */
1070: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1071: sendbuf[cnt++] = *b_sendbuf++;
1072: m--;
1073: b_sendj++;
1074: }
1076: /* put in diagonal portion */
1077: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1078: sendbuf[cnt++] = *a_sendbuf++;
1079: }
1081: /* put in upper diagonal portion */
1082: while (m-- > 0) {
1083: sendbuf[cnt++] = *b_sendbuf++;
1084: b_sendj++;
1085: }
1086: }
1087: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1089: /* -----------------------------------------------------------------
1090: Gather all numerical values to all processors
1091: */
1092: if (!recvcounts) {
1093: PetscMalloc2(size,&recvcounts,size,&displs);
1094: }
1095: for (i=0; i<size; i++) {
1096: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1097: }
1098: displs[0] = 0;
1099: for (i=1; i<size; i++) {
1100: displs[i] = displs[i-1] + recvcounts[i-1];
1101: }
1102: recvbuf = b->a;
1103: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1104: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1105: #else
1106: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1107: #endif
1108: } /* endof (flag == MAT_GET_VALUES) */
1109: PetscFree2(recvcounts,displs);
1111: if (A->symmetric) {
1112: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1113: } else if (A->hermitian) {
1114: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1115: } else if (A->structurally_symmetric) {
1116: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1117: }
1118: return(0);
1119: }
1125: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1126: {
1128: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
1129: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
1133: /*
1134: Check for special case: each processor gets entire matrix
1135: */
1136: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1137: ISIdentity(*isrow,&rowflag);
1138: ISIdentity(*iscol,&colflag);
1139: ISGetLocalSize(*isrow,&nrow);
1140: ISGetLocalSize(*iscol,&ncol);
1141: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1142: wantallmatrix = PETSC_TRUE;
1144: PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
1145: }
1146: }
1147: MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));
1148: if (twantallmatrix) {
1149: MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
1150: return(0);
1151: }
1153: /* Allocate memory to hold all the submatrices */
1154: if (scall != MAT_REUSE_MATRIX) {
1155: PetscMalloc1(ismax+1,submat);
1156: }
1158: /* Check for special case: each processor gets entire matrix columns */
1159: PetscMalloc1(ismax+1,&allcolumns);
1160: for (i=0; i<ismax; i++) {
1161: ISIdentity(iscol[i],&colflag);
1162: ISGetLocalSize(iscol[i],&ncol);
1163: if (colflag && ncol == C->cmap->N) {
1164: allcolumns[i] = PETSC_TRUE;
1165: } else {
1166: allcolumns[i] = PETSC_FALSE;
1167: }
1168: }
1170: /* Determine the number of stages through which submatrices are done */
1171: nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1173: /*
1174: Each stage will extract nmax submatrices.
1175: nmax is determined by the matrix column dimension.
1176: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1177: */
1178: if (!nmax) nmax = 1;
1179: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
1181: /* Make sure every processor loops through the nstages */
1182: MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
1184: for (i=0,pos=0; i<nstages; i++) {
1185: if (pos+nmax <= ismax) max_no = nmax;
1186: else if (pos == ismax) max_no = 0;
1187: else max_no = ismax-pos;
1188: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);
1189: pos += max_no;
1190: }
1192: PetscFree(allcolumns);
1193: return(0);
1194: }
1196: /* -------------------------------------------------------------------------*/
1199: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
1200: {
1201: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
1202: Mat A = c->A;
1203: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
1204: const PetscInt **icol,**irow;
1205: PetscInt *nrow,*ncol,start;
1207: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
1208: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
1209: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
1210: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
1211: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
1212: #if defined(PETSC_USE_CTABLE)
1213: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
1214: #else
1215: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
1216: #endif
1217: const PetscInt *irow_i;
1218: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
1219: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1220: MPI_Request *r_waits4,*s_waits3,*s_waits4;
1221: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
1222: MPI_Status *r_status3,*r_status4,*s_status4;
1223: MPI_Comm comm;
1224: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
1225: PetscMPIInt *onodes1,*olengths1;
1226: PetscMPIInt idex,idex2,end;
1229: PetscObjectGetComm((PetscObject)C,&comm);
1230: tag0 = ((PetscObject)C)->tag;
1231: size = c->size;
1232: rank = c->rank;
1234: /* Get some new tags to keep the communication clean */
1235: PetscObjectGetNewTag((PetscObject)C,&tag1);
1236: PetscObjectGetNewTag((PetscObject)C,&tag2);
1237: PetscObjectGetNewTag((PetscObject)C,&tag3);
1239: PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);
1241: for (i=0; i<ismax; i++) {
1242: ISGetIndices(isrow[i],&irow[i]);
1243: ISGetLocalSize(isrow[i],&nrow[i]);
1244: if (allcolumns[i]) {
1245: icol[i] = NULL;
1246: ncol[i] = C->cmap->N;
1247: } else {
1248: ISGetIndices(iscol[i],&icol[i]);
1249: ISGetLocalSize(iscol[i],&ncol[i]);
1250: }
1251: }
1253: /* evaluate communication - mesg to who, length of mesg, and buffer space
1254: required. Based on this, buffers are allocated, and data copied into them*/
1255: PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size */
1256: PetscMemzero(w1,size*sizeof(PetscMPIInt)); /* initialize work vector*/
1257: PetscMemzero(w2,size*sizeof(PetscMPIInt)); /* initialize work vector*/
1258: PetscMemzero(w3,size*sizeof(PetscMPIInt)); /* initialize work vector*/
1259: for (i=0; i<ismax; i++) {
1260: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
1261: jmax = nrow[i];
1262: irow_i = irow[i];
1263: for (j=0; j<jmax; j++) {
1264: l = 0;
1265: row = irow_i[j];
1266: while (row >= C->rmap->range[l+1]) l++;
1267: proc = l;
1268: w4[proc]++;
1269: }
1270: for (j=0; j<size; j++) {
1271: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
1272: }
1273: }
1275: nrqs = 0; /* no of outgoing messages */
1276: msz = 0; /* total mesg length (for all procs) */
1277: w1[rank] = 0; /* no mesg sent to self */
1278: w3[rank] = 0;
1279: for (i=0; i<size; i++) {
1280: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
1281: }
1282: PetscMalloc1(nrqs+1,&pa); /*(proc -array)*/
1283: for (i=0,j=0; i<size; i++) {
1284: if (w1[i]) { pa[j] = i; j++; }
1285: }
1287: /* Each message would have a header = 1 + 2*(no of IS) + data */
1288: for (i=0; i<nrqs; i++) {
1289: j = pa[i];
1290: w1[j] += w2[j] + 2* w3[j];
1291: msz += w1[j];
1292: }
1293: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
1295: /* Determine the number of messages to expect, their lengths, from from-ids */
1296: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1297: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
1299: /* Now post the Irecvs corresponding to these messages */
1300: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
1302: PetscFree(onodes1);
1303: PetscFree(olengths1);
1305: /* Allocate Memory for outgoing messages */
1306: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1307: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
1308: PetscMemzero(ptr,size*sizeof(PetscInt*));
1310: {
1311: PetscInt *iptr = tmp,ict = 0;
1312: for (i=0; i<nrqs; i++) {
1313: j = pa[i];
1314: iptr += ict;
1315: sbuf1[j] = iptr;
1316: ict = w1[j];
1317: }
1318: }
1320: /* Form the outgoing messages */
1321: /* Initialize the header space */
1322: for (i=0; i<nrqs; i++) {
1323: j = pa[i];
1324: sbuf1[j][0] = 0;
1325: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
1326: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
1327: }
1329: /* Parse the isrow and copy data into outbuf */
1330: for (i=0; i<ismax; i++) {
1331: PetscMemzero(ctr,size*sizeof(PetscInt));
1332: irow_i = irow[i];
1333: jmax = nrow[i];
1334: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
1335: l = 0;
1336: row = irow_i[j];
1337: while (row >= C->rmap->range[l+1]) l++;
1338: proc = l;
1339: if (proc != rank) { /* copy to the outgoing buf*/
1340: ctr[proc]++;
1341: *ptr[proc] = row;
1342: ptr[proc]++;
1343: }
1344: }
1345: /* Update the headers for the current IS */
1346: for (j=0; j<size; j++) { /* Can Optimise this loop too */
1347: if ((ctr_j = ctr[j])) {
1348: sbuf1_j = sbuf1[j];
1349: k = ++sbuf1_j[0];
1350: sbuf1_j[2*k] = ctr_j;
1351: sbuf1_j[2*k-1] = i;
1352: }
1353: }
1354: }
1356: /* Now post the sends */
1357: PetscMalloc1(nrqs+1,&s_waits1);
1358: for (i=0; i<nrqs; ++i) {
1359: j = pa[i];
1360: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
1361: }
1363: /* Post Receives to capture the buffer size */
1364: PetscMalloc1(nrqs+1,&r_waits2);
1365: PetscMalloc1(nrqs+1,&rbuf2);
1366: rbuf2[0] = tmp + msz;
1367: for (i=1; i<nrqs; ++i) {
1368: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
1369: }
1370: for (i=0; i<nrqs; ++i) {
1371: j = pa[i];
1372: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
1373: }
1375: /* Send to other procs the buf size they should allocate */
1378: /* Receive messages*/
1379: PetscMalloc1(nrqr+1,&s_waits2);
1380: PetscMalloc1(nrqr+1,&r_status1);
1381: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);
1382: {
1383: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
1384: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
1385: PetscInt *sbuf2_i;
1387: for (i=0; i<nrqr; ++i) {
1388: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
1390: req_size[idex] = 0;
1391: rbuf1_i = rbuf1[idex];
1392: start = 2*rbuf1_i[0] + 1;
1393: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1394: PetscMalloc1(end+1,&sbuf2[idex]);
1395: sbuf2_i = sbuf2[idex];
1396: for (j=start; j<end; j++) {
1397: id = rbuf1_i[j] - rstart;
1398: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1399: sbuf2_i[j] = ncols;
1400: req_size[idex] += ncols;
1401: }
1402: req_source[idex] = r_status1[i].MPI_SOURCE;
1403: /* form the header */
1404: sbuf2_i[0] = req_size[idex];
1405: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1407: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1408: }
1409: }
1410: PetscFree(r_status1);
1411: PetscFree(r_waits1);
1413: /* recv buffer sizes */
1414: /* Receive messages*/
1416: PetscMalloc1(nrqs+1,&rbuf3);
1417: PetscMalloc1(nrqs+1,&rbuf4);
1418: PetscMalloc1(nrqs+1,&r_waits3);
1419: PetscMalloc1(nrqs+1,&r_waits4);
1420: PetscMalloc1(nrqs+1,&r_status2);
1422: for (i=0; i<nrqs; ++i) {
1423: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1424: PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);
1425: PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);
1426: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1427: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1428: }
1429: PetscFree(r_status2);
1430: PetscFree(r_waits2);
1432: /* Wait on sends1 and sends2 */
1433: PetscMalloc1(nrqs+1,&s_status1);
1434: PetscMalloc1(nrqr+1,&s_status2);
1436: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1437: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1438: PetscFree(s_status1);
1439: PetscFree(s_status2);
1440: PetscFree(s_waits1);
1441: PetscFree(s_waits2);
1443: /* Now allocate buffers for a->j, and send them off */
1444: PetscMalloc1(nrqr+1,&sbuf_aj);
1445: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1446: PetscMalloc1(j+1,&sbuf_aj[0]);
1447: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1449: PetscMalloc1(nrqr+1,&s_waits3);
1450: {
1451: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1452: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1453: PetscInt cend = C->cmap->rend;
1454: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1456: for (i=0; i<nrqr; i++) {
1457: rbuf1_i = rbuf1[i];
1458: sbuf_aj_i = sbuf_aj[i];
1459: ct1 = 2*rbuf1_i[0] + 1;
1460: ct2 = 0;
1461: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1462: kmax = rbuf1[i][2*j];
1463: for (k=0; k<kmax; k++,ct1++) {
1464: row = rbuf1_i[ct1] - rstart;
1465: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1466: ncols = nzA + nzB;
1467: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1469: /* load the column indices for this row into cols*/
1470: cols = sbuf_aj_i + ct2;
1472: lwrite = 0;
1473: for (l=0; l<nzB; l++) {
1474: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1475: }
1476: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1477: for (l=0; l<nzB; l++) {
1478: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1479: }
1481: ct2 += ncols;
1482: }
1483: }
1484: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1485: }
1486: }
1487: PetscMalloc1(nrqs+1,&r_status3);
1488: PetscMalloc1(nrqr+1,&s_status3);
1490: /* Allocate buffers for a->a, and send them off */
1491: PetscMalloc1(nrqr+1,&sbuf_aa);
1492: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1493: PetscMalloc1(j+1,&sbuf_aa[0]);
1494: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1496: PetscMalloc1(nrqr+1,&s_waits4);
1497: {
1498: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1499: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1500: PetscInt cend = C->cmap->rend;
1501: PetscInt *b_j = b->j;
1502: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1504: for (i=0; i<nrqr; i++) {
1505: rbuf1_i = rbuf1[i];
1506: sbuf_aa_i = sbuf_aa[i];
1507: ct1 = 2*rbuf1_i[0]+1;
1508: ct2 = 0;
1509: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1510: kmax = rbuf1_i[2*j];
1511: for (k=0; k<kmax; k++,ct1++) {
1512: row = rbuf1_i[ct1] - rstart;
1513: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1514: ncols = nzA + nzB;
1515: cworkB = b_j + b_i[row];
1516: vworkA = a_a + a_i[row];
1517: vworkB = b_a + b_i[row];
1519: /* load the column values for this row into vals*/
1520: vals = sbuf_aa_i+ct2;
1522: lwrite = 0;
1523: for (l=0; l<nzB; l++) {
1524: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1525: }
1526: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1527: for (l=0; l<nzB; l++) {
1528: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1529: }
1531: ct2 += ncols;
1532: }
1533: }
1534: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1535: }
1536: }
1537: PetscFree(rbuf1[0]);
1538: PetscFree(rbuf1);
1539: PetscMalloc1(nrqs+1,&r_status4);
1540: PetscMalloc1(nrqr+1,&s_status4);
1542: /* Form the matrix */
1543: /* create col map: global col of C -> local col of submatrices */
1544: {
1545: const PetscInt *icol_i;
1546: #if defined(PETSC_USE_CTABLE)
1547: PetscMalloc1(1+ismax,&cmap);
1548: for (i=0; i<ismax; i++) {
1549: if (!allcolumns[i]) {
1550: PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);
1552: jmax = ncol[i];
1553: icol_i = icol[i];
1554: cmap_i = cmap[i];
1555: for (j=0; j<jmax; j++) {
1556: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
1557: }
1558: } else {
1559: cmap[i] = NULL;
1560: }
1561: }
1562: #else
1563: PetscMalloc1(ismax,&cmap);
1564: for (i=0; i<ismax; i++) {
1565: if (!allcolumns[i]) {
1566: PetscMalloc1(C->cmap->N,&cmap[i]);
1567: PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));
1568: jmax = ncol[i];
1569: icol_i = icol[i];
1570: cmap_i = cmap[i];
1571: for (j=0; j<jmax; j++) {
1572: cmap_i[icol_i[j]] = j+1;
1573: }
1574: } else {
1575: cmap[i] = NULL;
1576: }
1577: }
1578: #endif
1579: }
1581: /* Create lens which is required for MatCreate... */
1582: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1583: PetscMalloc1(ismax,&lens);
1584: if (ismax) {
1585: PetscMalloc1(j,&lens[0]);
1586: PetscMemzero(lens[0],j*sizeof(PetscInt));
1587: }
1588: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1590: /* Update lens from local data */
1591: for (i=0; i<ismax; i++) {
1592: jmax = nrow[i];
1593: if (!allcolumns[i]) cmap_i = cmap[i];
1594: irow_i = irow[i];
1595: lens_i = lens[i];
1596: for (j=0; j<jmax; j++) {
1597: l = 0;
1598: row = irow_i[j];
1599: while (row >= C->rmap->range[l+1]) l++;
1600: proc = l;
1601: if (proc == rank) {
1602: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1603: if (!allcolumns[i]) {
1604: for (k=0; k<ncols; k++) {
1605: #if defined(PETSC_USE_CTABLE)
1606: PetscTableFind(cmap_i,cols[k]+1,&tcol);
1607: #else
1608: tcol = cmap_i[cols[k]];
1609: #endif
1610: if (tcol) lens_i[j]++;
1611: }
1612: } else { /* allcolumns */
1613: lens_i[j] = ncols;
1614: }
1615: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1616: }
1617: }
1618: }
1620: /* Create row map: global row of C -> local row of submatrices */
1621: #if defined(PETSC_USE_CTABLE)
1622: PetscMalloc1(1+ismax,&rmap);
1623: for (i=0; i<ismax; i++) {
1624: PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);
1625: irow_i = irow[i];
1626: jmax = nrow[i];
1627: for (j=0; j<jmax; j++) {
1628: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1629: }
1630: }
1631: #else
1632: PetscMalloc1(ismax,&rmap);
1633: if (ismax) {
1634: PetscMalloc1(ismax*C->rmap->N,&rmap[0]);
1635: PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1636: }
1637: for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1638: for (i=0; i<ismax; i++) {
1639: rmap_i = rmap[i];
1640: irow_i = irow[i];
1641: jmax = nrow[i];
1642: for (j=0; j<jmax; j++) {
1643: rmap_i[irow_i[j]] = j;
1644: }
1645: }
1646: #endif
1648: /* Update lens from offproc data */
1649: {
1650: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1652: for (tmp2=0; tmp2<nrqs; tmp2++) {
1653: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1654: idex = pa[idex2];
1655: sbuf1_i = sbuf1[idex];
1656: jmax = sbuf1_i[0];
1657: ct1 = 2*jmax+1;
1658: ct2 = 0;
1659: rbuf2_i = rbuf2[idex2];
1660: rbuf3_i = rbuf3[idex2];
1661: for (j=1; j<=jmax; j++) {
1662: is_no = sbuf1_i[2*j-1];
1663: max1 = sbuf1_i[2*j];
1664: lens_i = lens[is_no];
1665: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1666: rmap_i = rmap[is_no];
1667: for (k=0; k<max1; k++,ct1++) {
1668: #if defined(PETSC_USE_CTABLE)
1669: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1670: row--;
1671: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1672: #else
1673: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1674: #endif
1675: max2 = rbuf2_i[ct1];
1676: for (l=0; l<max2; l++,ct2++) {
1677: if (!allcolumns[is_no]) {
1678: #if defined(PETSC_USE_CTABLE)
1679: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1680: #else
1681: tcol = cmap_i[rbuf3_i[ct2]];
1682: #endif
1683: if (tcol) lens_i[row]++;
1684: } else { /* allcolumns */
1685: lens_i[row]++; /* lens_i[row] += max2 ? */
1686: }
1687: }
1688: }
1689: }
1690: }
1691: }
1692: PetscFree(r_status3);
1693: PetscFree(r_waits3);
1694: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1695: PetscFree(s_status3);
1696: PetscFree(s_waits3);
1698: /* Create the submatrices */
1699: if (scall == MAT_REUSE_MATRIX) {
1700: PetscBool flag;
1702: /*
1703: Assumes new rows are same length as the old rows,hence bug!
1704: */
1705: for (i=0; i<ismax; i++) {
1706: mat = (Mat_SeqAIJ*)(submats[i]->data);
1707: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1708: PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1709: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1710: /* Initial matrix as if empty */
1711: PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));
1713: submats[i]->factortype = C->factortype;
1714: }
1715: } else {
1716: for (i=0; i<ismax; i++) {
1717: PetscInt rbs,cbs;
1718: ISGetBlockSize(isrow[i],&rbs);
1719: ISGetBlockSize(iscol[i],&cbs);
1721: MatCreate(PETSC_COMM_SELF,submats+i);
1722: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1724: MatSetBlockSizes(submats[i],rbs,cbs);
1725: MatSetType(submats[i],((PetscObject)A)->type_name);
1726: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1727: }
1728: }
1730: /* Assemble the matrices */
1731: /* First assemble the local rows */
1732: {
1733: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1734: PetscScalar *imat_a;
1736: for (i=0; i<ismax; i++) {
1737: mat = (Mat_SeqAIJ*)submats[i]->data;
1738: imat_ilen = mat->ilen;
1739: imat_j = mat->j;
1740: imat_i = mat->i;
1741: imat_a = mat->a;
1743: if (!allcolumns[i]) cmap_i = cmap[i];
1744: rmap_i = rmap[i];
1745: irow_i = irow[i];
1746: jmax = nrow[i];
1747: for (j=0; j<jmax; j++) {
1748: l = 0;
1749: row = irow_i[j];
1750: while (row >= C->rmap->range[l+1]) l++;
1751: proc = l;
1752: if (proc == rank) {
1753: old_row = row;
1754: #if defined(PETSC_USE_CTABLE)
1755: PetscTableFind(rmap_i,row+1,&row);
1756: row--;
1757: #else
1758: row = rmap_i[row];
1759: #endif
1760: ilen_row = imat_ilen[row];
1761: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1762: mat_i = imat_i[row];
1763: mat_a = imat_a + mat_i;
1764: mat_j = imat_j + mat_i;
1765: if (!allcolumns[i]) {
1766: for (k=0; k<ncols; k++) {
1767: #if defined(PETSC_USE_CTABLE)
1768: PetscTableFind(cmap_i,cols[k]+1,&tcol);
1769: #else
1770: tcol = cmap_i[cols[k]];
1771: #endif
1772: if (tcol) {
1773: *mat_j++ = tcol - 1;
1774: *mat_a++ = vals[k];
1775: ilen_row++;
1776: }
1777: }
1778: } else { /* allcolumns */
1779: for (k=0; k<ncols; k++) {
1780: *mat_j++ = cols[k]; /* global col index! */
1781: *mat_a++ = vals[k];
1782: ilen_row++;
1783: }
1784: }
1785: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1787: imat_ilen[row] = ilen_row;
1788: }
1789: }
1790: }
1791: }
1793: /* Now assemble the off proc rows*/
1794: {
1795: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1796: PetscInt *imat_j,*imat_i;
1797: PetscScalar *imat_a,*rbuf4_i;
1799: for (tmp2=0; tmp2<nrqs; tmp2++) {
1800: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1801: idex = pa[idex2];
1802: sbuf1_i = sbuf1[idex];
1803: jmax = sbuf1_i[0];
1804: ct1 = 2*jmax + 1;
1805: ct2 = 0;
1806: rbuf2_i = rbuf2[idex2];
1807: rbuf3_i = rbuf3[idex2];
1808: rbuf4_i = rbuf4[idex2];
1809: for (j=1; j<=jmax; j++) {
1810: is_no = sbuf1_i[2*j-1];
1811: rmap_i = rmap[is_no];
1812: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1813: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1814: imat_ilen = mat->ilen;
1815: imat_j = mat->j;
1816: imat_i = mat->i;
1817: imat_a = mat->a;
1818: max1 = sbuf1_i[2*j];
1819: for (k=0; k<max1; k++,ct1++) {
1820: row = sbuf1_i[ct1];
1821: #if defined(PETSC_USE_CTABLE)
1822: PetscTableFind(rmap_i,row+1,&row);
1823: row--;
1824: #else
1825: row = rmap_i[row];
1826: #endif
1827: ilen = imat_ilen[row];
1828: mat_i = imat_i[row];
1829: mat_a = imat_a + mat_i;
1830: mat_j = imat_j + mat_i;
1831: max2 = rbuf2_i[ct1];
1832: if (!allcolumns[is_no]) {
1833: for (l=0; l<max2; l++,ct2++) {
1835: #if defined(PETSC_USE_CTABLE)
1836: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1837: #else
1838: tcol = cmap_i[rbuf3_i[ct2]];
1839: #endif
1840: if (tcol) {
1841: *mat_j++ = tcol - 1;
1842: *mat_a++ = rbuf4_i[ct2];
1843: ilen++;
1844: }
1845: }
1846: } else { /* allcolumns */
1847: for (l=0; l<max2; l++,ct2++) {
1848: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1849: *mat_a++ = rbuf4_i[ct2];
1850: ilen++;
1851: }
1852: }
1853: imat_ilen[row] = ilen;
1854: }
1855: }
1856: }
1857: }
1859: /* sort the rows */
1860: {
1861: PetscInt *imat_ilen,*imat_j,*imat_i;
1862: PetscScalar *imat_a;
1864: for (i=0; i<ismax; i++) {
1865: mat = (Mat_SeqAIJ*)submats[i]->data;
1866: imat_j = mat->j;
1867: imat_i = mat->i;
1868: imat_a = mat->a;
1869: imat_ilen = mat->ilen;
1871: if (allcolumns[i]) continue;
1872: jmax = nrow[i];
1873: for (j=0; j<jmax; j++) {
1874: PetscInt ilen;
1876: mat_i = imat_i[j];
1877: mat_a = imat_a + mat_i;
1878: mat_j = imat_j + mat_i;
1879: ilen = imat_ilen[j];
1880: PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);
1881: }
1882: }
1883: }
1885: PetscFree(r_status4);
1886: PetscFree(r_waits4);
1887: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1888: PetscFree(s_waits4);
1889: PetscFree(s_status4);
1891: /* Restore the indices */
1892: for (i=0; i<ismax; i++) {
1893: ISRestoreIndices(isrow[i],irow+i);
1894: if (!allcolumns[i]) {
1895: ISRestoreIndices(iscol[i],icol+i);
1896: }
1897: }
1899: /* Destroy allocated memory */
1900: PetscFree4(irow,icol,nrow,ncol);
1901: PetscFree4(w1,w2,w3,w4);
1902: PetscFree(pa);
1904: PetscFree4(sbuf1,ptr,tmp,ctr);
1905: PetscFree(rbuf2);
1906: for (i=0; i<nrqr; ++i) {
1907: PetscFree(sbuf2[i]);
1908: }
1909: for (i=0; i<nrqs; ++i) {
1910: PetscFree(rbuf3[i]);
1911: PetscFree(rbuf4[i]);
1912: }
1914: PetscFree3(sbuf2,req_size,req_source);
1915: PetscFree(rbuf3);
1916: PetscFree(rbuf4);
1917: PetscFree(sbuf_aj[0]);
1918: PetscFree(sbuf_aj);
1919: PetscFree(sbuf_aa[0]);
1920: PetscFree(sbuf_aa);
1922: #if defined(PETSC_USE_CTABLE)
1923: for (i=0; i<ismax; i++) {PetscTableDestroy((PetscTable*)&rmap[i]);}
1924: #else
1925: if (ismax) {PetscFree(rmap[0]);}
1926: #endif
1927: PetscFree(rmap);
1929: for (i=0; i<ismax; i++) {
1930: if (!allcolumns[i]) {
1931: #if defined(PETSC_USE_CTABLE)
1932: PetscTableDestroy((PetscTable*)&cmap[i]);
1933: #else
1934: PetscFree(cmap[i]);
1935: #endif
1936: }
1937: }
1938: PetscFree(cmap);
1939: if (ismax) {PetscFree(lens[0]);}
1940: PetscFree(lens);
1942: for (i=0; i<ismax; i++) {
1943: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1944: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1945: }
1946: return(0);
1947: }
1949: /*
1950: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1951: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1952: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1953: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1954: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1955: state, and needs to be "assembled" later by compressing B's column space.
1957: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1958: Following this call, C->A & C->B have been created, even if empty.
1959: */
1962: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1963: {
1964: /* If making this function public, change the error returned in this function away from _PLIB. */
1966: Mat_MPIAIJ *aij;
1967: Mat_SeqAIJ *Baij;
1968: PetscBool seqaij,Bdisassembled;
1969: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1970: PetscScalar v;
1971: const PetscInt *rowindices,*colindices;
1974: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1975: if (A) {
1976: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
1977: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1978: if (rowemb) {
1979: ISGetLocalSize(rowemb,&m);
1980: if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1981: } else {
1982: if (C->rmap->n != A->rmap->n) {
1983: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1984: }
1985: }
1986: if (dcolemb) {
1987: ISGetLocalSize(dcolemb,&n);
1988: if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1989: } else {
1990: if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1991: }
1992: }
1993: if (B) {
1994: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
1995: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1996: if (rowemb) {
1997: ISGetLocalSize(rowemb,&m);
1998: if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
1999: } else {
2000: if (C->rmap->n != B->rmap->n) {
2001: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2002: }
2003: }
2004: if (ocolemb) {
2005: ISGetLocalSize(ocolemb,&n);
2006: if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2007: } else {
2008: if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2009: }
2010: }
2012: aij = (Mat_MPIAIJ*)(C->data);
2013: if (!aij->A) {
2014: /* Mimic parts of MatMPIAIJSetPreallocation() */
2015: MatCreate(PETSC_COMM_SELF,&aij->A);
2016: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2017: MatSetBlockSizesFromMats(aij->A,C,C);
2018: MatSetType(aij->A,MATSEQAIJ);
2019: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2020: }
2021: if (A) {
2022: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2023: } else {
2024: MatSetUp(aij->A);
2025: }
2026: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2027: /*
2028: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2029: need to "disassemble" B -- convert it to using C's global indices.
2030: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2032: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2033: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2035: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2036: At least avoid calling MatSetValues() and the implied searches?
2037: */
2039: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2040: #if defined(PETSC_USE_CTABLE)
2041: PetscTableDestroy(&aij->colmap);
2042: #else
2043: PetscFree(aij->colmap);
2044: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2045: if (aij->B) {
2046: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2047: }
2048: #endif
2049: ngcol = 0;
2050: if (aij->lvec) {
2051: VecGetSize(aij->lvec,&ngcol);
2052: }
2053: if (aij->garray) {
2054: PetscFree(aij->garray);
2055: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2056: }
2057: VecDestroy(&aij->lvec);
2058: VecScatterDestroy(&aij->Mvctx);
2059: }
2060: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2061: MatDestroy(&aij->B);
2062: }
2063: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2064: MatZeroEntries(aij->B);
2065: }
2066: }
2067: Bdisassembled = PETSC_FALSE;
2068: if (!aij->B) {
2069: MatCreate(PETSC_COMM_SELF,&aij->B);
2070: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2071: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2072: MatSetBlockSizesFromMats(aij->B,B,B);
2073: MatSetType(aij->B,MATSEQAIJ);
2074: Bdisassembled = PETSC_TRUE;
2075: }
2076: if (B) {
2077: Baij = (Mat_SeqAIJ*)(B->data);
2078: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2079: PetscMalloc1(B->rmap->n,&nz);
2080: for (i=0; i<B->rmap->n; i++) {
2081: nz[i] = Baij->i[i+1] - Baij->i[i];
2082: }
2083: MatSeqAIJSetPreallocation(aij->B,0,nz);
2084: PetscFree(nz);
2085: }
2087: PetscLayoutGetRange(C->rmap,&rstart,&rend);
2088: shift = rend-rstart;
2089: count = 0;
2090: rowindices = NULL;
2091: colindices = NULL;
2092: if (rowemb) {
2093: ISGetIndices(rowemb,&rowindices);
2094: }
2095: if (ocolemb) {
2096: ISGetIndices(ocolemb,&colindices);
2097: }
2098: for (i=0; i<B->rmap->n; i++) {
2099: PetscInt row;
2100: row = i;
2101: if (rowindices) row = rowindices[i];
2102: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
2103: col = Baij->j[count];
2104: if (colindices) col = colindices[col];
2105: if (Bdisassembled && col>=rstart) col += shift;
2106: v = Baij->a[count];
2107: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
2108: ++count;
2109: }
2110: }
2111: /* No assembly for aij->B is necessary. */
2112: /* FIXME: set aij->B's nonzerostate correctly. */
2113: } else {
2114: MatSetUp(aij->B);
2115: }
2116: C->preallocated = PETSC_TRUE;
2117: C->was_assembled = PETSC_FALSE;
2118: C->assembled = PETSC_FALSE;
2119: /*
2120: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2121: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2122: */
2123: return(0);
2124: }
2128: /*
2129: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
2130: */
2131: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
2132: {
2133: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
2138: /* FIXME: make sure C is assembled */
2139: *A = aij->A;
2140: *B = aij->B;
2141: /* Note that we don't incref *A and *B, so be careful! */
2142: return(0);
2143: }
2145: /*
2146: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
2147: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
2148: */
2151: PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
2152: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
2153: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
2154: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
2155: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
2156: {
2158: PetscMPIInt isize,flag;
2159: PetscInt i,ii,cismax,ispar,ciscol_localsize;
2160: Mat *A,*B;
2161: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
2164: if (!ismax) return(0);
2166: for (i = 0, cismax = 0; i < ismax; ++i) {
2167: PetscMPIInt isize;
2168: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
2169: if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
2170: MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);
2171: if (isize > 1) ++cismax;
2172: }
2173: /*
2174: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
2175: ispar counts the number of parallel ISs across C's comm.
2176: */
2177: MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2178: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
2179: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
2180: return(0);
2181: }
2183: /* if (ispar) */
2184: /*
2185: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
2186: These are used to extract the off-diag portion of the resulting parallel matrix.
2187: The row IS for the off-diag portion is the same as for the diag portion,
2188: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
2189: */
2190: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
2191: PetscMalloc1(cismax,&ciscol_p);
2192: for (i = 0, ii = 0; i < ismax; ++i) {
2193: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2194: if (isize > 1) {
2195: /*
2196: TODO: This is the part that's ***NOT SCALABLE***.
2197: To fix this we need to extract just the indices of C's nonzero columns
2198: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
2199: part of iscol[i] -- without actually computing ciscol[ii]. This also has
2200: to be done without serializing on the IS list, so, most likely, it is best
2201: done by rewriting MatGetSubMatrices_MPIAIJ() directly.
2202: */
2203: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
2204: /* Now we have to
2205: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
2206: were sorted on each rank, concatenated they might no longer be sorted;
2207: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
2208: indices in the nondecreasing order to the original index positions.
2209: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
2210: */
2211: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
2212: ISSort(ciscol[ii]);
2213: ++ii;
2214: }
2215: }
2216: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
2217: for (i = 0, ii = 0; i < ismax; ++i) {
2218: PetscInt j,issize;
2219: const PetscInt *indices;
2221: /*
2222: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
2223: */
2224: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
2225: ISSort(isrow[i]);
2226: ISGetLocalSize(isrow[i],&issize);
2227: ISGetIndices(isrow[i],&indices);
2228: for (j = 1; j < issize; ++j) {
2229: if (indices[j] == indices[j-1]) {
2230: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2231: }
2232: }
2233: ISRestoreIndices(isrow[i],&indices);
2236: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
2237: ISSort(iscol[i]);
2238: ISGetLocalSize(iscol[i],&issize);
2239: ISGetIndices(iscol[i],&indices);
2240: for (j = 1; j < issize; ++j) {
2241: if (indices[j-1] == indices[j]) {
2242: SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
2243: }
2244: }
2245: ISRestoreIndices(iscol[i],&indices);
2246: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2247: if (isize > 1) {
2248: cisrow[ii] = isrow[i];
2249: ++ii;
2250: }
2251: }
2252: /*
2253: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
2254: array of sequential matrices underlying the resulting parallel matrices.
2255: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
2256: contain duplicates.
2258: There are as many diag matrices as there are original index sets. There are only as many parallel
2259: and off-diag matrices, as there are parallel (comm size > 1) index sets.
2261: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
2262: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
2263: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
2264: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
2265: with A[i] and B[ii] extracted from the corresponding MPI submat.
2266: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
2267: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
2268: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
2269: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
2270: values into A[i] and B[ii] sitting inside the corresponding submat.
2271: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
2272: A[i], B[ii], AA[i] or BB[ii] matrices.
2273: */
2274: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
2275: if (scall == MAT_INITIAL_MATRIX) {
2276: PetscMalloc1(ismax,submat);
2277: /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
2278: } else {
2279: PetscMalloc1(ismax,&A);
2280: PetscMalloc1(cismax,&B);
2281: /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
2282: for (i = 0, ii = 0; i < ismax; ++i) {
2283: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2284: if (isize > 1) {
2285: Mat AA,BB;
2286: (*getlocalmats)((*submat)[i],&AA,&BB);
2287: if (!isrow_p[i] && !iscol_p[i]) {
2288: A[i] = AA;
2289: } else {
2290: /* TODO: extract A[i] composed on AA. */
2291: MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);
2292: }
2293: if (!isrow_p[i] && !ciscol_p[ii]) {
2294: B[ii] = BB;
2295: } else {
2296: /* TODO: extract B[ii] composed on BB. */
2297: MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);
2298: }
2299: ++ii;
2300: } else {
2301: if (!isrow_p[i] && !iscol_p[i]) {
2302: A[i] = (*submat)[i];
2303: } else {
2304: /* TODO: extract A[i] composed on (*submat)[i]. */
2305: MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);
2306: }
2307: }
2308: }
2309: }
2310: /* Now obtain the sequential A and B submatrices separately. */
2311: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);
2312: /* I did not figure out a good way to fix it right now.
2313: * Local column size of B[i] is different from the size of ciscol[i].
2314: * B[i]'s size is finally determined by MatAssembly, while
2315: * ciscol[i] is computed as the complement of iscol[i].
2316: * It is better to keep only nonzero indices when computing
2317: * the complement ciscol[i].
2318: * */
2319: if(scall==MAT_REUSE_MATRIX){
2320: for(i=0; i<cismax; i++){
2321: ISGetLocalSize(ciscol[i],&ciscol_localsize);
2322: B[i]->cmap->n = ciscol_localsize;
2323: }
2324: }
2325: (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);
2326: /*
2327: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
2328: matrices A & B have been extracted directly into the parallel matrices containing them, or
2329: simply into the sequential matrix identical with the corresponding A (if isize == 1).
2330: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
2331: to have the same sparsity pattern.
2332: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
2333: must be constructed for C. This is done by setseqmat(s).
2334: */
2335: for (i = 0, ii = 0; i < ismax; ++i) {
2336: /*
2337: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
2338: That way we can avoid sorting and computing permutations when reusing.
2339: To this end:
2340: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
2341: - if caching arrays to hold the ISs, make and compose a container for them so that it can
2342: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
2343: */
2344: MatStructure pattern;
2345: if (scall == MAT_INITIAL_MATRIX) {
2346: pattern = DIFFERENT_NONZERO_PATTERN;
2347: } else {
2348: pattern = SAME_NONZERO_PATTERN;
2349: }
2350: MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);
2351: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
2352: if (isize > 1) {
2353: if (scall == MAT_INITIAL_MATRIX) {
2354: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
2355: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
2356: MatSetType((*submat)[i],MATMPIAIJ);
2357: PetscLayoutSetUp((*submat)[i]->rmap);
2358: PetscLayoutSetUp((*submat)[i]->cmap);
2359: }
2360: /*
2361: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
2362: */
2363: {
2364: Mat AA,BB;
2365: AA = NULL;
2366: if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
2367: BB = NULL;
2368: if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
2369: if (AA || BB) {
2370: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
2371: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
2372: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
2373: }
2374: if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
2375: /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
2376: MatDestroy(&AA);
2377: }
2378: if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
2379: /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
2380: MatDestroy(&BB);
2381: }
2382: }
2383: ISDestroy(ciscol+ii);
2384: ISDestroy(ciscol_p+ii);
2385: ++ii;
2386: } else { /* if (isize == 1) */
2387: if (scall == MAT_INITIAL_MATRIX) {
2388: if (isrow_p[i] || iscol_p[i]) {
2389: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
2390: } else (*submat)[i] = A[i];
2391: }
2392: if (isrow_p[i] || iscol_p[i]) {
2393: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
2394: /* Otherwise A is extracted straight into (*submats)[i]. */
2395: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
2396: MatDestroy(A+i);
2397: }
2398: }
2399: ISDestroy(&isrow_p[i]);
2400: ISDestroy(&iscol_p[i]);
2401: }
2402: PetscFree2(cisrow,ciscol);
2403: PetscFree2(isrow_p,iscol_p);
2404: PetscFree(ciscol_p);
2405: PetscFree(A);
2406: PetscFree(B);
2407: return(0);
2408: }
2414: PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
2415: {
2419: MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
2420: return(0);
2421: }