Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
21: /*
22: Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
24: The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25: that is a very major recoding job.
27: Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28: */
29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30: {
31: PetscFunctionBegin;
32: for (PetscInt i = 0; i < imax; i++) {
33: if (!is[i]) break;
34: PetscInt n = 0, N, Nmax, Nmin;
35: const PetscInt *idx;
36: PetscInt *nidx = NULL;
37: MPI_Comm comm;
38: PetscBT bt;
40: PetscCall(ISGetLocalSize(is[i], &N));
41: if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42: PetscCall(ISGetIndices(is[i], &idx));
43: PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44: Nmin = Nmin / bs;
45: Nmax = Nmax / bs;
46: PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47: for (PetscInt j = 0; j < N; j++) {
48: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49: }
50: PetscCall(PetscMalloc1(n, &nidx));
51: n = 0;
52: PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53: for (PetscInt j = 0; j < N; j++) {
54: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55: }
56: PetscCall(PetscBTDestroy(&bt));
57: PetscCall(ISRestoreIndices(is[i], &idx));
58: }
59: PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60: PetscCall(ISDestroy(is + i));
61: PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62: }
63: PetscFunctionReturn(PETSC_SUCCESS);
64: }
66: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67: {
68: PetscInt i;
70: PetscFunctionBegin;
71: PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72: for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74: PetscFunctionReturn(PETSC_SUCCESS);
75: }
77: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78: {
79: PetscInt i;
81: PetscFunctionBegin;
82: PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83: for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85: PetscFunctionReturn(PETSC_SUCCESS);
86: }
88: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89: {
90: MPI_Comm comm;
91: PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
92: PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93: PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94: const PetscInt *indices_i, **indices;
95: PetscLayout rmap;
96: PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner;
97: PetscSF sf;
98: PetscSFNode *remote;
100: PetscFunctionBegin;
101: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102: PetscCallMPI(MPI_Comm_rank(comm, &rank));
103: PetscCallMPI(MPI_Comm_size(comm, &size));
104: /* get row map to determine where rows should be going */
105: PetscCall(MatGetLayouts(mat, &rmap, NULL));
106: /* retrieve IS data and put all together so that we
107: * can optimize communication
108: * */
109: PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110: for (i = 0, tlength = 0; i < nidx; i++) {
111: PetscCall(ISGetLocalSize(is[i], &length[i]));
112: tlength += length[i];
113: PetscCall(ISGetIndices(is[i], &indices[i]));
114: }
115: /* find these rows on remote processors */
116: PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117: PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118: nrrows = 0;
119: for (i = 0; i < nidx; i++) {
120: length_i = length[i];
121: indices_i = indices[i];
122: for (j = 0; j < length_i; j++) {
123: owner = -1;
124: PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125: /* remote processors */
126: if (owner != rank) {
127: tosizes_temp[owner]++; /* number of rows to owner */
128: rrow_ranks[nrrows] = owner; /* processor */
129: rrow_isids[nrrows] = i; /* is id */
130: remoterows[nrrows++] = indices_i[j]; /* row */
131: }
132: }
133: PetscCall(ISRestoreIndices(is[i], &indices[i]));
134: }
135: PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136: /* test if we need to exchange messages
137: * generally speaking, we do not need to exchange
138: * data when overlap is 1
139: * */
140: PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141: /* we do not have any messages
142: * It usually corresponds to overlap 1
143: * */
144: if (!reducednrrows) {
145: PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146: PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147: PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148: PetscFunctionReturn(PETSC_SUCCESS);
149: }
150: nto = 0;
151: /* send sizes and ranks for building a two-sided communication */
152: for (i = 0; i < size; i++) {
153: if (tosizes_temp[i]) {
154: tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155: tosizes_temp[i] = nto; /* a map from processor to index */
156: toranks[nto++] = i; /* processor */
157: }
158: }
159: PetscCall(PetscMalloc1(nto + 1, &toffsets));
160: toffsets[0] = 0;
161: for (i = 0; i < nto; i++) {
162: toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */
163: tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */
164: }
165: /* send information to other processors */
166: PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167: nrecvrows = 0;
168: for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169: PetscCall(PetscMalloc1(nrecvrows, &remote));
170: nrecvrows = 0;
171: for (i = 0; i < nfrom; i++) {
172: for (j = 0; j < fromsizes[2 * i]; j++) {
173: remote[nrecvrows].rank = fromranks[i];
174: remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175: }
176: }
177: PetscCall(PetscSFCreate(comm, &sf));
178: PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180: PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181: PetscCall(PetscSFSetFromOptions(sf));
182: /* message pair <no of is, row> */
183: PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184: for (i = 0; i < nrrows; i++) {
185: owner = rrow_ranks[i]; /* processor */
186: j = tosizes_temp[owner]; /* index */
187: todata[toffsets[j]++] = rrow_isids[i];
188: todata[toffsets[j]++] = remoterows[i];
189: }
190: PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191: PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192: PetscCall(PetscFree(toffsets));
193: PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194: PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195: PetscCall(PetscSFDestroy(&sf));
196: /* send rows belonging to the remote so that then we could get the overlapping data back */
197: PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198: PetscCall(PetscFree2(todata, fromdata));
199: PetscCall(PetscFree(fromsizes));
200: PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201: PetscCall(PetscFree(fromranks));
202: nrecvrows = 0;
203: for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204: PetscCall(PetscCalloc1(nrecvrows, &todata));
205: PetscCall(PetscMalloc1(nrecvrows, &remote));
206: nrecvrows = 0;
207: for (i = 0; i < nto; i++) {
208: for (j = 0; j < tosizes[2 * i]; j++) {
209: remote[nrecvrows].rank = toranks[i];
210: remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211: }
212: }
213: PetscCall(PetscSFCreate(comm, &sf));
214: PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216: PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217: PetscCall(PetscSFSetFromOptions(sf));
218: /* overlap communication and computation */
219: PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220: PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221: PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222: PetscCall(PetscSFDestroy(&sf));
223: PetscCall(PetscFree2(sbdata, sbsizes));
224: PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225: PetscCall(PetscFree(toranks));
226: PetscCall(PetscFree(tosizes));
227: PetscCall(PetscFree(todata));
228: PetscFunctionReturn(PETSC_SUCCESS);
229: }
231: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232: {
233: PetscInt *isz, isz_i, i, j, is_id, data_size;
234: PetscInt col, lsize, max_lsize, *indices_temp, *indices_i;
235: const PetscInt *indices_i_temp;
236: MPI_Comm *iscomms;
238: PetscFunctionBegin;
239: max_lsize = 0;
240: PetscCall(PetscMalloc1(nidx, &isz));
241: for (i = 0; i < nidx; i++) {
242: PetscCall(ISGetLocalSize(is[i], &lsize));
243: max_lsize = lsize > max_lsize ? lsize : max_lsize;
244: isz[i] = lsize;
245: }
246: PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247: for (i = 0; i < nidx; i++) {
248: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249: PetscCall(ISGetIndices(is[i], &indices_i_temp));
250: PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i]));
251: PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252: PetscCall(ISDestroy(&is[i]));
253: }
254: /* retrieve information to get row id and its overlap */
255: for (i = 0; i < nrecvs;) {
256: is_id = recvdata[i++];
257: data_size = recvdata[i++];
258: indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259: isz_i = isz[is_id];
260: for (j = 0; j < data_size; j++) {
261: col = recvdata[i++];
262: indices_i[isz_i++] = col;
263: }
264: isz[is_id] = isz_i;
265: }
266: /* remove duplicate entities */
267: for (i = 0; i < nidx; i++) {
268: indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i);
269: isz_i = isz[i];
270: PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271: PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272: PetscCall(PetscCommDestroy(&iscomms[i]));
273: }
274: PetscCall(PetscFree(isz));
275: PetscCall(PetscFree2(indices_temp, iscomms));
276: PetscFunctionReturn(PETSC_SUCCESS);
277: }
279: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280: {
281: PetscLayout rmap, cmap;
282: PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283: PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284: PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285: const PetscInt *gcols, *ai, *aj, *bi, *bj;
286: Mat amat, bmat;
287: PetscMPIInt rank;
288: PetscBool done;
289: MPI_Comm comm;
291: PetscFunctionBegin;
292: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293: PetscCallMPI(MPI_Comm_rank(comm, &rank));
294: PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295: /* Even if the mat is symmetric, we still assume it is not symmetric */
296: PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298: PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300: /* total number of nonzero values is used to estimate the memory usage in the next step */
301: tnz = ai[an] + bi[bn];
302: PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303: PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304: PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305: /* to find the longest message */
306: max_fszs = 0;
307: for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308: /* better way to estimate number of nonzero in the mat??? */
309: PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310: for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i);
311: rows_pos = 0;
312: totalrows = 0;
313: for (i = 0; i < nfrom; i++) {
314: PetscCall(PetscArrayzero(rows_pos_i, nidx));
315: /* group data together */
316: for (j = 0; j < fromsizes[2 * i]; j += 2) {
317: is_id = fromrows[rows_pos++]; /* no of is */
318: rows_i = rows_data[is_id];
319: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320: }
321: /* estimate a space to avoid multiple allocations */
322: for (j = 0; j < nidx; j++) {
323: indvc_ij = 0;
324: rows_i = rows_data[j];
325: for (l = 0; l < rows_pos_i[j]; l++) {
326: row = rows_i[l] - rstart;
327: start = ai[row];
328: end = ai[row + 1];
329: for (k = start; k < end; k++) { /* Amat */
330: col = aj[k] + cstart;
331: indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332: }
333: start = bi[row];
334: end = bi[row + 1];
335: for (k = start; k < end; k++) { /* Bmat */
336: col = gcols[bj[k]];
337: indices_tmp[indvc_ij++] = col;
338: }
339: }
340: PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341: indv_counts[i * nidx + j] = indvc_ij;
342: totalrows += indvc_ij;
343: }
344: }
345: /* message triple <no of is, number of rows, rows> */
346: PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347: totalrows = 0;
348: rows_pos = 0;
349: /* use this code again */
350: for (i = 0; i < nfrom; i++) {
351: PetscCall(PetscArrayzero(rows_pos_i, nidx));
352: for (j = 0; j < fromsizes[2 * i]; j += 2) {
353: is_id = fromrows[rows_pos++];
354: rows_i = rows_data[is_id];
355: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356: }
357: /* add data */
358: for (j = 0; j < nidx; j++) {
359: if (!indv_counts[i * nidx + j]) continue;
360: indvc_ij = 0;
361: sbdata[totalrows++] = j;
362: sbdata[totalrows++] = indv_counts[i * nidx + j];
363: sbsizes[2 * i] += 2;
364: rows_i = rows_data[j];
365: for (l = 0; l < rows_pos_i[j]; l++) {
366: row = rows_i[l] - rstart;
367: start = ai[row];
368: end = ai[row + 1];
369: for (k = start; k < end; k++) { /* Amat */
370: col = aj[k] + cstart;
371: indices_tmp[indvc_ij++] = col;
372: }
373: start = bi[row];
374: end = bi[row + 1];
375: for (k = start; k < end; k++) { /* Bmat */
376: col = gcols[bj[k]];
377: indices_tmp[indvc_ij++] = col;
378: }
379: }
380: PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381: sbsizes[2 * i] += indvc_ij;
382: PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383: totalrows += indvc_ij;
384: }
385: }
386: PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387: offsets[0] = 0;
388: for (i = 0; i < nfrom; i++) {
389: offsets[i + 1] = offsets[i] + sbsizes[2 * i];
390: sbsizes[2 * i + 1] = offsets[i];
391: }
392: PetscCall(PetscFree(offsets));
393: if (sbrowsizes) *sbrowsizes = sbsizes;
394: if (sbrows) *sbrows = sbdata;
395: PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396: PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397: PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398: PetscFunctionReturn(PETSC_SUCCESS);
399: }
401: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402: {
403: const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404: PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405: PetscInt lsize, lsize_tmp;
406: PetscMPIInt rank, owner;
407: Mat amat, bmat;
408: PetscBool done;
409: PetscLayout cmap, rmap;
410: MPI_Comm comm;
412: PetscFunctionBegin;
413: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414: PetscCallMPI(MPI_Comm_rank(comm, &rank));
415: PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416: PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418: PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420: /* is it a safe way to compute number of nonzero values ? */
421: tnz = ai[an] + bi[bn];
422: PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423: PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424: PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425: /* it is a better way to estimate memory than the old implementation
426: * where global size of matrix is used
427: * */
428: PetscCall(PetscMalloc1(tnz, &indices_temp));
429: for (i = 0; i < nidx; i++) {
430: MPI_Comm iscomm;
432: PetscCall(ISGetLocalSize(is[i], &lsize));
433: PetscCall(ISGetIndices(is[i], &indices));
434: lsize_tmp = 0;
435: for (j = 0; j < lsize; j++) {
436: owner = -1;
437: row = indices[j];
438: PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439: if (owner != rank) continue;
440: /* local number */
441: row -= rstart;
442: start = ai[row];
443: end = ai[row + 1];
444: for (k = start; k < end; k++) { /* Amat */
445: col = aj[k] + cstart;
446: indices_temp[lsize_tmp++] = col;
447: }
448: start = bi[row];
449: end = bi[row + 1];
450: for (k = start; k < end; k++) { /* Bmat */
451: col = gcols[bj[k]];
452: indices_temp[lsize_tmp++] = col;
453: }
454: }
455: PetscCall(ISRestoreIndices(is[i], &indices));
456: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457: PetscCall(ISDestroy(&is[i]));
458: PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459: PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460: PetscCall(PetscCommDestroy(&iscomm));
461: }
462: PetscCall(PetscFree(indices_temp));
463: PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464: PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465: PetscFunctionReturn(PETSC_SUCCESS);
466: }
468: /*
469: Sample message format:
470: If a processor A wants processor B to process some elements corresponding
471: to index sets is[1],is[5]
472: mesg [0] = 2 (no of index sets in the mesg)
473: -----------
474: mesg [1] = 1 => is[1]
475: mesg [2] = sizeof(is[1]);
476: -----------
477: mesg [3] = 5 => is[5]
478: mesg [4] = sizeof(is[5]);
479: -----------
480: mesg [5]
481: mesg [n] datas[1]
482: -----------
483: mesg[n+1]
484: mesg[m] data(is[5])
485: -----------
487: Notes:
488: nrqs - no of requests sent (or to be sent out)
489: nrqr - no of requests received (which have to be or which have been processed)
490: */
491: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492: {
493: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
494: PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495: const PetscInt **idx, *idx_i;
496: PetscInt *n, **data, len;
497: #if defined(PETSC_USE_CTABLE)
498: PetscHMapI *table_data, table_data_i;
499: PetscInt *tdata, tcount, tcount_max;
500: #else
501: PetscInt *data_i, *d_p;
502: #endif
503: PetscMPIInt size, rank, tag1, tag2, proc = 0;
504: PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505: PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506: PetscBT *table;
507: MPI_Comm comm;
508: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509: MPI_Status *recv_status;
510: MPI_Comm *iscomms;
511: char *t_p;
513: PetscFunctionBegin;
514: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515: size = c->size;
516: rank = c->rank;
517: M = C->rmap->N;
519: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
522: PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
524: for (i = 0; i < imax; i++) {
525: PetscCall(ISGetIndices(is[i], &idx[i]));
526: PetscCall(ISGetLocalSize(is[i], &n[i]));
527: }
529: /* evaluate communication - mesg to who,length of mesg, and buffer space
530: required. Based on this, buffers are allocated, and data copied into them */
531: PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532: for (i = 0; i < imax; i++) {
533: PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534: idx_i = idx[i];
535: len = n[i];
536: for (j = 0; j < len; j++) {
537: row = idx_i[j];
538: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539: PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540: w4[proc]++;
541: }
542: for (j = 0; j < size; j++) {
543: if (w4[j]) {
544: w1[j] += w4[j];
545: w3[j]++;
546: }
547: }
548: }
550: nrqs = 0; /* no of outgoing messages */
551: msz = 0; /* total mesg length (for all proc */
552: w1[rank] = 0; /* no mesg sent to intself */
553: w3[rank] = 0;
554: for (i = 0; i < size; i++) {
555: if (w1[i]) {
556: w2[i] = 1;
557: nrqs++;
558: } /* there exists a message to proc i */
559: }
560: /* pa - is list of processors to communicate with */
561: PetscCall(PetscMalloc1(nrqs, &pa));
562: for (i = 0, j = 0; i < size; i++) {
563: if (w1[i]) {
564: pa[j] = i;
565: j++;
566: }
567: }
569: /* Each message would have a header = 1 + 2*(no of IS) + data */
570: for (i = 0; i < nrqs; i++) {
571: j = pa[i];
572: w1[j] += w2[j] + 2 * w3[j];
573: msz += w1[j];
574: }
576: /* Determine the number of messages to expect, their lengths, from from-ids */
577: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
580: /* Now post the Irecvs corresponding to these messages */
581: PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
583: /* Allocate Memory for outgoing messages */
584: PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585: PetscCall(PetscArrayzero(outdat, size));
586: PetscCall(PetscArrayzero(ptr, size));
588: {
589: PetscInt *iptr = tmp, ict = 0;
590: for (i = 0; i < nrqs; i++) {
591: j = pa[i];
592: iptr += ict;
593: outdat[j] = iptr;
594: ict = w1[j];
595: }
596: }
598: /* Form the outgoing messages */
599: /* plug in the headers */
600: for (i = 0; i < nrqs; i++) {
601: j = pa[i];
602: outdat[j][0] = 0;
603: PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604: ptr[j] = outdat[j] + 2 * w3[j] + 1;
605: }
607: /* Memory for doing local proc's work */
608: {
609: PetscInt M_BPB_imax = 0;
610: #if defined(PETSC_USE_CTABLE)
611: PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612: PetscCall(PetscMalloc1(imax, &table_data));
613: for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614: PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615: for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616: #else
617: PetscInt Mimax = 0;
618: PetscCall(PetscIntMultError(M, imax, &Mimax));
619: PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620: PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621: for (i = 0; i < imax; i++) {
622: table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623: data[i] = d_p + M * i;
624: }
625: #endif
626: }
628: /* Parse the IS and update local tables and the outgoing buf with the data */
629: {
630: PetscInt n_i, isz_i, *outdat_j, ctr_j;
631: PetscBT table_i;
633: for (i = 0; i < imax; i++) {
634: PetscCall(PetscArrayzero(ctr, size));
635: n_i = n[i];
636: table_i = table[i];
637: idx_i = idx[i];
638: #if defined(PETSC_USE_CTABLE)
639: table_data_i = table_data[i];
640: #else
641: data_i = data[i];
642: #endif
643: isz_i = isz[i];
644: for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645: row = idx_i[j];
646: PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647: if (proc != rank) { /* copy to the outgoing buffer */
648: ctr[proc]++;
649: *ptr[proc] = row;
650: ptr[proc]++;
651: } else if (!PetscBTLookupSet(table_i, row)) {
652: #if defined(PETSC_USE_CTABLE)
653: PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654: #else
655: data_i[isz_i] = row; /* Update the local table */
656: #endif
657: isz_i++;
658: }
659: }
660: /* Update the headers for the current IS */
661: for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662: if ((ctr_j = ctr[j])) {
663: outdat_j = outdat[j];
664: k = ++outdat_j[0];
665: outdat_j[2 * k] = ctr_j;
666: outdat_j[2 * k - 1] = i;
667: }
668: }
669: isz[i] = isz_i;
670: }
671: }
673: /* Now post the sends */
674: PetscCall(PetscMalloc1(nrqs, &s_waits1));
675: for (i = 0; i < nrqs; ++i) {
676: j = pa[i];
677: PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678: }
680: /* No longer need the original indices */
681: PetscCall(PetscMalloc1(imax, &iscomms));
682: for (i = 0; i < imax; ++i) {
683: PetscCall(ISRestoreIndices(is[i], idx + i));
684: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685: }
686: PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
688: for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
690: /* Do Local work */
691: #if defined(PETSC_USE_CTABLE)
692: PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693: #else
694: PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695: #endif
697: /* Receive messages */
698: PetscCall(PetscMalloc1(nrqr, &recv_status));
699: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
702: /* Phase 1 sends are complete - deallocate buffers */
703: PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704: PetscCall(PetscFree4(w1, w2, w3, w4));
706: PetscCall(PetscMalloc1(nrqr, &xdata));
707: PetscCall(PetscMalloc1(nrqr, &isz1));
708: PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709: PetscCall(PetscFree(rbuf[0]));
710: PetscCall(PetscFree(rbuf));
712: /* Send the data back */
713: /* Do a global reduction to know the buffer space req for incoming messages */
714: {
715: PetscMPIInt *rw1;
717: PetscCall(PetscCalloc1(size, &rw1));
719: for (i = 0; i < nrqr; ++i) {
720: proc = recv_status[i].MPI_SOURCE;
722: PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723: rw1[proc] = isz1[i];
724: }
725: PetscCall(PetscFree(onodes1));
726: PetscCall(PetscFree(olengths1));
728: /* Determine the number of messages to expect, their lengths, from from-ids */
729: PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730: PetscCall(PetscFree(rw1));
731: }
732: /* Now post the Irecvs corresponding to these messages */
733: PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
735: /* Now post the sends */
736: PetscCall(PetscMalloc1(nrqr, &s_waits2));
737: for (i = 0; i < nrqr; ++i) {
738: j = recv_status[i].MPI_SOURCE;
739: PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740: }
742: /* receive work done on other processors */
743: {
744: PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745: PetscMPIInt idex;
746: PetscBT table_i;
748: for (i = 0; i < nrqs; ++i) {
749: PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750: /* Process the message */
751: rbuf2_i = rbuf2[idex];
752: ct1 = 2 * rbuf2_i[0] + 1;
753: jmax = rbuf2[idex][0];
754: for (j = 1; j <= jmax; j++) {
755: max = rbuf2_i[2 * j];
756: is_no = rbuf2_i[2 * j - 1];
757: isz_i = isz[is_no];
758: table_i = table[is_no];
759: #if defined(PETSC_USE_CTABLE)
760: table_data_i = table_data[is_no];
761: #else
762: data_i = data[is_no];
763: #endif
764: for (k = 0; k < max; k++, ct1++) {
765: row = rbuf2_i[ct1];
766: if (!PetscBTLookupSet(table_i, row)) {
767: #if defined(PETSC_USE_CTABLE)
768: PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769: #else
770: data_i[isz_i] = row;
771: #endif
772: isz_i++;
773: }
774: }
775: isz[is_no] = isz_i;
776: }
777: }
779: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780: }
782: #if defined(PETSC_USE_CTABLE)
783: tcount_max = 0;
784: for (i = 0; i < imax; ++i) {
785: table_data_i = table_data[i];
786: PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787: if (tcount_max < tcount) tcount_max = tcount;
788: }
789: PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790: #endif
792: for (i = 0; i < imax; ++i) {
793: #if defined(PETSC_USE_CTABLE)
794: PetscHashIter tpos;
796: table_data_i = table_data[i];
797: PetscHashIterBegin(table_data_i, tpos);
798: while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799: PetscHashIterGetKey(table_data_i, tpos, k);
800: PetscHashIterGetVal(table_data_i, tpos, j);
801: PetscHashIterNext(table_data_i, tpos);
802: tdata[--j] = --k;
803: }
804: PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805: #else
806: PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807: #endif
808: PetscCall(PetscCommDestroy(&iscomms[i]));
809: }
811: PetscCall(PetscFree(iscomms));
812: PetscCall(PetscFree(onodes2));
813: PetscCall(PetscFree(olengths2));
815: PetscCall(PetscFree(pa));
816: PetscCall(PetscFree(rbuf2[0]));
817: PetscCall(PetscFree(rbuf2));
818: PetscCall(PetscFree(s_waits1));
819: PetscCall(PetscFree(r_waits1));
820: PetscCall(PetscFree(s_waits2));
821: PetscCall(PetscFree(r_waits2));
822: PetscCall(PetscFree(recv_status));
823: if (xdata) {
824: PetscCall(PetscFree(xdata[0]));
825: PetscCall(PetscFree(xdata));
826: }
827: PetscCall(PetscFree(isz1));
828: #if defined(PETSC_USE_CTABLE)
829: for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830: PetscCall(PetscFree(table_data));
831: PetscCall(PetscFree(tdata));
832: PetscCall(PetscFree4(table, data, isz, t_p));
833: #else
834: PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835: #endif
836: PetscFunctionReturn(PETSC_SUCCESS);
837: }
839: /*
840: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841: the work on the local processor.
843: Inputs:
844: C - MAT_MPIAIJ;
845: imax - total no of index sets processed at a time;
846: table - an array of char - size = m bits.
848: Output:
849: isz - array containing the count of the solution elements corresponding
850: to each index set;
851: data or table_data - pointer to the solutions
852: */
853: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854: {
855: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856: Mat A = c->A, B = c->B;
857: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858: PetscInt start, end, val, max, rstart, cstart, *ai, *aj;
859: PetscInt *bi, *bj, *garray, i, j, k, row, isz_i;
860: PetscBT table_i;
861: #if defined(PETSC_USE_CTABLE)
862: PetscHMapI table_data_i;
863: PetscHashIter tpos;
864: PetscInt tcount, *tdata;
865: #else
866: PetscInt *data_i;
867: #endif
869: PetscFunctionBegin;
870: rstart = C->rmap->rstart;
871: cstart = C->cmap->rstart;
872: ai = a->i;
873: aj = a->j;
874: bi = b->i;
875: bj = b->j;
876: garray = c->garray;
878: for (i = 0; i < imax; i++) {
879: #if defined(PETSC_USE_CTABLE)
880: /* copy existing entries of table_data_i into tdata[] */
881: table_data_i = table_data[i];
882: PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883: PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
885: PetscCall(PetscMalloc1(tcount, &tdata));
886: PetscHashIterBegin(table_data_i, tpos);
887: while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888: PetscHashIterGetKey(table_data_i, tpos, row);
889: PetscHashIterGetVal(table_data_i, tpos, j);
890: PetscHashIterNext(table_data_i, tpos);
891: tdata[--j] = --row;
892: PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893: }
894: #else
895: data_i = data[i];
896: #endif
897: table_i = table[i];
898: isz_i = isz[i];
899: max = isz[i];
901: for (j = 0; j < max; j++) {
902: #if defined(PETSC_USE_CTABLE)
903: row = tdata[j] - rstart;
904: #else
905: row = data_i[j] - rstart;
906: #endif
907: start = ai[row];
908: end = ai[row + 1];
909: for (k = start; k < end; k++) { /* Amat */
910: val = aj[k] + cstart;
911: if (!PetscBTLookupSet(table_i, val)) {
912: #if defined(PETSC_USE_CTABLE)
913: PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914: #else
915: data_i[isz_i] = val;
916: #endif
917: isz_i++;
918: }
919: }
920: start = bi[row];
921: end = bi[row + 1];
922: for (k = start; k < end; k++) { /* Bmat */
923: val = garray[bj[k]];
924: if (!PetscBTLookupSet(table_i, val)) {
925: #if defined(PETSC_USE_CTABLE)
926: PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927: #else
928: data_i[isz_i] = val;
929: #endif
930: isz_i++;
931: }
932: }
933: }
934: isz[i] = isz_i;
936: #if defined(PETSC_USE_CTABLE)
937: PetscCall(PetscFree(tdata));
938: #endif
939: }
940: PetscFunctionReturn(PETSC_SUCCESS);
941: }
943: /*
944: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945: and return the output
947: Input:
948: C - the matrix
949: nrqr - no of messages being processed.
950: rbuf - an array of pointers to the received requests
952: Output:
953: xdata - array of messages to be sent back
954: isz1 - size of each message
956: For better efficiency perhaps we should malloc separately each xdata[i],
957: then if a remalloc is required we need only copy the data for that one row
958: rather than all previous rows as it is now where a single large chunk of
959: memory is used.
961: */
962: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963: {
964: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965: Mat A = c->A, B = c->B;
966: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967: PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968: PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969: PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970: PetscInt *rbuf_i, kmax, rbuf_0;
971: PetscBT xtable;
973: PetscFunctionBegin;
974: m = C->rmap->N;
975: rstart = C->rmap->rstart;
976: cstart = C->cmap->rstart;
977: ai = a->i;
978: aj = a->j;
979: bi = b->i;
980: bj = b->j;
981: garray = c->garray;
983: for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984: rbuf_i = rbuf[i];
985: rbuf_0 = rbuf_i[0];
986: ct += rbuf_0;
987: for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988: }
990: if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991: else max1 = 1;
992: mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993: if (nrqr) {
994: PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995: ++no_malloc;
996: }
997: PetscCall(PetscBTCreate(m, &xtable));
998: PetscCall(PetscArrayzero(isz1, nrqr));
1000: ct3 = 0;
1001: for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002: rbuf_i = rbuf[i];
1003: rbuf_0 = rbuf_i[0];
1004: ct1 = 2 * rbuf_0 + 1;
1005: ct2 = ct1;
1006: ct3 += ct1;
1007: for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008: PetscCall(PetscBTMemzero(m, xtable));
1009: oct2 = ct2;
1010: kmax = rbuf_i[2 * j];
1011: for (k = 0; k < kmax; k++, ct1++) {
1012: row = rbuf_i[ct1];
1013: if (!PetscBTLookupSet(xtable, row)) {
1014: if (!(ct3 < mem_estimate)) {
1015: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016: PetscCall(PetscMalloc1(new_estimate, &tmp));
1017: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018: PetscCall(PetscFree(xdata[0]));
1019: xdata[0] = tmp;
1020: mem_estimate = new_estimate;
1021: ++no_malloc;
1022: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023: }
1024: xdata[i][ct2++] = row;
1025: ct3++;
1026: }
1027: }
1028: for (k = oct2, max2 = ct2; k < max2; k++) {
1029: row = xdata[i][k] - rstart;
1030: start = ai[row];
1031: end = ai[row + 1];
1032: for (l = start; l < end; l++) {
1033: val = aj[l] + cstart;
1034: if (!PetscBTLookupSet(xtable, val)) {
1035: if (!(ct3 < mem_estimate)) {
1036: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037: PetscCall(PetscMalloc1(new_estimate, &tmp));
1038: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039: PetscCall(PetscFree(xdata[0]));
1040: xdata[0] = tmp;
1041: mem_estimate = new_estimate;
1042: ++no_malloc;
1043: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044: }
1045: xdata[i][ct2++] = val;
1046: ct3++;
1047: }
1048: }
1049: start = bi[row];
1050: end = bi[row + 1];
1051: for (l = start; l < end; l++) {
1052: val = garray[bj[l]];
1053: if (!PetscBTLookupSet(xtable, val)) {
1054: if (!(ct3 < mem_estimate)) {
1055: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056: PetscCall(PetscMalloc1(new_estimate, &tmp));
1057: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058: PetscCall(PetscFree(xdata[0]));
1059: xdata[0] = tmp;
1060: mem_estimate = new_estimate;
1061: ++no_malloc;
1062: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063: }
1064: xdata[i][ct2++] = val;
1065: ct3++;
1066: }
1067: }
1068: }
1069: /* Update the header*/
1070: xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071: xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072: }
1073: xdata[i][0] = rbuf_0;
1074: if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075: isz1[i] = ct2; /* size of each message */
1076: }
1077: PetscCall(PetscBTDestroy(&xtable));
1078: PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079: PetscFunctionReturn(PETSC_SUCCESS);
1080: }
1082: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083: /*
1084: Every processor gets the entire matrix
1085: */
1086: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087: {
1088: Mat B;
1089: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090: Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091: PetscMPIInt size, rank;
1092: PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0;
1093: PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1095: PetscFunctionBegin;
1096: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097: PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098: if (scall == MAT_INITIAL_MATRIX) {
1099: /* Tell every processor the number of nonzeros per row */
1100: PetscCall(PetscCalloc1(A->rmap->N, &lens));
1101: for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1103: /* All MPI processes get the same matrix */
1104: PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1106: /* Create the sequential matrix of the same type as the local block diagonal */
1107: PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1108: PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1109: PetscCall(MatSetBlockSizesFromMats(B, A, A));
1110: PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1111: PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1112: PetscCall(PetscCalloc1(2, Bin));
1113: **Bin = B;
1114: b = (Mat_SeqAIJ *)B->data;
1116: /* zero column space */
1117: nrecv = 0;
1118: for (i = 0; i < size; i++) {
1119: for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j];
1120: }
1121: PetscCall(PetscArrayzero(b->j, nrecv));
1123: /* Copy my part of matrix column indices over */
1124: sendcount = ad->nz + bd->nz;
1125: jsendbuf = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]);
1126: a_jsendbuf = ad->j;
1127: b_jsendbuf = bd->j;
1128: n = A->rmap->rend - A->rmap->rstart;
1129: cnt = 0;
1130: for (i = 0; i < n; i++) {
1131: /* put in lower diagonal portion */
1132: m = bd->i[i + 1] - bd->i[i];
1133: while (m > 0) {
1134: /* is it above diagonal (in bd (compressed) numbering) */
1135: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1136: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1137: m--;
1138: }
1140: /* put in diagonal portion */
1141: for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1143: /* put in upper diagonal portion */
1144: while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1145: }
1146: PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1148: /* send column indices, b->j was zeroed */
1149: PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A)));
1151: /* Assemble the matrix into useable form (numerical values not yet set) */
1152: /* set the b->ilen (length of each row) values */
1153: PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154: /* set the b->i indices */
1155: b->i[0] = 0;
1156: for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157: PetscCall(PetscFree(lens));
1158: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1161: } else {
1162: B = **Bin;
1163: b = (Mat_SeqAIJ *)B->data;
1164: }
1166: /* Copy my part of matrix numerical values into the values location */
1167: if (flag == MAT_GET_VALUES) {
1168: const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169: MatScalar *sendbuf;
1171: /* initialize b->a */
1172: PetscCall(PetscArrayzero(b->a, b->nz));
1174: PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1175: PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1176: sendcount = ad->nz + bd->nz;
1177: sendbuf = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]);
1178: a_sendbuf = ada;
1179: b_sendbuf = bda;
1180: b_sendj = bd->j;
1181: n = A->rmap->rend - A->rmap->rstart;
1182: cnt = 0;
1183: for (i = 0; i < n; i++) {
1184: /* put in lower diagonal portion */
1185: m = bd->i[i + 1] - bd->i[i];
1186: while (m > 0) {
1187: /* is it above diagonal (in bd (compressed) numbering) */
1188: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1189: sendbuf[cnt++] = *b_sendbuf++;
1190: m--;
1191: b_sendj++;
1192: }
1194: /* put in diagonal portion */
1195: for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1197: /* put in upper diagonal portion */
1198: while (m-- > 0) {
1199: sendbuf[cnt++] = *b_sendbuf++;
1200: b_sendj++;
1201: }
1202: }
1203: PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1205: /* send values, b->a was zeroed */
1206: PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A)));
1208: PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1209: PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1210: } /* endof (flag == MAT_GET_VALUES) */
1212: PetscCall(MatPropagateSymmetryOptions(A, B));
1213: PetscFunctionReturn(PETSC_SUCCESS);
1214: }
1216: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1217: {
1218: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
1219: Mat submat, A = c->A, B = c->B;
1220: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1221: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1222: PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1223: const PetscInt *icol, *irow;
1224: PetscInt nrow, ncol, start;
1225: PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1226: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1227: PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1228: PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1229: PetscInt *lens, rmax, ncols, *cols, Crow;
1230: #if defined(PETSC_USE_CTABLE)
1231: PetscHMapI cmap, rmap;
1232: PetscInt *cmap_loc, *rmap_loc;
1233: #else
1234: PetscInt *cmap, *rmap;
1235: #endif
1236: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1237: PetscInt *cworkB, lwrite, *subcols, *row2proc;
1238: PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1239: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1240: MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4;
1241: MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1242: MPI_Status *r_status3 = NULL, *r_status4, *s_status4;
1243: MPI_Comm comm;
1244: PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1245: PetscMPIInt *onodes1, *olengths1, idex, end;
1246: Mat_SubSppt *smatis1;
1247: PetscBool isrowsorted, iscolsorted;
1249: PetscFunctionBegin;
1252: PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1253: PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1254: PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1255: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1256: size = c->size;
1257: rank = c->rank;
1259: PetscCall(ISSorted(iscol[0], &iscolsorted));
1260: PetscCall(ISSorted(isrow[0], &isrowsorted));
1261: PetscCall(ISGetIndices(isrow[0], &irow));
1262: PetscCall(ISGetLocalSize(isrow[0], &nrow));
1263: if (allcolumns) {
1264: icol = NULL;
1265: ncol = C->cmap->N;
1266: } else {
1267: PetscCall(ISGetIndices(iscol[0], &icol));
1268: PetscCall(ISGetLocalSize(iscol[0], &ncol));
1269: }
1271: if (scall == MAT_INITIAL_MATRIX) {
1272: PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1274: /* Get some new tags to keep the communication clean */
1275: tag1 = ((PetscObject)C)->tag;
1276: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1277: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1279: /* evaluate communication - mesg to who, length of mesg, and buffer space
1280: required. Based on this, buffers are allocated, and data copied into them */
1281: PetscCall(PetscCalloc2(size, &w1, size, &w2));
1282: PetscCall(PetscMalloc1(nrow, &row2proc));
1284: /* w1[proc] = num of rows owned by proc -- to be requested */
1285: proc = 0;
1286: nrqs = 0; /* num of outgoing messages */
1287: for (j = 0; j < nrow; j++) {
1288: row = irow[j];
1289: if (!isrowsorted) proc = 0;
1290: while (row >= C->rmap->range[proc + 1]) proc++;
1291: w1[proc]++;
1292: row2proc[j] = proc; /* map row index to proc */
1294: if (proc != rank && !w2[proc]) {
1295: w2[proc] = 1;
1296: nrqs++;
1297: }
1298: }
1299: w1[rank] = 0; /* rows owned by self will not be requested */
1301: PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1302: for (proc = 0, j = 0; proc < size; proc++) {
1303: if (w1[proc]) pa[j++] = proc;
1304: }
1306: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1307: msz = 0; /* total mesg length (for all procs) */
1308: for (i = 0; i < nrqs; i++) {
1309: proc = pa[i];
1310: w1[proc] += 3;
1311: msz += w1[proc];
1312: }
1313: PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1315: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1316: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1317: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1319: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1320: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1321: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1323: /* Now post the Irecvs corresponding to these messages */
1324: PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1326: PetscCall(PetscFree(onodes1));
1327: PetscCall(PetscFree(olengths1));
1329: /* Allocate Memory for outgoing messages */
1330: PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1331: PetscCall(PetscArrayzero(sbuf1, size));
1332: PetscCall(PetscArrayzero(ptr, size));
1334: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1335: iptr = tmp;
1336: for (i = 0; i < nrqs; i++) {
1337: proc = pa[i];
1338: sbuf1[proc] = iptr;
1339: iptr += w1[proc];
1340: }
1342: /* Form the outgoing messages */
1343: /* Initialize the header space */
1344: for (i = 0; i < nrqs; i++) {
1345: proc = pa[i];
1346: PetscCall(PetscArrayzero(sbuf1[proc], 3));
1347: ptr[proc] = sbuf1[proc] + 3;
1348: }
1350: /* Parse the isrow and copy data into outbuf */
1351: PetscCall(PetscArrayzero(ctr, size));
1352: for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1353: proc = row2proc[j];
1354: if (proc != rank) { /* copy to the outgoing buf*/
1355: *ptr[proc] = irow[j];
1356: ctr[proc]++;
1357: ptr[proc]++;
1358: }
1359: }
1361: /* Update the headers for the current IS */
1362: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1363: if ((ctr_j = ctr[j])) {
1364: sbuf1_j = sbuf1[j];
1365: k = ++sbuf1_j[0];
1366: sbuf1_j[2 * k] = ctr_j;
1367: sbuf1_j[2 * k - 1] = 0;
1368: }
1369: }
1371: /* Now post the sends */
1372: PetscCall(PetscMalloc1(nrqs, &s_waits1));
1373: for (i = 0; i < nrqs; ++i) {
1374: proc = pa[i];
1375: PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1376: }
1378: /* Post Receives to capture the buffer size */
1379: PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1380: PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1382: if (nrqs) rbuf2[0] = tmp + msz;
1383: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1385: for (i = 0; i < nrqs; ++i) {
1386: proc = pa[i];
1387: PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1388: }
1390: PetscCall(PetscFree2(w1, w2));
1392: /* Send to other procs the buf size they should allocate */
1393: /* Receive messages*/
1394: PetscCall(PetscMalloc1(nrqr, &r_status1));
1395: PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1397: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1398: for (i = 0; i < nrqr; ++i) {
1399: req_size[i] = 0;
1400: rbuf1_i = rbuf1[i];
1401: start = 2 * rbuf1_i[0] + 1;
1402: PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1403: PetscCall(PetscMalloc1(end, &sbuf2[i]));
1404: sbuf2_i = sbuf2[i];
1405: for (j = start; j < end; j++) {
1406: k = rbuf1_i[j] - rstart;
1407: ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1408: sbuf2_i[j] = ncols;
1409: req_size[i] += ncols;
1410: }
1411: req_source1[i] = r_status1[i].MPI_SOURCE;
1413: /* form the header */
1414: sbuf2_i[0] = req_size[i];
1415: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1417: PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1418: }
1420: PetscCall(PetscFree(r_status1));
1421: PetscCall(PetscFree(r_waits1));
1423: /* rbuf2 is received, Post recv column indices a->j */
1424: PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1426: PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1427: for (i = 0; i < nrqs; ++i) {
1428: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1429: req_source2[i] = r_status2[i].MPI_SOURCE;
1430: PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1431: }
1433: /* Wait on sends1 and sends2 */
1434: PetscCall(PetscMalloc1(nrqs, &s_status1));
1435: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1436: PetscCall(PetscFree(s_waits1));
1437: PetscCall(PetscFree(s_status1));
1439: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1440: PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1442: /* Now allocate sending buffers for a->j, and send them off */
1443: PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1444: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1445: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1446: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1448: for (i = 0; i < nrqr; i++) { /* for each requested message */
1449: rbuf1_i = rbuf1[i];
1450: sbuf_aj_i = sbuf_aj[i];
1451: ct1 = 2 * rbuf1_i[0] + 1;
1452: ct2 = 0;
1453: /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1455: kmax = rbuf1[i][2];
1456: for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1457: row = rbuf1_i[ct1] - rstart;
1458: nzA = ai[row + 1] - ai[row];
1459: nzB = bi[row + 1] - bi[row];
1460: ncols = nzA + nzB;
1461: cworkA = PetscSafePointerPlusOffset(aj, ai[row]);
1462: cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1464: /* load the column indices for this row into cols*/
1465: cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2);
1467: lwrite = 0;
1468: for (l = 0; l < nzB; l++) {
1469: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1470: }
1471: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1472: for (l = 0; l < nzB; l++) {
1473: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1474: }
1476: ct2 += ncols;
1477: }
1478: PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1479: }
1481: /* create column map (cmap): global col of C -> local col of submat */
1482: #if defined(PETSC_USE_CTABLE)
1483: if (!allcolumns) {
1484: PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1485: PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1486: for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1487: if (icol[j] >= cstart && icol[j] < cend) {
1488: cmap_loc[icol[j] - cstart] = j + 1;
1489: } else { /* use PetscHMapI for non-local col indices */
1490: PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1491: }
1492: }
1493: } else {
1494: cmap = NULL;
1495: cmap_loc = NULL;
1496: }
1497: PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1498: #else
1499: if (!allcolumns) {
1500: PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1501: for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1502: } else {
1503: cmap = NULL;
1504: }
1505: #endif
1507: /* Create lens for MatSeqAIJSetPreallocation() */
1508: PetscCall(PetscCalloc1(nrow, &lens));
1510: /* Compute lens from local part of C */
1511: for (j = 0; j < nrow; j++) {
1512: row = irow[j];
1513: proc = row2proc[j];
1514: if (proc == rank) {
1515: /* diagonal part A = c->A */
1516: ncols = ai[row - rstart + 1] - ai[row - rstart];
1517: cols = PetscSafePointerPlusOffset(aj, ai[row - rstart]);
1518: if (!allcolumns) {
1519: for (k = 0; k < ncols; k++) {
1520: #if defined(PETSC_USE_CTABLE)
1521: tcol = cmap_loc[cols[k]];
1522: #else
1523: tcol = cmap[cols[k] + cstart];
1524: #endif
1525: if (tcol) lens[j]++;
1526: }
1527: } else { /* allcolumns */
1528: lens[j] = ncols;
1529: }
1531: /* off-diagonal part B = c->B */
1532: ncols = bi[row - rstart + 1] - bi[row - rstart];
1533: cols = PetscSafePointerPlusOffset(bj, bi[row - rstart]);
1534: if (!allcolumns) {
1535: for (k = 0; k < ncols; k++) {
1536: #if defined(PETSC_USE_CTABLE)
1537: PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1538: #else
1539: tcol = cmap[bmap[cols[k]]];
1540: #endif
1541: if (tcol) lens[j]++;
1542: }
1543: } else { /* allcolumns */
1544: lens[j] += ncols;
1545: }
1546: }
1547: }
1549: /* Create row map (rmap): global row of C -> local row of submat */
1550: #if defined(PETSC_USE_CTABLE)
1551: PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1552: for (j = 0; j < nrow; j++) {
1553: row = irow[j];
1554: proc = row2proc[j];
1555: if (proc == rank) { /* a local row */
1556: rmap_loc[row - rstart] = j;
1557: } else {
1558: PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1559: }
1560: }
1561: #else
1562: PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1563: for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1564: #endif
1566: /* Update lens from offproc data */
1567: /* recv a->j is done */
1568: PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1569: for (i = 0; i < nrqs; i++) {
1570: proc = pa[i];
1571: sbuf1_i = sbuf1[proc];
1572: /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1573: ct1 = 2 + 1;
1574: ct2 = 0;
1575: rbuf2_i = rbuf2[i]; /* received length of C->j */
1576: rbuf3_i = rbuf3[i]; /* received C->j */
1578: /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1579: max1 = sbuf1_i[2];
1580: for (k = 0; k < max1; k++, ct1++) {
1581: #if defined(PETSC_USE_CTABLE)
1582: PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1583: row--;
1584: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1585: #else
1586: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1587: #endif
1588: /* Now, store row index of submat in sbuf1_i[ct1] */
1589: sbuf1_i[ct1] = row;
1591: nnz = rbuf2_i[ct1];
1592: if (!allcolumns) {
1593: for (l = 0; l < nnz; l++, ct2++) {
1594: #if defined(PETSC_USE_CTABLE)
1595: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1596: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1597: } else {
1598: PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1599: }
1600: #else
1601: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1602: #endif
1603: if (tcol) lens[row]++;
1604: }
1605: } else { /* allcolumns */
1606: lens[row] += nnz;
1607: }
1608: }
1609: }
1610: PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1611: PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1613: /* Create the submatrices */
1614: PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1615: PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1617: PetscCall(ISGetBlockSize(isrow[0], &i));
1618: PetscCall(ISGetBlockSize(iscol[0], &j));
1619: if (i > 1 || j > 1) PetscCall(MatSetBlockSizes(submat, i, j));
1620: PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1621: PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1623: /* create struct Mat_SubSppt and attached it to submat */
1624: PetscCall(PetscNew(&smatis1));
1625: subc = (Mat_SeqAIJ *)submat->data;
1626: subc->submatis1 = smatis1;
1628: smatis1->id = 0;
1629: smatis1->nrqs = nrqs;
1630: smatis1->nrqr = nrqr;
1631: smatis1->rbuf1 = rbuf1;
1632: smatis1->rbuf2 = rbuf2;
1633: smatis1->rbuf3 = rbuf3;
1634: smatis1->sbuf2 = sbuf2;
1635: smatis1->req_source2 = req_source2;
1637: smatis1->sbuf1 = sbuf1;
1638: smatis1->ptr = ptr;
1639: smatis1->tmp = tmp;
1640: smatis1->ctr = ctr;
1642: smatis1->pa = pa;
1643: smatis1->req_size = req_size;
1644: smatis1->req_source1 = req_source1;
1646: smatis1->allcolumns = allcolumns;
1647: smatis1->singleis = PETSC_TRUE;
1648: smatis1->row2proc = row2proc;
1649: smatis1->rmap = rmap;
1650: smatis1->cmap = cmap;
1651: #if defined(PETSC_USE_CTABLE)
1652: smatis1->rmap_loc = rmap_loc;
1653: smatis1->cmap_loc = cmap_loc;
1654: #endif
1656: smatis1->destroy = submat->ops->destroy;
1657: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1658: submat->factortype = C->factortype;
1660: /* compute rmax */
1661: rmax = 0;
1662: for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1664: } else { /* scall == MAT_REUSE_MATRIX */
1665: submat = submats[0];
1666: PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1668: subc = (Mat_SeqAIJ *)submat->data;
1669: rmax = subc->rmax;
1670: smatis1 = subc->submatis1;
1671: nrqs = smatis1->nrqs;
1672: nrqr = smatis1->nrqr;
1673: rbuf1 = smatis1->rbuf1;
1674: rbuf2 = smatis1->rbuf2;
1675: rbuf3 = smatis1->rbuf3;
1676: req_source2 = smatis1->req_source2;
1678: sbuf1 = smatis1->sbuf1;
1679: sbuf2 = smatis1->sbuf2;
1680: ptr = smatis1->ptr;
1681: tmp = smatis1->tmp;
1682: ctr = smatis1->ctr;
1684: pa = smatis1->pa;
1685: req_size = smatis1->req_size;
1686: req_source1 = smatis1->req_source1;
1688: allcolumns = smatis1->allcolumns;
1689: row2proc = smatis1->row2proc;
1690: rmap = smatis1->rmap;
1691: cmap = smatis1->cmap;
1692: #if defined(PETSC_USE_CTABLE)
1693: rmap_loc = smatis1->rmap_loc;
1694: cmap_loc = smatis1->cmap_loc;
1695: #endif
1696: }
1698: /* Post recv matrix values */
1699: PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1700: PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1701: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1702: for (i = 0; i < nrqs; ++i) {
1703: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1704: PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1705: }
1707: /* Allocate sending buffers for a->a, and send them off */
1708: PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1709: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1710: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1711: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1713: for (i = 0; i < nrqr; i++) {
1714: rbuf1_i = rbuf1[i];
1715: sbuf_aa_i = sbuf_aa[i];
1716: ct1 = 2 * rbuf1_i[0] + 1;
1717: ct2 = 0;
1718: /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1720: kmax = rbuf1_i[2];
1721: for (k = 0; k < kmax; k++, ct1++) {
1722: row = rbuf1_i[ct1] - rstart;
1723: nzA = ai[row + 1] - ai[row];
1724: nzB = bi[row + 1] - bi[row];
1725: ncols = nzA + nzB;
1726: cworkB = PetscSafePointerPlusOffset(bj, bi[row]);
1727: vworkA = PetscSafePointerPlusOffset(a_a, ai[row]);
1728: vworkB = PetscSafePointerPlusOffset(b_a, bi[row]);
1730: /* load the column values for this row into vals*/
1731: vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2);
1733: lwrite = 0;
1734: for (l = 0; l < nzB; l++) {
1735: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1736: }
1737: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1738: for (l = 0; l < nzB; l++) {
1739: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1740: }
1742: ct2 += ncols;
1743: }
1744: PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1745: }
1747: /* Assemble submat */
1748: /* First assemble the local rows */
1749: for (j = 0; j < nrow; j++) {
1750: row = irow[j];
1751: proc = row2proc[j];
1752: if (proc == rank) {
1753: Crow = row - rstart; /* local row index of C */
1754: #if defined(PETSC_USE_CTABLE)
1755: row = rmap_loc[Crow]; /* row index of submat */
1756: #else
1757: row = rmap[row];
1758: #endif
1760: if (allcolumns) {
1761: /* diagonal part A = c->A */
1762: ncols = ai[Crow + 1] - ai[Crow];
1763: cols = PetscSafePointerPlusOffset(aj, ai[Crow]);
1764: vals = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1765: i = 0;
1766: for (k = 0; k < ncols; k++) {
1767: subcols[i] = cols[k] + cstart;
1768: subvals[i++] = vals[k];
1769: }
1771: /* off-diagonal part B = c->B */
1772: ncols = bi[Crow + 1] - bi[Crow];
1773: cols = PetscSafePointerPlusOffset(bj, bi[Crow]);
1774: vals = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1775: for (k = 0; k < ncols; k++) {
1776: subcols[i] = bmap[cols[k]];
1777: subvals[i++] = vals[k];
1778: }
1780: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1782: } else { /* !allcolumns */
1783: #if defined(PETSC_USE_CTABLE)
1784: /* diagonal part A = c->A */
1785: ncols = ai[Crow + 1] - ai[Crow];
1786: cols = PetscSafePointerPlusOffset(aj, ai[Crow]);
1787: vals = PetscSafePointerPlusOffset(a_a, ai[Crow]);
1788: i = 0;
1789: for (k = 0; k < ncols; k++) {
1790: tcol = cmap_loc[cols[k]];
1791: if (tcol) {
1792: subcols[i] = --tcol;
1793: subvals[i++] = vals[k];
1794: }
1795: }
1797: /* off-diagonal part B = c->B */
1798: ncols = bi[Crow + 1] - bi[Crow];
1799: cols = PetscSafePointerPlusOffset(bj, bi[Crow]);
1800: vals = PetscSafePointerPlusOffset(b_a, bi[Crow]);
1801: for (k = 0; k < ncols; k++) {
1802: PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1803: if (tcol) {
1804: subcols[i] = --tcol;
1805: subvals[i++] = vals[k];
1806: }
1807: }
1808: #else
1809: /* diagonal part A = c->A */
1810: ncols = ai[Crow + 1] - ai[Crow];
1811: cols = aj + ai[Crow];
1812: vals = a_a + ai[Crow];
1813: i = 0;
1814: for (k = 0; k < ncols; k++) {
1815: tcol = cmap[cols[k] + cstart];
1816: if (tcol) {
1817: subcols[i] = --tcol;
1818: subvals[i++] = vals[k];
1819: }
1820: }
1822: /* off-diagonal part B = c->B */
1823: ncols = bi[Crow + 1] - bi[Crow];
1824: cols = bj + bi[Crow];
1825: vals = b_a + bi[Crow];
1826: for (k = 0; k < ncols; k++) {
1827: tcol = cmap[bmap[cols[k]]];
1828: if (tcol) {
1829: subcols[i] = --tcol;
1830: subvals[i++] = vals[k];
1831: }
1832: }
1833: #endif
1834: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1835: }
1836: }
1837: }
1839: /* Now assemble the off-proc rows */
1840: for (i = 0; i < nrqs; i++) { /* for each requested message */
1841: /* recv values from other processes */
1842: PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1843: proc = pa[idex];
1844: sbuf1_i = sbuf1[proc];
1845: /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1846: ct1 = 2 + 1;
1847: ct2 = 0; /* count of received C->j */
1848: ct3 = 0; /* count of received C->j that will be inserted into submat */
1849: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1850: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1851: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1853: /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1854: max1 = sbuf1_i[2]; /* num of rows */
1855: for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1856: row = sbuf1_i[ct1]; /* row index of submat */
1857: if (!allcolumns) {
1858: idex = 0;
1859: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1860: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1861: for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1862: #if defined(PETSC_USE_CTABLE)
1863: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1864: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1865: } else {
1866: PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1867: }
1868: #else
1869: tcol = cmap[rbuf3_i[ct2]];
1870: #endif
1871: if (tcol) {
1872: subcols[idex] = --tcol; /* may not be sorted */
1873: subvals[idex++] = rbuf4_i[ct2];
1875: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1876: For reuse, we replace received C->j with index that should be inserted to submat */
1877: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1878: }
1879: }
1880: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1881: } else { /* scall == MAT_REUSE_MATRIX */
1882: submat = submats[0];
1883: subc = (Mat_SeqAIJ *)submat->data;
1885: nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1886: for (l = 0; l < nnz; l++) {
1887: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1888: subvals[idex++] = rbuf4_i[ct2];
1889: }
1891: bj = subc->j + subc->i[row]; /* sorted column indices */
1892: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1893: }
1894: } else { /* allcolumns */
1895: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1896: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES));
1897: ct2 += nnz;
1898: }
1899: }
1900: }
1902: /* sending a->a are done */
1903: PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1904: PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1906: PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1907: PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1908: submats[0] = submat;
1910: /* Restore the indices */
1911: PetscCall(ISRestoreIndices(isrow[0], &irow));
1912: if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1914: /* Destroy allocated memory */
1915: for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1916: PetscCall(PetscFree3(rbuf4, subcols, subvals));
1917: if (sbuf_aa) {
1918: PetscCall(PetscFree(sbuf_aa[0]));
1919: PetscCall(PetscFree(sbuf_aa));
1920: }
1922: if (scall == MAT_INITIAL_MATRIX) {
1923: PetscCall(PetscFree(lens));
1924: if (sbuf_aj) {
1925: PetscCall(PetscFree(sbuf_aj[0]));
1926: PetscCall(PetscFree(sbuf_aj));
1927: }
1928: }
1929: PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1930: PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1931: PetscFunctionReturn(PETSC_SUCCESS);
1932: }
1934: static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1935: {
1936: PetscInt ncol;
1937: PetscBool colflag, allcolumns = PETSC_FALSE;
1939: PetscFunctionBegin;
1940: /* Allocate memory to hold all the submatrices */
1941: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1943: /* Check for special case: each processor gets entire matrix columns */
1944: PetscCall(ISIdentity(iscol[0], &colflag));
1945: PetscCall(ISGetLocalSize(iscol[0], &ncol));
1946: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1948: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1949: PetscFunctionReturn(PETSC_SUCCESS);
1950: }
1952: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1953: {
1954: PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1955: PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE;
1956: Mat_SeqAIJ *subc;
1957: Mat_SubSppt *smat;
1959: PetscFunctionBegin;
1960: /* Check for special case: each processor has a single IS */
1961: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1962: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1963: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1964: PetscFunctionReturn(PETSC_SUCCESS);
1965: }
1967: /* Collect global wantallmatrix and nstages */
1968: if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1969: else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1970: if (!nmax) nmax = 1;
1972: if (scall == MAT_INITIAL_MATRIX) {
1973: /* Collect global wantallmatrix and nstages */
1974: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1975: PetscCall(ISIdentity(*isrow, &rowflag));
1976: PetscCall(ISIdentity(*iscol, &colflag));
1977: PetscCall(ISGetLocalSize(*isrow, &nrow));
1978: PetscCall(ISGetLocalSize(*iscol, &ncol));
1979: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1980: wantallmatrix = PETSC_TRUE;
1982: PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1983: }
1984: }
1986: /* Determine the number of stages through which submatrices are done
1987: Each stage will extract nmax submatrices.
1988: nmax is determined by the matrix column dimension.
1989: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1990: */
1991: nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1993: in[0] = -1 * (PetscInt)wantallmatrix;
1994: in[1] = nstages;
1995: PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1996: wantallmatrix = (PetscBool)(-out[0]);
1997: nstages = out[1]; /* Make sure every processor loops through the global nstages */
1999: } else { /* MAT_REUSE_MATRIX */
2000: if (ismax) {
2001: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2002: smat = subc->submatis1;
2003: } else { /* (*submat)[0] is a dummy matrix */
2004: smat = (Mat_SubSppt *)(*submat)[0]->data;
2005: }
2006: if (!smat) {
2007: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2008: wantallmatrix = PETSC_TRUE;
2009: } else if (smat->singleis) {
2010: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2011: PetscFunctionReturn(PETSC_SUCCESS);
2012: } else {
2013: nstages = smat->nstages;
2014: }
2015: }
2017: if (wantallmatrix) {
2018: PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2019: PetscFunctionReturn(PETSC_SUCCESS);
2020: }
2022: /* Allocate memory to hold all the submatrices and dummy submatrices */
2023: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2025: for (i = 0, pos = 0; i < nstages; i++) {
2026: if (pos + nmax <= ismax) max_no = nmax;
2027: else if (pos >= ismax) max_no = 0;
2028: else max_no = ismax - pos;
2030: PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos));
2031: if (!max_no) {
2032: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2033: smat = (Mat_SubSppt *)(*submat)[pos]->data;
2034: smat->nstages = nstages;
2035: }
2036: pos++; /* advance to next dummy matrix if any */
2037: } else pos += max_no;
2038: }
2040: if (ismax && scall == MAT_INITIAL_MATRIX) {
2041: /* save nstages for reuse */
2042: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2043: smat = subc->submatis1;
2044: smat->nstages = nstages;
2045: }
2046: PetscFunctionReturn(PETSC_SUCCESS);
2047: }
2049: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2050: {
2051: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
2052: Mat A = c->A;
2053: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2054: const PetscInt **icol, **irow;
2055: PetscInt *nrow, *ncol, start;
2056: PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2057: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2058: PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2059: PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2060: PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2061: #if defined(PETSC_USE_CTABLE)
2062: PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2063: #else
2064: PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2065: #endif
2066: const PetscInt *irow_i;
2067: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2068: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2069: MPI_Request *r_waits4, *s_waits3, *s_waits4;
2070: MPI_Comm comm;
2071: PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2072: PetscMPIInt *onodes1, *olengths1, end;
2073: PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2074: Mat_SubSppt *smat_i;
2075: PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2076: PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2078: PetscFunctionBegin;
2079: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2080: size = c->size;
2081: rank = c->rank;
2083: PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2084: PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2086: for (i = 0; i < ismax; i++) {
2087: PetscCall(ISSorted(iscol[i], &issorted[i]));
2088: if (!issorted[i]) iscsorted = issorted[i];
2090: PetscCall(ISSorted(isrow[i], &issorted[i]));
2092: PetscCall(ISGetIndices(isrow[i], &irow[i]));
2093: PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2095: /* Check for special case: allcolumn */
2096: PetscCall(ISIdentity(iscol[i], &colflag));
2097: PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2098: if (colflag && ncol[i] == C->cmap->N) {
2099: allcolumns[i] = PETSC_TRUE;
2100: icol[i] = NULL;
2101: } else {
2102: allcolumns[i] = PETSC_FALSE;
2103: PetscCall(ISGetIndices(iscol[i], &icol[i]));
2104: }
2105: }
2107: if (scall == MAT_REUSE_MATRIX) {
2108: /* Assumes new rows are same length as the old rows */
2109: for (i = 0; i < ismax; i++) {
2110: PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2111: subc = (Mat_SeqAIJ *)submats[i]->data;
2112: PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2114: /* Initial matrix as if empty */
2115: PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2117: smat_i = subc->submatis1;
2119: nrqs = smat_i->nrqs;
2120: nrqr = smat_i->nrqr;
2121: rbuf1 = smat_i->rbuf1;
2122: rbuf2 = smat_i->rbuf2;
2123: rbuf3 = smat_i->rbuf3;
2124: req_source2 = smat_i->req_source2;
2126: sbuf1 = smat_i->sbuf1;
2127: sbuf2 = smat_i->sbuf2;
2128: ptr = smat_i->ptr;
2129: tmp = smat_i->tmp;
2130: ctr = smat_i->ctr;
2132: pa = smat_i->pa;
2133: req_size = smat_i->req_size;
2134: req_source1 = smat_i->req_source1;
2136: allcolumns[i] = smat_i->allcolumns;
2137: row2proc[i] = smat_i->row2proc;
2138: rmap[i] = smat_i->rmap;
2139: cmap[i] = smat_i->cmap;
2140: }
2142: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2143: PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2144: smat_i = (Mat_SubSppt *)submats[0]->data;
2146: nrqs = smat_i->nrqs;
2147: nrqr = smat_i->nrqr;
2148: rbuf1 = smat_i->rbuf1;
2149: rbuf2 = smat_i->rbuf2;
2150: rbuf3 = smat_i->rbuf3;
2151: req_source2 = smat_i->req_source2;
2153: sbuf1 = smat_i->sbuf1;
2154: sbuf2 = smat_i->sbuf2;
2155: ptr = smat_i->ptr;
2156: tmp = smat_i->tmp;
2157: ctr = smat_i->ctr;
2159: pa = smat_i->pa;
2160: req_size = smat_i->req_size;
2161: req_source1 = smat_i->req_source1;
2163: allcolumns[0] = PETSC_FALSE;
2164: }
2165: } else { /* scall == MAT_INITIAL_MATRIX */
2166: /* Get some new tags to keep the communication clean */
2167: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2168: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2170: /* evaluate communication - mesg to who, length of mesg, and buffer space
2171: required. Based on this, buffers are allocated, and data copied into them*/
2172: PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2174: for (i = 0; i < ismax; i++) {
2175: jmax = nrow[i];
2176: irow_i = irow[i];
2178: PetscCall(PetscMalloc1(jmax, &row2proc_i));
2179: row2proc[i] = row2proc_i;
2181: if (issorted[i]) proc = 0;
2182: for (j = 0; j < jmax; j++) {
2183: if (!issorted[i]) proc = 0;
2184: row = irow_i[j];
2185: while (row >= C->rmap->range[proc + 1]) proc++;
2186: w4[proc]++;
2187: row2proc_i[j] = proc; /* map row index to proc */
2188: }
2189: for (j = 0; j < size; j++) {
2190: if (w4[j]) {
2191: w1[j] += w4[j];
2192: w3[j]++;
2193: w4[j] = 0;
2194: }
2195: }
2196: }
2198: nrqs = 0; /* no of outgoing messages */
2199: msz = 0; /* total mesg length (for all procs) */
2200: w1[rank] = 0; /* no mesg sent to self */
2201: w3[rank] = 0;
2202: for (i = 0; i < size; i++) {
2203: if (w1[i]) {
2204: w2[i] = 1;
2205: nrqs++;
2206: } /* there exists a message to proc i */
2207: }
2208: PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2209: for (i = 0, j = 0; i < size; i++) {
2210: if (w1[i]) {
2211: pa[j] = i;
2212: j++;
2213: }
2214: }
2216: /* Each message would have a header = 1 + 2*(no of IS) + data */
2217: for (i = 0; i < nrqs; i++) {
2218: j = pa[i];
2219: w1[j] += w2[j] + 2 * w3[j];
2220: msz += w1[j];
2221: }
2222: PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2224: /* Determine the number of messages to expect, their lengths, from from-ids */
2225: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2226: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2228: /* Now post the Irecvs corresponding to these messages */
2229: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2230: PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2232: /* Allocate Memory for outgoing messages */
2233: PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2234: PetscCall(PetscArrayzero(sbuf1, size));
2235: PetscCall(PetscArrayzero(ptr, size));
2237: {
2238: PetscInt *iptr = tmp;
2239: k = 0;
2240: for (i = 0; i < nrqs; i++) {
2241: j = pa[i];
2242: iptr += k;
2243: sbuf1[j] = iptr;
2244: k = w1[j];
2245: }
2246: }
2248: /* Form the outgoing messages. Initialize the header space */
2249: for (i = 0; i < nrqs; i++) {
2250: j = pa[i];
2251: sbuf1[j][0] = 0;
2252: PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2253: ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2254: }
2256: /* Parse the isrow and copy data into outbuf */
2257: for (i = 0; i < ismax; i++) {
2258: row2proc_i = row2proc[i];
2259: PetscCall(PetscArrayzero(ctr, size));
2260: irow_i = irow[i];
2261: jmax = nrow[i];
2262: for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2263: proc = row2proc_i[j];
2264: if (proc != rank) { /* copy to the outgoing buf*/
2265: ctr[proc]++;
2266: *ptr[proc] = irow_i[j];
2267: ptr[proc]++;
2268: }
2269: }
2270: /* Update the headers for the current IS */
2271: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2272: if ((ctr_j = ctr[j])) {
2273: sbuf1_j = sbuf1[j];
2274: k = ++sbuf1_j[0];
2275: sbuf1_j[2 * k] = ctr_j;
2276: sbuf1_j[2 * k - 1] = i;
2277: }
2278: }
2279: }
2281: /* Now post the sends */
2282: PetscCall(PetscMalloc1(nrqs, &s_waits1));
2283: for (i = 0; i < nrqs; ++i) {
2284: j = pa[i];
2285: PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2286: }
2288: /* Post Receives to capture the buffer size */
2289: PetscCall(PetscMalloc1(nrqs, &r_waits2));
2290: PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2291: if (nrqs) rbuf2[0] = tmp + msz;
2292: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2293: for (i = 0; i < nrqs; ++i) {
2294: j = pa[i];
2295: PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2296: }
2298: /* Send to other procs the buf size they should allocate */
2299: /* Receive messages*/
2300: PetscCall(PetscMalloc1(nrqr, &s_waits2));
2301: PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2302: {
2303: PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2304: PetscInt *sbuf2_i;
2306: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2307: for (i = 0; i < nrqr; ++i) {
2308: req_size[i] = 0;
2309: rbuf1_i = rbuf1[i];
2310: start = 2 * rbuf1_i[0] + 1;
2311: end = olengths1[i];
2312: PetscCall(PetscMalloc1(end, &sbuf2[i]));
2313: sbuf2_i = sbuf2[i];
2314: for (j = start; j < end; j++) {
2315: id = rbuf1_i[j] - rstart;
2316: ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2317: sbuf2_i[j] = ncols;
2318: req_size[i] += ncols;
2319: }
2320: req_source1[i] = onodes1[i];
2321: /* form the header */
2322: sbuf2_i[0] = req_size[i];
2323: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2325: PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2326: }
2327: }
2329: PetscCall(PetscFree(onodes1));
2330: PetscCall(PetscFree(olengths1));
2331: PetscCall(PetscFree(r_waits1));
2332: PetscCall(PetscFree4(w1, w2, w3, w4));
2334: /* Receive messages*/
2335: PetscCall(PetscMalloc1(nrqs, &r_waits3));
2336: PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2337: for (i = 0; i < nrqs; ++i) {
2338: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2339: req_source2[i] = pa[i];
2340: PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2341: }
2342: PetscCall(PetscFree(r_waits2));
2344: /* Wait on sends1 and sends2 */
2345: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2346: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2347: PetscCall(PetscFree(s_waits1));
2348: PetscCall(PetscFree(s_waits2));
2350: /* Now allocate sending buffers for a->j, and send them off */
2351: PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2352: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2353: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2354: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2356: PetscCall(PetscMalloc1(nrqr, &s_waits3));
2357: {
2358: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2359: PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2360: PetscInt cend = C->cmap->rend;
2361: PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2363: for (i = 0; i < nrqr; i++) {
2364: rbuf1_i = rbuf1[i];
2365: sbuf_aj_i = sbuf_aj[i];
2366: ct1 = 2 * rbuf1_i[0] + 1;
2367: ct2 = 0;
2368: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2369: kmax = rbuf1[i][2 * j];
2370: for (k = 0; k < kmax; k++, ct1++) {
2371: row = rbuf1_i[ct1] - rstart;
2372: nzA = a_i[row + 1] - a_i[row];
2373: nzB = b_i[row + 1] - b_i[row];
2374: ncols = nzA + nzB;
2375: cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]);
2376: cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2378: /* load the column indices for this row into cols */
2379: cols = sbuf_aj_i + ct2;
2381: lwrite = 0;
2382: for (l = 0; l < nzB; l++) {
2383: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2384: }
2385: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2386: for (l = 0; l < nzB; l++) {
2387: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2388: }
2390: ct2 += ncols;
2391: }
2392: }
2393: PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2394: }
2395: }
2397: /* create col map: global col of C -> local col of submatrices */
2398: {
2399: const PetscInt *icol_i;
2400: #if defined(PETSC_USE_CTABLE)
2401: for (i = 0; i < ismax; i++) {
2402: if (!allcolumns[i]) {
2403: PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2405: jmax = ncol[i];
2406: icol_i = icol[i];
2407: cmap_i = cmap[i];
2408: for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2409: } else cmap[i] = NULL;
2410: }
2411: #else
2412: for (i = 0; i < ismax; i++) {
2413: if (!allcolumns[i]) {
2414: PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2415: jmax = ncol[i];
2416: icol_i = icol[i];
2417: cmap_i = cmap[i];
2418: for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2419: } else cmap[i] = NULL;
2420: }
2421: #endif
2422: }
2424: /* Create lens which is required for MatCreate... */
2425: for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2426: PetscCall(PetscMalloc1(ismax, &lens));
2428: if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2429: for (i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]);
2431: /* Update lens from local data */
2432: for (i = 0; i < ismax; i++) {
2433: row2proc_i = row2proc[i];
2434: jmax = nrow[i];
2435: if (!allcolumns[i]) cmap_i = cmap[i];
2436: irow_i = irow[i];
2437: lens_i = lens[i];
2438: for (j = 0; j < jmax; j++) {
2439: row = irow_i[j];
2440: proc = row2proc_i[j];
2441: if (proc == rank) {
2442: PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2443: if (!allcolumns[i]) {
2444: for (k = 0; k < ncols; k++) {
2445: #if defined(PETSC_USE_CTABLE)
2446: PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2447: #else
2448: tcol = cmap_i[cols[k]];
2449: #endif
2450: if (tcol) lens_i[j]++;
2451: }
2452: } else { /* allcolumns */
2453: lens_i[j] = ncols;
2454: }
2455: PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2456: }
2457: }
2458: }
2460: /* Create row map: global row of C -> local row of submatrices */
2461: #if defined(PETSC_USE_CTABLE)
2462: for (i = 0; i < ismax; i++) {
2463: PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2464: irow_i = irow[i];
2465: jmax = nrow[i];
2466: for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2467: }
2468: #else
2469: for (i = 0; i < ismax; i++) {
2470: PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2471: rmap_i = rmap[i];
2472: irow_i = irow[i];
2473: jmax = nrow[i];
2474: for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2475: }
2476: #endif
2478: /* Update lens from offproc data */
2479: {
2480: PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2482: PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2483: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2484: sbuf1_i = sbuf1[pa[tmp2]];
2485: jmax = sbuf1_i[0];
2486: ct1 = 2 * jmax + 1;
2487: ct2 = 0;
2488: rbuf2_i = rbuf2[tmp2];
2489: rbuf3_i = rbuf3[tmp2];
2490: for (j = 1; j <= jmax; j++) {
2491: is_no = sbuf1_i[2 * j - 1];
2492: max1 = sbuf1_i[2 * j];
2493: lens_i = lens[is_no];
2494: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2495: rmap_i = rmap[is_no];
2496: for (k = 0; k < max1; k++, ct1++) {
2497: #if defined(PETSC_USE_CTABLE)
2498: PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2499: row--;
2500: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2501: #else
2502: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2503: #endif
2504: max2 = rbuf2_i[ct1];
2505: for (l = 0; l < max2; l++, ct2++) {
2506: if (!allcolumns[is_no]) {
2507: #if defined(PETSC_USE_CTABLE)
2508: PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2509: #else
2510: tcol = cmap_i[rbuf3_i[ct2]];
2511: #endif
2512: if (tcol) lens_i[row]++;
2513: } else { /* allcolumns */
2514: lens_i[row]++; /* lens_i[row] += max2 ? */
2515: }
2516: }
2517: }
2518: }
2519: }
2520: }
2521: PetscCall(PetscFree(r_waits3));
2522: PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2523: PetscCall(PetscFree(s_waits3));
2525: /* Create the submatrices */
2526: for (i = 0; i < ismax; i++) {
2527: PetscInt rbs, cbs;
2529: PetscCall(ISGetBlockSize(isrow[i], &rbs));
2530: PetscCall(ISGetBlockSize(iscol[i], &cbs));
2532: PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2533: PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2535: if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2536: PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2537: PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2539: /* create struct Mat_SubSppt and attached it to submat */
2540: PetscCall(PetscNew(&smat_i));
2541: subc = (Mat_SeqAIJ *)submats[i]->data;
2542: subc->submatis1 = smat_i;
2544: smat_i->destroy = submats[i]->ops->destroy;
2545: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2546: submats[i]->factortype = C->factortype;
2548: smat_i->id = i;
2549: smat_i->nrqs = nrqs;
2550: smat_i->nrqr = nrqr;
2551: smat_i->rbuf1 = rbuf1;
2552: smat_i->rbuf2 = rbuf2;
2553: smat_i->rbuf3 = rbuf3;
2554: smat_i->sbuf2 = sbuf2;
2555: smat_i->req_source2 = req_source2;
2557: smat_i->sbuf1 = sbuf1;
2558: smat_i->ptr = ptr;
2559: smat_i->tmp = tmp;
2560: smat_i->ctr = ctr;
2562: smat_i->pa = pa;
2563: smat_i->req_size = req_size;
2564: smat_i->req_source1 = req_source1;
2566: smat_i->allcolumns = allcolumns[i];
2567: smat_i->singleis = PETSC_FALSE;
2568: smat_i->row2proc = row2proc[i];
2569: smat_i->rmap = rmap[i];
2570: smat_i->cmap = cmap[i];
2571: }
2573: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2574: PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2575: PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2576: PetscCall(MatSetType(submats[0], MATDUMMY));
2578: /* create struct Mat_SubSppt and attached it to submat */
2579: PetscCall(PetscNew(&smat_i));
2580: submats[0]->data = (void *)smat_i;
2582: smat_i->destroy = submats[0]->ops->destroy;
2583: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2584: submats[0]->factortype = C->factortype;
2586: smat_i->id = 0;
2587: smat_i->nrqs = nrqs;
2588: smat_i->nrqr = nrqr;
2589: smat_i->rbuf1 = rbuf1;
2590: smat_i->rbuf2 = rbuf2;
2591: smat_i->rbuf3 = rbuf3;
2592: smat_i->sbuf2 = sbuf2;
2593: smat_i->req_source2 = req_source2;
2595: smat_i->sbuf1 = sbuf1;
2596: smat_i->ptr = ptr;
2597: smat_i->tmp = tmp;
2598: smat_i->ctr = ctr;
2600: smat_i->pa = pa;
2601: smat_i->req_size = req_size;
2602: smat_i->req_source1 = req_source1;
2604: smat_i->allcolumns = PETSC_FALSE;
2605: smat_i->singleis = PETSC_FALSE;
2606: smat_i->row2proc = NULL;
2607: smat_i->rmap = NULL;
2608: smat_i->cmap = NULL;
2609: }
2611: if (ismax) PetscCall(PetscFree(lens[0]));
2612: PetscCall(PetscFree(lens));
2613: if (sbuf_aj) {
2614: PetscCall(PetscFree(sbuf_aj[0]));
2615: PetscCall(PetscFree(sbuf_aj));
2616: }
2618: } /* endof scall == MAT_INITIAL_MATRIX */
2620: /* Post recv matrix values */
2621: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2622: PetscCall(PetscMalloc1(nrqs, &rbuf4));
2623: PetscCall(PetscMalloc1(nrqs, &r_waits4));
2624: for (i = 0; i < nrqs; ++i) {
2625: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2626: PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2627: }
2629: /* Allocate sending buffers for a->a, and send them off */
2630: PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2631: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2632: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2633: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2635: PetscCall(PetscMalloc1(nrqr, &s_waits4));
2636: {
2637: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2638: PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2639: PetscInt cend = C->cmap->rend;
2640: PetscInt *b_j = b->j;
2641: PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2643: PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2644: PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2645: for (i = 0; i < nrqr; i++) {
2646: rbuf1_i = rbuf1[i];
2647: sbuf_aa_i = sbuf_aa[i];
2648: ct1 = 2 * rbuf1_i[0] + 1;
2649: ct2 = 0;
2650: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2651: kmax = rbuf1_i[2 * j];
2652: for (k = 0; k < kmax; k++, ct1++) {
2653: row = rbuf1_i[ct1] - rstart;
2654: nzA = a_i[row + 1] - a_i[row];
2655: nzB = b_i[row + 1] - b_i[row];
2656: ncols = nzA + nzB;
2657: cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]);
2658: vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]);
2659: vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]);
2661: /* load the column values for this row into vals*/
2662: vals = sbuf_aa_i + ct2;
2664: lwrite = 0;
2665: for (l = 0; l < nzB; l++) {
2666: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2667: }
2668: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2669: for (l = 0; l < nzB; l++) {
2670: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2671: }
2673: ct2 += ncols;
2674: }
2675: }
2676: PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2677: }
2678: PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2679: PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2680: }
2682: /* Assemble the matrices */
2683: /* First assemble the local rows */
2684: for (i = 0; i < ismax; i++) {
2685: row2proc_i = row2proc[i];
2686: subc = (Mat_SeqAIJ *)submats[i]->data;
2687: imat_ilen = subc->ilen;
2688: imat_j = subc->j;
2689: imat_i = subc->i;
2690: imat_a = subc->a;
2692: if (!allcolumns[i]) cmap_i = cmap[i];
2693: rmap_i = rmap[i];
2694: irow_i = irow[i];
2695: jmax = nrow[i];
2696: for (j = 0; j < jmax; j++) {
2697: row = irow_i[j];
2698: proc = row2proc_i[j];
2699: if (proc == rank) {
2700: old_row = row;
2701: #if defined(PETSC_USE_CTABLE)
2702: PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2703: row--;
2704: #else
2705: row = rmap_i[row];
2706: #endif
2707: ilen_row = imat_ilen[row];
2708: PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2709: mat_i = imat_i[row];
2710: mat_a = imat_a + mat_i;
2711: mat_j = imat_j + mat_i;
2712: if (!allcolumns[i]) {
2713: for (k = 0; k < ncols; k++) {
2714: #if defined(PETSC_USE_CTABLE)
2715: PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2716: #else
2717: tcol = cmap_i[cols[k]];
2718: #endif
2719: if (tcol) {
2720: *mat_j++ = tcol - 1;
2721: *mat_a++ = vals[k];
2722: ilen_row++;
2723: }
2724: }
2725: } else { /* allcolumns */
2726: for (k = 0; k < ncols; k++) {
2727: *mat_j++ = cols[k]; /* global col index! */
2728: *mat_a++ = vals[k];
2729: ilen_row++;
2730: }
2731: }
2732: PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2734: imat_ilen[row] = ilen_row;
2735: }
2736: }
2737: }
2739: /* Now assemble the off proc rows */
2740: PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2741: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2742: sbuf1_i = sbuf1[pa[tmp2]];
2743: jmax = sbuf1_i[0];
2744: ct1 = 2 * jmax + 1;
2745: ct2 = 0;
2746: rbuf2_i = rbuf2[tmp2];
2747: rbuf3_i = rbuf3[tmp2];
2748: rbuf4_i = rbuf4[tmp2];
2749: for (j = 1; j <= jmax; j++) {
2750: is_no = sbuf1_i[2 * j - 1];
2751: rmap_i = rmap[is_no];
2752: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2753: subc = (Mat_SeqAIJ *)submats[is_no]->data;
2754: imat_ilen = subc->ilen;
2755: imat_j = subc->j;
2756: imat_i = subc->i;
2757: imat_a = subc->a;
2758: max1 = sbuf1_i[2 * j];
2759: for (k = 0; k < max1; k++, ct1++) {
2760: row = sbuf1_i[ct1];
2761: #if defined(PETSC_USE_CTABLE)
2762: PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2763: row--;
2764: #else
2765: row = rmap_i[row];
2766: #endif
2767: ilen = imat_ilen[row];
2768: mat_i = imat_i[row];
2769: mat_a = PetscSafePointerPlusOffset(imat_a, mat_i);
2770: mat_j = PetscSafePointerPlusOffset(imat_j, mat_i);
2771: max2 = rbuf2_i[ct1];
2772: if (!allcolumns[is_no]) {
2773: for (l = 0; l < max2; l++, ct2++) {
2774: #if defined(PETSC_USE_CTABLE)
2775: PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2776: #else
2777: tcol = cmap_i[rbuf3_i[ct2]];
2778: #endif
2779: if (tcol) {
2780: *mat_j++ = tcol - 1;
2781: *mat_a++ = rbuf4_i[ct2];
2782: ilen++;
2783: }
2784: }
2785: } else { /* allcolumns */
2786: for (l = 0; l < max2; l++, ct2++) {
2787: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2788: *mat_a++ = rbuf4_i[ct2];
2789: ilen++;
2790: }
2791: }
2792: imat_ilen[row] = ilen;
2793: }
2794: }
2795: }
2797: if (!iscsorted) { /* sort column indices of the rows */
2798: for (i = 0; i < ismax; i++) {
2799: subc = (Mat_SeqAIJ *)submats[i]->data;
2800: imat_j = subc->j;
2801: imat_i = subc->i;
2802: imat_a = subc->a;
2803: imat_ilen = subc->ilen;
2805: if (allcolumns[i]) continue;
2806: jmax = nrow[i];
2807: for (j = 0; j < jmax; j++) {
2808: mat_i = imat_i[j];
2809: mat_a = imat_a + mat_i;
2810: mat_j = imat_j + mat_i;
2811: PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2812: }
2813: }
2814: }
2816: PetscCall(PetscFree(r_waits4));
2817: PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2818: PetscCall(PetscFree(s_waits4));
2820: /* Restore the indices */
2821: for (i = 0; i < ismax; i++) {
2822: PetscCall(ISRestoreIndices(isrow[i], irow + i));
2823: if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2824: }
2826: for (i = 0; i < ismax; i++) {
2827: PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2828: PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2829: }
2831: /* Destroy allocated memory */
2832: if (sbuf_aa) {
2833: PetscCall(PetscFree(sbuf_aa[0]));
2834: PetscCall(PetscFree(sbuf_aa));
2835: }
2836: PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2838: for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2839: PetscCall(PetscFree(rbuf4));
2841: PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2842: PetscFunctionReturn(PETSC_SUCCESS);
2843: }
2845: /*
2846: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2847: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2848: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2849: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2850: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2851: state, and needs to be "assembled" later by compressing B's column space.
2853: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2854: Following this call, C->A & C->B have been created, even if empty.
2855: */
2856: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2857: {
2858: /* If making this function public, change the error returned in this function away from _PLIB. */
2859: Mat_MPIAIJ *aij;
2860: Mat_SeqAIJ *Baij;
2861: PetscBool seqaij, Bdisassembled;
2862: PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2863: PetscScalar v;
2864: const PetscInt *rowindices, *colindices;
2866: PetscFunctionBegin;
2867: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2868: if (A) {
2869: PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2870: PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2871: if (rowemb) {
2872: PetscCall(ISGetLocalSize(rowemb, &m));
2873: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2874: } else {
2875: PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2876: }
2877: if (dcolemb) {
2878: PetscCall(ISGetLocalSize(dcolemb, &n));
2879: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2880: } else {
2881: PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2882: }
2883: }
2884: if (B) {
2885: PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2886: PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2887: if (rowemb) {
2888: PetscCall(ISGetLocalSize(rowemb, &m));
2889: PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2890: } else {
2891: PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2892: }
2893: if (ocolemb) {
2894: PetscCall(ISGetLocalSize(ocolemb, &n));
2895: PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2896: } else {
2897: PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2898: }
2899: }
2901: aij = (Mat_MPIAIJ *)C->data;
2902: if (!aij->A) {
2903: /* Mimic parts of MatMPIAIJSetPreallocation() */
2904: PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2905: PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2906: PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2907: PetscCall(MatSetType(aij->A, MATSEQAIJ));
2908: }
2909: if (A) {
2910: PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2911: } else {
2912: PetscCall(MatSetUp(aij->A));
2913: }
2914: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2915: /*
2916: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2917: need to "disassemble" B -- convert it to using C's global indices.
2918: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2920: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2921: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2923: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2924: At least avoid calling MatSetValues() and the implied searches?
2925: */
2927: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2928: #if defined(PETSC_USE_CTABLE)
2929: PetscCall(PetscHMapIDestroy(&aij->colmap));
2930: #else
2931: PetscCall(PetscFree(aij->colmap));
2932: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2933: #endif
2934: ngcol = 0;
2935: if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2936: if (aij->garray) PetscCall(PetscFree(aij->garray));
2937: PetscCall(VecDestroy(&aij->lvec));
2938: PetscCall(VecScatterDestroy(&aij->Mvctx));
2939: }
2940: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2941: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2942: }
2943: Bdisassembled = PETSC_FALSE;
2944: if (!aij->B) {
2945: PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2946: PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2947: PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2948: PetscCall(MatSetType(aij->B, MATSEQAIJ));
2949: Bdisassembled = PETSC_TRUE;
2950: }
2951: if (B) {
2952: Baij = (Mat_SeqAIJ *)B->data;
2953: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2954: PetscCall(PetscMalloc1(B->rmap->n, &nz));
2955: for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2956: PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2957: PetscCall(PetscFree(nz));
2958: }
2960: PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2961: shift = rend - rstart;
2962: count = 0;
2963: rowindices = NULL;
2964: colindices = NULL;
2965: if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2966: if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2967: for (i = 0; i < B->rmap->n; i++) {
2968: PetscInt row;
2969: row = i;
2970: if (rowindices) row = rowindices[i];
2971: for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2972: col = Baij->j[count];
2973: if (colindices) col = colindices[col];
2974: if (Bdisassembled && col >= rstart) col += shift;
2975: v = Baij->a[count];
2976: PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2977: ++count;
2978: }
2979: }
2980: /* No assembly for aij->B is necessary. */
2981: /* FIXME: set aij->B's nonzerostate correctly. */
2982: } else {
2983: PetscCall(MatSetUp(aij->B));
2984: }
2985: C->preallocated = PETSC_TRUE;
2986: C->was_assembled = PETSC_FALSE;
2987: C->assembled = PETSC_FALSE;
2988: /*
2989: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2990: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2991: */
2992: PetscFunctionReturn(PETSC_SUCCESS);
2993: }
2995: /*
2996: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
2997: */
2998: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
2999: {
3000: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3002: PetscFunctionBegin;
3003: PetscAssertPointer(A, 2);
3004: PetscAssertPointer(B, 3);
3005: /* FIXME: make sure C is assembled */
3006: *A = aij->A;
3007: *B = aij->B;
3008: /* Note that we don't incref *A and *B, so be careful! */
3009: PetscFunctionReturn(PETSC_SUCCESS);
3010: }
3012: /*
3013: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3014: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3015: */
3016: static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3017: {
3018: PetscMPIInt size, flag;
3019: PetscInt i, ii, cismax, ispar;
3020: Mat *A, *B;
3021: IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3023: PetscFunctionBegin;
3024: if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3026: for (i = 0, cismax = 0; i < ismax; ++i) {
3027: PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3028: PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3029: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3030: if (size > 1) ++cismax;
3031: }
3033: /*
3034: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3035: ispar counts the number of parallel ISs across C's comm.
3036: */
3037: PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3038: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3039: PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3040: PetscFunctionReturn(PETSC_SUCCESS);
3041: }
3043: /* if (ispar) */
3044: /*
3045: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3046: These are used to extract the off-diag portion of the resulting parallel matrix.
3047: The row IS for the off-diag portion is the same as for the diag portion,
3048: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3049: */
3050: PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3051: PetscCall(PetscMalloc1(cismax, &ciscol_p));
3052: for (i = 0, ii = 0; i < ismax; ++i) {
3053: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3054: if (size > 1) {
3055: /*
3056: TODO: This is the part that's ***NOT SCALABLE***.
3057: To fix this we need to extract just the indices of C's nonzero columns
3058: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3059: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3060: to be done without serializing on the IS list, so, most likely, it is best
3061: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3062: */
3063: PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii]));
3064: /* Now we have to
3065: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3066: were sorted on each rank, concatenated they might no longer be sorted;
3067: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3068: indices in the nondecreasing order to the original index positions.
3069: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3070: */
3071: PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3072: PetscCall(ISSort(ciscol[ii]));
3073: ++ii;
3074: }
3075: }
3076: PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3077: for (i = 0, ii = 0; i < ismax; ++i) {
3078: PetscInt j, issize;
3079: const PetscInt *indices;
3081: /*
3082: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3083: */
3084: PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3085: PetscCall(ISSort(isrow[i]));
3086: PetscCall(ISGetLocalSize(isrow[i], &issize));
3087: PetscCall(ISGetIndices(isrow[i], &indices));
3088: for (j = 1; j < issize; ++j) {
3089: PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3090: }
3091: PetscCall(ISRestoreIndices(isrow[i], &indices));
3092: PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3093: PetscCall(ISSort(iscol[i]));
3094: PetscCall(ISGetLocalSize(iscol[i], &issize));
3095: PetscCall(ISGetIndices(iscol[i], &indices));
3096: for (j = 1; j < issize; ++j) {
3097: PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3098: }
3099: PetscCall(ISRestoreIndices(iscol[i], &indices));
3100: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3101: if (size > 1) {
3102: cisrow[ii] = isrow[i];
3103: ++ii;
3104: }
3105: }
3106: /*
3107: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3108: array of sequential matrices underlying the resulting parallel matrices.
3109: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3110: contain duplicates.
3112: There are as many diag matrices as there are original index sets. There are only as many parallel
3113: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3115: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3116: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3117: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3118: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3119: with A[i] and B[ii] extracted from the corresponding MPI submat.
3120: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3121: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3122: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3123: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3124: values into A[i] and B[ii] sitting inside the corresponding submat.
3125: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3126: A[i], B[ii], AA[i] or BB[ii] matrices.
3127: */
3128: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3129: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3131: /* Now obtain the sequential A and B submatrices separately. */
3132: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3133: PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3134: PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3136: /*
3137: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3138: matrices A & B have been extracted directly into the parallel matrices containing them, or
3139: simply into the sequential matrix identical with the corresponding A (if size == 1).
3140: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3141: to have the same sparsity pattern.
3142: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3143: must be constructed for C. This is done by setseqmat(s).
3144: */
3145: for (i = 0, ii = 0; i < ismax; ++i) {
3146: /*
3147: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3148: That way we can avoid sorting and computing permutations when reusing.
3149: To this end:
3150: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3151: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3152: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3153: */
3154: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3156: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3157: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3158: if (size > 1) {
3159: if (scall == MAT_INITIAL_MATRIX) {
3160: PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3161: PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3162: PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3163: PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3164: PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3165: }
3166: /*
3167: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3168: */
3169: {
3170: Mat AA = A[i], BB = B[ii];
3172: if (AA || BB) {
3173: PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3174: PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3175: PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3176: }
3177: PetscCall(MatDestroy(&AA));
3178: }
3179: PetscCall(ISDestroy(ciscol + ii));
3180: PetscCall(ISDestroy(ciscol_p + ii));
3181: ++ii;
3182: } else { /* if (size == 1) */
3183: if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3184: if (isrow_p[i] || iscol_p[i]) {
3185: PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3186: PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3187: /* Otherwise A is extracted straight into (*submats)[i]. */
3188: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3189: PetscCall(MatDestroy(A + i));
3190: } else (*submat)[i] = A[i];
3191: }
3192: PetscCall(ISDestroy(&isrow_p[i]));
3193: PetscCall(ISDestroy(&iscol_p[i]));
3194: }
3195: PetscCall(PetscFree2(cisrow, ciscol));
3196: PetscCall(PetscFree2(isrow_p, iscol_p));
3197: PetscCall(PetscFree(ciscol_p));
3198: PetscCall(PetscFree(A));
3199: PetscCall(MatDestroySubMatrices(cismax, &B));
3200: PetscFunctionReturn(PETSC_SUCCESS);
3201: }
3203: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3204: {
3205: PetscFunctionBegin;
3206: PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3207: PetscFunctionReturn(PETSC_SUCCESS);
3208: }