Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *);
21: /*
22: Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks
24: The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but
25: that is a very major recoding job.
27: Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin
28: */
29: static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[])
30: {
31: PetscFunctionBegin;
32: for (PetscInt i = 0; i < imax; i++) {
33: if (!is[i]) break;
34: PetscInt n = 0, N, Nmax, Nmin;
35: const PetscInt *idx;
36: PetscInt *nidx = NULL;
37: MPI_Comm comm;
38: PetscBT bt;
40: PetscCall(ISGetLocalSize(is[i], &N));
41: if (N > 0) { /* Nmax and Nmin are garbage for empty IS */
42: PetscCall(ISGetIndices(is[i], &idx));
43: PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax));
44: Nmin = Nmin / bs;
45: Nmax = Nmax / bs;
46: PetscCall(PetscBTCreate(Nmax - Nmin, &bt));
47: for (PetscInt j = 0; j < N; j++) {
48: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++;
49: }
50: PetscCall(PetscMalloc1(n, &nidx));
51: n = 0;
52: PetscCall(PetscBTMemzero(Nmax - Nmin, bt));
53: for (PetscInt j = 0; j < N; j++) {
54: if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs;
55: }
56: PetscCall(PetscBTDestroy(&bt));
57: PetscCall(ISRestoreIndices(is[i], &idx));
58: }
59: PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm));
60: PetscCall(ISDestroy(is + i));
61: PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i));
62: }
63: PetscFunctionReturn(PETSC_SUCCESS);
64: }
66: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov)
67: {
68: PetscInt i;
70: PetscFunctionBegin;
71: PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
72: for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is));
73: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
74: PetscFunctionReturn(PETSC_SUCCESS);
75: }
77: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov)
78: {
79: PetscInt i;
81: PetscFunctionBegin;
82: PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
83: for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is));
84: if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is));
85: PetscFunctionReturn(PETSC_SUCCESS);
86: }
88: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[])
89: {
90: MPI_Comm comm;
91: PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j;
92: PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata;
93: PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL;
94: const PetscInt *indices_i, **indices;
95: PetscLayout rmap;
96: PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner;
97: PetscSF sf;
98: PetscSFNode *remote;
100: PetscFunctionBegin;
101: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
102: PetscCallMPI(MPI_Comm_rank(comm, &rank));
103: PetscCallMPI(MPI_Comm_size(comm, &size));
104: /* get row map to determine where rows should be going */
105: PetscCall(MatGetLayouts(mat, &rmap, NULL));
106: /* retrieve IS data and put all together so that we
107: * can optimize communication
108: * */
109: PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length));
110: for (i = 0, tlength = 0; i < nidx; i++) {
111: PetscCall(ISGetLocalSize(is[i], &length[i]));
112: tlength += length[i];
113: PetscCall(ISGetIndices(is[i], &indices[i]));
114: }
115: /* find these rows on remote processors */
116: PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids));
117: PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp));
118: nrrows = 0;
119: for (i = 0; i < nidx; i++) {
120: length_i = length[i];
121: indices_i = indices[i];
122: for (j = 0; j < length_i; j++) {
123: owner = -1;
124: PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner));
125: /* remote processors */
126: if (owner != rank) {
127: tosizes_temp[owner]++; /* number of rows to owner */
128: rrow_ranks[nrrows] = owner; /* processor */
129: rrow_isids[nrrows] = i; /* is id */
130: remoterows[nrrows++] = indices_i[j]; /* row */
131: }
132: }
133: PetscCall(ISRestoreIndices(is[i], &indices[i]));
134: }
135: PetscCall(PetscFree2(*(PetscInt ***)&indices, length));
136: /* test if we need to exchange messages
137: * generally speaking, we do not need to exchange
138: * data when overlap is 1
139: * */
140: PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm));
141: /* we do not have any messages
142: * It usually corresponds to overlap 1
143: * */
144: if (!reducednrrows) {
145: PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
146: PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
147: PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
148: PetscFunctionReturn(PETSC_SUCCESS);
149: }
150: nto = 0;
151: /* send sizes and ranks for building a two-sided communication */
152: for (i = 0; i < size; i++) {
153: if (tosizes_temp[i]) {
154: tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */
155: tosizes_temp[i] = nto; /* a map from processor to index */
156: toranks[nto++] = i; /* processor */
157: }
158: }
159: PetscCall(PetscMalloc1(nto + 1, &toffsets));
160: toffsets[0] = 0;
161: for (i = 0; i < nto; i++) {
162: toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */
163: tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */
164: }
165: /* send information to other processors */
166: PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes));
167: nrecvrows = 0;
168: for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i];
169: PetscCall(PetscMalloc1(nrecvrows, &remote));
170: nrecvrows = 0;
171: for (i = 0; i < nfrom; i++) {
172: for (j = 0; j < fromsizes[2 * i]; j++) {
173: remote[nrecvrows].rank = fromranks[i];
174: remote[nrecvrows++].index = fromsizes[2 * i + 1] + j;
175: }
176: }
177: PetscCall(PetscSFCreate(comm, &sf));
178: PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
179: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
180: PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
181: PetscCall(PetscSFSetFromOptions(sf));
182: /* message pair <no of is, row> */
183: PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata));
184: for (i = 0; i < nrrows; i++) {
185: owner = rrow_ranks[i]; /* processor */
186: j = tosizes_temp[owner]; /* index */
187: todata[toffsets[j]++] = rrow_isids[i];
188: todata[toffsets[j]++] = remoterows[i];
189: }
190: PetscCall(PetscFree3(toranks, tosizes, tosizes_temp));
191: PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids));
192: PetscCall(PetscFree(toffsets));
193: PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
194: PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE));
195: PetscCall(PetscSFDestroy(&sf));
196: /* send rows belonging to the remote so that then we could get the overlapping data back */
197: PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata));
198: PetscCall(PetscFree2(todata, fromdata));
199: PetscCall(PetscFree(fromsizes));
200: PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes));
201: PetscCall(PetscFree(fromranks));
202: nrecvrows = 0;
203: for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i];
204: PetscCall(PetscCalloc1(nrecvrows, &todata));
205: PetscCall(PetscMalloc1(nrecvrows, &remote));
206: nrecvrows = 0;
207: for (i = 0; i < nto; i++) {
208: for (j = 0; j < tosizes[2 * i]; j++) {
209: remote[nrecvrows].rank = toranks[i];
210: remote[nrecvrows++].index = tosizes[2 * i + 1] + j;
211: }
212: }
213: PetscCall(PetscSFCreate(comm, &sf));
214: PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
215: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
216: PetscCall(PetscSFSetType(sf, PETSCSFBASIC));
217: PetscCall(PetscSFSetFromOptions(sf));
218: /* overlap communication and computation */
219: PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
220: PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is));
221: PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE));
222: PetscCall(PetscSFDestroy(&sf));
223: PetscCall(PetscFree2(sbdata, sbsizes));
224: PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata));
225: PetscCall(PetscFree(toranks));
226: PetscCall(PetscFree(tosizes));
227: PetscCall(PetscFree(todata));
228: PetscFunctionReturn(PETSC_SUCCESS);
229: }
231: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
232: {
233: PetscInt *isz, isz_i, i, j, is_id, data_size;
234: PetscInt col, lsize, max_lsize, *indices_temp, *indices_i;
235: const PetscInt *indices_i_temp;
236: MPI_Comm *iscomms;
238: PetscFunctionBegin;
239: max_lsize = 0;
240: PetscCall(PetscMalloc1(nidx, &isz));
241: for (i = 0; i < nidx; i++) {
242: PetscCall(ISGetLocalSize(is[i], &lsize));
243: max_lsize = lsize > max_lsize ? lsize : max_lsize;
244: isz[i] = lsize;
245: }
246: PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms));
247: for (i = 0; i < nidx; i++) {
248: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
249: PetscCall(ISGetIndices(is[i], &indices_i_temp));
250: PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i]));
251: PetscCall(ISRestoreIndices(is[i], &indices_i_temp));
252: PetscCall(ISDestroy(&is[i]));
253: }
254: /* retrieve information to get row id and its overlap */
255: for (i = 0; i < nrecvs;) {
256: is_id = recvdata[i++];
257: data_size = recvdata[i++];
258: indices_i = indices_temp + (max_lsize + nrecvs) * is_id;
259: isz_i = isz[is_id];
260: for (j = 0; j < data_size; j++) {
261: col = recvdata[i++];
262: indices_i[isz_i++] = col;
263: }
264: isz[is_id] = isz_i;
265: }
266: /* remove duplicate entities */
267: for (i = 0; i < nidx; i++) {
268: indices_i = indices_temp + (max_lsize + nrecvs) * i;
269: isz_i = isz[i];
270: PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i));
271: PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i]));
272: PetscCall(PetscCommDestroy(&iscomms[i]));
273: }
274: PetscCall(PetscFree(isz));
275: PetscCall(PetscFree2(indices_temp, iscomms));
276: PetscFunctionReturn(PETSC_SUCCESS);
277: }
279: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
280: {
281: PetscLayout rmap, cmap;
282: PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i;
283: PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata;
284: PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets;
285: const PetscInt *gcols, *ai, *aj, *bi, *bj;
286: Mat amat, bmat;
287: PetscMPIInt rank;
288: PetscBool done;
289: MPI_Comm comm;
291: PetscFunctionBegin;
292: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
293: PetscCallMPI(MPI_Comm_rank(comm, &rank));
294: PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
295: /* Even if the mat is symmetric, we still assume it is not symmetric */
296: PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
297: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
298: PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
299: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
300: /* total number of nonzero values is used to estimate the memory usage in the next step */
301: tnz = ai[an] + bi[bn];
302: PetscCall(MatGetLayouts(mat, &rmap, &cmap));
303: PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
304: PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
305: /* to find the longest message */
306: max_fszs = 0;
307: for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs;
308: /* better way to estimate number of nonzero in the mat??? */
309: PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp));
310: for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i;
311: rows_pos = 0;
312: totalrows = 0;
313: for (i = 0; i < nfrom; i++) {
314: PetscCall(PetscArrayzero(rows_pos_i, nidx));
315: /* group data together */
316: for (j = 0; j < fromsizes[2 * i]; j += 2) {
317: is_id = fromrows[rows_pos++]; /* no of is */
318: rows_i = rows_data[is_id];
319: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */
320: }
321: /* estimate a space to avoid multiple allocations */
322: for (j = 0; j < nidx; j++) {
323: indvc_ij = 0;
324: rows_i = rows_data[j];
325: for (l = 0; l < rows_pos_i[j]; l++) {
326: row = rows_i[l] - rstart;
327: start = ai[row];
328: end = ai[row + 1];
329: for (k = start; k < end; k++) { /* Amat */
330: col = aj[k] + cstart;
331: indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */
332: }
333: start = bi[row];
334: end = bi[row + 1];
335: for (k = start; k < end; k++) { /* Bmat */
336: col = gcols[bj[k]];
337: indices_tmp[indvc_ij++] = col;
338: }
339: }
340: PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
341: indv_counts[i * nidx + j] = indvc_ij;
342: totalrows += indvc_ij;
343: }
344: }
345: /* message triple <no of is, number of rows, rows> */
346: PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes));
347: totalrows = 0;
348: rows_pos = 0;
349: /* use this code again */
350: for (i = 0; i < nfrom; i++) {
351: PetscCall(PetscArrayzero(rows_pos_i, nidx));
352: for (j = 0; j < fromsizes[2 * i]; j += 2) {
353: is_id = fromrows[rows_pos++];
354: rows_i = rows_data[is_id];
355: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
356: }
357: /* add data */
358: for (j = 0; j < nidx; j++) {
359: if (!indv_counts[i * nidx + j]) continue;
360: indvc_ij = 0;
361: sbdata[totalrows++] = j;
362: sbdata[totalrows++] = indv_counts[i * nidx + j];
363: sbsizes[2 * i] += 2;
364: rows_i = rows_data[j];
365: for (l = 0; l < rows_pos_i[j]; l++) {
366: row = rows_i[l] - rstart;
367: start = ai[row];
368: end = ai[row + 1];
369: for (k = start; k < end; k++) { /* Amat */
370: col = aj[k] + cstart;
371: indices_tmp[indvc_ij++] = col;
372: }
373: start = bi[row];
374: end = bi[row + 1];
375: for (k = start; k < end; k++) { /* Bmat */
376: col = gcols[bj[k]];
377: indices_tmp[indvc_ij++] = col;
378: }
379: }
380: PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp));
381: sbsizes[2 * i] += indvc_ij;
382: PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij));
383: totalrows += indvc_ij;
384: }
385: }
386: PetscCall(PetscMalloc1(nfrom + 1, &offsets));
387: offsets[0] = 0;
388: for (i = 0; i < nfrom; i++) {
389: offsets[i + 1] = offsets[i] + sbsizes[2 * i];
390: sbsizes[2 * i + 1] = offsets[i];
391: }
392: PetscCall(PetscFree(offsets));
393: if (sbrowsizes) *sbrowsizes = sbsizes;
394: if (sbrows) *sbrows = sbdata;
395: PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp));
396: PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
397: PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
398: PetscFunctionReturn(PETSC_SUCCESS);
399: }
401: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[])
402: {
403: const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices;
404: PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp;
405: PetscInt lsize, lsize_tmp;
406: PetscMPIInt rank, owner;
407: Mat amat, bmat;
408: PetscBool done;
409: PetscLayout cmap, rmap;
410: MPI_Comm comm;
412: PetscFunctionBegin;
413: PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
414: PetscCallMPI(MPI_Comm_rank(comm, &rank));
415: PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols));
416: PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
417: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
418: PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
419: PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ ");
420: /* is it a safe way to compute number of nonzero values ? */
421: tnz = ai[an] + bi[bn];
422: PetscCall(MatGetLayouts(mat, &rmap, &cmap));
423: PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL));
424: PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL));
425: /* it is a better way to estimate memory than the old implementation
426: * where global size of matrix is used
427: * */
428: PetscCall(PetscMalloc1(tnz, &indices_temp));
429: for (i = 0; i < nidx; i++) {
430: MPI_Comm iscomm;
432: PetscCall(ISGetLocalSize(is[i], &lsize));
433: PetscCall(ISGetIndices(is[i], &indices));
434: lsize_tmp = 0;
435: for (j = 0; j < lsize; j++) {
436: owner = -1;
437: row = indices[j];
438: PetscCall(PetscLayoutFindOwner(rmap, row, &owner));
439: if (owner != rank) continue;
440: /* local number */
441: row -= rstart;
442: start = ai[row];
443: end = ai[row + 1];
444: for (k = start; k < end; k++) { /* Amat */
445: col = aj[k] + cstart;
446: indices_temp[lsize_tmp++] = col;
447: }
448: start = bi[row];
449: end = bi[row + 1];
450: for (k = start; k < end; k++) { /* Bmat */
451: col = gcols[bj[k]];
452: indices_temp[lsize_tmp++] = col;
453: }
454: }
455: PetscCall(ISRestoreIndices(is[i], &indices));
456: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL));
457: PetscCall(ISDestroy(&is[i]));
458: PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp));
459: PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i]));
460: PetscCall(PetscCommDestroy(&iscomm));
461: }
462: PetscCall(PetscFree(indices_temp));
463: PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done));
464: PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done));
465: PetscFunctionReturn(PETSC_SUCCESS);
466: }
468: /*
469: Sample message format:
470: If a processor A wants processor B to process some elements corresponding
471: to index sets is[1],is[5]
472: mesg [0] = 2 (no of index sets in the mesg)
473: -----------
474: mesg [1] = 1 => is[1]
475: mesg [2] = sizeof(is[1]);
476: -----------
477: mesg [3] = 5 => is[5]
478: mesg [4] = sizeof(is[5]);
479: -----------
480: mesg [5]
481: mesg [n] datas[1]
482: -----------
483: mesg[n+1]
484: mesg[m] data(is[5])
485: -----------
487: Notes:
488: nrqs - no of requests sent (or to be sent out)
489: nrqr - no of requests received (which have to be or which have been processed)
490: */
491: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[])
492: {
493: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
494: PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2;
495: const PetscInt **idx, *idx_i;
496: PetscInt *n, **data, len;
497: #if defined(PETSC_USE_CTABLE)
498: PetscHMapI *table_data, table_data_i;
499: PetscInt *tdata, tcount, tcount_max;
500: #else
501: PetscInt *data_i, *d_p;
502: #endif
503: PetscMPIInt size, rank, tag1, tag2, proc = 0;
504: PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr;
505: PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2;
506: PetscBT *table;
507: MPI_Comm comm;
508: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2;
509: MPI_Status *recv_status;
510: MPI_Comm *iscomms;
511: char *t_p;
513: PetscFunctionBegin;
514: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
515: size = c->size;
516: rank = c->rank;
517: M = C->rmap->N;
519: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
520: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
522: PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n));
524: for (i = 0; i < imax; i++) {
525: PetscCall(ISGetIndices(is[i], &idx[i]));
526: PetscCall(ISGetLocalSize(is[i], &n[i]));
527: }
529: /* evaluate communication - mesg to who,length of mesg, and buffer space
530: required. Based on this, buffers are allocated, and data copied into them */
531: PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4));
532: for (i = 0; i < imax; i++) {
533: PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/
534: idx_i = idx[i];
535: len = n[i];
536: for (j = 0; j < len; j++) {
537: row = idx_i[j];
538: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries");
539: PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
540: w4[proc]++;
541: }
542: for (j = 0; j < size; j++) {
543: if (w4[j]) {
544: w1[j] += w4[j];
545: w3[j]++;
546: }
547: }
548: }
550: nrqs = 0; /* no of outgoing messages */
551: msz = 0; /* total mesg length (for all proc */
552: w1[rank] = 0; /* no mesg sent to intself */
553: w3[rank] = 0;
554: for (i = 0; i < size; i++) {
555: if (w1[i]) {
556: w2[i] = 1;
557: nrqs++;
558: } /* there exists a message to proc i */
559: }
560: /* pa - is list of processors to communicate with */
561: PetscCall(PetscMalloc1(nrqs, &pa));
562: for (i = 0, j = 0; i < size; i++) {
563: if (w1[i]) {
564: pa[j] = i;
565: j++;
566: }
567: }
569: /* Each message would have a header = 1 + 2*(no of IS) + data */
570: for (i = 0; i < nrqs; i++) {
571: j = pa[i];
572: w1[j] += w2[j] + 2 * w3[j];
573: msz += w1[j];
574: }
576: /* Determine the number of messages to expect, their lengths, from from-ids */
577: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
578: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
580: /* Now post the Irecvs corresponding to these messages */
581: PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1));
583: /* Allocate Memory for outgoing messages */
584: PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr));
585: PetscCall(PetscArrayzero(outdat, size));
586: PetscCall(PetscArrayzero(ptr, size));
588: {
589: PetscInt *iptr = tmp, ict = 0;
590: for (i = 0; i < nrqs; i++) {
591: j = pa[i];
592: iptr += ict;
593: outdat[j] = iptr;
594: ict = w1[j];
595: }
596: }
598: /* Form the outgoing messages */
599: /* plug in the headers */
600: for (i = 0; i < nrqs; i++) {
601: j = pa[i];
602: outdat[j][0] = 0;
603: PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j]));
604: ptr[j] = outdat[j] + 2 * w3[j] + 1;
605: }
607: /* Memory for doing local proc's work */
608: {
609: PetscInt M_BPB_imax = 0;
610: #if defined(PETSC_USE_CTABLE)
611: PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
612: PetscCall(PetscMalloc1(imax, &table_data));
613: for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i));
614: PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p));
615: for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
616: #else
617: PetscInt Mimax = 0;
618: PetscCall(PetscIntMultError(M, imax, &Mimax));
619: PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax));
620: PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p));
621: for (i = 0; i < imax; i++) {
622: table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i;
623: data[i] = d_p + M * i;
624: }
625: #endif
626: }
628: /* Parse the IS and update local tables and the outgoing buf with the data */
629: {
630: PetscInt n_i, isz_i, *outdat_j, ctr_j;
631: PetscBT table_i;
633: for (i = 0; i < imax; i++) {
634: PetscCall(PetscArrayzero(ctr, size));
635: n_i = n[i];
636: table_i = table[i];
637: idx_i = idx[i];
638: #if defined(PETSC_USE_CTABLE)
639: table_data_i = table_data[i];
640: #else
641: data_i = data[i];
642: #endif
643: isz_i = isz[i];
644: for (j = 0; j < n_i; j++) { /* parse the indices of each IS */
645: row = idx_i[j];
646: PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc));
647: if (proc != rank) { /* copy to the outgoing buffer */
648: ctr[proc]++;
649: *ptr[proc] = row;
650: ptr[proc]++;
651: } else if (!PetscBTLookupSet(table_i, row)) {
652: #if defined(PETSC_USE_CTABLE)
653: PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
654: #else
655: data_i[isz_i] = row; /* Update the local table */
656: #endif
657: isz_i++;
658: }
659: }
660: /* Update the headers for the current IS */
661: for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */
662: if ((ctr_j = ctr[j])) {
663: outdat_j = outdat[j];
664: k = ++outdat_j[0];
665: outdat_j[2 * k] = ctr_j;
666: outdat_j[2 * k - 1] = i;
667: }
668: }
669: isz[i] = isz_i;
670: }
671: }
673: /* Now post the sends */
674: PetscCall(PetscMalloc1(nrqs, &s_waits1));
675: for (i = 0; i < nrqs; ++i) {
676: j = pa[i];
677: PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i));
678: }
680: /* No longer need the original indices */
681: PetscCall(PetscMalloc1(imax, &iscomms));
682: for (i = 0; i < imax; ++i) {
683: PetscCall(ISRestoreIndices(is[i], idx + i));
684: PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL));
685: }
686: PetscCall(PetscFree2(*(PetscInt ***)&idx, n));
688: for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i]));
690: /* Do Local work */
691: #if defined(PETSC_USE_CTABLE)
692: PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data));
693: #else
694: PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL));
695: #endif
697: /* Receive messages */
698: PetscCall(PetscMalloc1(nrqr, &recv_status));
699: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status));
700: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
702: /* Phase 1 sends are complete - deallocate buffers */
703: PetscCall(PetscFree4(outdat, ptr, tmp, ctr));
704: PetscCall(PetscFree4(w1, w2, w3, w4));
706: PetscCall(PetscMalloc1(nrqr, &xdata));
707: PetscCall(PetscMalloc1(nrqr, &isz1));
708: PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1));
709: PetscCall(PetscFree(rbuf[0]));
710: PetscCall(PetscFree(rbuf));
712: /* Send the data back */
713: /* Do a global reduction to know the buffer space req for incoming messages */
714: {
715: PetscMPIInt *rw1;
717: PetscCall(PetscCalloc1(size, &rw1));
719: for (i = 0; i < nrqr; ++i) {
720: proc = recv_status[i].MPI_SOURCE;
722: PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch");
723: rw1[proc] = isz1[i];
724: }
725: PetscCall(PetscFree(onodes1));
726: PetscCall(PetscFree(olengths1));
728: /* Determine the number of messages to expect, their lengths, from from-ids */
729: PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2));
730: PetscCall(PetscFree(rw1));
731: }
732: /* Now post the Irecvs corresponding to these messages */
733: PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2));
735: /* Now post the sends */
736: PetscCall(PetscMalloc1(nrqr, &s_waits2));
737: for (i = 0; i < nrqr; ++i) {
738: j = recv_status[i].MPI_SOURCE;
739: PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i));
740: }
742: /* receive work done on other processors */
743: {
744: PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax;
745: PetscMPIInt idex;
746: PetscBT table_i;
748: for (i = 0; i < nrqs; ++i) {
749: PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE));
750: /* Process the message */
751: rbuf2_i = rbuf2[idex];
752: ct1 = 2 * rbuf2_i[0] + 1;
753: jmax = rbuf2[idex][0];
754: for (j = 1; j <= jmax; j++) {
755: max = rbuf2_i[2 * j];
756: is_no = rbuf2_i[2 * j - 1];
757: isz_i = isz[is_no];
758: table_i = table[is_no];
759: #if defined(PETSC_USE_CTABLE)
760: table_data_i = table_data[is_no];
761: #else
762: data_i = data[is_no];
763: #endif
764: for (k = 0; k < max; k++, ct1++) {
765: row = rbuf2_i[ct1];
766: if (!PetscBTLookupSet(table_i, row)) {
767: #if defined(PETSC_USE_CTABLE)
768: PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1));
769: #else
770: data_i[isz_i] = row;
771: #endif
772: isz_i++;
773: }
774: }
775: isz[is_no] = isz_i;
776: }
777: }
779: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
780: }
782: #if defined(PETSC_USE_CTABLE)
783: tcount_max = 0;
784: for (i = 0; i < imax; ++i) {
785: table_data_i = table_data[i];
786: PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
787: if (tcount_max < tcount) tcount_max = tcount;
788: }
789: PetscCall(PetscMalloc1(tcount_max + 1, &tdata));
790: #endif
792: for (i = 0; i < imax; ++i) {
793: #if defined(PETSC_USE_CTABLE)
794: PetscHashIter tpos;
796: table_data_i = table_data[i];
797: PetscHashIterBegin(table_data_i, tpos);
798: while (!PetscHashIterAtEnd(table_data_i, tpos)) {
799: PetscHashIterGetKey(table_data_i, tpos, k);
800: PetscHashIterGetVal(table_data_i, tpos, j);
801: PetscHashIterNext(table_data_i, tpos);
802: tdata[--j] = --k;
803: }
804: PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i));
805: #else
806: PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i));
807: #endif
808: PetscCall(PetscCommDestroy(&iscomms[i]));
809: }
811: PetscCall(PetscFree(iscomms));
812: PetscCall(PetscFree(onodes2));
813: PetscCall(PetscFree(olengths2));
815: PetscCall(PetscFree(pa));
816: PetscCall(PetscFree(rbuf2[0]));
817: PetscCall(PetscFree(rbuf2));
818: PetscCall(PetscFree(s_waits1));
819: PetscCall(PetscFree(r_waits1));
820: PetscCall(PetscFree(s_waits2));
821: PetscCall(PetscFree(r_waits2));
822: PetscCall(PetscFree(recv_status));
823: if (xdata) {
824: PetscCall(PetscFree(xdata[0]));
825: PetscCall(PetscFree(xdata));
826: }
827: PetscCall(PetscFree(isz1));
828: #if defined(PETSC_USE_CTABLE)
829: for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i));
830: PetscCall(PetscFree(table_data));
831: PetscCall(PetscFree(tdata));
832: PetscCall(PetscFree4(table, data, isz, t_p));
833: #else
834: PetscCall(PetscFree5(table, data, isz, d_p, t_p));
835: #endif
836: PetscFunctionReturn(PETSC_SUCCESS);
837: }
839: /*
840: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
841: the work on the local processor.
843: Inputs:
844: C - MAT_MPIAIJ;
845: imax - total no of index sets processed at a time;
846: table - an array of char - size = m bits.
848: Output:
849: isz - array containing the count of the solution elements corresponding
850: to each index set;
851: data or table_data - pointer to the solutions
852: */
853: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data)
854: {
855: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
856: Mat A = c->A, B = c->B;
857: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
858: PetscInt start, end, val, max, rstart, cstart, *ai, *aj;
859: PetscInt *bi, *bj, *garray, i, j, k, row, isz_i;
860: PetscBT table_i;
861: #if defined(PETSC_USE_CTABLE)
862: PetscHMapI table_data_i;
863: PetscHashIter tpos;
864: PetscInt tcount, *tdata;
865: #else
866: PetscInt *data_i;
867: #endif
869: PetscFunctionBegin;
870: rstart = C->rmap->rstart;
871: cstart = C->cmap->rstart;
872: ai = a->i;
873: aj = a->j;
874: bi = b->i;
875: bj = b->j;
876: garray = c->garray;
878: for (i = 0; i < imax; i++) {
879: #if defined(PETSC_USE_CTABLE)
880: /* copy existing entries of table_data_i into tdata[] */
881: table_data_i = table_data[i];
882: PetscCall(PetscHMapIGetSize(table_data_i, &tcount));
883: PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]);
885: PetscCall(PetscMalloc1(tcount, &tdata));
886: PetscHashIterBegin(table_data_i, tpos);
887: while (!PetscHashIterAtEnd(table_data_i, tpos)) {
888: PetscHashIterGetKey(table_data_i, tpos, row);
889: PetscHashIterGetVal(table_data_i, tpos, j);
890: PetscHashIterNext(table_data_i, tpos);
891: tdata[--j] = --row;
892: PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount);
893: }
894: #else
895: data_i = data[i];
896: #endif
897: table_i = table[i];
898: isz_i = isz[i];
899: max = isz[i];
901: for (j = 0; j < max; j++) {
902: #if defined(PETSC_USE_CTABLE)
903: row = tdata[j] - rstart;
904: #else
905: row = data_i[j] - rstart;
906: #endif
907: start = ai[row];
908: end = ai[row + 1];
909: for (k = start; k < end; k++) { /* Amat */
910: val = aj[k] + cstart;
911: if (!PetscBTLookupSet(table_i, val)) {
912: #if defined(PETSC_USE_CTABLE)
913: PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
914: #else
915: data_i[isz_i] = val;
916: #endif
917: isz_i++;
918: }
919: }
920: start = bi[row];
921: end = bi[row + 1];
922: for (k = start; k < end; k++) { /* Bmat */
923: val = garray[bj[k]];
924: if (!PetscBTLookupSet(table_i, val)) {
925: #if defined(PETSC_USE_CTABLE)
926: PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1));
927: #else
928: data_i[isz_i] = val;
929: #endif
930: isz_i++;
931: }
932: }
933: }
934: isz[i] = isz_i;
936: #if defined(PETSC_USE_CTABLE)
937: PetscCall(PetscFree(tdata));
938: #endif
939: }
940: PetscFunctionReturn(PETSC_SUCCESS);
941: }
943: /*
944: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
945: and return the output
947: Input:
948: C - the matrix
949: nrqr - no of messages being processed.
950: rbuf - an array of pointers to the received requests
952: Output:
953: xdata - array of messages to be sent back
954: isz1 - size of each message
956: For better efficiency perhaps we should malloc separately each xdata[i],
957: then if a remalloc is required we need only copy the data for that one row
958: rather than all previous rows as it is now where a single large chunk of
959: memory is used.
961: */
962: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1)
963: {
964: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
965: Mat A = c->A, B = c->B;
966: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data;
967: PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k;
968: PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end;
969: PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr;
970: PetscInt *rbuf_i, kmax, rbuf_0;
971: PetscBT xtable;
973: PetscFunctionBegin;
974: m = C->rmap->N;
975: rstart = C->rmap->rstart;
976: cstart = C->cmap->rstart;
977: ai = a->i;
978: aj = a->j;
979: bi = b->i;
980: bj = b->j;
981: garray = c->garray;
983: for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) {
984: rbuf_i = rbuf[i];
985: rbuf_0 = rbuf_i[0];
986: ct += rbuf_0;
987: for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j];
988: }
990: if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n;
991: else max1 = 1;
992: mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1);
993: if (nrqr) {
994: PetscCall(PetscMalloc1(mem_estimate, &xdata[0]));
995: ++no_malloc;
996: }
997: PetscCall(PetscBTCreate(m, &xtable));
998: PetscCall(PetscArrayzero(isz1, nrqr));
1000: ct3 = 0;
1001: for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */
1002: rbuf_i = rbuf[i];
1003: rbuf_0 = rbuf_i[0];
1004: ct1 = 2 * rbuf_0 + 1;
1005: ct2 = ct1;
1006: ct3 += ct1;
1007: for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/
1008: PetscCall(PetscBTMemzero(m, xtable));
1009: oct2 = ct2;
1010: kmax = rbuf_i[2 * j];
1011: for (k = 0; k < kmax; k++, ct1++) {
1012: row = rbuf_i[ct1];
1013: if (!PetscBTLookupSet(xtable, row)) {
1014: if (!(ct3 < mem_estimate)) {
1015: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1016: PetscCall(PetscMalloc1(new_estimate, &tmp));
1017: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1018: PetscCall(PetscFree(xdata[0]));
1019: xdata[0] = tmp;
1020: mem_estimate = new_estimate;
1021: ++no_malloc;
1022: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1023: }
1024: xdata[i][ct2++] = row;
1025: ct3++;
1026: }
1027: }
1028: for (k = oct2, max2 = ct2; k < max2; k++) {
1029: row = xdata[i][k] - rstart;
1030: start = ai[row];
1031: end = ai[row + 1];
1032: for (l = start; l < end; l++) {
1033: val = aj[l] + cstart;
1034: if (!PetscBTLookupSet(xtable, val)) {
1035: if (!(ct3 < mem_estimate)) {
1036: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1037: PetscCall(PetscMalloc1(new_estimate, &tmp));
1038: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1039: PetscCall(PetscFree(xdata[0]));
1040: xdata[0] = tmp;
1041: mem_estimate = new_estimate;
1042: ++no_malloc;
1043: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1044: }
1045: xdata[i][ct2++] = val;
1046: ct3++;
1047: }
1048: }
1049: start = bi[row];
1050: end = bi[row + 1];
1051: for (l = start; l < end; l++) {
1052: val = garray[bj[l]];
1053: if (!PetscBTLookupSet(xtable, val)) {
1054: if (!(ct3 < mem_estimate)) {
1055: new_estimate = (PetscInt)(1.5 * mem_estimate) + 1;
1056: PetscCall(PetscMalloc1(new_estimate, &tmp));
1057: PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate));
1058: PetscCall(PetscFree(xdata[0]));
1059: xdata[0] = tmp;
1060: mem_estimate = new_estimate;
1061: ++no_malloc;
1062: for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1];
1063: }
1064: xdata[i][ct2++] = val;
1065: ct3++;
1066: }
1067: }
1068: }
1069: /* Update the header*/
1070: xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1071: xdata[i][2 * j - 1] = rbuf_i[2 * j - 1];
1072: }
1073: xdata[i][0] = rbuf_0;
1074: if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2;
1075: isz1[i] = ct2; /* size of each message */
1076: }
1077: PetscCall(PetscBTDestroy(&xtable));
1078: PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc));
1079: PetscFunctionReturn(PETSC_SUCCESS);
1080: }
1082: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *);
1083: /*
1084: Every processor gets the entire matrix
1085: */
1086: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[])
1087: {
1088: Mat B;
1089: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1090: Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data;
1091: PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL;
1092: PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j;
1093: PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf;
1095: PetscFunctionBegin;
1096: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1097: PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank));
1098: if (scall == MAT_INITIAL_MATRIX) {
1099: /* Tell every processor the number of nonzeros per row */
1100: PetscCall(PetscMalloc1(A->rmap->N, &lens));
1101: for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart];
1102: PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1103: for (i = 0; i < size; i++) {
1104: recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i];
1105: displs[i] = A->rmap->range[i];
1106: }
1107: PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1108: /* Create the sequential matrix of the same type as the local block diagonal */
1109: PetscCall(MatCreate(PETSC_COMM_SELF, &B));
1110: PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE));
1111: PetscCall(MatSetBlockSizesFromMats(B, A, A));
1112: PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name));
1113: PetscCall(MatSeqAIJSetPreallocation(B, 0, lens));
1114: PetscCall(PetscCalloc1(2, Bin));
1115: **Bin = B;
1116: b = (Mat_SeqAIJ *)B->data;
1118: /* Copy my part of matrix column indices over */
1119: sendcount = ad->nz + bd->nz;
1120: jsendbuf = b->j + b->i[rstarts[rank]];
1121: a_jsendbuf = ad->j;
1122: b_jsendbuf = bd->j;
1123: n = A->rmap->rend - A->rmap->rstart;
1124: cnt = 0;
1125: for (i = 0; i < n; i++) {
1126: /* put in lower diagonal portion */
1127: m = bd->i[i + 1] - bd->i[i];
1128: while (m > 0) {
1129: /* is it above diagonal (in bd (compressed) numbering) */
1130: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1131: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1132: m--;
1133: }
1135: /* put in diagonal portion */
1136: for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1138: /* put in upper diagonal portion */
1139: while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++];
1140: }
1141: PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1143: /* Gather all column indices to all processors */
1144: for (i = 0; i < size; i++) {
1145: recvcounts[i] = 0;
1146: for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j];
1147: }
1148: displs[0] = 0;
1149: for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1150: PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A)));
1151: /* Assemble the matrix into useable form (note numerical values not yet set) */
1152: /* set the b->ilen (length of each row) values */
1153: PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N));
1154: /* set the b->i indices */
1155: b->i[0] = 0;
1156: for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1];
1157: PetscCall(PetscFree(lens));
1158: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1159: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1161: } else {
1162: B = **Bin;
1163: b = (Mat_SeqAIJ *)B->data;
1164: }
1166: /* Copy my part of matrix numerical values into the values location */
1167: if (flag == MAT_GET_VALUES) {
1168: const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf;
1169: MatScalar *sendbuf, *recvbuf;
1171: PetscCall(MatSeqAIJGetArrayRead(a->A, &ada));
1172: PetscCall(MatSeqAIJGetArrayRead(a->B, &bda));
1173: sendcount = ad->nz + bd->nz;
1174: sendbuf = b->a + b->i[rstarts[rank]];
1175: a_sendbuf = ada;
1176: b_sendbuf = bda;
1177: b_sendj = bd->j;
1178: n = A->rmap->rend - A->rmap->rstart;
1179: cnt = 0;
1180: for (i = 0; i < n; i++) {
1181: /* put in lower diagonal portion */
1182: m = bd->i[i + 1] - bd->i[i];
1183: while (m > 0) {
1184: /* is it above diagonal (in bd (compressed) numbering) */
1185: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1186: sendbuf[cnt++] = *b_sendbuf++;
1187: m--;
1188: b_sendj++;
1189: }
1191: /* put in diagonal portion */
1192: for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++;
1194: /* put in upper diagonal portion */
1195: while (m-- > 0) {
1196: sendbuf[cnt++] = *b_sendbuf++;
1197: b_sendj++;
1198: }
1199: }
1200: PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt);
1202: /* Gather all numerical values to all processors */
1203: if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs));
1204: for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]];
1205: displs[0] = 0;
1206: for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1];
1207: recvbuf = b->a;
1208: PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A)));
1209: PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada));
1210: PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda));
1211: } /* endof (flag == MAT_GET_VALUES) */
1212: PetscCall(PetscFree2(recvcounts, displs));
1214: PetscCall(MatPropagateSymmetryOptions(A, B));
1215: PetscFunctionReturn(PETSC_SUCCESS);
1216: }
1218: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats)
1219: {
1220: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
1221: Mat submat, A = c->A, B = c->B;
1222: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc;
1223: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB;
1224: PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray;
1225: const PetscInt *icol, *irow;
1226: PetscInt nrow, ncol, start;
1227: PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr;
1228: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc;
1229: PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr;
1230: PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz;
1231: PetscInt *lens, rmax, ncols, *cols, Crow;
1232: #if defined(PETSC_USE_CTABLE)
1233: PetscHMapI cmap, rmap;
1234: PetscInt *cmap_loc, *rmap_loc;
1235: #else
1236: PetscInt *cmap, *rmap;
1237: #endif
1238: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i;
1239: PetscInt *cworkB, lwrite, *subcols, *row2proc;
1240: PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL;
1241: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
1242: MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4;
1243: MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2;
1244: MPI_Status *r_status3 = NULL, *r_status4, *s_status4;
1245: MPI_Comm comm;
1246: PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i;
1247: PetscMPIInt *onodes1, *olengths1, idex, end;
1248: Mat_SubSppt *smatis1;
1249: PetscBool isrowsorted, iscolsorted;
1251: PetscFunctionBegin;
1254: PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1");
1255: PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
1256: PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a));
1257: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
1258: size = c->size;
1259: rank = c->rank;
1261: PetscCall(ISSorted(iscol[0], &iscolsorted));
1262: PetscCall(ISSorted(isrow[0], &isrowsorted));
1263: PetscCall(ISGetIndices(isrow[0], &irow));
1264: PetscCall(ISGetLocalSize(isrow[0], &nrow));
1265: if (allcolumns) {
1266: icol = NULL;
1267: ncol = C->cmap->N;
1268: } else {
1269: PetscCall(ISGetIndices(iscol[0], &icol));
1270: PetscCall(ISGetLocalSize(iscol[0], &ncol));
1271: }
1273: if (scall == MAT_INITIAL_MATRIX) {
1274: PetscInt *sbuf2_i, *cworkA, lwrite, ctmp;
1276: /* Get some new tags to keep the communication clean */
1277: tag1 = ((PetscObject)C)->tag;
1278: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
1279: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
1281: /* evaluate communication - mesg to who, length of mesg, and buffer space
1282: required. Based on this, buffers are allocated, and data copied into them */
1283: PetscCall(PetscCalloc2(size, &w1, size, &w2));
1284: PetscCall(PetscMalloc1(nrow, &row2proc));
1286: /* w1[proc] = num of rows owned by proc -- to be requested */
1287: proc = 0;
1288: nrqs = 0; /* num of outgoing messages */
1289: for (j = 0; j < nrow; j++) {
1290: row = irow[j];
1291: if (!isrowsorted) proc = 0;
1292: while (row >= C->rmap->range[proc + 1]) proc++;
1293: w1[proc]++;
1294: row2proc[j] = proc; /* map row index to proc */
1296: if (proc != rank && !w2[proc]) {
1297: w2[proc] = 1;
1298: nrqs++;
1299: }
1300: }
1301: w1[rank] = 0; /* rows owned by self will not be requested */
1303: PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
1304: for (proc = 0, j = 0; proc < size; proc++) {
1305: if (w1[proc]) pa[j++] = proc;
1306: }
1308: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1309: msz = 0; /* total mesg length (for all procs) */
1310: for (i = 0; i < nrqs; i++) {
1311: proc = pa[i];
1312: w1[proc] += 3;
1313: msz += w1[proc];
1314: }
1315: PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
1317: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1318: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1319: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
1321: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1322: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1323: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
1325: /* Now post the Irecvs corresponding to these messages */
1326: PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
1328: PetscCall(PetscFree(onodes1));
1329: PetscCall(PetscFree(olengths1));
1331: /* Allocate Memory for outgoing messages */
1332: PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
1333: PetscCall(PetscArrayzero(sbuf1, size));
1334: PetscCall(PetscArrayzero(ptr, size));
1336: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1337: iptr = tmp;
1338: for (i = 0; i < nrqs; i++) {
1339: proc = pa[i];
1340: sbuf1[proc] = iptr;
1341: iptr += w1[proc];
1342: }
1344: /* Form the outgoing messages */
1345: /* Initialize the header space */
1346: for (i = 0; i < nrqs; i++) {
1347: proc = pa[i];
1348: PetscCall(PetscArrayzero(sbuf1[proc], 3));
1349: ptr[proc] = sbuf1[proc] + 3;
1350: }
1352: /* Parse the isrow and copy data into outbuf */
1353: PetscCall(PetscArrayzero(ctr, size));
1354: for (j = 0; j < nrow; j++) { /* parse the indices of each IS */
1355: proc = row2proc[j];
1356: if (proc != rank) { /* copy to the outgoing buf*/
1357: *ptr[proc] = irow[j];
1358: ctr[proc]++;
1359: ptr[proc]++;
1360: }
1361: }
1363: /* Update the headers for the current IS */
1364: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
1365: if ((ctr_j = ctr[j])) {
1366: sbuf1_j = sbuf1[j];
1367: k = ++sbuf1_j[0];
1368: sbuf1_j[2 * k] = ctr_j;
1369: sbuf1_j[2 * k - 1] = 0;
1370: }
1371: }
1373: /* Now post the sends */
1374: PetscCall(PetscMalloc1(nrqs, &s_waits1));
1375: for (i = 0; i < nrqs; ++i) {
1376: proc = pa[i];
1377: PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i));
1378: }
1380: /* Post Receives to capture the buffer size */
1381: PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2));
1382: PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
1384: if (nrqs) rbuf2[0] = tmp + msz;
1385: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
1387: for (i = 0; i < nrqs; ++i) {
1388: proc = pa[i];
1389: PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i));
1390: }
1392: PetscCall(PetscFree2(w1, w2));
1394: /* Send to other procs the buf size they should allocate */
1395: /* Receive messages*/
1396: PetscCall(PetscMalloc1(nrqr, &r_status1));
1397: PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
1399: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1));
1400: for (i = 0; i < nrqr; ++i) {
1401: req_size[i] = 0;
1402: rbuf1_i = rbuf1[i];
1403: start = 2 * rbuf1_i[0] + 1;
1404: PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end));
1405: PetscCall(PetscMalloc1(end, &sbuf2[i]));
1406: sbuf2_i = sbuf2[i];
1407: for (j = start; j < end; j++) {
1408: k = rbuf1_i[j] - rstart;
1409: ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k];
1410: sbuf2_i[j] = ncols;
1411: req_size[i] += ncols;
1412: }
1413: req_source1[i] = r_status1[i].MPI_SOURCE;
1415: /* form the header */
1416: sbuf2_i[0] = req_size[i];
1417: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
1419: PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
1420: }
1422: PetscCall(PetscFree(r_status1));
1423: PetscCall(PetscFree(r_waits1));
1425: /* rbuf2 is received, Post recv column indices a->j */
1426: PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2));
1428: PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3));
1429: for (i = 0; i < nrqs; ++i) {
1430: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
1431: req_source2[i] = r_status2[i].MPI_SOURCE;
1432: PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
1433: }
1435: /* Wait on sends1 and sends2 */
1436: PetscCall(PetscMalloc1(nrqs, &s_status1));
1437: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1));
1438: PetscCall(PetscFree(s_waits1));
1439: PetscCall(PetscFree(s_status1));
1441: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2));
1442: PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2));
1444: /* Now allocate sending buffers for a->j, and send them off */
1445: PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
1446: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1447: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
1448: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
1450: for (i = 0; i < nrqr; i++) { /* for each requested message */
1451: rbuf1_i = rbuf1[i];
1452: sbuf_aj_i = sbuf_aj[i];
1453: ct1 = 2 * rbuf1_i[0] + 1;
1454: ct2 = 0;
1455: /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1457: kmax = rbuf1[i][2];
1458: for (k = 0; k < kmax; k++, ct1++) { /* for each row */
1459: row = rbuf1_i[ct1] - rstart;
1460: nzA = ai[row + 1] - ai[row];
1461: nzB = bi[row + 1] - bi[row];
1462: ncols = nzA + nzB;
1463: cworkA = aj + ai[row];
1464: cworkB = bj ? bj + bi[row] : NULL;
1466: /* load the column indices for this row into cols*/
1467: cols = sbuf_aj_i + ct2;
1469: lwrite = 0;
1470: for (l = 0; l < nzB; l++) {
1471: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1472: }
1473: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1474: for (l = 0; l < nzB; l++) {
1475: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1476: }
1478: ct2 += ncols;
1479: }
1480: PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
1481: }
1483: /* create column map (cmap): global col of C -> local col of submat */
1484: #if defined(PETSC_USE_CTABLE)
1485: if (!allcolumns) {
1486: PetscCall(PetscHMapICreateWithSize(ncol, &cmap));
1487: PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc));
1488: for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */
1489: if (icol[j] >= cstart && icol[j] < cend) {
1490: cmap_loc[icol[j] - cstart] = j + 1;
1491: } else { /* use PetscHMapI for non-local col indices */
1492: PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1));
1493: }
1494: }
1495: } else {
1496: cmap = NULL;
1497: cmap_loc = NULL;
1498: }
1499: PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc));
1500: #else
1501: if (!allcolumns) {
1502: PetscCall(PetscCalloc1(C->cmap->N, &cmap));
1503: for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1;
1504: } else {
1505: cmap = NULL;
1506: }
1507: #endif
1509: /* Create lens for MatSeqAIJSetPreallocation() */
1510: PetscCall(PetscCalloc1(nrow, &lens));
1512: /* Compute lens from local part of C */
1513: for (j = 0; j < nrow; j++) {
1514: row = irow[j];
1515: proc = row2proc[j];
1516: if (proc == rank) {
1517: /* diagonal part A = c->A */
1518: ncols = ai[row - rstart + 1] - ai[row - rstart];
1519: cols = aj + ai[row - rstart];
1520: if (!allcolumns) {
1521: for (k = 0; k < ncols; k++) {
1522: #if defined(PETSC_USE_CTABLE)
1523: tcol = cmap_loc[cols[k]];
1524: #else
1525: tcol = cmap[cols[k] + cstart];
1526: #endif
1527: if (tcol) lens[j]++;
1528: }
1529: } else { /* allcolumns */
1530: lens[j] = ncols;
1531: }
1533: /* off-diagonal part B = c->B */
1534: ncols = bi[row - rstart + 1] - bi[row - rstart];
1535: cols = bj ? bj + bi[row - rstart] : NULL;
1536: if (!allcolumns) {
1537: for (k = 0; k < ncols; k++) {
1538: #if defined(PETSC_USE_CTABLE)
1539: PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1540: #else
1541: tcol = cmap[bmap[cols[k]]];
1542: #endif
1543: if (tcol) lens[j]++;
1544: }
1545: } else { /* allcolumns */
1546: lens[j] += ncols;
1547: }
1548: }
1549: }
1551: /* Create row map (rmap): global row of C -> local row of submat */
1552: #if defined(PETSC_USE_CTABLE)
1553: PetscCall(PetscHMapICreateWithSize(nrow, &rmap));
1554: for (j = 0; j < nrow; j++) {
1555: row = irow[j];
1556: proc = row2proc[j];
1557: if (proc == rank) { /* a local row */
1558: rmap_loc[row - rstart] = j;
1559: } else {
1560: PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1));
1561: }
1562: }
1563: #else
1564: PetscCall(PetscCalloc1(C->rmap->N, &rmap));
1565: for (j = 0; j < nrow; j++) rmap[irow[j]] = j;
1566: #endif
1568: /* Update lens from offproc data */
1569: /* recv a->j is done */
1570: PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3));
1571: for (i = 0; i < nrqs; i++) {
1572: proc = pa[i];
1573: sbuf1_i = sbuf1[proc];
1574: /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1575: ct1 = 2 + 1;
1576: ct2 = 0;
1577: rbuf2_i = rbuf2[i]; /* received length of C->j */
1578: rbuf3_i = rbuf3[i]; /* received C->j */
1580: /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1581: max1 = sbuf1_i[2];
1582: for (k = 0; k < max1; k++, ct1++) {
1583: #if defined(PETSC_USE_CTABLE)
1584: PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row));
1585: row--;
1586: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
1587: #else
1588: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1589: #endif
1590: /* Now, store row index of submat in sbuf1_i[ct1] */
1591: sbuf1_i[ct1] = row;
1593: nnz = rbuf2_i[ct1];
1594: if (!allcolumns) {
1595: for (l = 0; l < nnz; l++, ct2++) {
1596: #if defined(PETSC_USE_CTABLE)
1597: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1598: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1599: } else {
1600: PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1601: }
1602: #else
1603: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1604: #endif
1605: if (tcol) lens[row]++;
1606: }
1607: } else { /* allcolumns */
1608: lens[row] += nnz;
1609: }
1610: }
1611: }
1612: PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3));
1613: PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3));
1615: /* Create the submatrices */
1616: PetscCall(MatCreate(PETSC_COMM_SELF, &submat));
1617: PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE));
1619: PetscCall(ISGetBlockSize(isrow[0], &i));
1620: PetscCall(ISGetBlockSize(iscol[0], &j));
1621: if (i > 1 || j > 1) PetscCall(MatSetBlockSizes(submat, i, j));
1622: PetscCall(MatSetType(submat, ((PetscObject)A)->type_name));
1623: PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens));
1625: /* create struct Mat_SubSppt and attached it to submat */
1626: PetscCall(PetscNew(&smatis1));
1627: subc = (Mat_SeqAIJ *)submat->data;
1628: subc->submatis1 = smatis1;
1630: smatis1->id = 0;
1631: smatis1->nrqs = nrqs;
1632: smatis1->nrqr = nrqr;
1633: smatis1->rbuf1 = rbuf1;
1634: smatis1->rbuf2 = rbuf2;
1635: smatis1->rbuf3 = rbuf3;
1636: smatis1->sbuf2 = sbuf2;
1637: smatis1->req_source2 = req_source2;
1639: smatis1->sbuf1 = sbuf1;
1640: smatis1->ptr = ptr;
1641: smatis1->tmp = tmp;
1642: smatis1->ctr = ctr;
1644: smatis1->pa = pa;
1645: smatis1->req_size = req_size;
1646: smatis1->req_source1 = req_source1;
1648: smatis1->allcolumns = allcolumns;
1649: smatis1->singleis = PETSC_TRUE;
1650: smatis1->row2proc = row2proc;
1651: smatis1->rmap = rmap;
1652: smatis1->cmap = cmap;
1653: #if defined(PETSC_USE_CTABLE)
1654: smatis1->rmap_loc = rmap_loc;
1655: smatis1->cmap_loc = cmap_loc;
1656: #endif
1658: smatis1->destroy = submat->ops->destroy;
1659: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1660: submat->factortype = C->factortype;
1662: /* compute rmax */
1663: rmax = 0;
1664: for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]);
1666: } else { /* scall == MAT_REUSE_MATRIX */
1667: submat = submats[0];
1668: PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
1670: subc = (Mat_SeqAIJ *)submat->data;
1671: rmax = subc->rmax;
1672: smatis1 = subc->submatis1;
1673: nrqs = smatis1->nrqs;
1674: nrqr = smatis1->nrqr;
1675: rbuf1 = smatis1->rbuf1;
1676: rbuf2 = smatis1->rbuf2;
1677: rbuf3 = smatis1->rbuf3;
1678: req_source2 = smatis1->req_source2;
1680: sbuf1 = smatis1->sbuf1;
1681: sbuf2 = smatis1->sbuf2;
1682: ptr = smatis1->ptr;
1683: tmp = smatis1->tmp;
1684: ctr = smatis1->ctr;
1686: pa = smatis1->pa;
1687: req_size = smatis1->req_size;
1688: req_source1 = smatis1->req_source1;
1690: allcolumns = smatis1->allcolumns;
1691: row2proc = smatis1->row2proc;
1692: rmap = smatis1->rmap;
1693: cmap = smatis1->cmap;
1694: #if defined(PETSC_USE_CTABLE)
1695: rmap_loc = smatis1->rmap_loc;
1696: cmap_loc = smatis1->cmap_loc;
1697: #endif
1698: }
1700: /* Post recv matrix values */
1701: PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals));
1702: PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4));
1703: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
1704: for (i = 0; i < nrqs; ++i) {
1705: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
1706: PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
1707: }
1709: /* Allocate sending buffers for a->a, and send them off */
1710: PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
1711: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
1712: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
1713: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
1715: for (i = 0; i < nrqr; i++) {
1716: rbuf1_i = rbuf1[i];
1717: sbuf_aa_i = sbuf_aa[i];
1718: ct1 = 2 * rbuf1_i[0] + 1;
1719: ct2 = 0;
1720: /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1722: kmax = rbuf1_i[2];
1723: for (k = 0; k < kmax; k++, ct1++) {
1724: row = rbuf1_i[ct1] - rstart;
1725: nzA = ai[row + 1] - ai[row];
1726: nzB = bi[row + 1] - bi[row];
1727: ncols = nzA + nzB;
1728: cworkB = bj ? bj + bi[row] : NULL;
1729: vworkA = a_a + ai[row];
1730: vworkB = b_a ? b_a + bi[row] : NULL;
1732: /* load the column values for this row into vals*/
1733: vals = sbuf_aa_i + ct2;
1735: lwrite = 0;
1736: for (l = 0; l < nzB; l++) {
1737: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1738: }
1739: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
1740: for (l = 0; l < nzB; l++) {
1741: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1742: }
1744: ct2 += ncols;
1745: }
1746: PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
1747: }
1749: /* Assemble submat */
1750: /* First assemble the local rows */
1751: for (j = 0; j < nrow; j++) {
1752: row = irow[j];
1753: proc = row2proc[j];
1754: if (proc == rank) {
1755: Crow = row - rstart; /* local row index of C */
1756: #if defined(PETSC_USE_CTABLE)
1757: row = rmap_loc[Crow]; /* row index of submat */
1758: #else
1759: row = rmap[row];
1760: #endif
1762: if (allcolumns) {
1763: /* diagonal part A = c->A */
1764: ncols = ai[Crow + 1] - ai[Crow];
1765: cols = aj + ai[Crow];
1766: vals = a_a + ai[Crow];
1767: i = 0;
1768: for (k = 0; k < ncols; k++) {
1769: subcols[i] = cols[k] + cstart;
1770: subvals[i++] = vals[k];
1771: }
1773: /* off-diagonal part B = c->B */
1774: ncols = bi[Crow + 1] - bi[Crow];
1775: cols = bj + bi[Crow];
1776: vals = b_a + bi[Crow];
1777: for (k = 0; k < ncols; k++) {
1778: subcols[i] = bmap[cols[k]];
1779: subvals[i++] = vals[k];
1780: }
1782: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1784: } else { /* !allcolumns */
1785: #if defined(PETSC_USE_CTABLE)
1786: /* diagonal part A = c->A */
1787: ncols = ai[Crow + 1] - ai[Crow];
1788: cols = aj + ai[Crow];
1789: vals = a_a + ai[Crow];
1790: i = 0;
1791: for (k = 0; k < ncols; k++) {
1792: tcol = cmap_loc[cols[k]];
1793: if (tcol) {
1794: subcols[i] = --tcol;
1795: subvals[i++] = vals[k];
1796: }
1797: }
1799: /* off-diagonal part B = c->B */
1800: ncols = bi[Crow + 1] - bi[Crow];
1801: cols = bj ? bj + bi[Crow] : NULL;
1802: vals = b_a ? b_a + bi[Crow] : NULL;
1803: for (k = 0; k < ncols; k++) {
1804: PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol));
1805: if (tcol) {
1806: subcols[i] = --tcol;
1807: subvals[i++] = vals[k];
1808: }
1809: }
1810: #else
1811: /* diagonal part A = c->A */
1812: ncols = ai[Crow + 1] - ai[Crow];
1813: cols = aj + ai[Crow];
1814: vals = a_a + ai[Crow];
1815: i = 0;
1816: for (k = 0; k < ncols; k++) {
1817: tcol = cmap[cols[k] + cstart];
1818: if (tcol) {
1819: subcols[i] = --tcol;
1820: subvals[i++] = vals[k];
1821: }
1822: }
1824: /* off-diagonal part B = c->B */
1825: ncols = bi[Crow + 1] - bi[Crow];
1826: cols = bj + bi[Crow];
1827: vals = b_a + bi[Crow];
1828: for (k = 0; k < ncols; k++) {
1829: tcol = cmap[bmap[cols[k]]];
1830: if (tcol) {
1831: subcols[i] = --tcol;
1832: subvals[i++] = vals[k];
1833: }
1834: }
1835: #endif
1836: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES));
1837: }
1838: }
1839: }
1841: /* Now assemble the off-proc rows */
1842: for (i = 0; i < nrqs; i++) { /* for each requested message */
1843: /* recv values from other processes */
1844: PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i));
1845: proc = pa[idex];
1846: sbuf1_i = sbuf1[proc];
1847: /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1848: ct1 = 2 + 1;
1849: ct2 = 0; /* count of received C->j */
1850: ct3 = 0; /* count of received C->j that will be inserted into submat */
1851: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1852: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1853: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1855: /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1856: max1 = sbuf1_i[2]; /* num of rows */
1857: for (k = 0; k < max1; k++, ct1++) { /* for each recved row */
1858: row = sbuf1_i[ct1]; /* row index of submat */
1859: if (!allcolumns) {
1860: idex = 0;
1861: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1862: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1863: for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */
1864: #if defined(PETSC_USE_CTABLE)
1865: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) {
1866: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1867: } else {
1868: PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol));
1869: }
1870: #else
1871: tcol = cmap[rbuf3_i[ct2]];
1872: #endif
1873: if (tcol) {
1874: subcols[idex] = --tcol; /* may not be sorted */
1875: subvals[idex++] = rbuf4_i[ct2];
1877: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1878: For reuse, we replace received C->j with index that should be inserted to submat */
1879: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1880: }
1881: }
1882: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES));
1883: } else { /* scall == MAT_REUSE_MATRIX */
1884: submat = submats[0];
1885: subc = (Mat_SeqAIJ *)submat->data;
1887: nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */
1888: for (l = 0; l < nnz; l++) {
1889: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1890: subvals[idex++] = rbuf4_i[ct2];
1891: }
1893: bj = subc->j + subc->i[row]; /* sorted column indices */
1894: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES));
1895: }
1896: } else { /* allcolumns */
1897: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1898: PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES));
1899: ct2 += nnz;
1900: }
1901: }
1902: }
1904: /* sending a->a are done */
1905: PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4));
1906: PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4));
1908: PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY));
1909: PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY));
1910: submats[0] = submat;
1912: /* Restore the indices */
1913: PetscCall(ISRestoreIndices(isrow[0], &irow));
1914: if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol));
1916: /* Destroy allocated memory */
1917: for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
1918: PetscCall(PetscFree3(rbuf4, subcols, subvals));
1919: if (sbuf_aa) {
1920: PetscCall(PetscFree(sbuf_aa[0]));
1921: PetscCall(PetscFree(sbuf_aa));
1922: }
1924: if (scall == MAT_INITIAL_MATRIX) {
1925: PetscCall(PetscFree(lens));
1926: if (sbuf_aj) {
1927: PetscCall(PetscFree(sbuf_aj[0]));
1928: PetscCall(PetscFree(sbuf_aj));
1929: }
1930: }
1931: PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
1932: PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a));
1933: PetscFunctionReturn(PETSC_SUCCESS);
1934: }
1936: static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1937: {
1938: PetscInt ncol;
1939: PetscBool colflag, allcolumns = PETSC_FALSE;
1941: PetscFunctionBegin;
1942: /* Allocate memory to hold all the submatrices */
1943: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat));
1945: /* Check for special case: each processor gets entire matrix columns */
1946: PetscCall(ISIdentity(iscol[0], &colflag));
1947: PetscCall(ISGetLocalSize(iscol[0], &ncol));
1948: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1950: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat));
1951: PetscFunctionReturn(PETSC_SUCCESS);
1952: }
1954: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
1955: {
1956: PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2];
1957: PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE;
1958: Mat_SeqAIJ *subc;
1959: Mat_SubSppt *smat;
1961: PetscFunctionBegin;
1962: /* Check for special case: each processor has a single IS */
1963: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1964: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
1965: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1966: PetscFunctionReturn(PETSC_SUCCESS);
1967: }
1969: /* Collect global wantallmatrix and nstages */
1970: if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt);
1971: else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt));
1972: if (!nmax) nmax = 1;
1974: if (scall == MAT_INITIAL_MATRIX) {
1975: /* Collect global wantallmatrix and nstages */
1976: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1977: PetscCall(ISIdentity(*isrow, &rowflag));
1978: PetscCall(ISIdentity(*iscol, &colflag));
1979: PetscCall(ISGetLocalSize(*isrow, &nrow));
1980: PetscCall(ISGetLocalSize(*iscol, &ncol));
1981: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
1982: wantallmatrix = PETSC_TRUE;
1984: PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL));
1985: }
1986: }
1988: /* Determine the number of stages through which submatrices are done
1989: Each stage will extract nmax submatrices.
1990: nmax is determined by the matrix column dimension.
1991: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
1992: */
1993: nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
1995: in[0] = -1 * (PetscInt)wantallmatrix;
1996: in[1] = nstages;
1997: PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
1998: wantallmatrix = (PetscBool)(-out[0]);
1999: nstages = out[1]; /* Make sure every processor loops through the global nstages */
2001: } else { /* MAT_REUSE_MATRIX */
2002: if (ismax) {
2003: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2004: smat = subc->submatis1;
2005: } else { /* (*submat)[0] is a dummy matrix */
2006: smat = (Mat_SubSppt *)(*submat)[0]->data;
2007: }
2008: if (!smat) {
2009: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2010: wantallmatrix = PETSC_TRUE;
2011: } else if (smat->singleis) {
2012: PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat));
2013: PetscFunctionReturn(PETSC_SUCCESS);
2014: } else {
2015: nstages = smat->nstages;
2016: }
2017: }
2019: if (wantallmatrix) {
2020: PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat));
2021: PetscFunctionReturn(PETSC_SUCCESS);
2022: }
2024: /* Allocate memory to hold all the submatrices and dummy submatrices */
2025: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat));
2027: for (i = 0, pos = 0; i < nstages; i++) {
2028: if (pos + nmax <= ismax) max_no = nmax;
2029: else if (pos >= ismax) max_no = 0;
2030: else max_no = ismax - pos;
2032: PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos));
2033: if (!max_no) {
2034: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2035: smat = (Mat_SubSppt *)(*submat)[pos]->data;
2036: smat->nstages = nstages;
2037: }
2038: pos++; /* advance to next dummy matrix if any */
2039: } else pos += max_no;
2040: }
2042: if (ismax && scall == MAT_INITIAL_MATRIX) {
2043: /* save nstages for reuse */
2044: subc = (Mat_SeqAIJ *)(*submat)[0]->data;
2045: smat = subc->submatis1;
2046: smat->nstages = nstages;
2047: }
2048: PetscFunctionReturn(PETSC_SUCCESS);
2049: }
2051: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats)
2052: {
2053: Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data;
2054: Mat A = c->A;
2055: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc;
2056: const PetscInt **icol, **irow;
2057: PetscInt *nrow, *ncol, start;
2058: PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr;
2059: PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1;
2060: PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol;
2061: PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2;
2062: PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax;
2063: #if defined(PETSC_USE_CTABLE)
2064: PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i;
2065: #else
2066: PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i;
2067: #endif
2068: const PetscInt *irow_i;
2069: PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i;
2070: MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3;
2071: MPI_Request *r_waits4, *s_waits3, *s_waits4;
2072: MPI_Comm comm;
2073: PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i;
2074: PetscMPIInt *onodes1, *olengths1, end;
2075: PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row;
2076: Mat_SubSppt *smat_i;
2077: PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE;
2078: PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen;
2080: PetscFunctionBegin;
2081: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
2082: size = c->size;
2083: rank = c->rank;
2085: PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns));
2086: PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted));
2088: for (i = 0; i < ismax; i++) {
2089: PetscCall(ISSorted(iscol[i], &issorted[i]));
2090: if (!issorted[i]) iscsorted = issorted[i];
2092: PetscCall(ISSorted(isrow[i], &issorted[i]));
2094: PetscCall(ISGetIndices(isrow[i], &irow[i]));
2095: PetscCall(ISGetLocalSize(isrow[i], &nrow[i]));
2097: /* Check for special case: allcolumn */
2098: PetscCall(ISIdentity(iscol[i], &colflag));
2099: PetscCall(ISGetLocalSize(iscol[i], &ncol[i]));
2100: if (colflag && ncol[i] == C->cmap->N) {
2101: allcolumns[i] = PETSC_TRUE;
2102: icol[i] = NULL;
2103: } else {
2104: allcolumns[i] = PETSC_FALSE;
2105: PetscCall(ISGetIndices(iscol[i], &icol[i]));
2106: }
2107: }
2109: if (scall == MAT_REUSE_MATRIX) {
2110: /* Assumes new rows are same length as the old rows */
2111: for (i = 0; i < ismax; i++) {
2112: PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i);
2113: subc = (Mat_SeqAIJ *)submats[i]->data;
2114: PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size");
2116: /* Initial matrix as if empty */
2117: PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n));
2119: smat_i = subc->submatis1;
2121: nrqs = smat_i->nrqs;
2122: nrqr = smat_i->nrqr;
2123: rbuf1 = smat_i->rbuf1;
2124: rbuf2 = smat_i->rbuf2;
2125: rbuf3 = smat_i->rbuf3;
2126: req_source2 = smat_i->req_source2;
2128: sbuf1 = smat_i->sbuf1;
2129: sbuf2 = smat_i->sbuf2;
2130: ptr = smat_i->ptr;
2131: tmp = smat_i->tmp;
2132: ctr = smat_i->ctr;
2134: pa = smat_i->pa;
2135: req_size = smat_i->req_size;
2136: req_source1 = smat_i->req_source1;
2138: allcolumns[i] = smat_i->allcolumns;
2139: row2proc[i] = smat_i->row2proc;
2140: rmap[i] = smat_i->rmap;
2141: cmap[i] = smat_i->cmap;
2142: }
2144: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2145: PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse");
2146: smat_i = (Mat_SubSppt *)submats[0]->data;
2148: nrqs = smat_i->nrqs;
2149: nrqr = smat_i->nrqr;
2150: rbuf1 = smat_i->rbuf1;
2151: rbuf2 = smat_i->rbuf2;
2152: rbuf3 = smat_i->rbuf3;
2153: req_source2 = smat_i->req_source2;
2155: sbuf1 = smat_i->sbuf1;
2156: sbuf2 = smat_i->sbuf2;
2157: ptr = smat_i->ptr;
2158: tmp = smat_i->tmp;
2159: ctr = smat_i->ctr;
2161: pa = smat_i->pa;
2162: req_size = smat_i->req_size;
2163: req_source1 = smat_i->req_source1;
2165: allcolumns[0] = PETSC_FALSE;
2166: }
2167: } else { /* scall == MAT_INITIAL_MATRIX */
2168: /* Get some new tags to keep the communication clean */
2169: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
2170: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3));
2172: /* evaluate communication - mesg to who, length of mesg, and buffer space
2173: required. Based on this, buffers are allocated, and data copied into them*/
2174: PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */
2176: for (i = 0; i < ismax; i++) {
2177: jmax = nrow[i];
2178: irow_i = irow[i];
2180: PetscCall(PetscMalloc1(jmax, &row2proc_i));
2181: row2proc[i] = row2proc_i;
2183: if (issorted[i]) proc = 0;
2184: for (j = 0; j < jmax; j++) {
2185: if (!issorted[i]) proc = 0;
2186: row = irow_i[j];
2187: while (row >= C->rmap->range[proc + 1]) proc++;
2188: w4[proc]++;
2189: row2proc_i[j] = proc; /* map row index to proc */
2190: }
2191: for (j = 0; j < size; j++) {
2192: if (w4[j]) {
2193: w1[j] += w4[j];
2194: w3[j]++;
2195: w4[j] = 0;
2196: }
2197: }
2198: }
2200: nrqs = 0; /* no of outgoing messages */
2201: msz = 0; /* total mesg length (for all procs) */
2202: w1[rank] = 0; /* no mesg sent to self */
2203: w3[rank] = 0;
2204: for (i = 0; i < size; i++) {
2205: if (w1[i]) {
2206: w2[i] = 1;
2207: nrqs++;
2208: } /* there exists a message to proc i */
2209: }
2210: PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/
2211: for (i = 0, j = 0; i < size; i++) {
2212: if (w1[i]) {
2213: pa[j] = i;
2214: j++;
2215: }
2216: }
2218: /* Each message would have a header = 1 + 2*(no of IS) + data */
2219: for (i = 0; i < nrqs; i++) {
2220: j = pa[i];
2221: w1[j] += w2[j] + 2 * w3[j];
2222: msz += w1[j];
2223: }
2224: PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz));
2226: /* Determine the number of messages to expect, their lengths, from from-ids */
2227: PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr));
2228: PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1));
2230: /* Now post the Irecvs corresponding to these messages */
2231: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0));
2232: PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1));
2234: /* Allocate Memory for outgoing messages */
2235: PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr));
2236: PetscCall(PetscArrayzero(sbuf1, size));
2237: PetscCall(PetscArrayzero(ptr, size));
2239: {
2240: PetscInt *iptr = tmp;
2241: k = 0;
2242: for (i = 0; i < nrqs; i++) {
2243: j = pa[i];
2244: iptr += k;
2245: sbuf1[j] = iptr;
2246: k = w1[j];
2247: }
2248: }
2250: /* Form the outgoing messages. Initialize the header space */
2251: for (i = 0; i < nrqs; i++) {
2252: j = pa[i];
2253: sbuf1[j][0] = 0;
2254: PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j]));
2255: ptr[j] = sbuf1[j] + 2 * w3[j] + 1;
2256: }
2258: /* Parse the isrow and copy data into outbuf */
2259: for (i = 0; i < ismax; i++) {
2260: row2proc_i = row2proc[i];
2261: PetscCall(PetscArrayzero(ctr, size));
2262: irow_i = irow[i];
2263: jmax = nrow[i];
2264: for (j = 0; j < jmax; j++) { /* parse the indices of each IS */
2265: proc = row2proc_i[j];
2266: if (proc != rank) { /* copy to the outgoing buf*/
2267: ctr[proc]++;
2268: *ptr[proc] = irow_i[j];
2269: ptr[proc]++;
2270: }
2271: }
2272: /* Update the headers for the current IS */
2273: for (j = 0; j < size; j++) { /* Can Optimise this loop too */
2274: if ((ctr_j = ctr[j])) {
2275: sbuf1_j = sbuf1[j];
2276: k = ++sbuf1_j[0];
2277: sbuf1_j[2 * k] = ctr_j;
2278: sbuf1_j[2 * k - 1] = i;
2279: }
2280: }
2281: }
2283: /* Now post the sends */
2284: PetscCall(PetscMalloc1(nrqs, &s_waits1));
2285: for (i = 0; i < nrqs; ++i) {
2286: j = pa[i];
2287: PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i));
2288: }
2290: /* Post Receives to capture the buffer size */
2291: PetscCall(PetscMalloc1(nrqs, &r_waits2));
2292: PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3));
2293: if (nrqs) rbuf2[0] = tmp + msz;
2294: for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]];
2295: for (i = 0; i < nrqs; ++i) {
2296: j = pa[i];
2297: PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i));
2298: }
2300: /* Send to other procs the buf size they should allocate */
2301: /* Receive messages*/
2302: PetscCall(PetscMalloc1(nrqr, &s_waits2));
2303: PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1));
2304: {
2305: PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart;
2306: PetscInt *sbuf2_i;
2308: PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE));
2309: for (i = 0; i < nrqr; ++i) {
2310: req_size[i] = 0;
2311: rbuf1_i = rbuf1[i];
2312: start = 2 * rbuf1_i[0] + 1;
2313: end = olengths1[i];
2314: PetscCall(PetscMalloc1(end, &sbuf2[i]));
2315: sbuf2_i = sbuf2[i];
2316: for (j = start; j < end; j++) {
2317: id = rbuf1_i[j] - rstart;
2318: ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id];
2319: sbuf2_i[j] = ncols;
2320: req_size[i] += ncols;
2321: }
2322: req_source1[i] = onodes1[i];
2323: /* form the header */
2324: sbuf2_i[0] = req_size[i];
2325: for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j];
2327: PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i));
2328: }
2329: }
2331: PetscCall(PetscFree(onodes1));
2332: PetscCall(PetscFree(olengths1));
2333: PetscCall(PetscFree(r_waits1));
2334: PetscCall(PetscFree4(w1, w2, w3, w4));
2336: /* Receive messages*/
2337: PetscCall(PetscMalloc1(nrqs, &r_waits3));
2338: PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE));
2339: for (i = 0; i < nrqs; ++i) {
2340: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i]));
2341: req_source2[i] = pa[i];
2342: PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i));
2343: }
2344: PetscCall(PetscFree(r_waits2));
2346: /* Wait on sends1 and sends2 */
2347: PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE));
2348: PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE));
2349: PetscCall(PetscFree(s_waits1));
2350: PetscCall(PetscFree(s_waits2));
2352: /* Now allocate sending buffers for a->j, and send them off */
2353: PetscCall(PetscMalloc1(nrqr, &sbuf_aj));
2354: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2355: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0]));
2356: for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1];
2358: PetscCall(PetscMalloc1(nrqr, &s_waits3));
2359: {
2360: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite;
2361: PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2362: PetscInt cend = C->cmap->rend;
2363: PetscInt *a_j = a->j, *b_j = b->j, ctmp;
2365: for (i = 0; i < nrqr; i++) {
2366: rbuf1_i = rbuf1[i];
2367: sbuf_aj_i = sbuf_aj[i];
2368: ct1 = 2 * rbuf1_i[0] + 1;
2369: ct2 = 0;
2370: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2371: kmax = rbuf1[i][2 * j];
2372: for (k = 0; k < kmax; k++, ct1++) {
2373: row = rbuf1_i[ct1] - rstart;
2374: nzA = a_i[row + 1] - a_i[row];
2375: nzB = b_i[row + 1] - b_i[row];
2376: ncols = nzA + nzB;
2377: cworkA = a_j + a_i[row];
2378: cworkB = b_j ? b_j + b_i[row] : NULL;
2380: /* load the column indices for this row into cols */
2381: cols = sbuf_aj_i + ct2;
2383: lwrite = 0;
2384: for (l = 0; l < nzB; l++) {
2385: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2386: }
2387: for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2388: for (l = 0; l < nzB; l++) {
2389: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2390: }
2392: ct2 += ncols;
2393: }
2394: }
2395: PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i));
2396: }
2397: }
2399: /* create col map: global col of C -> local col of submatrices */
2400: {
2401: const PetscInt *icol_i;
2402: #if defined(PETSC_USE_CTABLE)
2403: for (i = 0; i < ismax; i++) {
2404: if (!allcolumns[i]) {
2405: PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i));
2407: jmax = ncol[i];
2408: icol_i = icol[i];
2409: cmap_i = cmap[i];
2410: for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1));
2411: } else cmap[i] = NULL;
2412: }
2413: #else
2414: for (i = 0; i < ismax; i++) {
2415: if (!allcolumns[i]) {
2416: PetscCall(PetscCalloc1(C->cmap->N, &cmap[i]));
2417: jmax = ncol[i];
2418: icol_i = icol[i];
2419: cmap_i = cmap[i];
2420: for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1;
2421: } else cmap[i] = NULL;
2422: }
2423: #endif
2424: }
2426: /* Create lens which is required for MatCreate... */
2427: for (i = 0, j = 0; i < ismax; i++) j += nrow[i];
2428: PetscCall(PetscMalloc1(ismax, &lens));
2430: if (ismax) PetscCall(PetscCalloc1(j, &lens[0]));
2431: for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1];
2433: /* Update lens from local data */
2434: for (i = 0; i < ismax; i++) {
2435: row2proc_i = row2proc[i];
2436: jmax = nrow[i];
2437: if (!allcolumns[i]) cmap_i = cmap[i];
2438: irow_i = irow[i];
2439: lens_i = lens[i];
2440: for (j = 0; j < jmax; j++) {
2441: row = irow_i[j];
2442: proc = row2proc_i[j];
2443: if (proc == rank) {
2444: PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2445: if (!allcolumns[i]) {
2446: for (k = 0; k < ncols; k++) {
2447: #if defined(PETSC_USE_CTABLE)
2448: PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2449: #else
2450: tcol = cmap_i[cols[k]];
2451: #endif
2452: if (tcol) lens_i[j]++;
2453: }
2454: } else { /* allcolumns */
2455: lens_i[j] = ncols;
2456: }
2457: PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL));
2458: }
2459: }
2460: }
2462: /* Create row map: global row of C -> local row of submatrices */
2463: #if defined(PETSC_USE_CTABLE)
2464: for (i = 0; i < ismax; i++) {
2465: PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i));
2466: irow_i = irow[i];
2467: jmax = nrow[i];
2468: for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1));
2469: }
2470: #else
2471: for (i = 0; i < ismax; i++) {
2472: PetscCall(PetscCalloc1(C->rmap->N, &rmap[i]));
2473: rmap_i = rmap[i];
2474: irow_i = irow[i];
2475: jmax = nrow[i];
2476: for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j;
2477: }
2478: #endif
2480: /* Update lens from offproc data */
2481: {
2482: PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i;
2484: PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE));
2485: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2486: sbuf1_i = sbuf1[pa[tmp2]];
2487: jmax = sbuf1_i[0];
2488: ct1 = 2 * jmax + 1;
2489: ct2 = 0;
2490: rbuf2_i = rbuf2[tmp2];
2491: rbuf3_i = rbuf3[tmp2];
2492: for (j = 1; j <= jmax; j++) {
2493: is_no = sbuf1_i[2 * j - 1];
2494: max1 = sbuf1_i[2 * j];
2495: lens_i = lens[is_no];
2496: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2497: rmap_i = rmap[is_no];
2498: for (k = 0; k < max1; k++, ct1++) {
2499: #if defined(PETSC_USE_CTABLE)
2500: PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row));
2501: row--;
2502: PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table");
2503: #else
2504: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2505: #endif
2506: max2 = rbuf2_i[ct1];
2507: for (l = 0; l < max2; l++, ct2++) {
2508: if (!allcolumns[is_no]) {
2509: #if defined(PETSC_USE_CTABLE)
2510: PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2511: #else
2512: tcol = cmap_i[rbuf3_i[ct2]];
2513: #endif
2514: if (tcol) lens_i[row]++;
2515: } else { /* allcolumns */
2516: lens_i[row]++; /* lens_i[row] += max2 ? */
2517: }
2518: }
2519: }
2520: }
2521: }
2522: }
2523: PetscCall(PetscFree(r_waits3));
2524: PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE));
2525: PetscCall(PetscFree(s_waits3));
2527: /* Create the submatrices */
2528: for (i = 0; i < ismax; i++) {
2529: PetscInt rbs, cbs;
2531: PetscCall(ISGetBlockSize(isrow[i], &rbs));
2532: PetscCall(ISGetBlockSize(iscol[i], &cbs));
2534: PetscCall(MatCreate(PETSC_COMM_SELF, submats + i));
2535: PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE));
2537: if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs));
2538: PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name));
2539: PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i]));
2541: /* create struct Mat_SubSppt and attached it to submat */
2542: PetscCall(PetscNew(&smat_i));
2543: subc = (Mat_SeqAIJ *)submats[i]->data;
2544: subc->submatis1 = smat_i;
2546: smat_i->destroy = submats[i]->ops->destroy;
2547: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2548: submats[i]->factortype = C->factortype;
2550: smat_i->id = i;
2551: smat_i->nrqs = nrqs;
2552: smat_i->nrqr = nrqr;
2553: smat_i->rbuf1 = rbuf1;
2554: smat_i->rbuf2 = rbuf2;
2555: smat_i->rbuf3 = rbuf3;
2556: smat_i->sbuf2 = sbuf2;
2557: smat_i->req_source2 = req_source2;
2559: smat_i->sbuf1 = sbuf1;
2560: smat_i->ptr = ptr;
2561: smat_i->tmp = tmp;
2562: smat_i->ctr = ctr;
2564: smat_i->pa = pa;
2565: smat_i->req_size = req_size;
2566: smat_i->req_source1 = req_source1;
2568: smat_i->allcolumns = allcolumns[i];
2569: smat_i->singleis = PETSC_FALSE;
2570: smat_i->row2proc = row2proc[i];
2571: smat_i->rmap = rmap[i];
2572: smat_i->cmap = cmap[i];
2573: }
2575: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2576: PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0]));
2577: PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE));
2578: PetscCall(MatSetType(submats[0], MATDUMMY));
2580: /* create struct Mat_SubSppt and attached it to submat */
2581: PetscCall(PetscNew(&smat_i));
2582: submats[0]->data = (void *)smat_i;
2584: smat_i->destroy = submats[0]->ops->destroy;
2585: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2586: submats[0]->factortype = C->factortype;
2588: smat_i->id = 0;
2589: smat_i->nrqs = nrqs;
2590: smat_i->nrqr = nrqr;
2591: smat_i->rbuf1 = rbuf1;
2592: smat_i->rbuf2 = rbuf2;
2593: smat_i->rbuf3 = rbuf3;
2594: smat_i->sbuf2 = sbuf2;
2595: smat_i->req_source2 = req_source2;
2597: smat_i->sbuf1 = sbuf1;
2598: smat_i->ptr = ptr;
2599: smat_i->tmp = tmp;
2600: smat_i->ctr = ctr;
2602: smat_i->pa = pa;
2603: smat_i->req_size = req_size;
2604: smat_i->req_source1 = req_source1;
2606: smat_i->allcolumns = PETSC_FALSE;
2607: smat_i->singleis = PETSC_FALSE;
2608: smat_i->row2proc = NULL;
2609: smat_i->rmap = NULL;
2610: smat_i->cmap = NULL;
2611: }
2613: if (ismax) PetscCall(PetscFree(lens[0]));
2614: PetscCall(PetscFree(lens));
2615: if (sbuf_aj) {
2616: PetscCall(PetscFree(sbuf_aj[0]));
2617: PetscCall(PetscFree(sbuf_aj));
2618: }
2620: } /* endof scall == MAT_INITIAL_MATRIX */
2622: /* Post recv matrix values */
2623: PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4));
2624: PetscCall(PetscMalloc1(nrqs, &rbuf4));
2625: PetscCall(PetscMalloc1(nrqs, &r_waits4));
2626: for (i = 0; i < nrqs; ++i) {
2627: PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i]));
2628: PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i));
2629: }
2631: /* Allocate sending buffers for a->a, and send them off */
2632: PetscCall(PetscMalloc1(nrqr, &sbuf_aa));
2633: for (i = 0, j = 0; i < nrqr; i++) j += req_size[i];
2634: if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0]));
2635: for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1];
2637: PetscCall(PetscMalloc1(nrqr, &s_waits4));
2638: {
2639: PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite;
2640: PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray;
2641: PetscInt cend = C->cmap->rend;
2642: PetscInt *b_j = b->j;
2643: PetscScalar *vworkA, *vworkB, *a_a, *b_a;
2645: PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a));
2646: PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a));
2647: for (i = 0; i < nrqr; i++) {
2648: rbuf1_i = rbuf1[i];
2649: sbuf_aa_i = sbuf_aa[i];
2650: ct1 = 2 * rbuf1_i[0] + 1;
2651: ct2 = 0;
2652: for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) {
2653: kmax = rbuf1_i[2 * j];
2654: for (k = 0; k < kmax; k++, ct1++) {
2655: row = rbuf1_i[ct1] - rstart;
2656: nzA = a_i[row + 1] - a_i[row];
2657: nzB = b_i[row + 1] - b_i[row];
2658: ncols = nzA + nzB;
2659: cworkB = b_j ? b_j + b_i[row] : NULL;
2660: vworkA = a_a + a_i[row];
2661: vworkB = b_a ? b_a + b_i[row] : NULL;
2663: /* load the column values for this row into vals*/
2664: vals = sbuf_aa_i + ct2;
2666: lwrite = 0;
2667: for (l = 0; l < nzB; l++) {
2668: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2669: }
2670: for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l];
2671: for (l = 0; l < nzB; l++) {
2672: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2673: }
2675: ct2 += ncols;
2676: }
2677: }
2678: PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i));
2679: }
2680: PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a));
2681: PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a));
2682: }
2684: /* Assemble the matrices */
2685: /* First assemble the local rows */
2686: for (i = 0; i < ismax; i++) {
2687: row2proc_i = row2proc[i];
2688: subc = (Mat_SeqAIJ *)submats[i]->data;
2689: imat_ilen = subc->ilen;
2690: imat_j = subc->j;
2691: imat_i = subc->i;
2692: imat_a = subc->a;
2694: if (!allcolumns[i]) cmap_i = cmap[i];
2695: rmap_i = rmap[i];
2696: irow_i = irow[i];
2697: jmax = nrow[i];
2698: for (j = 0; j < jmax; j++) {
2699: row = irow_i[j];
2700: proc = row2proc_i[j];
2701: if (proc == rank) {
2702: old_row = row;
2703: #if defined(PETSC_USE_CTABLE)
2704: PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2705: row--;
2706: #else
2707: row = rmap_i[row];
2708: #endif
2709: ilen_row = imat_ilen[row];
2710: PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2711: mat_i = imat_i[row];
2712: mat_a = imat_a + mat_i;
2713: mat_j = imat_j + mat_i;
2714: if (!allcolumns[i]) {
2715: for (k = 0; k < ncols; k++) {
2716: #if defined(PETSC_USE_CTABLE)
2717: PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol));
2718: #else
2719: tcol = cmap_i[cols[k]];
2720: #endif
2721: if (tcol) {
2722: *mat_j++ = tcol - 1;
2723: *mat_a++ = vals[k];
2724: ilen_row++;
2725: }
2726: }
2727: } else { /* allcolumns */
2728: for (k = 0; k < ncols; k++) {
2729: *mat_j++ = cols[k]; /* global col index! */
2730: *mat_a++ = vals[k];
2731: ilen_row++;
2732: }
2733: }
2734: PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals));
2736: imat_ilen[row] = ilen_row;
2737: }
2738: }
2739: }
2741: /* Now assemble the off proc rows */
2742: PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE));
2743: for (tmp2 = 0; tmp2 < nrqs; tmp2++) {
2744: sbuf1_i = sbuf1[pa[tmp2]];
2745: jmax = sbuf1_i[0];
2746: ct1 = 2 * jmax + 1;
2747: ct2 = 0;
2748: rbuf2_i = rbuf2[tmp2];
2749: rbuf3_i = rbuf3[tmp2];
2750: rbuf4_i = rbuf4[tmp2];
2751: for (j = 1; j <= jmax; j++) {
2752: is_no = sbuf1_i[2 * j - 1];
2753: rmap_i = rmap[is_no];
2754: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2755: subc = (Mat_SeqAIJ *)submats[is_no]->data;
2756: imat_ilen = subc->ilen;
2757: imat_j = subc->j;
2758: imat_i = subc->i;
2759: imat_a = subc->a;
2760: max1 = sbuf1_i[2 * j];
2761: for (k = 0; k < max1; k++, ct1++) {
2762: row = sbuf1_i[ct1];
2763: #if defined(PETSC_USE_CTABLE)
2764: PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row));
2765: row--;
2766: #else
2767: row = rmap_i[row];
2768: #endif
2769: ilen = imat_ilen[row];
2770: mat_i = imat_i[row];
2771: mat_a = imat_a ? imat_a + mat_i : NULL;
2772: mat_j = imat_j ? imat_j + mat_i : NULL;
2773: max2 = rbuf2_i[ct1];
2774: if (!allcolumns[is_no]) {
2775: for (l = 0; l < max2; l++, ct2++) {
2776: #if defined(PETSC_USE_CTABLE)
2777: PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol));
2778: #else
2779: tcol = cmap_i[rbuf3_i[ct2]];
2780: #endif
2781: if (tcol) {
2782: *mat_j++ = tcol - 1;
2783: *mat_a++ = rbuf4_i[ct2];
2784: ilen++;
2785: }
2786: }
2787: } else { /* allcolumns */
2788: for (l = 0; l < max2; l++, ct2++) {
2789: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2790: *mat_a++ = rbuf4_i[ct2];
2791: ilen++;
2792: }
2793: }
2794: imat_ilen[row] = ilen;
2795: }
2796: }
2797: }
2799: if (!iscsorted) { /* sort column indices of the rows */
2800: for (i = 0; i < ismax; i++) {
2801: subc = (Mat_SeqAIJ *)submats[i]->data;
2802: imat_j = subc->j;
2803: imat_i = subc->i;
2804: imat_a = subc->a;
2805: imat_ilen = subc->ilen;
2807: if (allcolumns[i]) continue;
2808: jmax = nrow[i];
2809: for (j = 0; j < jmax; j++) {
2810: mat_i = imat_i[j];
2811: mat_a = imat_a + mat_i;
2812: mat_j = imat_j + mat_i;
2813: PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a));
2814: }
2815: }
2816: }
2818: PetscCall(PetscFree(r_waits4));
2819: PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE));
2820: PetscCall(PetscFree(s_waits4));
2822: /* Restore the indices */
2823: for (i = 0; i < ismax; i++) {
2824: PetscCall(ISRestoreIndices(isrow[i], irow + i));
2825: if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i));
2826: }
2828: for (i = 0; i < ismax; i++) {
2829: PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY));
2830: PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY));
2831: }
2833: /* Destroy allocated memory */
2834: if (sbuf_aa) {
2835: PetscCall(PetscFree(sbuf_aa[0]));
2836: PetscCall(PetscFree(sbuf_aa));
2837: }
2838: PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted));
2840: for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i]));
2841: PetscCall(PetscFree(rbuf4));
2843: PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns));
2844: PetscFunctionReturn(PETSC_SUCCESS);
2845: }
2847: /*
2848: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2849: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2850: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2851: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2852: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2853: state, and needs to be "assembled" later by compressing B's column space.
2855: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2856: Following this call, C->A & C->B have been created, even if empty.
2857: */
2858: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B)
2859: {
2860: /* If making this function public, change the error returned in this function away from _PLIB. */
2861: Mat_MPIAIJ *aij;
2862: Mat_SeqAIJ *Baij;
2863: PetscBool seqaij, Bdisassembled;
2864: PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count;
2865: PetscScalar v;
2866: const PetscInt *rowindices, *colindices;
2868: PetscFunctionBegin;
2869: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2870: if (A) {
2871: PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij));
2872: PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
2873: if (rowemb) {
2874: PetscCall(ISGetLocalSize(rowemb, &m));
2875: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2876: } else {
2877: PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2878: }
2879: if (dcolemb) {
2880: PetscCall(ISGetLocalSize(dcolemb, &n));
2881: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n);
2882: } else {
2883: PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2884: }
2885: }
2886: if (B) {
2887: PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij));
2888: PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
2889: if (rowemb) {
2890: PetscCall(ISGetLocalSize(rowemb, &m));
2891: PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n);
2892: } else {
2893: PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2894: }
2895: if (ocolemb) {
2896: PetscCall(ISGetLocalSize(ocolemb, &n));
2897: PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n);
2898: } else {
2899: PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2900: }
2901: }
2903: aij = (Mat_MPIAIJ *)C->data;
2904: if (!aij->A) {
2905: /* Mimic parts of MatMPIAIJSetPreallocation() */
2906: PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A));
2907: PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n));
2908: PetscCall(MatSetBlockSizesFromMats(aij->A, C, C));
2909: PetscCall(MatSetType(aij->A, MATSEQAIJ));
2910: }
2911: if (A) {
2912: PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A));
2913: } else {
2914: PetscCall(MatSetUp(aij->A));
2915: }
2916: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2917: /*
2918: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2919: need to "disassemble" B -- convert it to using C's global indices.
2920: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2922: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2923: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2925: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2926: At least avoid calling MatSetValues() and the implied searches?
2927: */
2929: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2930: #if defined(PETSC_USE_CTABLE)
2931: PetscCall(PetscHMapIDestroy(&aij->colmap));
2932: #else
2933: PetscCall(PetscFree(aij->colmap));
2934: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2935: #endif
2936: ngcol = 0;
2937: if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol));
2938: if (aij->garray) PetscCall(PetscFree(aij->garray));
2939: PetscCall(VecDestroy(&aij->lvec));
2940: PetscCall(VecScatterDestroy(&aij->Mvctx));
2941: }
2942: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B));
2943: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B));
2944: }
2945: Bdisassembled = PETSC_FALSE;
2946: if (!aij->B) {
2947: PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B));
2948: PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N));
2949: PetscCall(MatSetBlockSizesFromMats(aij->B, B, B));
2950: PetscCall(MatSetType(aij->B, MATSEQAIJ));
2951: Bdisassembled = PETSC_TRUE;
2952: }
2953: if (B) {
2954: Baij = (Mat_SeqAIJ *)B->data;
2955: if (pattern == DIFFERENT_NONZERO_PATTERN) {
2956: PetscCall(PetscMalloc1(B->rmap->n, &nz));
2957: for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i];
2958: PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz));
2959: PetscCall(PetscFree(nz));
2960: }
2962: PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend));
2963: shift = rend - rstart;
2964: count = 0;
2965: rowindices = NULL;
2966: colindices = NULL;
2967: if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices));
2968: if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices));
2969: for (i = 0; i < B->rmap->n; i++) {
2970: PetscInt row;
2971: row = i;
2972: if (rowindices) row = rowindices[i];
2973: for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) {
2974: col = Baij->j[count];
2975: if (colindices) col = colindices[col];
2976: if (Bdisassembled && col >= rstart) col += shift;
2977: v = Baij->a[count];
2978: PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES));
2979: ++count;
2980: }
2981: }
2982: /* No assembly for aij->B is necessary. */
2983: /* FIXME: set aij->B's nonzerostate correctly. */
2984: } else {
2985: PetscCall(MatSetUp(aij->B));
2986: }
2987: C->preallocated = PETSC_TRUE;
2988: C->was_assembled = PETSC_FALSE;
2989: C->assembled = PETSC_FALSE;
2990: /*
2991: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
2992: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
2993: */
2994: PetscFunctionReturn(PETSC_SUCCESS);
2995: }
2997: /*
2998: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
2999: */
3000: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B)
3001: {
3002: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data;
3004: PetscFunctionBegin;
3005: PetscAssertPointer(A, 2);
3006: PetscAssertPointer(B, 3);
3007: /* FIXME: make sure C is assembled */
3008: *A = aij->A;
3009: *B = aij->B;
3010: /* Note that we don't incref *A and *B, so be careful! */
3011: PetscFunctionReturn(PETSC_SUCCESS);
3012: }
3014: /*
3015: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3016: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3017: */
3018: static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat))
3019: {
3020: PetscMPIInt size, flag;
3021: PetscInt i, ii, cismax, ispar;
3022: Mat *A, *B;
3023: IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p;
3025: PetscFunctionBegin;
3026: if (!ismax) PetscFunctionReturn(PETSC_SUCCESS);
3028: for (i = 0, cismax = 0; i < ismax; ++i) {
3029: PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag));
3030: PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3031: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3032: if (size > 1) ++cismax;
3033: }
3035: /*
3036: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3037: ispar counts the number of parallel ISs across C's comm.
3038: */
3039: PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
3040: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3041: PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat));
3042: PetscFunctionReturn(PETSC_SUCCESS);
3043: }
3045: /* if (ispar) */
3046: /*
3047: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3048: These are used to extract the off-diag portion of the resulting parallel matrix.
3049: The row IS for the off-diag portion is the same as for the diag portion,
3050: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3051: */
3052: PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol));
3053: PetscCall(PetscMalloc1(cismax, &ciscol_p));
3054: for (i = 0, ii = 0; i < ismax; ++i) {
3055: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3056: if (size > 1) {
3057: /*
3058: TODO: This is the part that's ***NOT SCALABLE***.
3059: To fix this we need to extract just the indices of C's nonzero columns
3060: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3061: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3062: to be done without serializing on the IS list, so, most likely, it is best
3063: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3064: */
3065: PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii])));
3066: /* Now we have to
3067: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3068: were sorted on each rank, concatenated they might no longer be sorted;
3069: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3070: indices in the nondecreasing order to the original index positions.
3071: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3072: */
3073: PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii));
3074: PetscCall(ISSort(ciscol[ii]));
3075: ++ii;
3076: }
3077: }
3078: PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p));
3079: for (i = 0, ii = 0; i < ismax; ++i) {
3080: PetscInt j, issize;
3081: const PetscInt *indices;
3083: /*
3084: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3085: */
3086: PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i));
3087: PetscCall(ISSort(isrow[i]));
3088: PetscCall(ISGetLocalSize(isrow[i], &issize));
3089: PetscCall(ISGetIndices(isrow[i], &indices));
3090: for (j = 1; j < issize; ++j) {
3091: PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3092: }
3093: PetscCall(ISRestoreIndices(isrow[i], &indices));
3094: PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i));
3095: PetscCall(ISSort(iscol[i]));
3096: PetscCall(ISGetLocalSize(iscol[i], &issize));
3097: PetscCall(ISGetIndices(iscol[i], &indices));
3098: for (j = 1; j < issize; ++j) {
3099: PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]);
3100: }
3101: PetscCall(ISRestoreIndices(iscol[i], &indices));
3102: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3103: if (size > 1) {
3104: cisrow[ii] = isrow[i];
3105: ++ii;
3106: }
3107: }
3108: /*
3109: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3110: array of sequential matrices underlying the resulting parallel matrices.
3111: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3112: contain duplicates.
3114: There are as many diag matrices as there are original index sets. There are only as many parallel
3115: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3117: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3118: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3119: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3120: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3121: with A[i] and B[ii] extracted from the corresponding MPI submat.
3122: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3123: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3124: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3125: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3126: values into A[i] and B[ii] sitting inside the corresponding submat.
3127: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3128: A[i], B[ii], AA[i] or BB[ii] matrices.
3129: */
3130: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3131: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat));
3133: /* Now obtain the sequential A and B submatrices separately. */
3134: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3135: PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A));
3136: PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B));
3138: /*
3139: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3140: matrices A & B have been extracted directly into the parallel matrices containing them, or
3141: simply into the sequential matrix identical with the corresponding A (if size == 1).
3142: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3143: to have the same sparsity pattern.
3144: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3145: must be constructed for C. This is done by setseqmat(s).
3146: */
3147: for (i = 0, ii = 0; i < ismax; ++i) {
3148: /*
3149: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3150: That way we can avoid sorting and computing permutations when reusing.
3151: To this end:
3152: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3153: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3154: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3155: */
3156: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3158: PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size));
3159: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3160: if (size > 1) {
3161: if (scall == MAT_INITIAL_MATRIX) {
3162: PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i));
3163: PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE));
3164: PetscCall(MatSetType((*submat)[i], MATMPIAIJ));
3165: PetscCall(PetscLayoutSetUp((*submat)[i]->rmap));
3166: PetscCall(PetscLayoutSetUp((*submat)[i]->cmap));
3167: }
3168: /*
3169: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3170: */
3171: {
3172: Mat AA = A[i], BB = B[ii];
3174: if (AA || BB) {
3175: PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB));
3176: PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY));
3177: PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY));
3178: }
3179: PetscCall(MatDestroy(&AA));
3180: }
3181: PetscCall(ISDestroy(ciscol + ii));
3182: PetscCall(ISDestroy(ciscol_p + ii));
3183: ++ii;
3184: } else { /* if (size == 1) */
3185: if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i]));
3186: if (isrow_p[i] || iscol_p[i]) {
3187: PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i));
3188: PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i]));
3189: /* Otherwise A is extracted straight into (*submats)[i]. */
3190: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3191: PetscCall(MatDestroy(A + i));
3192: } else (*submat)[i] = A[i];
3193: }
3194: PetscCall(ISDestroy(&isrow_p[i]));
3195: PetscCall(ISDestroy(&iscol_p[i]));
3196: }
3197: PetscCall(PetscFree2(cisrow, ciscol));
3198: PetscCall(PetscFree2(isrow_p, iscol_p));
3199: PetscCall(PetscFree(ciscol_p));
3200: PetscCall(PetscFree(A));
3201: PetscCall(MatDestroySubMatrices(cismax, &B));
3202: PetscFunctionReturn(PETSC_SUCCESS);
3203: }
3205: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[])
3206: {
3207: PetscFunctionBegin;
3208: PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ));
3209: PetscFunctionReturn(PETSC_SUCCESS);
3210: }