Actual source code: blas_cyclic.c
1: #include "blas_cyclic.h"
2: #if PetscDefined(HAVE_CXX)
3: #include "cupm/blas_cyclic_cupm.h"
4: #endif
5: #include <petsc/private/vecimpl.h>
6: #include <petsc/private/matimpl.h>
7: #include <petscblaslapack.h>
9: PetscLogEvent AXPBY_Cyc, DMV_Cyc, DSV_Cyc, TRSV_Cyc, GEMV_Cyc, HEMV_Cyc;
11: #define VecCheckAllEntriesFirstRank(a, arg) PetscCheck((a)->map->range[1] == (a)->map->N, PetscObjectComm((PetscObject)(a)), PETSC_ERR_ARG_SIZ, "Vector argument # %d does not have all of its entries on the first rank", (arg))
12: #define MatCheckAllEntriesFirstRank(a, arg) \
13: PetscCheck(((a)->rmap->range[1] == (a)->rmap->N) && ((a)->cmap->range[1] == (a)->cmap->N), PetscObjectComm((PetscObject)(a)), PETSC_ERR_ARG_SIZ, "Matrix argument # %d does not have all of its entries on the first rank", (arg))
15: // Takes y_stride argument because this is also used for updating a row of a MatDense
16: static inline void AXPBY_Private(PetscInt m, PetscScalar alpha, const PetscScalar x[], PetscScalar beta, PetscScalar y[], PetscInt y_stride)
17: {
18: for (PetscInt i = 0; i < m; i++) y[i * y_stride] = alpha * x[i] + beta * y[i * y_stride];
19: }
21: static PetscErrorCode AXPBYCylic_Private(PetscInt m, PetscInt oldest, PetscInt next, PetscScalar alpha, const PetscScalar x[], PetscScalar beta, PetscScalar y[], PetscInt y_stride)
22: {
23: PetscInt i_oldest = oldest % m;
24: PetscInt i_next = ((next - 1) % m) + 1;
26: PetscFunctionBegin;
27: if (next - oldest == m) {
28: AXPBY_Private(m, alpha, x, beta, y, y_stride);
29: } else if (i_next > i_oldest) {
30: AXPBY_Private(i_next - i_oldest, alpha, &x[i_oldest], beta, &y[i_oldest * y_stride], y_stride);
31: } else {
32: AXPBY_Private(i_next, alpha, x, beta, y, y_stride);
33: AXPBY_Private(m - i_oldest, alpha, &x[i_oldest], beta, &y[i_oldest * y_stride], y_stride);
34: }
35: PetscFunctionReturn(PETSC_SUCCESS);
36: }
38: PETSC_INTERN PetscErrorCode VecAXPBYCyclic(PetscInt oldest, PetscInt next, PetscScalar alpha, Vec x, PetscScalar beta, Vec y)
39: {
40: const PetscScalar *x_;
41: PetscScalar *y_;
42: PetscInt m, m_local;
43: PetscMemType x_memtype, y_memtype;
44: PetscBool on_host = PETSC_FALSE;
46: PetscFunctionBegin;
49: PetscCheckSameComm(x, 4, y, 6);
50: VecCheckSameSize(x, 4, y, 6);
51: VecCheckAllEntriesFirstRank(x, 4);
52: VecCheckAllEntriesFirstRank(y, 6);
53: PetscCall(VecGetSize(x, &m));
54: PetscCall(VecGetLocalSize(x, &m_local));
55: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
56: PetscCall(PetscLogEventBegin(AXPBY_Cyc, NULL, NULL, NULL, NULL));
57: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
58: PetscCall(VecGetArrayAndMemType(y, &y_, &y_memtype));
59: if (PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
60: #if PetscDefined(HAVE_CXX)
61: if (m_local == m) PetscCall(AXPBYCyclic_CUPM_Private(m, oldest, next, alpha, x_, beta, y_, 1));
62: #else
63: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
64: #endif
65: } else if (m_local == m) on_host = PETSC_TRUE;
66: PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
67: PetscCall(VecRestoreArrayAndMemType(y, &y_));
68: if (on_host) {
69: PetscCall(VecGetArrayRead(x, &x_));
70: PetscCall(VecGetArray(y, &y_));
71: PetscCall(AXPBYCylic_Private(m, oldest, next, alpha, x_, beta, y_, 1));
72: PetscCall(VecRestoreArray(y, &y_));
73: PetscCall(VecRestoreArrayRead(x, &x_));
74: }
75: PetscCall(PetscLogEventEnd(AXPBY_Cyc, NULL, NULL, NULL, NULL));
76: PetscFunctionReturn(PETSC_SUCCESS);
77: }
79: static inline void DMV_Private(PetscBool hermitian_transpose, PetscInt m, PetscScalar alpha, const PetscScalar A[], const PetscScalar x[], PetscScalar beta, PetscScalar y[])
80: {
81: if (!hermitian_transpose) {
82: for (PetscInt i = 0; i < m; i++) y[i] = alpha * A[i] * x[i] + beta * y[i];
83: } else {
84: for (PetscInt i = 0; i < m; i++) y[i] = alpha * PetscConj(A[i]) * x[i] + beta * y[i];
85: }
86: }
88: static PetscErrorCode DMVCylic_Private(PetscBool hermitian_transpose, PetscInt m, PetscInt oldest, PetscInt next, PetscScalar alpha, const PetscScalar A[], const PetscScalar x[], PetscScalar beta, PetscScalar y[])
89: {
90: PetscInt i_oldest = oldest % m;
91: PetscInt i_next = ((next - 1) % m) + 1;
93: PetscFunctionBegin;
94: if (next - oldest == m) {
95: DMV_Private(hermitian_transpose, m, alpha, A, x, beta, y);
96: } else if (i_next > i_oldest) {
97: DMV_Private(hermitian_transpose, i_next - i_oldest, alpha, &A[i_oldest], &x[i_oldest], beta, &y[i_oldest]);
98: } else {
99: DMV_Private(hermitian_transpose, i_next, alpha, A, x, beta, y);
100: DMV_Private(hermitian_transpose, m - i_oldest, alpha, &A[i_oldest], &x[i_oldest], beta, &y[i_oldest]);
101: }
102: PetscFunctionReturn(PETSC_SUCCESS);
103: }
105: PETSC_INTERN PetscErrorCode VecDMVCyclic(PetscBool hermitian_transpose, PetscInt oldest, PetscInt next, PetscScalar alpha, Vec A, Vec x, PetscScalar beta, Vec y)
106: {
107: const PetscScalar *A_;
108: const PetscScalar *x_;
109: PetscScalar *y_;
110: PetscInt m, m_local;
111: PetscMemType A_memtype, x_memtype, y_memtype;
112: PetscBool on_host = PETSC_FALSE;
114: PetscFunctionBegin;
118: PetscCheckSameComm(A, 5, x, 6);
119: PetscCheckSameComm(A, 5, y, 8);
120: VecCheckSameSize(A, 5, x, 6);
121: VecCheckSameSize(A, 5, y, 8);
122: VecCheckAllEntriesFirstRank(A, 5);
123: VecCheckAllEntriesFirstRank(x, 6);
124: VecCheckAllEntriesFirstRank(y, 8);
125: PetscCall(VecGetSize(A, &m));
126: PetscCall(VecGetLocalSize(A, &m_local));
127: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
128: PetscCall(PetscLogEventBegin(DMV_Cyc, NULL, NULL, NULL, NULL));
129: PetscCall(VecGetArrayReadAndMemType(A, &A_, &A_memtype));
130: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
131: PetscCall(VecGetArrayAndMemType(y, &y_, &y_memtype));
132: if (PetscMemTypeDevice(A_memtype) && PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
133: #if PetscDefined(HAVE_CXX)
134: if (m_local == m) PetscCall(DMVCyclic_CUPM_Private(hermitian_transpose, m, oldest, next, alpha, A_, x_, beta, y_));
135: #else
136: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
137: #endif
138: } else if (m_local == m) on_host = PETSC_TRUE;
139: PetscCall(VecRestoreArrayAndMemType(y, &y_));
140: PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
141: PetscCall(VecRestoreArrayReadAndMemType(A, &A_));
142: if (on_host) {
143: PetscCall(VecGetArrayRead(A, &A_));
144: PetscCall(VecGetArrayRead(x, &x_));
145: PetscCall(VecGetArray(y, &y_));
146: PetscCall(DMVCylic_Private(hermitian_transpose, m, oldest, next, alpha, A_, x_, beta, y_));
147: PetscCall(VecRestoreArray(y, &y_));
148: PetscCall(VecRestoreArrayRead(x, &x_));
149: PetscCall(VecRestoreArrayRead(A, &A_));
150: }
151: PetscCall(PetscLogEventEnd(DMV_Cyc, NULL, NULL, NULL, NULL));
152: PetscFunctionReturn(PETSC_SUCCESS);
153: }
155: static inline void DSV_Private(PetscBool hermitian_transpose, PetscInt m, const PetscScalar A[], const PetscScalar x[], PetscScalar y[])
156: {
157: if (x != y) {
158: if (!hermitian_transpose) {
159: for (PetscInt i = 0; i < m; i++) y[i] = x[i] / A[i];
160: } else {
161: for (PetscInt i = 0; i < m; i++) y[i] = x[i] / PetscConj(A[i]);
162: }
163: } else {
164: if (!hermitian_transpose) {
165: for (PetscInt i = 0; i < m; i++) y[i] = y[i] / A[i];
166: } else {
167: for (PetscInt i = 0; i < m; i++) y[i] = y[i] / PetscConj(A[i]);
168: }
169: }
170: }
172: static PetscErrorCode DSVCyclic_Private(PetscBool hermitian_transpose, PetscInt m, PetscInt oldest, PetscInt next, const PetscScalar A[], const PetscScalar x[], PetscScalar y[])
173: {
174: PetscInt i_oldest = oldest % m;
175: PetscInt i_next = ((next - 1) % m) + 1;
177: PetscFunctionBegin;
178: if (next - oldest == m) {
179: DSV_Private(hermitian_transpose, m, A, x, y);
180: } else if (i_next > i_oldest) {
181: DSV_Private(hermitian_transpose, i_next - i_oldest, &A[i_oldest], &x[i_oldest], &y[i_oldest]);
182: } else {
183: DSV_Private(hermitian_transpose, i_next, A, x, y);
184: DSV_Private(hermitian_transpose, m - i_oldest, &A[i_oldest], &x[i_oldest], &y[i_oldest]);
185: }
186: PetscFunctionReturn(PETSC_SUCCESS);
187: }
189: PETSC_INTERN PetscErrorCode VecDSVCyclic(PetscBool hermitian_transpose, PetscInt oldest, PetscInt next, Vec A, Vec x, Vec y)
190: {
191: const PetscScalar *A_;
192: const PetscScalar *x_ = NULL;
193: PetscScalar *y_;
194: PetscInt m, m_local;
195: PetscMemType A_memtype, x_memtype, y_memtype;
196: PetscBool on_host = PETSC_FALSE;
198: PetscFunctionBegin;
202: PetscCheckSameComm(A, 4, x, 5);
203: PetscCheckSameComm(A, 4, y, 6);
204: VecCheckSameSize(A, 4, x, 5);
205: VecCheckSameSize(A, 4, y, 6);
206: VecCheckAllEntriesFirstRank(A, 4);
207: VecCheckAllEntriesFirstRank(x, 5);
208: VecCheckAllEntriesFirstRank(y, 6);
209: PetscCall(VecGetSize(A, &m));
210: PetscCall(VecGetLocalSize(A, &m_local));
211: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
212: PetscCall(PetscLogEventBegin(DSV_Cyc, NULL, NULL, NULL, NULL));
213: PetscCall(VecGetArrayReadAndMemType(A, &A_, &A_memtype));
214: PetscCall(VecGetArrayAndMemType(y, &y_, &y_memtype));
215: if (x == y) {
216: x_ = y_;
217: x_memtype = y_memtype;
218: } else {
219: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
220: }
221: if (PetscMemTypeDevice(A_memtype) && PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
222: #if PetscDefined(HAVE_CXX)
223: if (m_local == m) PetscCall(DSVCyclic_CUPM_Private(hermitian_transpose, m, oldest, next, A_, x_, y_));
224: #else
225: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
226: #endif
227: } else if (m_local == m) on_host = PETSC_TRUE;
228: if (x != y) PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
229: PetscCall(VecRestoreArrayAndMemType(y, &y_));
230: PetscCall(VecRestoreArrayReadAndMemType(A, &A_));
231: if (on_host) {
232: PetscCall(VecGetArrayRead(A, &A_));
233: PetscCall(VecGetArray(y, &y_));
234: if (x == y) {
235: x_ = y_;
236: } else {
237: PetscCall(VecGetArrayRead(x, &x_));
238: }
239: PetscCall(DSVCyclic_Private(hermitian_transpose, m, oldest, next, A_, x_, y_));
240: if (x != y) PetscCall(VecRestoreArrayRead(x, &x_));
241: PetscCall(VecRestoreArray(y, &y_));
242: PetscCall(VecRestoreArrayRead(A, &A_));
243: }
244: PetscCall(PetscLogEventEnd(DSV_Cyc, NULL, NULL, NULL, NULL));
245: PetscFunctionReturn(PETSC_SUCCESS);
246: }
248: static PetscErrorCode TRSVCyclic_Private(PetscBool hermitian_transpose, PetscInt m, PetscInt oldest, PetscInt next, const PetscScalar A[], PetscInt lda, const PetscScalar x[], PetscScalar y[])
249: {
250: PetscBLASInt b_one = 1, blda, bm;
251: PetscBLASInt i_oldest, i_next;
252: PetscScalar minus_one = -1.0, one = 1.0;
254: PetscFunctionBegin;
255: PetscCall(PetscBLASIntCast(lda, &blda));
256: PetscCall(PetscBLASIntCast(m, &bm));
257: PetscCall(PetscBLASIntCast(oldest % m, &i_oldest));
258: PetscCall(PetscBLASIntCast(((next - 1) % m) + 1, &i_next));
259: if (i_next > i_oldest) {
260: PetscBLASInt bn = i_next - i_oldest;
261: const char *trans = hermitian_transpose ? "C" : "N";
263: if (x != y) PetscCall(PetscArraycpy(&y[i_oldest], &x[i_oldest], bn));
264: PetscCallBLAS("BLAStrsv", BLAStrsv_("U", trans, "N", &bn, &A[i_oldest * (lda + 1)], &blda, &y[i_oldest], &b_one));
265: } else {
266: PetscBLASInt bn = bm - i_oldest;
267: if (x != y) {
268: PetscCall(PetscArraycpy(y, x, i_next));
269: PetscCall(PetscArraycpy(&y[i_oldest], &x[i_oldest], bn));
270: }
271: if (!hermitian_transpose) {
272: if (i_next > 0) PetscCallBLAS("BLAStrsv", BLAStrsv_("U", "N", "N", &i_next, A, &blda, y, &b_one));
273: if (i_next > 0 && bn > 0) PetscCallBLAS("BLASgemv", BLASgemv_("N", &bn, &i_next, &minus_one, &A[i_oldest], &blda, y, &b_one, &one, &y[i_oldest], &b_one));
274: if (bn > 0) PetscCallBLAS("BLAStrsv", BLAStrsv_("U", "N", "N", &bn, &A[i_oldest * (lda + 1)], &blda, &y[i_oldest], &b_one));
275: } else {
276: if (bn > 0) PetscCallBLAS("BLAStrsv", BLAStrsv_("U", "C", "N", &bn, &A[i_oldest * (lda + 1)], &blda, &y[i_oldest], &b_one));
277: if (i_next > 0 && bn > 0) PetscCallBLAS("BLASgemv", BLASgemv_("C", &bn, &i_next, &minus_one, &A[i_oldest], &blda, &y[i_oldest], &b_one, &one, y, &b_one));
278: if (i_next > 0) PetscCallBLAS("BLAStrsv", BLAStrsv_("U", "C", "N", &i_next, A, &blda, y, &b_one));
279: }
280: }
281: PetscFunctionReturn(PETSC_SUCCESS);
282: }
284: PETSC_INTERN PetscErrorCode MatSeqDenseTRSVCyclic(PetscBool hermitian_transpose, PetscInt oldest, PetscInt next, Mat A, Vec x, Vec y)
285: {
286: const PetscScalar *A_;
287: const PetscScalar *x_ = NULL;
288: PetscScalar *y_;
289: PetscInt m, m_local, lda;
290: PetscMemType A_memtype, x_memtype, y_memtype;
291: PetscBool on_host = PETSC_FALSE;
293: PetscFunctionBegin;
297: PetscCheckSameComm(A, 4, x, 5);
298: PetscCheckSameComm(A, 4, y, 6);
299: VecCheckMatCompatible(A, x, 5, y, 6);
300: MatCheckAllEntriesFirstRank(A, 4);
301: VecCheckAllEntriesFirstRank(x, 5);
302: VecCheckAllEntriesFirstRank(y, 6);
303: PetscCall(VecGetSize(x, &m));
304: PetscCall(VecGetLocalSize(x, &m_local));
305: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
306: PetscCall(PetscLogEventBegin(TRSV_Cyc, NULL, NULL, NULL, NULL));
307: PetscCall(MatDenseGetLDA(A, &lda));
308: PetscCall(MatDenseGetArrayReadAndMemType(A, &A_, &A_memtype));
309: PetscCall(VecGetArrayAndMemType(y, &y_, &y_memtype));
310: if (x == y) {
311: x_ = y_;
312: x_memtype = y_memtype;
313: } else {
314: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
315: }
316: if (PetscMemTypeDevice(A_memtype) && PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
317: #if PetscDefined(HAVE_CXX)
318: if (m_local == m) PetscCall(TRSVCyclic_CUPM_Private(hermitian_transpose, m, oldest, next, A_, lda, x_, y_));
319: #else
320: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
321: #endif
322: } else if (m_local == m) on_host = PETSC_TRUE;
323: if (x != y) PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
324: PetscCall(VecRestoreArrayAndMemType(y, &y_));
325: PetscCall(MatDenseRestoreArrayReadAndMemType(A, &A_));
326: if (on_host) {
327: PetscCall(MatDenseGetArrayRead(A, &A_));
328: PetscCall(VecGetArray(y, &y_));
329: if (x == y) {
330: x_ = y_;
331: } else {
332: PetscCall(VecGetArrayRead(x, &x_));
333: }
334: PetscCall(TRSVCyclic_Private(hermitian_transpose, m, oldest, next, A_, lda, x_, y_));
335: if (x != y) PetscCall(VecRestoreArrayRead(x, &x_));
336: PetscCall(VecRestoreArray(y, &y_));
337: PetscCall(MatDenseRestoreArrayRead(A, &A_));
338: }
339: PetscCall(PetscLogEventEnd(TRSV_Cyc, NULL, NULL, NULL, NULL));
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: static PetscErrorCode HEMVCyclic_Private(PetscInt m, PetscInt oldest, PetscInt next, PetscScalar alpha, const PetscScalar A[], PetscInt lda, const PetscScalar x[], PetscScalar beta, PetscScalar y[])
344: {
345: PetscBLASInt b_one = 1, blda, bm;
346: PetscBLASInt i_oldest, i_next;
347: PetscScalar one = 1.0;
349: PetscFunctionBegin;
350: PetscCall(PetscBLASIntCast(lda, &blda));
351: PetscCall(PetscBLASIntCast(m, &bm));
352: PetscCall(PetscBLASIntCast(oldest % m, &i_oldest));
353: PetscCall(PetscBLASIntCast(((next - 1) % m) + 1, &i_next));
354: if (i_next > i_oldest) {
355: PetscBLASInt bn = i_next - i_oldest;
357: PetscCallBLAS("BLAShemv", BLAShemv_("U", &bn, &alpha, &A[i_oldest * (lda + 1)], &blda, &x[i_oldest], &b_one, &beta, &y[i_oldest], &b_one));
358: } else {
359: PetscBLASInt bn = bm - i_oldest;
360: if (i_next > 0) PetscCallBLAS("BLAShemv", BLAShemv_("U", &i_next, &alpha, A, &blda, x, &b_one, &beta, y, &b_one));
361: if (bn > 0) PetscCallBLAS("BLAShemv", BLAShemv_("U", &bn, &alpha, &A[i_oldest * (lda + 1)], &blda, &x[i_oldest], &b_one, &beta, &y[i_oldest], &b_one));
362: if (i_next > 0 && bn > 0) {
363: PetscCallBLAS("BLASgemv", BLASgemv_("N", &bn, &i_next, &alpha, &A[i_oldest], &blda, x, &b_one, &one, &y[i_oldest], &b_one));
364: PetscCallBLAS("BLASgemv", BLASgemv_("C", &bn, &i_next, &alpha, &A[i_oldest], &blda, &x[i_oldest], &b_one, &one, y, &b_one));
365: }
366: }
367: PetscFunctionReturn(PETSC_SUCCESS);
368: }
370: PETSC_INTERN PetscErrorCode MatSeqDenseHEMVCyclic(PetscInt oldest, PetscInt next, PetscScalar alpha, Mat A, Vec x, PetscScalar beta, Vec y)
371: {
372: const PetscScalar *A_;
373: const PetscScalar *x_ = NULL;
374: PetscScalar *y_;
375: PetscInt m, m_local, lda;
376: PetscMemType A_memtype, x_memtype, y_memtype;
377: PetscBool on_host = PETSC_FALSE;
379: PetscFunctionBegin;
383: PetscCheckSameComm(A, 4, x, 5);
384: PetscCheckSameComm(A, 4, y, 7);
385: VecCheckMatCompatible(A, x, 5, y, 7);
386: MatCheckAllEntriesFirstRank(A, 4);
387: VecCheckAllEntriesFirstRank(x, 5);
388: VecCheckAllEntriesFirstRank(y, 7);
389: PetscCall(VecGetSize(x, &m));
390: PetscCall(VecGetLocalSize(x, &m_local));
391: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
392: PetscCall(PetscLogEventBegin(HEMV_Cyc, NULL, NULL, NULL, NULL));
393: PetscCall(MatDenseGetLDA(A, &lda));
394: PetscCall(MatDenseGetArrayReadAndMemType(A, &A_, &A_memtype));
395: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
396: PetscCall(VecGetArrayAndMemType(y, &y_, &y_memtype));
397: if (PetscMemTypeDevice(A_memtype) && PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
398: #if PetscDefined(HAVE_CXX)
399: if (m_local == m) PetscCall(HEMVCyclic_CUPM_Private(m, oldest, next, alpha, A_, lda, x_, beta, y_));
400: #else
401: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
402: #endif
403: } else if (m_local == m) on_host = PETSC_TRUE;
404: PetscCall(VecRestoreArrayAndMemType(y, &y_));
405: PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
406: PetscCall(MatDenseRestoreArrayReadAndMemType(A, &A_));
407: if (on_host) {
408: PetscCall(MatDenseGetArrayRead(A, &A_));
409: PetscCall(VecGetArrayRead(x, &x_));
410: PetscCall(VecGetArray(y, &y_));
411: PetscCall(HEMVCyclic_Private(m, oldest, next, alpha, A_, lda, x_, beta, y_));
412: PetscCall(VecRestoreArray(y, &y_));
413: PetscCall(VecRestoreArrayRead(x, &x_));
414: PetscCall(MatDenseRestoreArrayRead(A, &A_));
415: }
416: PetscCall(PetscLogEventEnd(HEMV_Cyc, NULL, NULL, NULL, NULL));
417: PetscFunctionReturn(PETSC_SUCCESS);
418: }
420: static PetscErrorCode GEMVCyclic_Private(PetscBool hermitian_transpose, PetscInt m, PetscInt oldest, PetscInt next, PetscScalar alpha, const PetscScalar A[], PetscInt lda, const PetscScalar x[], PetscScalar beta, PetscScalar y[])
421: {
422: PetscBLASInt b_one = 1, blda, bm;
423: PetscBLASInt i_oldest, i_next;
424: PetscScalar one = 1.0;
425: const char *trans = hermitian_transpose ? "C" : "N";
427: PetscFunctionBegin;
428: PetscCall(PetscBLASIntCast(lda, &blda));
429: PetscCall(PetscBLASIntCast(m, &bm));
430: PetscCall(PetscBLASIntCast(oldest % m, &i_oldest));
431: PetscCall(PetscBLASIntCast(((next - 1) % m) + 1, &i_next));
432: if (next - oldest == m) {
433: PetscCallBLAS("BLASgemv", BLASgemv_(trans, &bm, &bm, &alpha, A, &blda, x, &b_one, &beta, y, &b_one));
434: } else if (i_next > i_oldest) {
435: PetscBLASInt bn = i_next - i_oldest;
437: PetscCallBLAS("BLASgemv", BLASgemv_(trans, &bn, &bn, &alpha, &A[i_oldest * (lda + 1)], &blda, &x[i_oldest], &b_one, &beta, &y[i_oldest], &b_one));
438: } else {
439: PetscBLASInt bn = bm - i_oldest;
440: if (i_next > 0) PetscCallBLAS("BLASgemv", BLASgemv_(trans, &i_next, &i_next, &alpha, A, &blda, x, &b_one, &beta, y, &b_one));
441: if (bn > 0) PetscCallBLAS("BLASgemv", BLASgemv_(trans, &bn, &bn, &alpha, &A[i_oldest * (lda + 1)], &blda, &x[i_oldest], &b_one, &beta, &y[i_oldest], &b_one));
442: if (i_next > 0 && bn > 0) {
443: if (!hermitian_transpose) {
444: PetscCallBLAS("BLASgemv", BLASgemv_("N", &bn, &i_next, &alpha, &A[i_oldest], &blda, x, &b_one, &one, &y[i_oldest], &b_one));
445: PetscCallBLAS("BLASgemv", BLASgemv_("N", &i_next, &bn, &alpha, &A[i_oldest * lda], &blda, &x[i_oldest], &b_one, &one, y, &b_one));
446: } else {
447: PetscCallBLAS("BLASgemv", BLASgemv_("C", &i_next, &bn, &alpha, &A[i_oldest * lda], &blda, x, &b_one, &one, &y[i_oldest], &b_one));
448: PetscCallBLAS("BLASgemv", BLASgemv_("C", &bn, &i_next, &alpha, &A[i_oldest], &blda, &x[i_oldest], &b_one, &one, y, &b_one));
449: }
450: }
451: }
452: PetscFunctionReturn(PETSC_SUCCESS);
453: }
455: PETSC_INTERN PetscErrorCode MatSeqDenseGEMVCyclic(PetscBool hermitian_transpose, PetscInt oldest, PetscInt next, PetscScalar alpha, Mat A, Vec x, PetscScalar beta, Vec y)
456: {
457: const PetscScalar *A_;
458: const PetscScalar *x_ = NULL;
459: PetscScalar *y_;
460: PetscInt m, m_local, lda;
461: PetscMemType A_memtype, x_memtype, y_memtype;
462: PetscBool on_host = PETSC_FALSE;
464: PetscFunctionBegin;
468: PetscCheckSameComm(A, 5, x, 6);
469: PetscCheckSameComm(A, 5, y, 8);
470: VecCheckMatCompatible(A, x, 6, y, 8);
471: MatCheckAllEntriesFirstRank(A, 5);
472: VecCheckAllEntriesFirstRank(x, 6);
473: VecCheckAllEntriesFirstRank(y, 8);
474: PetscCall(VecGetSize(x, &m));
475: PetscCall(VecGetLocalSize(x, &m_local));
476: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
477: PetscCall(PetscLogEventBegin(GEMV_Cyc, NULL, NULL, NULL, NULL));
478: PetscCall(MatDenseGetLDA(A, &lda));
479: PetscCall(MatDenseGetArrayReadAndMemType(A, &A_, &A_memtype));
480: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
481: PetscCall(VecGetArrayAndMemType(y, &y_, &y_memtype));
482: if (PetscMemTypeDevice(A_memtype) && PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
483: #if PetscDefined(HAVE_CXX)
484: if (m_local == m) PetscCall(GEMVCyclic_CUPM_Private(hermitian_transpose, m, oldest, next, alpha, A_, lda, x_, beta, y_));
485: #else
486: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
487: #endif
488: } else if (m_local == m) on_host = PETSC_TRUE;
489: PetscCall(VecRestoreArrayAndMemType(y, &y_));
490: PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
491: PetscCall(MatDenseRestoreArrayReadAndMemType(A, &A_));
492: if (on_host) {
493: PetscCall(MatDenseGetArrayRead(A, &A_));
494: PetscCall(VecGetArrayRead(x, &x_));
495: PetscCall(VecGetArray(y, &y_));
496: PetscCall(GEMVCyclic_Private(hermitian_transpose, m, oldest, next, alpha, A_, lda, x_, beta, y_));
497: PetscCall(VecRestoreArray(y, &y_));
498: PetscCall(VecRestoreArrayRead(x, &x_));
499: PetscCall(MatDenseRestoreArrayRead(A, &A_));
500: }
501: PetscCall(PetscLogEventEnd(GEMV_Cyc, NULL, NULL, NULL, NULL));
502: PetscFunctionReturn(PETSC_SUCCESS);
503: }
505: PETSC_INTERN PetscErrorCode MatSeqDenseRowAXPBYCyclic(PetscInt oldest, PetscInt next, PetscScalar alpha, Vec x, PetscScalar beta, Mat Y, PetscInt row)
506: {
507: const PetscScalar *x_ = NULL;
508: PetscScalar *y_;
509: PetscInt m, m_local, ldy;
510: PetscMemType x_memtype, y_memtype;
511: PetscBool on_host = PETSC_FALSE;
513: PetscFunctionBegin;
516: PetscCheckSameComm(x, 4, Y, 6);
517: VecCheckMatCompatible(Y, x, 4, x, 4);
518: VecCheckAllEntriesFirstRank(x, 4);
519: MatCheckAllEntriesFirstRank(Y, 6);
520: PetscCall(VecGetSize(x, &m));
521: PetscCall(VecGetLocalSize(x, &m_local));
522: if (!m) PetscFunctionReturn(PETSC_SUCCESS);
523: PetscCall(PetscLogEventBegin(AXPBY_Cyc, NULL, NULL, NULL, NULL));
524: PetscCall(MatDenseGetLDA(Y, &ldy));
525: PetscCall(VecGetArrayReadAndMemType(x, &x_, &x_memtype));
526: PetscCall(MatDenseGetArrayAndMemType(Y, &y_, &y_memtype));
527: if (PetscMemTypeDevice(x_memtype) && PetscMemTypeDevice(y_memtype)) {
528: #if PetscDefined(HAVE_CXX)
529: if (m_local == m) PetscCall(AXPBYCyclic_CUPM_Private(m, oldest, next, alpha, x_, beta, &y_[row % m], ldy));
530: #else
531: SETERRQ(PetscObjectComm((PetscObject)x), PETSC_ERR_PLIB, "Memtype device needs C++ support");
532: #endif
533: } else if (m_local == m) on_host = PETSC_TRUE;
534: PetscCall(MatDenseRestoreArrayAndMemType(Y, &y_));
535: PetscCall(VecRestoreArrayReadAndMemType(x, &x_));
536: if (on_host) {
537: PetscCall(VecGetArrayRead(x, &x_));
538: PetscCall(MatDenseGetArray(Y, &y_));
539: PetscCall(AXPBYCylic_Private(m, oldest, next, alpha, x_, beta, &y_[row % m], ldy));
540: PetscCall(MatDenseRestoreArray(Y, &y_));
541: PetscCall(VecRestoreArrayRead(x, &x_));
542: }
543: PetscCall(PetscLogEventEnd(AXPBY_Cyc, NULL, NULL, NULL, NULL));
544: PetscFunctionReturn(PETSC_SUCCESS);
545: }
547: PETSC_INTERN PetscErrorCode MatMultColumnRange(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
548: {
549: PetscFunctionBegin;
550: PetscCall(PetscLogEventBegin(MAT_Mult, A, NULL, NULL, NULL));
551: PetscUseMethod(A, "MatMultColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (A, xx, yy, c_start, c_end));
552: PetscCall(PetscLogEventEnd(MAT_Mult, A, NULL, NULL, NULL));
553: PetscFunctionReturn(PETSC_SUCCESS);
554: }
556: PETSC_INTERN PetscErrorCode MatMultAddColumnRange(Mat A, Vec xx, Vec zz, Vec yy, PetscInt c_start, PetscInt c_end)
557: {
558: PetscFunctionBegin;
559: PetscCall(PetscLogEventBegin(MAT_MultAdd, A, NULL, NULL, NULL));
560: PetscUseMethod(A, "MatMultAddColumnRange_C", (Mat, Vec, Vec, Vec, PetscInt, PetscInt), (A, xx, zz, yy, c_start, c_end));
561: PetscCall(PetscLogEventEnd(MAT_MultAdd, A, NULL, NULL, NULL));
562: PetscFunctionReturn(PETSC_SUCCESS);
563: }
565: PETSC_INTERN PetscErrorCode MatMultHermitianTransposeColumnRange(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end)
566: {
567: PetscFunctionBegin;
568: PetscCall(PetscLogEventBegin(MAT_MultHermitianTranspose, A, NULL, NULL, NULL));
569: PetscUseMethod(A, "MatMultHermitianTransposeColumnRange_C", (Mat, Vec, Vec, PetscInt, PetscInt), (A, xx, yy, c_start, c_end));
570: PetscCall(PetscLogEventEnd(MAT_MultHermitianTranspose, A, NULL, NULL, NULL));
571: PetscFunctionReturn(PETSC_SUCCESS);
572: }
574: PETSC_INTERN PetscErrorCode MatMultHermitianTransposeAddColumnRange(Mat A, Vec xx, Vec zz, Vec yy, PetscInt c_start, PetscInt c_end)
575: {
576: PetscFunctionBegin;
577: PetscCall(PetscLogEventBegin(MAT_MultHermitianTransposeAdd, A, NULL, NULL, NULL));
578: PetscUseMethod(A, "MatMultHermitianTransposeAddColumnRange_C", (Mat, Vec, Vec, Vec, PetscInt, PetscInt), (A, xx, zz, yy, c_start, c_end));
579: PetscCall(PetscLogEventEnd(MAT_MultHermitianTransposeAdd, A, NULL, NULL, NULL));
580: PetscFunctionReturn(PETSC_SUCCESS);
581: }