Actual source code: dvec2.c
1: /*
2: Defines some vector operation functions that are shared by
3: sequential and parallel vectors.
4: */
5: #include <../src/vec/vec/impls/dvecimpl.h>
6: #include <petsc/private/kernels/petscaxpy.h>
8: #if defined(PETSC_USE_FORTRAN_KERNEL_MDOT)
9: #include <../src/vec/vec/impls/seq/ftn-kernels/fmdot.h>
10: PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
11: {
12: const PetscInt n = xin->map->n;
13: PetscInt i = nv, nv_rem = nv & 0x3;
14: PetscScalar sum0 = 0., sum1 = 0., sum2 = 0., sum3;
15: const PetscScalar *yy0, *yy1, *yy2, *yy3, *x;
16: Vec *yy = (Vec *)yin;
18: PetscFunctionBegin;
19: PetscCall(VecGetArrayRead(xin, &x));
20: switch (nv_rem) {
21: case 3:
22: PetscCall(VecGetArrayRead(yy[0], &yy0));
23: PetscCall(VecGetArrayRead(yy[1], &yy1));
24: PetscCall(VecGetArrayRead(yy[2], &yy2));
25: fortranmdot3_(x, yy0, yy1, yy2, &n, &sum0, &sum1, &sum2);
26: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
27: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
28: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
29: z[0] = sum0;
30: z[1] = sum1;
31: z[2] = sum2;
32: break;
33: case 2:
34: PetscCall(VecGetArrayRead(yy[0], &yy0));
35: PetscCall(VecGetArrayRead(yy[1], &yy1));
36: fortranmdot2_(x, yy0, yy1, &n, &sum0, &sum1);
37: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
38: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
39: z[0] = sum0;
40: z[1] = sum1;
41: break;
42: case 1:
43: PetscCall(VecGetArrayRead(yy[0], &yy0));
44: fortranmdot1_(x, yy0, &n, &sum0);
45: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
46: z[0] = sum0;
47: break;
48: case 0:
49: break;
50: }
51: z += nv_rem;
52: i -= nv_rem;
53: yy += nv_rem;
55: while (i > 0) {
56: sum0 = 0.;
57: sum1 = 0.;
58: sum2 = 0.;
59: sum3 = 0.;
60: PetscCall(VecGetArrayRead(yy[0], &yy0));
61: PetscCall(VecGetArrayRead(yy[1], &yy1));
62: PetscCall(VecGetArrayRead(yy[2], &yy2));
63: PetscCall(VecGetArrayRead(yy[3], &yy3));
64: fortranmdot4_(x, yy0, yy1, yy2, yy3, &n, &sum0, &sum1, &sum2, &sum3);
65: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
66: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
67: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
68: PetscCall(VecRestoreArrayRead(yy[3], &yy3));
69: yy += 4;
70: z[0] = sum0;
71: z[1] = sum1;
72: z[2] = sum2;
73: z[3] = sum3;
74: z += 4;
75: i -= 4;
76: }
77: PetscCall(VecRestoreArrayRead(xin, &x));
78: PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
79: PetscFunctionReturn(PETSC_SUCCESS);
80: }
82: #else
83: PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
84: {
85: const PetscInt n = xin->map->n;
86: PetscInt i = nv, j = n, nv_rem = nv & 0x3, j_rem;
87: PetscScalar sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3, x0, x1, x2, x3;
88: const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
89: const Vec *yy = (Vec *)yin;
91: PetscFunctionBegin;
92: if (n == 0) {
93: PetscCall(PetscArrayzero(z, nv));
94: PetscFunctionReturn(PETSC_SUCCESS);
95: }
96: PetscCall(VecGetArrayRead(xin, &xbase));
97: x = xbase;
98: switch (nv_rem) {
99: case 3:
100: PetscCall(VecGetArrayRead(yy[0], &yy0));
101: PetscCall(VecGetArrayRead(yy[1], &yy1));
102: PetscCall(VecGetArrayRead(yy[2], &yy2));
103: switch (j_rem = j & 0x3) {
104: case 3:
105: x2 = x[2];
106: sum0 += x2 * PetscConj(yy0[2]);
107: sum1 += x2 * PetscConj(yy1[2]);
108: sum2 += x2 * PetscConj(yy2[2]); /* fall through */
109: case 2:
110: x1 = x[1];
111: sum0 += x1 * PetscConj(yy0[1]);
112: sum1 += x1 * PetscConj(yy1[1]);
113: sum2 += x1 * PetscConj(yy2[1]); /* fall through */
114: case 1:
115: x0 = x[0];
116: sum0 += x0 * PetscConj(yy0[0]);
117: sum1 += x0 * PetscConj(yy1[0]);
118: sum2 += x0 * PetscConj(yy2[0]); /* fall through */
119: case 0:
120: x += j_rem;
121: yy0 += j_rem;
122: yy1 += j_rem;
123: yy2 += j_rem;
124: j -= j_rem;
125: break;
126: }
127: while (j > 0) {
128: x0 = x[0];
129: x1 = x[1];
130: x2 = x[2];
131: x3 = x[3];
132: x += 4;
134: sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
135: yy0 += 4;
136: sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
137: yy1 += 4;
138: sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
139: yy2 += 4;
140: j -= 4;
141: }
142: z[0] = sum0;
143: z[1] = sum1;
144: z[2] = sum2;
145: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
146: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
147: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
148: break;
149: case 2:
150: PetscCall(VecGetArrayRead(yy[0], &yy0));
151: PetscCall(VecGetArrayRead(yy[1], &yy1));
152: switch (j_rem = j & 0x3) {
153: case 3:
154: x2 = x[2];
155: sum0 += x2 * PetscConj(yy0[2]);
156: sum1 += x2 * PetscConj(yy1[2]); /* fall through */
157: case 2:
158: x1 = x[1];
159: sum0 += x1 * PetscConj(yy0[1]);
160: sum1 += x1 * PetscConj(yy1[1]); /* fall through */
161: case 1:
162: x0 = x[0];
163: sum0 += x0 * PetscConj(yy0[0]);
164: sum1 += x0 * PetscConj(yy1[0]); /* fall through */
165: case 0:
166: x += j_rem;
167: yy0 += j_rem;
168: yy1 += j_rem;
169: j -= j_rem;
170: break;
171: }
172: while (j > 0) {
173: x0 = x[0];
174: x1 = x[1];
175: x2 = x[2];
176: x3 = x[3];
177: x += 4;
179: sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
180: yy0 += 4;
181: sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
182: yy1 += 4;
183: j -= 4;
184: }
185: z[0] = sum0;
186: z[1] = sum1;
188: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
189: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
190: break;
191: case 1:
192: PetscCall(VecGetArrayRead(yy[0], &yy0));
193: switch (j_rem = j & 0x3) {
194: case 3:
195: x2 = x[2];
196: sum0 += x2 * PetscConj(yy0[2]); /* fall through */
197: case 2:
198: x1 = x[1];
199: sum0 += x1 * PetscConj(yy0[1]); /* fall through */
200: case 1:
201: x0 = x[0];
202: sum0 += x0 * PetscConj(yy0[0]); /* fall through */
203: case 0:
204: x += j_rem;
205: yy0 += j_rem;
206: j -= j_rem;
207: break;
208: }
209: while (j > 0) {
210: sum0 += x[0] * PetscConj(yy0[0]) + x[1] * PetscConj(yy0[1]) + x[2] * PetscConj(yy0[2]) + x[3] * PetscConj(yy0[3]);
211: yy0 += 4;
212: j -= 4;
213: x += 4;
214: }
215: z[0] = sum0;
217: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
218: break;
219: case 0:
220: break;
221: }
222: z += nv_rem;
223: i -= nv_rem;
224: yy += nv_rem;
226: while (i > 0) {
227: sum0 = 0.;
228: sum1 = 0.;
229: sum2 = 0.;
230: sum3 = 0.;
231: PetscCall(VecGetArrayRead(yy[0], &yy0));
232: PetscCall(VecGetArrayRead(yy[1], &yy1));
233: PetscCall(VecGetArrayRead(yy[2], &yy2));
234: PetscCall(VecGetArrayRead(yy[3], &yy3));
236: j = n;
237: x = xbase;
238: switch (j_rem = j & 0x3) {
239: case 3:
240: x2 = x[2];
241: sum0 += x2 * PetscConj(yy0[2]);
242: sum1 += x2 * PetscConj(yy1[2]);
243: sum2 += x2 * PetscConj(yy2[2]);
244: sum3 += x2 * PetscConj(yy3[2]); /* fall through */
245: case 2:
246: x1 = x[1];
247: sum0 += x1 * PetscConj(yy0[1]);
248: sum1 += x1 * PetscConj(yy1[1]);
249: sum2 += x1 * PetscConj(yy2[1]);
250: sum3 += x1 * PetscConj(yy3[1]); /* fall through */
251: case 1:
252: x0 = x[0];
253: sum0 += x0 * PetscConj(yy0[0]);
254: sum1 += x0 * PetscConj(yy1[0]);
255: sum2 += x0 * PetscConj(yy2[0]);
256: sum3 += x0 * PetscConj(yy3[0]); /* fall through */
257: case 0:
258: x += j_rem;
259: yy0 += j_rem;
260: yy1 += j_rem;
261: yy2 += j_rem;
262: yy3 += j_rem;
263: j -= j_rem;
264: break;
265: }
266: while (j > 0) {
267: x0 = x[0];
268: x1 = x[1];
269: x2 = x[2];
270: x3 = x[3];
271: x += 4;
273: sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
274: yy0 += 4;
275: sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
276: yy1 += 4;
277: sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
278: yy2 += 4;
279: sum3 += x0 * PetscConj(yy3[0]) + x1 * PetscConj(yy3[1]) + x2 * PetscConj(yy3[2]) + x3 * PetscConj(yy3[3]);
280: yy3 += 4;
281: j -= 4;
282: }
283: z[0] = sum0;
284: z[1] = sum1;
285: z[2] = sum2;
286: z[3] = sum3;
287: z += 4;
288: i -= 4;
289: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
290: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
291: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
292: PetscCall(VecRestoreArrayRead(yy[3], &yy3));
293: yy += 4;
294: }
295: PetscCall(VecRestoreArrayRead(xin, &xbase));
296: PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
297: PetscFunctionReturn(PETSC_SUCCESS);
298: }
299: #endif
301: /* ----------------------------------------------------------------------------*/
302: PetscErrorCode VecMTDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
303: {
304: const PetscInt n = xin->map->n;
305: PetscInt i = nv, j = n, nv_rem = nv & 0x3, j_rem;
306: PetscScalar sum0 = 0., sum1 = 0., sum2 = 0., sum3, x0, x1, x2, x3;
307: const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
308: const Vec *yy = (Vec *)yin;
310: PetscFunctionBegin;
311: PetscCall(VecGetArrayRead(xin, &xbase));
312: x = xbase;
314: switch (nv_rem) {
315: case 3:
316: PetscCall(VecGetArrayRead(yy[0], &yy0));
317: PetscCall(VecGetArrayRead(yy[1], &yy1));
318: PetscCall(VecGetArrayRead(yy[2], &yy2));
319: switch (j_rem = j & 0x3) {
320: case 3:
321: x2 = x[2];
322: sum0 += x2 * yy0[2];
323: sum1 += x2 * yy1[2];
324: sum2 += x2 * yy2[2]; /* fall through */
325: case 2:
326: x1 = x[1];
327: sum0 += x1 * yy0[1];
328: sum1 += x1 * yy1[1];
329: sum2 += x1 * yy2[1]; /* fall through */
330: case 1:
331: x0 = x[0];
332: sum0 += x0 * yy0[0];
333: sum1 += x0 * yy1[0];
334: sum2 += x0 * yy2[0]; /* fall through */
335: case 0:
336: x += j_rem;
337: yy0 += j_rem;
338: yy1 += j_rem;
339: yy2 += j_rem;
340: j -= j_rem;
341: break;
342: }
343: while (j > 0) {
344: x0 = x[0];
345: x1 = x[1];
346: x2 = x[2];
347: x3 = x[3];
348: x += 4;
350: sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
351: yy0 += 4;
352: sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
353: yy1 += 4;
354: sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
355: yy2 += 4;
356: j -= 4;
357: }
358: z[0] = sum0;
359: z[1] = sum1;
360: z[2] = sum2;
361: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
362: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
363: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
364: break;
365: case 2:
366: PetscCall(VecGetArrayRead(yy[0], &yy0));
367: PetscCall(VecGetArrayRead(yy[1], &yy1));
368: switch (j_rem = j & 0x3) {
369: case 3:
370: x2 = x[2];
371: sum0 += x2 * yy0[2];
372: sum1 += x2 * yy1[2]; /* fall through */
373: case 2:
374: x1 = x[1];
375: sum0 += x1 * yy0[1];
376: sum1 += x1 * yy1[1]; /* fall through */
377: case 1:
378: x0 = x[0];
379: sum0 += x0 * yy0[0];
380: sum1 += x0 * yy1[0]; /* fall through */
381: case 0:
382: x += j_rem;
383: yy0 += j_rem;
384: yy1 += j_rem;
385: j -= j_rem;
386: break;
387: }
388: while (j > 0) {
389: x0 = x[0];
390: x1 = x[1];
391: x2 = x[2];
392: x3 = x[3];
393: x += 4;
395: sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
396: yy0 += 4;
397: sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
398: yy1 += 4;
399: j -= 4;
400: }
401: z[0] = sum0;
402: z[1] = sum1;
404: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
405: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
406: break;
407: case 1:
408: PetscCall(VecGetArrayRead(yy[0], &yy0));
409: switch (j_rem = j & 0x3) {
410: case 3:
411: x2 = x[2];
412: sum0 += x2 * yy0[2]; /* fall through */
413: case 2:
414: x1 = x[1];
415: sum0 += x1 * yy0[1]; /* fall through */
416: case 1:
417: x0 = x[0];
418: sum0 += x0 * yy0[0]; /* fall through */
419: case 0:
420: x += j_rem;
421: yy0 += j_rem;
422: j -= j_rem;
423: break;
424: }
425: while (j > 0) {
426: sum0 += x[0] * yy0[0] + x[1] * yy0[1] + x[2] * yy0[2] + x[3] * yy0[3];
427: yy0 += 4;
428: j -= 4;
429: x += 4;
430: }
431: z[0] = sum0;
433: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
434: break;
435: case 0:
436: break;
437: }
438: z += nv_rem;
439: i -= nv_rem;
440: yy += nv_rem;
442: while (i > 0) {
443: sum0 = 0.;
444: sum1 = 0.;
445: sum2 = 0.;
446: sum3 = 0.;
447: PetscCall(VecGetArrayRead(yy[0], &yy0));
448: PetscCall(VecGetArrayRead(yy[1], &yy1));
449: PetscCall(VecGetArrayRead(yy[2], &yy2));
450: PetscCall(VecGetArrayRead(yy[3], &yy3));
451: x = xbase;
453: j = n;
454: switch (j_rem = j & 0x3) {
455: case 3:
456: x2 = x[2];
457: sum0 += x2 * yy0[2];
458: sum1 += x2 * yy1[2];
459: sum2 += x2 * yy2[2];
460: sum3 += x2 * yy3[2]; /* fall through */
461: case 2:
462: x1 = x[1];
463: sum0 += x1 * yy0[1];
464: sum1 += x1 * yy1[1];
465: sum2 += x1 * yy2[1];
466: sum3 += x1 * yy3[1]; /* fall through */
467: case 1:
468: x0 = x[0];
469: sum0 += x0 * yy0[0];
470: sum1 += x0 * yy1[0];
471: sum2 += x0 * yy2[0];
472: sum3 += x0 * yy3[0]; /* fall through */
473: case 0:
474: x += j_rem;
475: yy0 += j_rem;
476: yy1 += j_rem;
477: yy2 += j_rem;
478: yy3 += j_rem;
479: j -= j_rem;
480: break;
481: }
482: while (j > 0) {
483: x0 = x[0];
484: x1 = x[1];
485: x2 = x[2];
486: x3 = x[3];
487: x += 4;
489: sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
490: yy0 += 4;
491: sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
492: yy1 += 4;
493: sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
494: yy2 += 4;
495: sum3 += x0 * yy3[0] + x1 * yy3[1] + x2 * yy3[2] + x3 * yy3[3];
496: yy3 += 4;
497: j -= 4;
498: }
499: z[0] = sum0;
500: z[1] = sum1;
501: z[2] = sum2;
502: z[3] = sum3;
503: z += 4;
504: i -= 4;
505: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
506: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
507: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
508: PetscCall(VecRestoreArrayRead(yy[3], &yy3));
509: yy += 4;
510: }
511: PetscCall(VecRestoreArrayRead(xin, &xbase));
512: PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
513: PetscFunctionReturn(PETSC_SUCCESS);
514: }
516: static PetscErrorCode VecMultiDot_Seq_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
517: {
518: PetscInt i, j, nfail;
519: const PetscScalar *xarray, *yarray, *yfirst, *ynext;
520: PetscBool stop = PETSC_FALSE;
521: const char *trans = conjugate ? "C" : "T";
522: PetscInt64 lda = 0;
523: PetscBLASInt n, m;
525: PetscFunctionBegin;
526: if (xin->map->n == 0) { // otherwise BLAS complains illegal values (0) for lda
527: PetscCall(PetscArrayzero(z, nv));
528: PetscFunctionReturn(PETSC_SUCCESS);
529: }
531: // caller guarantees xin->map->n <= PETSC_BLAS_INT_MAX
532: PetscCall(PetscBLASIntCast(xin->map->n, &n));
533: PetscCall(VecGetArrayRead(xin, &xarray));
534: i = nfail = 0;
535: while (i < nv) {
536: stop = PETSC_FALSE;
537: PetscCall(VecGetArrayRead(yin[i], &yfirst));
538: yarray = yfirst;
539: for (j = i + 1; j < nv; j++) {
540: PetscCall(VecGetArrayRead(yin[j], &ynext));
541: if (j == i + 1) {
542: lda = ynext - yarray;
543: if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
544: } else if (lda * (j - i) != ynext - yarray) { // not in the same stride? if so, stop here
545: stop = PETSC_TRUE;
546: }
547: PetscCall(VecRestoreArrayRead(yin[j], &ynext));
548: if (stop) break;
549: }
550: PetscCall(VecRestoreArrayRead(yin[i], &yfirst));
552: // we found m vectors yin[i..j)
553: m = j - i;
554: if (m > 1) {
555: PetscBLASInt ione = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
556: PetscScalar one = 1, zero = 0;
558: PetscCallBLAS("BLASgemv", BLASgemv_(trans, &n, &m, &one, yarray, &lda2, xarray, &ione, &zero, z + i, &ione));
559: PetscCall(PetscLogFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
560: } else {
561: if (nfail == 0) {
562: if (conjugate) PetscCall(VecDot_Seq(xin, yin[i], z + i));
563: else PetscCall(VecTDot_Seq(xin, yin[i], z + i));
564: nfail++;
565: } else break;
566: }
567: i = j;
568: }
569: PetscCall(VecRestoreArrayRead(xin, &xarray));
570: if (i < nv) { // finish the remaining if any
571: if (conjugate) PetscCall(VecMDot_Seq(xin, nv - i, yin + i, z + i));
572: else PetscCall(VecMTDot_Seq(xin, nv - i, yin + i, z + i));
573: }
574: PetscFunctionReturn(PETSC_SUCCESS);
575: }
577: PetscErrorCode VecMDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
578: {
579: PetscFunctionBegin;
580: if (xin->map->n > PETSC_BLAS_INT_MAX) {
581: PetscCall(VecMDot_Seq(xin, nv, yin, z));
582: } else {
583: PetscCall(VecMultiDot_Seq_GEMV(PETSC_TRUE, xin, nv, yin, z));
584: }
585: PetscFunctionReturn(PETSC_SUCCESS);
586: }
588: PetscErrorCode VecMTDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
589: {
590: PetscFunctionBegin;
591: if (xin->map->n > PETSC_BLAS_INT_MAX) {
592: PetscCall(VecMTDot_Seq(xin, nv, yin, z));
593: } else {
594: PetscCall(VecMultiDot_Seq_GEMV(PETSC_FALSE, xin, nv, yin, z));
595: }
596: PetscFunctionReturn(PETSC_SUCCESS);
597: }
599: static PetscErrorCode VecMinMax_Seq(Vec xin, PetscInt *idx, PetscReal *z, PetscReal minmax, int (*const cmp)(PetscReal, PetscReal))
600: {
601: const PetscInt n = xin->map->n;
602: PetscInt j = -1;
604: PetscFunctionBegin;
605: if (n) {
606: const PetscScalar *xx;
608: PetscCall(VecGetArrayRead(xin, &xx));
609: minmax = PetscRealPart(xx[(j = 0)]);
610: for (PetscInt i = 1; i < n; ++i) {
611: const PetscReal tmp = PetscRealPart(xx[i]);
613: if (cmp(tmp, minmax)) {
614: j = i;
615: minmax = tmp;
616: }
617: }
618: PetscCall(VecRestoreArrayRead(xin, &xx));
619: }
620: *z = minmax;
621: if (idx) *idx = j;
622: PetscFunctionReturn(PETSC_SUCCESS);
623: }
625: static int VecMax_Seq_GT(PetscReal l, PetscReal r)
626: {
627: return l > r;
628: }
630: PetscErrorCode VecMax_Seq(Vec xin, PetscInt *idx, PetscReal *z)
631: {
632: PetscFunctionBegin;
633: PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MIN_REAL, VecMax_Seq_GT));
634: PetscFunctionReturn(PETSC_SUCCESS);
635: }
637: static int VecMin_Seq_LT(PetscReal l, PetscReal r)
638: {
639: return l < r;
640: }
642: PetscErrorCode VecMin_Seq(Vec xin, PetscInt *idx, PetscReal *z)
643: {
644: PetscFunctionBegin;
645: PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MAX_REAL, VecMin_Seq_LT));
646: PetscFunctionReturn(PETSC_SUCCESS);
647: }
649: PetscErrorCode VecSet_Seq(Vec xin, PetscScalar alpha)
650: {
651: const PetscInt n = xin->map->n;
652: PetscScalar *xx;
654: PetscFunctionBegin;
655: PetscCall(VecGetArrayWrite(xin, &xx));
656: if (alpha == (PetscScalar)0.0) {
657: PetscCall(PetscArrayzero(xx, n));
658: } else {
659: for (PetscInt i = 0; i < n; i++) xx[i] = alpha;
660: }
661: PetscCall(VecRestoreArrayWrite(xin, &xx));
662: PetscFunctionReturn(PETSC_SUCCESS);
663: }
665: PetscErrorCode VecMAXPY_Seq(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
666: {
667: const PetscInt j_rem = nv & 0x3, n = xin->map->n;
668: const PetscScalar *yptr[4];
669: PetscScalar *xx;
670: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
671: #pragma disjoint(*xx, **yptr, *aptr)
672: #endif
674: PetscFunctionBegin;
675: PetscCall(PetscLogFlops(nv * 2.0 * n));
676: PetscCall(VecGetArray(xin, &xx));
677: for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
678: switch (j_rem) {
679: case 3:
680: PetscKernelAXPY3(xx, alpha[0], alpha[1], alpha[2], yptr[0], yptr[1], yptr[2], n);
681: break;
682: case 2:
683: PetscKernelAXPY2(xx, alpha[0], alpha[1], yptr[0], yptr[1], n);
684: break;
685: case 1:
686: PetscKernelAXPY(xx, alpha[0], yptr[0], n);
687: default:
688: break;
689: }
690: for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
691: alpha += j_rem;
692: y += j_rem;
693: for (PetscInt j = j_rem, inc = 4; j < nv; j += inc, alpha += inc, y += inc) {
694: for (PetscInt i = 0; i < inc; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
695: PetscKernelAXPY4(xx, alpha[0], alpha[1], alpha[2], alpha[3], yptr[0], yptr[1], yptr[2], yptr[3], n);
696: for (PetscInt i = 0; i < inc; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
697: }
698: PetscCall(VecRestoreArray(xin, &xx));
699: PetscFunctionReturn(PETSC_SUCCESS);
700: }
702: /* y = y + sum alpha[i] x[i] */
703: PetscErrorCode VecMAXPY_Seq_GEMV(Vec yin, PetscInt nv, const PetscScalar alpha[], Vec xin[])
704: {
705: PetscInt i, j, nfail;
706: PetscScalar *yarray;
707: const PetscScalar *xfirst, *xnext, *xarray;
708: PetscBool stop = PETSC_FALSE;
709: PetscInt64 lda = 0;
710: PetscBLASInt n, m;
712: PetscFunctionBegin;
713: if (yin->map->n == 0 || yin->map->n > PETSC_BLAS_INT_MAX) {
714: PetscCall(VecMAXPY_Seq(yin, nv, alpha, xin));
715: PetscFunctionReturn(PETSC_SUCCESS);
716: }
718: PetscCall(PetscBLASIntCast(yin->map->n, &n));
719: PetscCall(VecGetArray(yin, &yarray));
720: i = nfail = 0;
721: while (i < nv) {
722: stop = PETSC_FALSE;
723: PetscCall(VecGetArrayRead(xin[i], &xfirst));
724: xarray = xfirst;
725: for (j = i + 1; j < nv; j++) {
726: PetscCall(VecGetArrayRead(xin[j], &xnext));
727: if (j == i + 1) {
728: lda = xnext - xarray;
729: if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
730: } else if (lda * (j - i) != xnext - xarray) { // not in the same stride? if so, stop here
731: stop = PETSC_TRUE;
732: }
733: PetscCall(VecRestoreArrayRead(xin[j], &xnext));
734: if (stop) break;
735: }
736: PetscCall(VecRestoreArrayRead(xin[i], &xfirst));
738: m = j - i;
739: if (m > 1) {
740: PetscBLASInt incx = 1, incy = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
741: PetscScalar one = 1;
742: PetscCallBLAS("BLASgemv", BLASgemv_("N", &n, &m, &one, xarray, &lda2, alpha + i, &incx, &one, yarray, &incy));
743: PetscCall(PetscLogFlops(m * 2.0 * n));
744: } else {
745: // we only allow falling back on VecAXPY once
746: if (nfail++ == 0) PetscCall(VecAXPY_Seq(yin, alpha[i], xin[i]));
747: else break; // break the while loop
748: }
749: i = j;
750: }
751: PetscCall(VecRestoreArray(yin, &yarray));
753: // finish the remaining if any
754: if (i < nv) PetscCall(VecMAXPY_Seq(yin, nv - i, alpha + i, xin + i));
755: PetscFunctionReturn(PETSC_SUCCESS);
756: }
758: #include <../src/vec/vec/impls/seq/ftn-kernels/faypx.h>
760: PetscErrorCode VecAYPX_Seq(Vec yin, PetscScalar alpha, Vec xin)
761: {
762: PetscFunctionBegin;
763: if (alpha == (PetscScalar)0.0) {
764: PetscCall(VecCopy(xin, yin));
765: } else if (alpha == (PetscScalar)1.0) {
766: PetscCall(VecAXPY_Seq(yin, alpha, xin));
767: } else {
768: const PetscInt n = yin->map->n;
769: const PetscScalar *xx;
770: PetscScalar *yy;
772: PetscCall(VecGetArrayRead(xin, &xx));
773: PetscCall(VecGetArray(yin, &yy));
774: if (alpha == (PetscScalar)-1.0) {
775: for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] - yy[i];
776: PetscCall(PetscLogFlops(n));
777: } else {
778: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
779: fortranaypx_(&n, &alpha, xx, yy);
780: #else
781: for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] + alpha * yy[i];
782: #endif
783: PetscCall(PetscLogFlops(2 * n));
784: }
785: PetscCall(VecRestoreArrayRead(xin, &xx));
786: PetscCall(VecRestoreArray(yin, &yy));
787: }
788: PetscFunctionReturn(PETSC_SUCCESS);
789: }
791: #include <../src/vec/vec/impls/seq/ftn-kernels/fwaxpy.h>
792: /*
793: IBM ESSL contains a routine dzaxpy() that is our WAXPY() but it appears
794: to be slower than a regular C loop. Hence,we do not include it.
795: void ?zaxpy(int*,PetscScalar*,PetscScalar*,int*,PetscScalar*,int*,PetscScalar*,int*);
796: */
798: PetscErrorCode VecWAXPY_Seq(Vec win, PetscScalar alpha, Vec xin, Vec yin)
799: {
800: const PetscInt n = win->map->n;
801: const PetscScalar *yy, *xx;
802: PetscScalar *ww;
804: PetscFunctionBegin;
805: PetscCall(VecGetArrayRead(xin, &xx));
806: PetscCall(VecGetArrayRead(yin, &yy));
807: PetscCall(VecGetArray(win, &ww));
808: if (alpha == (PetscScalar)1.0) {
809: PetscCall(PetscLogFlops(n));
810: /* could call BLAS axpy after call to memcopy, but may be slower */
811: for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + xx[i];
812: } else if (alpha == (PetscScalar)-1.0) {
813: PetscCall(PetscLogFlops(n));
814: for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] - xx[i];
815: } else if (alpha == (PetscScalar)0.0) {
816: PetscCall(PetscArraycpy(ww, yy, n));
817: } else {
818: PetscCall(PetscLogFlops(2.0 * n));
819: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
820: fortranwaxpy_(&n, &alpha, xx, yy, ww);
821: #else
822: for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + alpha * xx[i];
823: #endif
824: }
825: PetscCall(VecRestoreArrayRead(xin, &xx));
826: PetscCall(VecRestoreArrayRead(yin, &yy));
827: PetscCall(VecRestoreArray(win, &ww));
828: PetscFunctionReturn(PETSC_SUCCESS);
829: }
831: PetscErrorCode VecMaxPointwiseDivide_Seq(Vec xin, Vec yin, PetscReal *max)
832: {
833: const PetscInt n = xin->map->n;
834: const PetscScalar *xx, *yy;
835: PetscReal m = 0.0;
837: PetscFunctionBegin;
838: PetscCall(VecGetArrayRead(xin, &xx));
839: PetscCall(VecGetArrayRead(yin, &yy));
840: for (PetscInt i = 0; i < n; ++i) {
841: const PetscReal v = PetscAbsScalar(yy[i] == (PetscScalar)0.0 ? xx[i] : xx[i] / yy[i]);
843: // use a separate value to not re-evaluate side-effects
844: m = PetscMax(v, m);
845: }
846: PetscCall(VecRestoreArrayRead(xin, &xx));
847: PetscCall(VecRestoreArrayRead(yin, &yy));
848: PetscCall(PetscLogFlops(n));
849: *max = m;
850: PetscFunctionReturn(PETSC_SUCCESS);
851: }
853: PetscErrorCode VecPlaceArray_Seq(Vec vin, const PetscScalar *a)
854: {
855: Vec_Seq *v = (Vec_Seq *)vin->data;
857: PetscFunctionBegin;
858: PetscCheck(!v->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
859: v->unplacedarray = v->array; /* save previous array so reset can bring it back */
860: v->array = (PetscScalar *)a;
861: PetscFunctionReturn(PETSC_SUCCESS);
862: }
864: PetscErrorCode VecReplaceArray_Seq(Vec vin, const PetscScalar *a)
865: {
866: Vec_Seq *v = (Vec_Seq *)vin->data;
868: PetscFunctionBegin;
869: PetscCall(PetscFree(v->array_allocated));
870: v->array_allocated = v->array = (PetscScalar *)a;
871: PetscFunctionReturn(PETSC_SUCCESS);
872: }