Actual source code: mpiaijAssemble.cu
petsc-3.7.3 2016-08-01
1: #define PETSC_SKIP_COMPLEX
2: #define PETSC_SKIP_SPINLOCK
4: #include <petscconf.h>
5: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <../src/vec/vec/impls/dvecimpl.h>
9: #include <petsc/private/vecimpl.h>
10: #undef VecType
11: #include <../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h>
13: #include <thrust/reduce.h>
14: #include <thrust/inner_product.h>
16: #include <cusp/array1d.h>
17: #include <cusp/print.h>
18: #include <cusp/coo_matrix.h>
20: #include <cusp/io/matrix_market.h>
22: #include <thrust/iterator/counting_iterator.h>
23: #include <thrust/iterator/transform_iterator.h>
24: #include <thrust/iterator/permutation_iterator.h>
25: #include <thrust/functional.h>
26: #include <thrust/partition.h>
27: #include <thrust/remove.h>
29: // this example illustrates how to make repeated access to a range of values
30: // examples:
31: // repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
32: // repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
33: // repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
34: // ...
36: template <typename Iterator>
37: class repeated_range
38: {
39: public:
41: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
43: struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
44: {
45: difference_type repeats;
47: repeat_functor(difference_type repeats) : repeats(repeats) {}
49: __host__ __device__
50: difference_type operator()(const difference_type &i) const
51: {
52: return i / repeats;
53: }
54: };
56: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
57: typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
58: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
60: // type of the repeated_range iterator
61: typedef PermutationIterator iterator;
63: // construct repeated_range for the range [first,last)
64: repeated_range(Iterator first, Iterator last, difference_type repeats)
65: : first(first), last(last), repeats(repeats) {}
67: iterator begin(void) const
68: {
69: return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
70: }
72: iterator end(void) const
73: {
74: return begin() + repeats * (last - first);
75: }
77: protected:
78: difference_type repeats;
79: Iterator first;
80: Iterator last;
82: };
84: // this example illustrates how to repeat blocks in a range multiple times
85: // examples:
86: // tiled_range([0, 1, 2, 3], 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
87: // tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
88: // tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
89: // tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
90: // ...
92: template <typename Iterator>
93: class tiled_range
94: {
95: public:
97: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
99: struct tile_functor : public thrust::unary_function<difference_type,difference_type>
100: {
101: difference_type repeats;
102: difference_type tile_size;
104: tile_functor(difference_type repeats, difference_type tile_size)
105: : tile_size(tile_size), repeats(repeats) {}
107: __host__ __device__
108: difference_type operator() (const difference_type &i) const
109: {
110: return tile_size * (i / (tile_size * repeats)) + i % tile_size;
111: }
112: };
114: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
115: typedef typename thrust::transform_iterator<tile_functor, CountingIterator> TransformIterator;
116: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
118: // type of the tiled_range iterator
119: typedef PermutationIterator iterator;
121: // construct repeated_range for the range [first,last)
122: tiled_range(Iterator first, Iterator last, difference_type repeats)
123: : first(first), last(last), repeats(repeats), tile_size(last - first) {}
125: tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
126: : first(first), last(last), repeats(repeats), tile_size(tile_size)
127: {
128: // ASSERT((last - first) % tile_size == 0)
129: }
131: iterator begin(void) const
132: {
133: return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
134: }
136: iterator end(void) const
137: {
138: return begin() + repeats * (last - first);
139: }
141: protected:
142: difference_type repeats;
143: difference_type tile_size;
144: Iterator first;
145: Iterator last;
146: };
148: typedef cusp::device_memory memSpace;
149: typedef int IndexType;
150: typedef PetscScalar ValueType;
151: typedef cusp::array1d<IndexType, memSpace> IndexArray;
152: typedef cusp::array1d<ValueType, memSpace> ValueArray;
153: typedef cusp::array1d<IndexType, cusp::host_memory> IndexHostArray;
154: typedef IndexArray::iterator IndexArrayIterator;
155: typedef ValueArray::iterator ValueArrayIterator;
157: struct is_diag
158: {
159: IndexType first, last;
161: is_diag(IndexType first, IndexType last) : first(first), last(last) {}
163: template <typename Tuple>
164: __host__ __device__
165: bool operator()(Tuple t)
166: {
167: // Check column
168: IndexType row = thrust::get<0>(t);
169: IndexType col = thrust::get<1>(t);
170: return (row >= first) && (row < last) && (col >= first) && (col < last);
171: }
172: };
174: struct is_nonlocal
175: {
176: IndexType first, last;
178: is_nonlocal(IndexType first, IndexType last) : first(first), last(last) {}
180: template <typename Tuple>
181: __host__ __device__
182: bool operator() (Tuple t)
183: {
184: // Check column
185: IndexType row = thrust::get<0>(t);
186: return (row < first) || (row >= last);
187: }
188: };
190: /*@C
191: MatMPIAIJSetValuesBatch - Set multiple blocks of values into a matrix
193: Not collective
195: Input Parameters:
196: + J - the assembled Mat object
197: . Ne - the number of blocks (elements)
198: . Nl - the block size (number of dof per element)
199: . elemRows - List of block row indices, in bunches of length Nl
200: - elemMats - List of block values, in bunches of Nl*Nl
202: Level: advanced
204: .seealso MatSetValues()
205: @*/
208: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
209: {
210: // Assumptions:
211: // 1) Each elemMat is square, of size Nl x Nl
212: //
213: // This means that any nonlocal entry (i,j) where i is owned by another process is matched to
214: // a) an offdiagonal entry (j,i) if j is diagonal, or
215: // b) another nonlocal entry (j,i) if j is offdiagonal
216: //
217: // No - numSendEntries: The number of on-process diagonal+offdiagonal entries
218: // numRecvEntries: The number of off-process diagonal+offdiagonal entries
219: //
220: // Glossary:
221: // diagonal: (i,j) such that i,j in [firstRow, lastRow)
222: // offdiagonal: (i,j) such that i in [firstRow, lastRow), and j not in [firstRow, lastRow)
223: // nonlocal: (i,j) such that i not in [firstRow, lastRow)
224: // nondiagonal: (i,j) such that i not in [firstRow, lastRow), or j not in [firstRow, lastRow)
225: // on-process: entries provided by elemMats
226: // off-process: entries received from other processes
227: MPI_Comm comm;
228: Mat_MPIAIJ *j = (Mat_MPIAIJ*) J->data;
229: size_t N = Ne * Nl; // Length of elemRows (dimension of unassembled space)
230: size_t No = Ne * Nl*Nl; // Length of elemMats (total number of values)
231: PetscInt Nr; // Size of J (dimension of assembled space)
232: PetscInt firstRow, lastRow, firstCol;
233: const PetscInt *rowRanges;
234: PetscInt numNonlocalRows; // Number of rows in elemRows not owned by this process
235: PetscInt numSendEntries; // Number of (i,j,v) entries sent to other processes
236: PetscInt numRecvEntries; // Number of (i,j,v) entries received from other processes
237: PetscInt Nc;
238: PetscMPIInt numProcs, rank;
241: // copy elemRows and elemMat to device
242: IndexArray d_elemRows(elemRows, elemRows + N);
243: ValueArray d_elemMats(elemMats, elemMats + No);
246: PetscObjectGetComm((PetscObject)J,&comm);
247: MPI_Comm_size(comm, &numProcs);
248: MPI_Comm_rank(comm, &rank);
249: // get matrix information
250: MatGetLocalSize(J, &Nr, NULL);
251: MatGetOwnershipRange(J, &firstRow, &lastRow);
252: MatGetOwnershipRanges(J, &rowRanges);
253: MatGetOwnershipRangeColumn(J, &firstCol, NULL);
254: PetscInfo3(J, "Assembling matrix of size %d (rows %d -- %d)\n", Nr, firstRow, lastRow);
256: // repeat elemRows entries Nl times
257: PetscInfo(J, "Making row indices\n");
258: repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
260: // tile rows of elemRows Nl times
261: PetscInfo(J, "Making column indices\n");
262: tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
264: // Find number of nonlocal rows, convert nonlocal rows to procs, and send sizes of off-proc entries (could send diag and offdiag sizes)
265: // TODO: Ask Nathan how to do this on GPU
266: PetscLogEventBegin(MAT_SetValuesBatchI,0,0,0,0);
267: PetscMPIInt *procSendSizes, *procRecvSizes;
269: PetscCalloc2(numProcs, &procSendSizes, numProcs, &procRecvSizes);
271: numNonlocalRows = 0;
272: for (size_t i = 0; i < N; ++i) {
273: const PetscInt row = elemRows[i];
275: if ((row < firstRow) || (row >= lastRow)) {
276: numNonlocalRows++;
277: for (IndexType p = 0; p < numProcs; ++p) {
278: if ((row >= rowRanges[p]) && (row < rowRanges[p+1])) {
279: procSendSizes[p] += Nl;
280: break;
281: }
282: }
283: }
284: }
285: numSendEntries = numNonlocalRows*Nl;
287: PetscInfo2(J, "Nonlocal rows %d total entries %d\n", numNonlocalRows, No);
288: MPI_Alltoall(procSendSizes, 1, MPIU_INT, procRecvSizes, 1, MPIU_INT, comm);
290: numRecvEntries = 0;
291: for (PetscInt p = 0; p < numProcs; ++p) numRecvEntries += procRecvSizes[p];
292: PetscInfo2(j->A, "Send entries %d Recv Entries %d\n", numSendEntries, numRecvEntries);
293: PetscLogEventEnd(MAT_SetValuesBatchI,0,0,0,0);
294: // Allocate storage for "fat" COO representation of matrix
295: PetscLogEventBegin(MAT_SetValuesBatchII,0,0,0,0);
296: PetscInfo2(j->A, "Making COO matrices, diag entries %d, nondiag entries %d\n", No-numSendEntries+numRecvEntries, numSendEntries*2);
297: cusp::coo_matrix<IndexType,ValueType, memSpace> diagCOO(Nr, Nr, No-numSendEntries+numRecvEntries); // ALLOC: This is oversized because I also count offdiagonal entries
298: IndexArray nondiagonalRows(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
299: IndexArray nondiagonalCols(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
300: ValueArray nondiagonalVals(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
301: IndexArray nonlocalRows(numSendEntries);
302: IndexArray nonlocalCols(numSendEntries);
303: ValueArray nonlocalVals(numSendEntries);
304: // partition on-process entries into diagonal and off-diagonal+nonlocal
305: PetscInfo(J, "Splitting on-process entries into diagonal and off-diagonal+nonlocal\n");
306: thrust::fill(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
307: thrust::fill(nondiagonalRows.begin(), nondiagonalRows.end(), -1);
308: thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(rowInd.begin(), colInd.begin(), d_elemMats.begin())),
309: thrust::make_zip_iterator(thrust::make_tuple(rowInd.end(), colInd.end(), d_elemMats.end())),
310: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin(), diagCOO.values.begin())),
311: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
312: is_diag(firstRow, lastRow));
313: // Current size without off-proc entries
314: PetscInt diagonalSize = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - thrust::count(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
315: PetscInt nondiagonalSize = No - diagonalSize;
316: PetscInfo2(j->A, "Diagonal size %d Nondiagonal size %d\n", diagonalSize, nondiagonalSize);
317: ///cusp::print(diagCOO);
318: ///cusp::print(nondiagonalRows);
319: // partition on-process entries again into off-diagonal and nonlocal
320: PetscInfo(J, "Splitting on-process entries into off-diagonal and nonlocal\n");
321: thrust::stable_partition(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
322: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(), nondiagonalCols.end(), nondiagonalVals.end())),
323: is_nonlocal(firstRow, lastRow));
324: PetscInt nonlocalSize = numSendEntries;
325: PetscInt offdiagonalSize = nondiagonalSize - nonlocalSize;
326: PetscInfo2(j->A, "Nonlocal size %d Offdiagonal size %d\n", nonlocalSize, offdiagonalSize);
327: PetscLogEventEnd(MAT_SetValuesBatchII,0,0,0,0);
328: ///cusp::print(nondiagonalRows);
329: // send off-proc entries (pack this up later)
330: PetscLogEventBegin(MAT_SetValuesBatchIII,0,0,0,0);
331: PetscMPIInt *procSendDispls, *procRecvDispls;
332: PetscInt *sendRows, *recvRows;
333: PetscInt *sendCols, *recvCols;
334: PetscScalar *sendVals, *recvVals;
336: PetscMalloc2(numProcs, &procSendDispls, numProcs, &procRecvDispls);
337: PetscMalloc3(numSendEntries, &sendRows, numSendEntries, &sendCols, numSendEntries, &sendVals);
338: PetscMalloc3(numRecvEntries, &recvRows, numRecvEntries, &recvCols, numRecvEntries, &recvVals);
340: procSendDispls[0] = procRecvDispls[0] = 0;
341: for (PetscInt p = 1; p < numProcs; ++p) {
342: procSendDispls[p] = procSendDispls[p-1] + procSendSizes[p-1];
343: procRecvDispls[p] = procRecvDispls[p-1] + procRecvSizes[p-1];
344: }
345: thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
346: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin()))+nonlocalSize,
347: thrust::make_zip_iterator(thrust::make_tuple(sendRows, sendCols, sendVals)));
348: // could pack into a struct and unpack
349: MPI_Alltoallv(sendRows, procSendSizes, procSendDispls, MPIU_INT, recvRows, procRecvSizes, procRecvDispls, MPIU_INT, comm);
350: MPI_Alltoallv(sendCols, procSendSizes, procSendDispls, MPIU_INT, recvCols, procRecvSizes, procRecvDispls, MPIU_INT, comm);
351: MPI_Alltoallv(sendVals, procSendSizes, procSendDispls, MPIU_SCALAR, recvVals, procRecvSizes, procRecvDispls, MPIU_SCALAR, comm);
352: PetscFree2(procSendSizes, procRecvSizes);
353: PetscFree2(procSendDispls, procRecvDispls);
354: PetscFree3(sendRows, sendCols, sendVals);
355: PetscLogEventEnd(MAT_SetValuesBatchIII,0,0,0,0);
357: PetscLogEventBegin(MAT_SetValuesBatchIV,0,0,0,0);
358: // Create off-diagonal matrix
359: cusp::coo_matrix<IndexType,ValueType, memSpace> offdiagCOO(Nr, Nr, offdiagonalSize+numRecvEntries); // ALLOC: This is oversizes because we count diagonal entries in numRecvEntries
360: // partition again into diagonal and off-diagonal
361: IndexArray d_recvRows(recvRows, recvRows+numRecvEntries);
362: IndexArray d_recvCols(recvCols, recvCols+numRecvEntries);
363: ValueArray d_recvVals(recvVals, recvVals+numRecvEntries);
364: thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(), nondiagonalCols.end(), nondiagonalVals.end()))-offdiagonalSize,
365: thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(), nondiagonalCols.end(), nondiagonalVals.end())),
366: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin(), offdiagCOO.values.begin())));
367: thrust::fill(diagCOO.row_indices.begin()+diagonalSize, diagCOO.row_indices.end(), -1);
368: thrust::fill(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.row_indices.end(), -1);
369: thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.begin(), d_recvCols.begin(), d_recvVals.begin())),
370: thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.end(), d_recvCols.end(), d_recvVals.end())),
371: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin()+diagonalSize, diagCOO.column_indices.begin()+diagonalSize, diagCOO.values.begin()+diagonalSize)),
372: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.column_indices.begin()+offdiagonalSize, offdiagCOO.values.begin()+offdiagonalSize)),
373: is_diag(firstRow, lastRow));
375: PetscFree3(recvRows, recvCols, recvVals);
377: diagonalSize = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - thrust::count(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
378: offdiagonalSize = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - thrust::count(offdiagCOO.row_indices.begin(), offdiagCOO.row_indices.end(), -1);
380: // sort COO format by (i,j), this is the most costly step
381: PetscInfo(J, "Sorting rows and columns\n");
382: diagCOO.sort_by_row_and_column();
383: offdiagCOO.sort_by_row_and_column();
384: PetscInt diagonalOffset = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - diagonalSize;
385: PetscInt offdiagonalOffset = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - offdiagonalSize;
387: // print the "fat" COO representation
388: if (PetscLogPrintInfo) {
389: #if !defined(PETSC_USE_COMPLEX)
390: cusp::print(diagCOO);
391: cusp::print(offdiagCOO);
392: #endif
393: }
395: // Figure out the number of unique off-diagonal columns
396: Nc = thrust::inner_product(offdiagCOO.column_indices.begin()+offdiagonalOffset,
397: offdiagCOO.column_indices.end() - 1,
398: offdiagCOO.column_indices.begin()+offdiagonalOffset + 1,
399: size_t(1), thrust::plus<size_t>(), thrust::not_equal_to<IndexType>());
401: // compute number of unique (i,j) entries
402: // this counts the number of changes as we move along the (i,j) list
403: PetscInfo(J, "Computing number of unique entries\n");
404: size_t num_diag_entries = thrust::inner_product
405: (thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
406: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(), diagCOO.column_indices.end())) - 1,
407: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset + 1,
408: size_t(1),
409: thrust::plus<size_t>(),
410: thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
411: size_t num_offdiag_entries = thrust::inner_product
412: (thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
413: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(), offdiagCOO.column_indices.end())) - 1,
414: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset + 1,
415: size_t(1),
416: thrust::plus<size_t>(),
417: thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
419: // allocate COO storage for final matrix
420: PetscInfo(J, "Allocating compressed matrices\n");
421: cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_diag_entries);
422: cusp::coo_matrix<IndexType, ValueType, memSpace> B(Nr, Nc, num_offdiag_entries);
424: // sum values with the same (i,j) index
425: // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
426: // the Cusp one is 2x faster, but still not optimal
427: // This could possibly be done in-place
428: PetscInfo(J, "Compressing matrices\n");
429: thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
430: thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(), diagCOO.column_indices.end())),
431: diagCOO.values.begin() + diagonalOffset,
432: thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
433: A.values.begin(),
434: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
435: thrust::plus<ValueType>());
437: thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
438: thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(), offdiagCOO.column_indices.end())),
439: offdiagCOO.values.begin() + offdiagonalOffset,
440: thrust::make_zip_iterator(thrust::make_tuple(B.row_indices.begin(), B.column_indices.begin())),
441: B.values.begin(),
442: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
443: thrust::plus<ValueType>());
445: // Convert row and column numbers
446: if (firstRow) {
447: thrust::transform(A.row_indices.begin(), A.row_indices.end(), thrust::make_constant_iterator(firstRow), A.row_indices.begin(), thrust::minus<IndexType>());
448: thrust::transform(B.row_indices.begin(), B.row_indices.end(), thrust::make_constant_iterator(firstRow), B.row_indices.begin(), thrust::minus<IndexType>());
449: }
450: if (firstCol) {
451: thrust::transform(A.column_indices.begin(), A.column_indices.end(), thrust::make_constant_iterator(firstCol), A.column_indices.begin(), thrust::minus<IndexType>());
452: }
454: // print the final matrix
455: if (PetscLogPrintInfo) {
456: #if !defined(PETSC_USE_COMPLEX)
457: cusp::print(A);
458: cusp::print(B);
459: #endif
460: }
462: PetscInfo(J, "Converting to PETSc matrix\n");
463: MatSetType(J, MATMPIAIJCUSP);
464: CUSPMATRIX *Agpu = new CUSPMATRIX;
465: CUSPMATRIX *Bgpu = new CUSPMATRIX;
466: cusp::convert(A, *Agpu);
467: cusp::convert(B, *Bgpu);
468: if (PetscLogPrintInfo) {
469: #if !defined(PETSC_USE_COMPLEX)
470: cusp::print(*Agpu);
471: cusp::print(*Bgpu);
472: #endif
473: }
474: {
475: PetscInfo(J, "Copying to CPU matrix");
476: MatCUSPCopyFromGPU(j->A, Agpu);
477: MatCUSPCopyFromGPU(j->B, Bgpu);
478: }
479: PetscLogEventEnd(MAT_SetValuesBatchIV,0,0,0,0);
480: return(0);
481: }