petsc-3.6.1 2015-08-06
  1: #define PETSC_SKIP_COMPLEX

  3: #include <petscconf.h>
  4: PETSC_CUDA_EXTERN_C_BEGIN
  5: #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
  6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  7: #include <petscbt.h>
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: #include <petsc/private/vecimpl.h>
 10: PETSC_CUDA_EXTERN_C_END
 11: #undef VecType
 12: #include <../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h>

 14: #include <thrust/reduce.h>
 15: #include <thrust/inner_product.h>

 17: #include <cusp/array1d.h>
 18: #include <cusp/print.h>
 19: #include <cusp/coo_matrix.h>

 21: #include <cusp/io/matrix_market.h>

 23: #include <thrust/iterator/counting_iterator.h>
 24: #include <thrust/iterator/transform_iterator.h>
 25: #include <thrust/iterator/permutation_iterator.h>
 26: #include <thrust/functional.h>
 27: #include <thrust/partition.h>
 28: #include <thrust/remove.h>

 30: // this example illustrates how to make repeated access to a range of values
 31: // examples:
 32: //   repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
 33: //   repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
 34: //   repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
 35: //   ...

 37: template <typename Iterator>
 38: class repeated_range
 39: {
 40: public:

 42:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 44:   struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
 45:   {
 46:     difference_type repeats;

 48:     repeat_functor(difference_type repeats) : repeats(repeats) {}

 50:     __host__ __device__
 51:     difference_type operator()(const difference_type &i) const
 52:     {
 53:       return i / repeats;
 54:     }
 55:   };

 57:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
 58:   typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
 59:   typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

 61:   // type of the repeated_range iterator
 62:   typedef PermutationIterator iterator;

 64:   // construct repeated_range for the range [first,last)
 65:   repeated_range(Iterator first, Iterator last, difference_type repeats)
 66:     : first(first), last(last), repeats(repeats) {}

 68:   iterator begin(void) const
 69:   {
 70:     return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
 71:   }

 73:   iterator end(void) const
 74:   {
 75:     return begin() + repeats * (last - first);
 76:   }

 78: protected:
 79:   difference_type repeats;
 80:   Iterator        first;
 81:   Iterator        last;

 83: };

 85: // this example illustrates how to repeat blocks in a range multiple times
 86: // examples:
 87: //   tiled_range([0, 1, 2, 3], 2)    -> [0, 1, 2, 3, 0, 1, 2, 3]
 88: //   tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
 89: //   tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
 90: //   tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
 91: //   ...

 93: template <typename Iterator>
 94: class tiled_range
 95: {
 96: public:

 98:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;

100:   struct tile_functor : public thrust::unary_function<difference_type,difference_type>
101:   {
102:     difference_type repeats;
103:     difference_type tile_size;

105:     tile_functor(difference_type repeats, difference_type tile_size)
106:       : tile_size(tile_size), repeats(repeats) {}

108:     __host__ __device__
109:     difference_type operator() (const difference_type &i) const
110:     {
111:       return tile_size * (i / (tile_size * repeats)) + i % tile_size;
112:     }
113:   };

115:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
116:   typedef typename thrust::transform_iterator<tile_functor, CountingIterator>   TransformIterator;
117:   typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

119:   // type of the tiled_range iterator
120:   typedef PermutationIterator iterator;

122:   // construct repeated_range for the range [first,last)
123:   tiled_range(Iterator first, Iterator last, difference_type repeats)
124:     : first(first), last(last), repeats(repeats), tile_size(last - first) {}

126:   tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
127:     : first(first), last(last), repeats(repeats), tile_size(tile_size)
128:   {
129:     // ASSERT((last - first) % tile_size == 0)
130:   }

132:   iterator begin(void) const
133:   {
134:     return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
135:   }

137:   iterator end(void) const
138:   {
139:     return begin() + repeats * (last - first);
140:   }

142: protected:
143:   difference_type repeats;
144:   difference_type tile_size;
145:   Iterator        first;
146:   Iterator        last;
147: };

149: typedef cusp::device_memory memSpace;
150: typedef int IndexType;
151: typedef PetscScalar ValueType;
152: typedef cusp::array1d<IndexType, memSpace> IndexArray;
153: typedef cusp::array1d<ValueType, memSpace> ValueArray;
154: typedef cusp::array1d<IndexType, cusp::host_memory> IndexHostArray;
155: typedef IndexArray::iterator IndexArrayIterator;
156: typedef ValueArray::iterator ValueArrayIterator;

158: struct is_diag
159: {
160:   IndexType first, last;

162:   is_diag(IndexType first, IndexType last) : first(first), last(last) {}

164:   template <typename Tuple>
165:   __host__ __device__
166:   bool operator()(Tuple t)
167:   {
168:     // Check column
169:     IndexType row = thrust::get<0>(t);
170:     IndexType col = thrust::get<1>(t);
171:     return (row >= first) && (row < last) && (col >= first) && (col < last);
172:   }
173: };

175: struct is_nonlocal
176: {
177:   IndexType first, last;

179:   is_nonlocal(IndexType first, IndexType last) : first(first), last(last) {}

181:   template <typename Tuple>
182:   __host__ __device__
183:   bool operator() (Tuple t)
184:   {
185:     // Check column
186:     IndexType row = thrust::get<0>(t);
187:     return (row < first) || (row >= last);
188:   }
189: };

191: /*@C
192:   MatMPIAIJSetValuesBatch - Set multiple blocks of values into a matrix

194:   Not collective

196:   Input Parameters:
197: + J  - the assembled Mat object
198: . Ne -  the number of blocks (elements)
199: . Nl -  the block size (number of dof per element)
200: . elemRows - List of block row indices, in bunches of length Nl
201: - elemMats - List of block values, in bunches of Nl*Nl

203:   Level: advanced

205: .seealso MatSetValues()
206: @*/
209: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
210: {
211:   // Assumptions:
212:   //   1) Each elemMat is square, of size Nl x Nl
213:   //
214:   //      This means that any nonlocal entry (i,j) where i is owned by another process is matched to
215:   //        a) an offdiagonal entry (j,i) if j is diagonal, or
216:   //        b) another nonlocal entry (j,i) if j is offdiagonal
217:   //
218:   //      No - numSendEntries: The number of on-process  diagonal+offdiagonal entries
219:   //      numRecvEntries:      The number of off-process diagonal+offdiagonal entries
220:   //
221:   //  Glossary:
222:   //     diagonal: (i,j) such that i,j in [firstRow, lastRow)
223:   //  offdiagonal: (i,j) such that i in [firstRow, lastRow), and j not in [firstRow, lastRow)
224:   //     nonlocal: (i,j) such that i not in [firstRow, lastRow)
225:   //  nondiagonal: (i,j) such that i not in [firstRow, lastRow), or j not in [firstRow, lastRow)
226:   //   on-process: entries provided by elemMats
227:   //  off-process: entries received from other processes
228:   MPI_Comm       comm;
229:   Mat_MPIAIJ     *j   = (Mat_MPIAIJ*) J->data;
230:   size_t         N    = Ne * Nl;     // Length of elemRows (dimension of unassembled space)
231:   size_t         No   = Ne * Nl*Nl;  // Length of elemMats (total number of values)
232:   PetscInt       Nr;                 // Size of J          (dimension of assembled space)
233:   PetscInt       firstRow, lastRow, firstCol;
234:   const PetscInt *rowRanges;
235:   PetscInt       numNonlocalRows;    // Number of rows in elemRows not owned by this process
236:   PetscInt       numSendEntries;     // Number of (i,j,v) entries sent to other processes
237:   PetscInt       numRecvEntries;     // Number of (i,j,v) entries received from other processes
238:   PetscInt       Nc;
239:   PetscMPIInt    numProcs, rank;

242:   // copy elemRows and elemMat to device
243:   IndexArray d_elemRows(elemRows, elemRows + N);
244:   ValueArray d_elemMats(elemMats, elemMats + No);

247:   PetscObjectGetComm((PetscObject)J,&comm);
248:   MPI_Comm_size(comm, &numProcs);
249:   MPI_Comm_rank(comm, &rank);
250:   // get matrix information
251:   MatGetLocalSize(J, &Nr, NULL);
252:   MatGetOwnershipRange(J, &firstRow, &lastRow);
253:   MatGetOwnershipRanges(J, &rowRanges);
254:   MatGetOwnershipRangeColumn(J, &firstCol, NULL);
255:   PetscInfo3(J, "Assembling matrix of size %d (rows %d -- %d)\n", Nr, firstRow, lastRow);

257:   // repeat elemRows entries Nl times
258:   PetscInfo(J, "Making row indices\n");
259:   repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);

261:   // tile rows of elemRows Nl times
262:   PetscInfo(J, "Making column indices\n");
263:   tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);

265:   // Find number of nonlocal rows, convert nonlocal rows to procs, and send sizes of off-proc entries (could send diag and offdiag sizes)
266:   // TODO: Ask Nathan how to do this on GPU
267:   PetscLogEventBegin(MAT_SetValuesBatchI,0,0,0,0);
268:   PetscMPIInt *procSendSizes, *procRecvSizes;

270:   PetscCalloc2(numProcs, &procSendSizes, numProcs, &procRecvSizes);

272:   numNonlocalRows = 0;
273:   for (size_t i = 0; i < N; ++i) {
274:     const PetscInt row = elemRows[i];

276:     if ((row < firstRow) || (row >= lastRow)) {
277:       numNonlocalRows++;
278:       for (IndexType p = 0; p < numProcs; ++p) {
279:         if ((row >= rowRanges[p]) && (row < rowRanges[p+1])) {
280:           procSendSizes[p] += Nl;
281:           break;
282:         }
283:       }
284:     }
285:   }
286:   numSendEntries = numNonlocalRows*Nl;

288:   PetscInfo2(J, "Nonlocal rows %d total entries %d\n", numNonlocalRows, No);
289:   MPI_Alltoall(procSendSizes, 1, MPIU_INT, procRecvSizes, 1, MPIU_INT, comm);

291:   numRecvEntries = 0;
292:   for (PetscInt p = 0; p < numProcs; ++p) numRecvEntries += procRecvSizes[p];
293:   PetscInfo2(j->A, "Send entries %d Recv Entries %d\n", numSendEntries, numRecvEntries);
294:   PetscLogEventEnd(MAT_SetValuesBatchI,0,0,0,0);
295:   // Allocate storage for "fat" COO representation of matrix
296:   PetscLogEventBegin(MAT_SetValuesBatchII,0,0,0,0);
297:   PetscInfo2(j->A, "Making COO matrices, diag entries %d, nondiag entries %d\n", No-numSendEntries+numRecvEntries, numSendEntries*2);
298:   cusp::coo_matrix<IndexType,ValueType, memSpace> diagCOO(Nr, Nr, No-numSendEntries+numRecvEntries); // ALLOC: This is oversized because I also count offdiagonal entries
299:   IndexArray nondiagonalRows(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
300:   IndexArray nondiagonalCols(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
301:   ValueArray nondiagonalVals(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
302:   IndexArray nonlocalRows(numSendEntries);
303:   IndexArray nonlocalCols(numSendEntries);
304:   ValueArray nonlocalVals(numSendEntries);
305:   // partition on-process entries into diagonal and off-diagonal+nonlocal
306:   PetscInfo(J, "Splitting on-process entries into diagonal and off-diagonal+nonlocal\n");
307:   thrust::fill(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
308:   thrust::fill(nondiagonalRows.begin(),     nondiagonalRows.end(),     -1);
309:   thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(rowInd.begin(), colInd.begin(), d_elemMats.begin())),
310:                          thrust::make_zip_iterator(thrust::make_tuple(rowInd.end(),   colInd.end(),   d_elemMats.end())),
311:                          thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(),    diagCOO.column_indices.begin(), diagCOO.values.begin())),
312:                          thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(),        nondiagonalCols.begin(),        nondiagonalVals.begin())),
313:                          is_diag(firstRow, lastRow));
314:   // Current size without off-proc entries
315:   PetscInt diagonalSize    = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - thrust::count(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
316:   PetscInt nondiagonalSize = No - diagonalSize;
317:   PetscInfo2(j->A, "Diagonal size %d Nondiagonal size %d\n", diagonalSize, nondiagonalSize);
318:   ///cusp::print(diagCOO);
319:   ///cusp::print(nondiagonalRows);
320:   // partition on-process entries again into off-diagonal and nonlocal
321:   PetscInfo(J, "Splitting on-process entries into off-diagonal and nonlocal\n");
322:   thrust::stable_partition(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
323:                            thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(),   nondiagonalCols.end(),   nondiagonalVals.end())),
324:                            is_nonlocal(firstRow, lastRow));
325:   PetscInt nonlocalSize    = numSendEntries;
326:   PetscInt offdiagonalSize = nondiagonalSize - nonlocalSize;
327:   PetscInfo2(j->A, "Nonlocal size %d Offdiagonal size %d\n", nonlocalSize, offdiagonalSize);
328:   PetscLogEventEnd(MAT_SetValuesBatchII,0,0,0,0);
329:   ///cusp::print(nondiagonalRows);
330:   // send off-proc entries (pack this up later)
331:   PetscLogEventBegin(MAT_SetValuesBatchIII,0,0,0,0);
332:   PetscMPIInt *procSendDispls, *procRecvDispls;
333:   PetscInt    *sendRows, *recvRows;
334:   PetscInt    *sendCols, *recvCols;
335:   PetscScalar *sendVals, *recvVals;

337:   PetscMalloc2(numProcs, &procSendDispls, numProcs, &procRecvDispls);
338:   PetscMalloc3(numSendEntries, &sendRows, numSendEntries, &sendCols, numSendEntries, &sendVals);
339:   PetscMalloc3(numRecvEntries, &recvRows, numRecvEntries, &recvCols, numRecvEntries, &recvVals);

341:   procSendDispls[0] = procRecvDispls[0] = 0;
342:   for (PetscInt p = 1; p < numProcs; ++p) {
343:     procSendDispls[p] = procSendDispls[p-1] + procSendSizes[p-1];
344:     procRecvDispls[p] = procRecvDispls[p-1] + procRecvSizes[p-1];
345:   }
346:   thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
347:                thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin()))+nonlocalSize,
348:                thrust::make_zip_iterator(thrust::make_tuple(sendRows,                sendCols,                sendVals)));
349:   //   could pack into a struct and unpack
350:   MPI_Alltoallv(sendRows, procSendSizes, procSendDispls, MPIU_INT,    recvRows, procRecvSizes, procRecvDispls, MPIU_INT,    comm);
351:   MPI_Alltoallv(sendCols, procSendSizes, procSendDispls, MPIU_INT,    recvCols, procRecvSizes, procRecvDispls, MPIU_INT,    comm);
352:   MPI_Alltoallv(sendVals, procSendSizes, procSendDispls, MPIU_SCALAR, recvVals, procRecvSizes, procRecvDispls, MPIU_SCALAR, comm);
353:   PetscFree2(procSendSizes, procRecvSizes);
354:   PetscFree2(procSendDispls, procRecvDispls);
355:   PetscFree3(sendRows, sendCols, sendVals);
356:   PetscLogEventEnd(MAT_SetValuesBatchIII,0,0,0,0);

358:   PetscLogEventBegin(MAT_SetValuesBatchIV,0,0,0,0);
359:   // Create off-diagonal matrix
360:   cusp::coo_matrix<IndexType,ValueType, memSpace> offdiagCOO(Nr, Nr, offdiagonalSize+numRecvEntries); // ALLOC: This is oversizes because we count diagonal entries in numRecvEntries
361:   // partition again into diagonal and off-diagonal
362:   IndexArray d_recvRows(recvRows, recvRows+numRecvEntries);
363:   IndexArray d_recvCols(recvCols, recvCols+numRecvEntries);
364:   ValueArray d_recvVals(recvVals, recvVals+numRecvEntries);
365:   thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(),          nondiagonalCols.end(),             nondiagonalVals.end()))-offdiagonalSize,
366:                thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(),          nondiagonalCols.end(),             nondiagonalVals.end())),
367:                thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin(), offdiagCOO.values.begin())));
368:   thrust::fill(diagCOO.row_indices.begin()+diagonalSize,       diagCOO.row_indices.end(),    -1);
369:   thrust::fill(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.row_indices.end(), -1);
370:   thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.begin(), d_recvCols.begin(), d_recvVals.begin())),
371:                          thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.end(),   d_recvCols.end(),   d_recvVals.end())),
372:                          thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin()+diagonalSize, diagCOO.column_indices.begin()+diagonalSize, diagCOO.values.begin()+diagonalSize)),
373:                          thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.column_indices.begin()+offdiagonalSize, offdiagCOO.values.begin()+offdiagonalSize)),
374:                          is_diag(firstRow, lastRow));

376:   PetscFree3(recvRows, recvCols, recvVals);

378:   diagonalSize    = (diagCOO.row_indices.end()    - diagCOO.row_indices.begin())    - thrust::count(diagCOO.row_indices.begin(),    diagCOO.row_indices.end(),    -1);
379:   offdiagonalSize = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - thrust::count(offdiagCOO.row_indices.begin(), offdiagCOO.row_indices.end(), -1);

381:   // sort COO format by (i,j), this is the most costly step
382:   PetscInfo(J, "Sorting rows and columns\n");
383:   diagCOO.sort_by_row_and_column();
384:   offdiagCOO.sort_by_row_and_column();
385:   PetscInt diagonalOffset    = (diagCOO.row_indices.end()    - diagCOO.row_indices.begin())    - diagonalSize;
386:   PetscInt offdiagonalOffset = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - offdiagonalSize;

388:   // print the "fat" COO representation
389:   if (PetscLogPrintInfo) {
390: #if !defined(PETSC_USE_COMPLEX)
391:     cusp::print(diagCOO);
392:     cusp::print(offdiagCOO);
393: #endif
394:   }

396:   // Figure out the number of unique off-diagonal columns
397:   Nc = thrust::inner_product(offdiagCOO.column_indices.begin()+offdiagonalOffset,
398:                              offdiagCOO.column_indices.end()   - 1,
399:                              offdiagCOO.column_indices.begin()+offdiagonalOffset + 1,
400:                              size_t(1), thrust::plus<size_t>(), thrust::not_equal_to<IndexType>());

402:   // compute number of unique (i,j) entries
403:   //   this counts the number of changes as we move along the (i,j) list
404:   PetscInfo(J, "Computing number of unique entries\n");
405:   size_t num_diag_entries = thrust::inner_product
406:                               (thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
407:                               thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(),   diagCOO.column_indices.end())) - 1,
408:                               thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset + 1,
409:                               size_t(1),
410:                               thrust::plus<size_t>(),
411:                               thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
412:   size_t num_offdiag_entries = thrust::inner_product
413:                                  (thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
414:                                  thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(),   offdiagCOO.column_indices.end())) - 1,
415:                                  thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset + 1,
416:                                  size_t(1),
417:                                  thrust::plus<size_t>(),
418:                                  thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());

420:   // allocate COO storage for final matrix
421:   PetscInfo(J, "Allocating compressed matrices\n");
422:   cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_diag_entries);
423:   cusp::coo_matrix<IndexType, ValueType, memSpace> B(Nr, Nc, num_offdiag_entries);

425:   // sum values with the same (i,j) index
426:   // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
427:   //     the Cusp one is 2x faster, but still not optimal
428:   // This could possibly be done in-place
429:   PetscInfo(J, "Compressing matrices\n");
430:   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
431:                         thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(),   diagCOO.column_indices.end())),
432:                         diagCOO.values.begin() + diagonalOffset,
433:                         thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
434:                         A.values.begin(),
435:                         thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
436:                         thrust::plus<ValueType>());

438:   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
439:                         thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(),   offdiagCOO.column_indices.end())),
440:                         offdiagCOO.values.begin() + offdiagonalOffset,
441:                         thrust::make_zip_iterator(thrust::make_tuple(B.row_indices.begin(), B.column_indices.begin())),
442:                         B.values.begin(),
443:                         thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
444:                         thrust::plus<ValueType>());

446:   // Convert row and column numbers
447:   if (firstRow) {
448:     thrust::transform(A.row_indices.begin(), A.row_indices.end(), thrust::make_constant_iterator(firstRow), A.row_indices.begin(), thrust::minus<IndexType>());
449:     thrust::transform(B.row_indices.begin(), B.row_indices.end(), thrust::make_constant_iterator(firstRow), B.row_indices.begin(), thrust::minus<IndexType>());
450:   }
451:   if (firstCol) {
452:     thrust::transform(A.column_indices.begin(), A.column_indices.end(), thrust::make_constant_iterator(firstCol), A.column_indices.begin(), thrust::minus<IndexType>());
453:   }

455:   // print the final matrix
456:   if (PetscLogPrintInfo) {
457: #if !defined(PETSC_USE_COMPLEX)
458:     cusp::print(A);
459:     cusp::print(B);
460: #endif
461:   }

463:   PetscInfo(J, "Converting to PETSc matrix\n");
464:   MatSetType(J, MATMPIAIJCUSP);
465:   CUSPMATRIX *Agpu = new CUSPMATRIX;
466:   CUSPMATRIX *Bgpu = new CUSPMATRIX;
467:   cusp::convert(A, *Agpu);
468:   cusp::convert(B, *Bgpu);
469:   if (PetscLogPrintInfo) {
470: #if !defined(PETSC_USE_COMPLEX)
471:     cusp::print(*Agpu);
472:     cusp::print(*Bgpu);
473: #endif
474:   }
475:   {
476:     PetscInfo(J, "Copying to CPU matrix");
477:     MatCUSPCopyFromGPU(j->A, Agpu);
478:     MatCUSPCopyFromGPU(j->B, Bgpu);
479:   }
480:   PetscLogEventEnd(MAT_SetValuesBatchIV,0,0,0,0);
481:   return(0);
482: }