Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
Former-commit-id: 0e11339ff84656d4a9b0cee7b206bb7184a13086
  • Loading branch information
dumerrill committed Mar 18, 2015
1 parent 8842512 commit 4068cfb
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 49 deletions.
46 changes: 30 additions & 16 deletions cub/agent/agent_spmv.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ template <
typename ValueT, ///< Matrix and vector value type
typename OffsetT, ///< Signed integer type for sequence offsets
typename CoordinateT, ///< Merge path coordinate type
bool HAS_BETA_ZERO, ///< Whether the input parameter \p beta is zero (and vector Y is set rather than updated)
bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1
bool HAS_BETA, ///< Whether the input parameter \p beta is 0
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
struct AgentSpmv
{
Expand Down Expand Up @@ -260,17 +261,17 @@ struct AgentSpmv
{
if (row_indices[ITEM] < tile_num_rows)
{
if (HAS_BETA_ZERO)
{
// Set the output vector element
d_vector_y_out[tile_start_coord.x + thread_segment[ITEM].offset] = thread_segment[ITEM].value;
}
else
if (HAS_BETA)
{
// Update the output vector element
ValueT addend = beta * d_vector_y_in[tile_start_coord.x + thread_segment[ITEM].offset];
d_vector_y_out[tile_start_coord.x + thread_segment[ITEM].offset] = thread_segment[ITEM].value + addend;
}
else
{
// Set the output vector element
d_vector_y_out[tile_start_coord.x + thread_segment[ITEM].offset] = thread_segment[ITEM].value;
}
}
}
}
Expand All @@ -295,24 +296,27 @@ struct AgentSpmv
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (row_indices[ITEM] < tile_num_rows)
{
s_tile_nonzeros[thread_segment[ITEM].offset] = thread_segment[ITEM].value;
}
}

__syncthreads();

// Scatter row dot products from smem to gmem
for (int item_idx = threadIdx.x; item_idx < tile_num_rows; item_idx += BLOCK_THREADS)
{
if (HAS_BETA_ZERO)
{
// Set the output vector element
d_vector_y_out[tile_start_coord.x + item_idx] = s_tile_nonzeros[item_idx];
}
if (HAS_BETA)
{
// Update the output vector element
ValueT addend = d_vector_y_in[tile_start_coord.x + item_idx] * beta;
d_vector_y_out[tile_start_coord.x + item_idx] = s_tile_nonzeros[item_idx] + addend;
}
else
{
// Set the output vector element
d_vector_y_out[tile_start_coord.x + item_idx] = s_tile_nonzeros[item_idx];
}
}
}

Expand Down Expand Up @@ -363,7 +367,7 @@ struct AgentSpmv
OffsetT column_index = d_matrix_column_indices[tile_start_coord.y + item];
ValueT matrix_value = d_matrix_values[tile_start_coord.y + item];
ValueT vector_value = d_vector_x[column_index];
nonzeros[ITEM] = matrix_value * vector_value;
nonzeros[ITEM] = matrix_value * vector_value;
}

__syncthreads(); // Perf-sync
Expand All @@ -373,6 +377,9 @@ struct AgentSpmv
tile_aggregate.offset = 0;
tile_aggregate.value = tile_sum;

if (HAS_ALPHA)
tile_aggregate.value *= alpha;

// Return the tile's running carry-out
return tile_aggregate;
}
Expand Down Expand Up @@ -433,7 +440,10 @@ struct AgentSpmv
OffsetT column_index = d_matrix_column_indices[nonzero_idx];
ValueT matrix_value = d_matrix_values[nonzero_idx];
ValueT vector_value = d_vector_x[column_index];
ValueT nonzero = alpha * matrix_value * vector_value;
ValueT nonzero = matrix_value * vector_value;

if (HAS_ALPHA)
nonzero *= alpha;

bool accumulate = (tile_nonzero_indices[thread_current_coord.y] < s_tile_row_end_offsets[thread_current_coord.x]);
if (accumulate)
Expand Down Expand Up @@ -492,8 +502,12 @@ struct AgentSpmv
OffsetT column_index = d_matrix_column_indices[tile_start_coord.y + item];
ValueT matrix_value = d_matrix_values[tile_start_coord.y + item];
ValueT vector_value = d_vector_x[column_index];

s_tile_nonzeros[item] = alpha * matrix_value * vector_value;
ValueT nonzero = matrix_value * vector_value;

if (HAS_ALPHA)
nonzero *= alpha;

s_tile_nonzeros[item] = nonzero;
}

__syncthreads(); // Perf-sync
Expand Down
70 changes: 44 additions & 26 deletions cub/device/dispatch/dispatch_spmv.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ template <
typename ValueT, ///< Matrix and vector value type
typename OffsetT, ///< Signed integer type for sequence offsets
typename CoordinateT, ///< Merge path coordinate type
bool HAS_BETA_ZERO> ///< Whether the input parameter Beta is zero (and vector Y is set rather than updated)
bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1
bool HAS_BETA> ///< Whether the input parameter Beta is 0
__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
__global__ void DeviceSpmvKernel(
SpmvParams<ValueT, OffsetT> spmv_params, ///< [in] SpMV input parameter bundle
Expand All @@ -136,7 +137,8 @@ __global__ void DeviceSpmvKernel(
ValueT,
OffsetT,
CoordinateT,
HAS_BETA_ZERO>
HAS_ALPHA,
HAS_BETA>
AgentSpmvT;

// Shared memory for AgentSpmv
Expand Down Expand Up @@ -567,33 +569,49 @@ struct DispatchSpmv
// Dispatch
if (spmv_params.beta == 0.0)
{
// Dispatch y = alpha*A*x
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
spmv_params,
stream,
debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, OffsetT*, OffsetT*, ValueT*, ValueT*, OffsetT*, ScanTileStateT, cub::Equality, cub::Sum, OffsetT, true>,
spmv_config,
reduce_by_key_config))) break;
if (spmv_params.alpha == 1.0)
{
// Dispatch y = A*x
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, false>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, OffsetT*, OffsetT*, ValueT*, ValueT*, OffsetT*, ScanTileStateT, cub::Equality, cub::Sum, OffsetT, true>,
spmv_config, reduce_by_key_config))) break;
}
else
{
// Dispatch y = alpha*A*x
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, false>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, OffsetT*, OffsetT*, ValueT*, ValueT*, OffsetT*, ScanTileStateT, cub::Equality, cub::Sum, OffsetT, true>,
spmv_config, reduce_by_key_config))) break;
}
}
else
{
// Dispatch y = alpha*A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
spmv_params,
stream,
debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, OffsetT*, OffsetT*, ValueT*, ValueT*, OffsetT*, ScanTileStateT, cub::Equality, cub::Sum, OffsetT, true>,
spmv_config,
reduce_by_key_config))) break;
if (spmv_params.alpha == 1.0)
{
// Dispatch y = A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, false, true>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, OffsetT*, OffsetT*, ValueT*, ValueT*, OffsetT*, ScanTileStateT, cub::Equality, cub::Sum, OffsetT, true>,
spmv_config, reduce_by_key_config))) break;
}
else
{
// Dispatch y = alpha*A*x + beta*y
if (CubDebug(error = Dispatch(
d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
DeviceSpmvSearchKernel<PtxSpmvPolicyT, ScanTileStateT, OffsetT, CoordinateT, SpmvParamsT>,
DeviceSpmvKernel<PtxSpmvPolicyT, ValueT, OffsetT, CoordinateT, true, true>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, OffsetT*, OffsetT*, ValueT*, ValueT*, OffsetT*, ScanTileStateT, cub::Equality, cub::Sum, OffsetT, true>,
spmv_config, reduce_by_key_config))) break;
}
}

}
Expand Down
20 changes: 13 additions & 7 deletions experimental/spmv_compare.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ float CusparseSpmv(
cusparseCreateMatDescr(&desc);

// Reset input/output vector y
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(float) * num_rows, cudaMemcpyHostToDevice));
if (beta != 0)
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(float) * num_rows, cudaMemcpyHostToDevice));

// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
Expand All @@ -134,7 +135,8 @@ float CusparseSpmv(
for(int it = 0; it < timing_iterations; ++it)
{
// Reset input/output vector y
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(float) * num_rows, cudaMemcpyHostToDevice));
if (beta != 0)
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(float) * num_rows, cudaMemcpyHostToDevice));

gpu_timer.Start();

Expand Down Expand Up @@ -177,7 +179,8 @@ float CusparseSpmv(
cusparseCreateMatDescr(&desc);

// Reset input/output vector y
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(double) * num_rows, cudaMemcpyHostToDevice));
if (beta != 0)
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(double) * num_rows, cudaMemcpyHostToDevice));

// Warmup
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
Expand All @@ -193,7 +196,8 @@ float CusparseSpmv(
for(int it = 0; it < timing_iterations; ++it)
{
// Reset input/output vector y
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(double) * num_rows, cudaMemcpyHostToDevice));
if (beta != 0)
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(double) * num_rows, cudaMemcpyHostToDevice));

gpu_timer.Start();

Expand Down Expand Up @@ -248,7 +252,8 @@ float CubSpmv(
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

// Reset input/output vector y
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(ValueT) * num_rows, cudaMemcpyHostToDevice));
if (beta != 0)
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(ValueT) * num_rows, cudaMemcpyHostToDevice));

// Warmup
CubDebugExit(DeviceSpmv::CsrMV(
Expand All @@ -265,7 +270,8 @@ float CubSpmv(
for(int it = 0; it < timing_iterations; ++it)
{
// Reset input/output vector y
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(ValueT) * num_rows, cudaMemcpyHostToDevice));
if (beta != 0)
CubDebugExit(cudaMemcpy(d_vector_y, vector_y_in, sizeof(ValueT) * num_rows, cudaMemcpyHostToDevice));

gpu_timer.Start();

Expand Down Expand Up @@ -351,7 +357,7 @@ void RunTests(
for (int col = 0; col < csr_matrix.num_cols; ++col)
vector_x[col] = 1.0;

for (int row = 0; row < csr_matrix.num_cols; ++row)
for (int row = 0; row < csr_matrix.num_rows; ++row)
vector_y_in[row] = 1.0;

// Compute reference answer
Expand Down

0 comments on commit 4068cfb

Please sign in to comment.