Skip to content

Commit

Permalink
Better fix for Issue NVIDIA#71 (KeyVairPair won't work if Key has non…
Browse files Browse the repository at this point in the history
…-trivial

ctor)

https://github.com/NVlabs/cub/issues/71
Former-commit-id: 627a5947cf3e34822b9f743c1f751279eed84230
  • Loading branch information
dumerrill committed Nov 21, 2016
1 parent 06b1f33 commit 7b34462
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 10 deletions.
2 changes: 1 addition & 1 deletion cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct BlockScanRunningPrefixOp
enum ScanTileStatus
{
SCAN_TILE_OOB, // Out-of-bounds (e.g., padding)
SCAN_TILE_INVALID = 99, // Not yet processed
SCAN_TILE_INVALID = 99, // Not yet processed
SCAN_TILE_PARTIAL, // Tile aggregate is available
SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available
};
Expand Down
91 changes: 86 additions & 5 deletions cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -649,23 +649,104 @@ struct Uninitialized
/**
* \brief A key identifier paired with a corresponding value
*/
template <typename _Key, typename _Value>
template <
typename _Key,
typename _Value
#if defined(_WIN32) && !defined(_WIN64)
, bool KeyIsLT = sizeof(_Key) < sizeof(_Value),
bool ValIsLT = sizeof(_Value) < sizeof(_Key)
#endif // #if defined(_WIN32) && !defined(_WIN64)
>
struct KeyValuePair
{
typedef _Key Key;
typedef _Value Value;
typedef _Key Key; ///< Key data type
typedef _Value Value; ///< Value data type

typename AlignBytes<Key>::Type key;
typename AlignBytes<Value>::Type value;
Key key; ///< Item key
Value value; ///< Item value

/// Constructor
__host__ __device__ __forceinline__
KeyValuePair() {}

/// Constructor
__host__ __device__ __forceinline__
KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}

/// Inequality operator
__host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
{
return (value != b.value) || (key != b.key);
}
};

#if defined(_WIN32) && !defined(_WIN64)

// Need explicit padding to normalize value alignment and overall structure size
// because the Win32 host compiler (VC++) may disagree with CUDA device C++ compilers
// (EDG) on the member alignment and size of types passed as template parameters
// through kernel functions

/// Smaller key specialization
template <typename K, typename V>
struct KeyValuePair<K, V, true, false>
{
typedef K Key;
typedef V Value;

typedef char Pad[sizeof(Value) - sizeof(Key)];

Value value;
Key key;
Pad pad;

/// Constructor
__host__ __device__ __forceinline__
KeyValuePair() {}

/// Constructor
__host__ __device__ __forceinline__
KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}

/// Inequality operator
__host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
{
return (value != b.value) || (key != b.key);
}
};


/// Smaller value specialization
template <typename K, typename V>
struct KeyValuePair<K, V, false, true>
{
typedef K Key;
typedef V Value;

typedef char Pad[sizeof(Key) - sizeof(Value)];

Key key;
Value value;
Pad pad;

/// Constructor
__host__ __device__ __forceinline__
KeyValuePair() {}

/// Constructor
__host__ __device__ __forceinline__
KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}

/// Inequality operator
__host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
{
return (value != b.value) || (key != b.key);
}
};

#endif // #if defined(_WIN32) && !defined(_WIN64)


#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


Expand Down
8 changes: 4 additions & 4 deletions test/test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -751,10 +751,10 @@ struct Solution<cub::ArgMin, InputValueT, OutputValueT>
{
for (int i = 0; i < num_segments; ++i)
{
OutputT aggregate = {1, Traits<InputValueT>::Max()}; // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
{
OutputT item = {j - h_segment_offsets[i], OutputValueT(h_in[j])};
OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
aggregate = reduction_op(aggregate, item);
}
h_reference[i] = aggregate;
Expand All @@ -775,10 +775,10 @@ struct Solution<cub::ArgMax, InputValueT, OutputValueT>
{
for (int i = 0; i < num_segments; ++i)
{
OutputT aggregate = {1, Traits<InputValueT>::Lowest()}; // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
{
OutputT item = {j - h_segment_offsets[i], OutputValueT(h_in[j])};
OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
aggregate = reduction_op(aggregate, item);
}
h_reference[i] = aggregate;
Expand Down

0 comments on commit 7b34462

Please sign in to comment.