Better fix for Issue NVIDIA#71 (KeyVairPair won't work if Key has non…

…-trivial ctor) https://github.com/NVlabs/cub/issues/71 Former-commit-id: 627a5947cf3e34822b9f743c1f751279eed84230
GaryShen2008 · Nov 21, 2016 · 7b34462 · 7b34462
1 parent 06b1f33
commit 7b34462
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 10 deletions.
diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
@@ -105,7 +105,7 @@ struct BlockScanRunningPrefixOp
 enum ScanTileStatus
 {
     SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID = 99,      // Not yet processed
+    SCAN_TILE_INVALID = 99, // Not yet processed
     SCAN_TILE_PARTIAL,      // Tile aggregate is available
     SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
 };

diff --git a/cub/util_type.cuh b/cub/util_type.cuh
@@ -649,23 +649,104 @@ struct Uninitialized
 /**
  * \brief A key identifier paired with a corresponding value
  */
-template <typename _Key, typename _Value>
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = sizeof(_Key) < sizeof(_Value),
+      bool ValIsLT = sizeof(_Value) < sizeof(_Key)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
 struct KeyValuePair
 {
-    typedef _Key    Key;
-    typedef _Value  Value;
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
 
-    typename AlignBytes<Key>::Type     key;
-    typename AlignBytes<Value>::Type   value;
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+// Need explicit padding to normalize value alignment and overall structure size
+// because the Win32 host compiler (VC++) may disagree with CUDA device C++ compilers
+// (EDG) on the member alignment and size of types passed as template parameters
+// through kernel functions
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[sizeof(Value) - sizeof(Key)];
+
+    Value   value;
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
 
     /// Inequality operator
     __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
     {
         return (value != b.value) || (key != b.key);
     }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[sizeof(Key) - sizeof(Value)];
+
+    Key     key;
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
 
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
 };
 
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
 

diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
@@ -751,10 +751,10 @@ struct Solution<cub::ArgMin, InputValueT, OutputValueT>
     {
         for (int i = 0; i < num_segments; ++i)
         {
-            OutputT aggregate = {1, Traits<InputValueT>::Max()}; // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
             {
-                OutputT item = {j - h_segment_offsets[i], OutputValueT(h_in[j])};
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
                 aggregate = reduction_op(aggregate, item);
             }
             h_reference[i] = aggregate;
@@ -775,10 +775,10 @@ struct Solution<cub::ArgMax, InputValueT, OutputValueT>
     {
         for (int i = 0; i < num_segments; ++i)
         {
-            OutputT aggregate = {1, Traits<InputValueT>::Lowest()}; // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
             for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
             {
-                OutputT item = {j - h_segment_offsets[i], OutputValueT(h_in[j])};
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
                 aggregate = reduction_op(aggregate, item);
             }
             h_reference[i] = aggregate;