Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /warp /warp_reduce.cuh

Xu Ma

update

1c3c0d9 over 3 years ago

25 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
	*/

	#pragma once

	#include "../config.cuh"
	#include "specializations/warp_reduce_shfl.cuh"
	#include "specializations/warp_reduce_smem.cuh"
	#include "../thread/thread_operators.cuh"
	#include "../util_type.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup WarpModule
	* @{
	*/

	/**
	* \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
	*
	* \tparam T The reduction input/output element type
	* \tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
	* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
	*
	* \par Overview
	* - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
	* uses a binary combining operator to compute a single aggregate from a list of input elements.
	* - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
	* - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
	*
	* \par Performance Considerations
	* - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
	* - Uses synchronization-free communication between warp lanes when applicable
	* - Incurs zero bank conflicts for most types
	* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
	* - Summation (<b><em>vs.</em></b> generic reduction)
	* - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
	*
	* \par Simple Examples
	* \warpcollective{WarpReduce}
	* \par
	* The code snippet below illustrates four concurrent warp sum reductions within a block of
	* 128 threads (one per each of the 32-thread warps).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for 4 warps
	* __shared__ typename WarpReduce::TempStorage temp_storage[4];
	*
	* // Obtain one input item per thread
	* int thread_data = ...
	*
	* // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
	* int warp_id = threadIdx.x / 32;
	* int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
	* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
	* \p 2544, and \p 3568, respectively (and is undefined in other threads).
	*
	* \par
	* The code snippet below illustrates a single warp sum reduction within a block of
	* 128 threads.
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	* ...
	*
	* // Only the first warp performs a reduction
	* if (threadIdx.x < 32)
	* {
	* // Obtain one input item per thread
	* int thread_data = ...
	*
	* // Return the warp-wide sum to lane0
	* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
	* The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
	*
	*/
	template <
	typename T,
	int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
	int PTX_ARCH = CUB_PTX_ARCH>
	class WarpReduce
	{
	private:

	/******************************************************************************
	* Constants and type definitions
	******************************************************************************/

	enum
	{
	/// Whether the logical warp size and the PTX warp size coincide
	IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),

	/// Whether the logical warp size is a power-of-two
	IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
	};

	public:

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

	/// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
	typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
	WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
	WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;

	#endif // DOXYGEN_SHOULD_SKIP_THIS


	private:

	/// Shared memory storage layout type for WarpReduce
	typedef typename InternalWarpReduce::TempStorage _TempStorage;


	/******************************************************************************
	* Thread fields
	******************************************************************************/

	/// Shared storage reference
	_TempStorage &temp_storage;


	/******************************************************************************
	* Utility methods
	******************************************************************************/

	public:

	/// \smemstorage{WarpReduce}
	struct TempStorage : Uninitialized<_TempStorage> {};


	/****************************************************************//
	* \name Collective constructors
	*********************************************************************/
	//@{


	/**
	* \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
	*/
	__device__ __forceinline__ WarpReduce(
	TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
	:
	temp_storage(temp_storage.Alias())
	{}


	//@} end member group
	/****************************************************************//
	* \name Summation reductions
	*********************************************************************/
	//@{


	/**
	* \brief Computes a warp-wide sum in the calling warp. The output is valid in warp <em>lane</em><sub>0</sub>.
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates four concurrent warp sum reductions within a block of
	* 128 threads (one per each of the 32-thread warps).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for 4 warps
	* __shared__ typename WarpReduce::TempStorage temp_storage[4];
	*
	* // Obtain one input item per thread
	* int thread_data = ...
	*
	* // Return the warp-wide sums to each lane0
	* int warp_id = threadIdx.x / 32;
	* int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
	* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
	* \p 2544, and \p 3568, respectively (and is undefined in other threads).
	*
	*/
	__device__ __forceinline__ T Sum(
	T input) ///< [in] Calling thread's input
	{
	return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
	}

	/**
	* \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp <em>lane</em><sub>0</sub>.
	*
	* All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a sum reduction within a single, partially-full
	* block of 32 threads (one warp).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, int valid_items)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	*
	* // Obtain one input item per thread if in range
	* int thread_data;
	* if (threadIdx.x < valid_items)
	* thread_data = d_data[threadIdx.x];
	*
	* // Return the warp-wide sums to each lane0
	* int aggregate = WarpReduce(temp_storage).Sum(
	* thread_data, valid_items);
	*
	* \endcode
	* \par
	* Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
	* is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is
	* undefined in other threads).
	*
	*/
	__device__ __forceinline__ T Sum(
	T input, ///< [in] Calling thread's input
	int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
	{
	// Determine if we don't need bounds checking
	return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
	}


	/**
	* \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a head-segmented warp sum
	* reduction within a block of 32 threads (one warp).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	*
	* // Obtain one input item and flag per thread
	* int thread_data = ...
	* int head_flag = ...
	*
	* // Return the warp-wide sums to each lane0
	* int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
	* thread_data, head_flag);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data and \p head_flag across the block of threads
	* is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
	* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
	* \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	*
	*/
	template <
	typename FlagT>
	__device__ __forceinline__ T HeadSegmentedSum(
	T input, ///< [in] Calling thread's input
	FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment
	{
	return HeadSegmentedReduce(input, head_flag, cub::Sum());
	}


	/**
	* \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a tail-segmented warp sum
	* reduction within a block of 32 threads (one warp).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	*
	* // Obtain one input item and flag per thread
	* int thread_data = ...
	* int tail_flag = ...
	*
	* // Return the warp-wide sums to each lane0
	* int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
	* thread_data, tail_flag);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data and \p tail_flag across the block of threads
	* is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
	* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
	* \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <
	typename FlagT>
	__device__ __forceinline__ T TailSegmentedSum(
	T input, ///< [in] Calling thread's input
	FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment
	{
	return TailSegmentedReduce(input, tail_flag, cub::Sum());
	}



	//@} end member group
	/****************************************************************//
	* \name Generic reductions
	*********************************************************************/
	//@{

	/**
	* \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
	*
	* Supports non-commutative reduction operators
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates four concurrent warp max reductions within a block of
	* 128 threads (one per each of the 32-thread warps).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for 4 warps
	* __shared__ typename WarpReduce::TempStorage temp_storage[4];
	*
	* // Obtain one input item per thread
	* int thread_data = ...
	*
	* // Return the warp-wide reductions to each lane0
	* int warp_id = threadIdx.x / 32;
	* int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
	* thread_data, cub::Max());
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
	* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
	* \p 95, and \p 127, respectively (and is undefined in other threads).
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <typename ReductionOp>
	__device__ __forceinline__ T Reduce(
	T input, ///< [in] Calling thread's input
	ReductionOp reduction_op) ///< [in] Binary reduction operator
	{
	return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
	}

	/**
	* \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
	*
	* All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
	*
	* Supports non-commutative reduction operators
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a max reduction within a single, partially-full
	* block of 32 threads (one warp).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, int valid_items)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	*
	* // Obtain one input item per thread if in range
	* int thread_data;
	* if (threadIdx.x < valid_items)
	* thread_data = d_data[threadIdx.x];
	*
	* // Return the warp-wide reductions to each lane0
	* int aggregate = WarpReduce(temp_storage).Reduce(
	* thread_data, cub::Max(), valid_items);
	*
	* \endcode
	* \par
	* Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
	* is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is
	* undefined in other threads).
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <typename ReductionOp>
	__device__ __forceinline__ T Reduce(
	T input, ///< [in] Calling thread's input
	ReductionOp reduction_op, ///< [in] Binary reduction operator
	int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
	{
	return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
	}


	/**
	* \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
	*
	* Supports non-commutative reduction operators
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a head-segmented warp max
	* reduction within a block of 32 threads (one warp).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	*
	* // Obtain one input item and flag per thread
	* int thread_data = ...
	* int head_flag = ...
	*
	* // Return the warp-wide reductions to each lane0
	* int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
	* thread_data, head_flag, cub::Max());
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data and \p head_flag across the block of threads
	* is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
	* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
	* \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <
	typename ReductionOp,
	typename FlagT>
	__device__ __forceinline__ T HeadSegmentedReduce(
	T input, ///< [in] Calling thread's input
	FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment
	ReductionOp reduction_op) ///< [in] Reduction operator
	{
	return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
	}


	/**
	* \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
	*
	* Supports non-commutative reduction operators
	*
	* \smemreuse
	*
	* \par Snippet
	* The code snippet below illustrates a tail-segmented warp max
	* reduction within a block of 32 threads (one warp).
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Specialize WarpReduce for type int
	* typedef cub::WarpReduce<int> WarpReduce;
	*
	* // Allocate WarpReduce shared memory for one warp
	* __shared__ typename WarpReduce::TempStorage temp_storage;
	*
	* // Obtain one input item and flag per thread
	* int thread_data = ...
	* int tail_flag = ...
	*
	* // Return the warp-wide reductions to each lane0
	* int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
	* thread_data, tail_flag, cub::Max());
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data and \p tail_flag across the block of threads
	* is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
	* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
	* \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
	*
	* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	*/
	template <
	typename ReductionOp,
	typename FlagT>
	__device__ __forceinline__ T TailSegmentedReduce(
	T input, ///< [in] Calling thread's input
	FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
	ReductionOp reduction_op) ///< [in] Reduction operator
	{
	return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
	}



	//@} end member group
	};

	/** @} */ // end group WarpModule

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)