Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /util_ptx.cuh

Xu Ma

update

1c3c0d9 over 3 years ago

21.8 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* PTX intrinsics
	*/


	#pragma once

	#include "util_type.cuh"
	#include "util_arch.cuh"
	#include "util_namespace.cuh"
	#include "util_debug.cuh"


	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup UtilPtx
	* @{
	*/


	/******************************************************************************
	* PTX helper macros
	******************************************************************************/

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

	/**
	* Register modifier for pointer-types (for inlining PTX assembly)
	*/
	#if defined(_WIN64) \|\| defined(__LP64__)
	#define __CUB_LP64__ 1
	// 64-bit register modifier for inlined asm
	#define _CUB_ASM_PTR_ "l"
	#define _CUB_ASM_PTR_SIZE_ "u64"
	#else
	#define __CUB_LP64__ 0
	// 32-bit register modifier for inlined asm
	#define _CUB_ASM_PTR_ "r"
	#define _CUB_ASM_PTR_SIZE_ "u32"
	#endif

	#endif // DOXYGEN_SHOULD_SKIP_THIS


	/******************************************************************************
	* Inlined PTX intrinsics
	******************************************************************************/

	/**
	* \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend.
	*/
	__device__ __forceinline__ unsigned int SHR_ADD(
	unsigned int x,
	unsigned int shift,
	unsigned int addend)
	{
	unsigned int ret;
	asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
	"=r"(ret) : "r"(x), "r"(shift), "r"(addend));
	return ret;
	}


	/**
	* \brief Shift-left then add. Returns (\p x << \p shift) + \p addend.
	*/
	__device__ __forceinline__ unsigned int SHL_ADD(
	unsigned int x,
	unsigned int shift,
	unsigned int addend)
	{
	unsigned int ret;
	asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
	"=r"(ret) : "r"(x), "r"(shift), "r"(addend));
	return ret;
	}

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

	/**
	* Bitfield-extract.
	*/
	template <typename UnsignedBits, int BYTE_LEN>
	__device__ __forceinline__ unsigned int BFE(
	UnsignedBits source,
	unsigned int bit_start,
	unsigned int num_bits,
	Int2Type<BYTE_LEN> /byte_len/)
	{
	unsigned int bits;
	asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
	return bits;
	}


	/**
	* Bitfield-extract for 64-bit types.
	*/
	template <typename UnsignedBits>
	__device__ __forceinline__ unsigned int BFE(
	UnsignedBits source,
	unsigned int bit_start,
	unsigned int num_bits,
	Int2Type<8> /byte_len/)
	{
	const unsigned long long MASK = (1ull << num_bits) - 1;
	return (source >> bit_start) & MASK;
	}

	#endif // DOXYGEN_SHOULD_SKIP_THIS

	/**
	* \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
	*/
	template <typename UnsignedBits>
	__device__ __forceinline__ unsigned int BFE(
	UnsignedBits source,
	unsigned int bit_start,
	unsigned int num_bits)
	{
	return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
	}


	/**
	* \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
	*/
	__device__ __forceinline__ void BFI(
	unsigned int &ret,
	unsigned int x,
	unsigned int y,
	unsigned int bit_start,
	unsigned int num_bits)
	{
	asm ("bfi.b32 %0, %1, %2, %3, %4;" :
	"=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
	}


	/**
	* \brief Three-operand add. Returns \p x + \p y + \p z.
	*/
	__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
	{
	asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
	return x;
	}


	/**
	* \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.
	*
	* \par
	* The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
	* {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
	* {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
	* the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
	*
	* \par Snippet
	* The code snippet below illustrates byte-permute.
	* \par
	* \code
	* #include <cub/cub.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* int a = 0x03020100;
	* int b = 0x07060504;
	* int index = 0x00007531;
	*
	* int selected = PRMT(a, b, index); // 0x07050301
	*
	* \endcode
	*
	*/
	__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
	{
	int ret;
	asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
	return ret;
	}

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

	/**
	* Sync-threads barrier.
	*/
	__device__ __forceinline__ void BAR(int count)
	{
	asm volatile("bar.sync 1, %0;" : : "r"(count));
	}

	/**
	* CTA barrier
	*/
	__device__ __forceinline__ void CTA_SYNC()
	{
	__syncthreads();
	}


	/**
	* CTA barrier with predicate
	*/
	__device__ __forceinline__ int CTA_SYNC_AND(int p)
	{
	return __syncthreads_and(p);
	}


	/**
	* Warp barrier
	*/
	__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	__syncwarp(member_mask);
	#endif
	}


	/**
	* Warp any
	*/
	__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	return __any_sync(member_mask, predicate);
	#else
	return ::__any(predicate);
	#endif
	}


	/**
	* Warp any
	*/
	__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	return __all_sync(member_mask, predicate);
	#else
	return ::__all(predicate);
	#endif
	}


	/**
	* Warp ballot
	*/
	__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	return __ballot_sync(member_mask, predicate);
	#else
	return __ballot(predicate);
	#endif
	}

	/**
	* Warp synchronous shfl_up
	*/
	__device__ __forceinline__
	unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
	: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
	#else
	asm volatile("shfl.up.b32 %0, %1, %2, %3;"
	: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
	#endif
	return word;
	}

	/**
	* Warp synchronous shfl_down
	*/
	__device__ __forceinline__
	unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
	: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
	#else
	asm volatile("shfl.down.b32 %0, %1, %2, %3;"
	: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
	#endif
	return word;
	}

	/**
	* Warp synchronous shfl_idx
	*/
	__device__ __forceinline__
	unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
	{
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
	: "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
	#else
	asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
	: "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
	#endif
	return word;
	}

	/**
	* Floating point multiply. (Mantissa LSB rounds towards zero.)
	*/
	__device__ __forceinline__ float FMUL_RZ(float a, float b)
	{
	float d;
	asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
	return d;
	}


	/**
	* Floating point multiply-add. (Mantissa LSB rounds towards zero.)
	*/
	__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
	{
	float d;
	asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
	return d;
	}

	#endif // DOXYGEN_SHOULD_SKIP_THIS

	/**
	* \brief Terminates the calling thread
	*/
	__device__ __forceinline__ void ThreadExit() {
	asm volatile("exit;");
	}


	/**
	* \brief Abort execution and generate an interrupt to the host CPU
	*/
	__device__ __forceinline__ void ThreadTrap() {
	asm volatile("trap;");
	}


	/**
	* \brief Returns the row-major linear thread identifier for a multidimensional thread block
	*/
	__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
	{
	return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
	((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
	threadIdx.x;
	}


	/**
	* \brief Returns the warp lane ID of the calling thread
	*/
	__device__ __forceinline__ unsigned int LaneId()
	{
	unsigned int ret;
	asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
	return ret;
	}


	/**
	* \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
	*/
	__device__ __forceinline__ unsigned int WarpId()
	{
	unsigned int ret;
	asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
	return ret;
	}

	/**
	* \brief Returns the warp lane mask of all lanes less than the calling thread
	*/
	__device__ __forceinline__ unsigned int LaneMaskLt()
	{
	unsigned int ret;
	asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
	return ret;
	}

	/**
	* \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
	*/
	__device__ __forceinline__ unsigned int LaneMaskLe()
	{
	unsigned int ret;
	asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
	return ret;
	}

	/**
	* \brief Returns the warp lane mask of all lanes greater than the calling thread
	*/
	__device__ __forceinline__ unsigned int LaneMaskGt()
	{
	unsigned int ret;
	asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
	return ret;
	}

	/**
	* \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
	*/
	__device__ __forceinline__ unsigned int LaneMaskGe()
	{
	unsigned int ret;
	asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
	return ret;
	}

	/** @} */ // end group UtilPtx




	/**
	* \brief Shuffle-up for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
	* \ingroup WarpModule
	*
	* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32.
	* \tparam T <b>[inferred]</b> The input/output element type
	*
	* \par
	* - Available only for SM3.0 or newer
	*
	* \par Snippet
	* The code snippet below illustrates each thread obtaining a \p double value from the
	* predecessor of its predecessor.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Obtain one input item per thread
	* double thread_data = ...
	*
	* // Obtain item from two ranks below
	* double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
	* The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
	*
	*/
	template <
	int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
	typename T>
	__device__ __forceinline__ T ShuffleUp(
	T input, ///< [in] The value to broadcast
	int src_offset, ///< [in] The relative down-offset of the peer to read from
	int first_thread, ///< [in] Index of first lane in logical warp (typically 0)
	unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
	{
	/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
	enum {
	SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
	};

	typedef typename UnitWord<T>::ShuffleWord ShuffleWord;

	const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);

	T output;
	ShuffleWord output_alias = reinterpret_cast<ShuffleWord >(&output);
	ShuffleWord input_alias = reinterpret_cast<ShuffleWord >(&input);

	unsigned int shuffle_word;
	shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread \| SHFL_C, member_mask);
	output_alias[0] = shuffle_word;

	#pragma unroll
	for (int WORD = 1; WORD < WORDS; ++WORD)
	{
	shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread \| SHFL_C, member_mask);
	output_alias[WORD] = shuffle_word;
	}

	return output;
	}


	/**
	* \brief Shuffle-down for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png)
	* \ingroup WarpModule
	*
	* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32.
	* \tparam T <b>[inferred]</b> The input/output element type
	*
	* \par
	* - Available only for SM3.0 or newer
	*
	* \par Snippet
	* The code snippet below illustrates each thread obtaining a \p double value from the
	* successor of its successor.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Obtain one input item per thread
	* double thread_data = ...
	*
	* // Obtain item from two ranks below
	* double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
	* The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
	*
	*/
	template <
	int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
	typename T>
	__device__ __forceinline__ T ShuffleDown(
	T input, ///< [in] The value to broadcast
	int src_offset, ///< [in] The relative up-offset of the peer to read from
	int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
	unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
	{
	/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
	enum {
	SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
	};

	typedef typename UnitWord<T>::ShuffleWord ShuffleWord;

	const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);

	T output;
	ShuffleWord output_alias = reinterpret_cast<ShuffleWord >(&output);
	ShuffleWord input_alias = reinterpret_cast<ShuffleWord >(&input);

	unsigned int shuffle_word;
	shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread \| SHFL_C, member_mask);
	output_alias[0] = shuffle_word;

	#pragma unroll
	for (int WORD = 1; WORD < WORDS; ++WORD)
	{
	shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread \| SHFL_C, member_mask);
	output_alias[WORD] = shuffle_word;
	}

	return output;
	}


	/**
	* \brief Shuffle-broadcast for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
	* contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>. For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
	* then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
	*
	* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32.
	* \tparam T <b>[inferred]</b> The input/output element type
	*
	* \ingroup WarpModule
	*
	* \par
	* - Available only for SM3.0 or newer
	*
	* \par Snippet
	* The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
	*
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
	*
	* __global__ void ExampleKernel(...)
	* {
	* // Obtain one input item per thread
	* double thread_data = ...
	*
	* // Obtain item from thread 0
	* double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
	*
	* \endcode
	* \par
	* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
	* The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
	*
	*/
	template <
	int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
	typename T>
	__device__ __forceinline__ T ShuffleIndex(
	T input, ///< [in] The value to broadcast
	int src_lane, ///< [in] Which warp lane is to do the broadcasting
	unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
	{
	/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
	enum {
	SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) \| (LOGICAL_WARP_THREADS - 1)
	};

	typedef typename UnitWord<T>::ShuffleWord ShuffleWord;

	const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);

	T output;
	ShuffleWord output_alias = reinterpret_cast<ShuffleWord >(&output);
	ShuffleWord input_alias = reinterpret_cast<ShuffleWord >(&input);

	unsigned int shuffle_word;
	shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
	src_lane,
	SHFL_C,
	member_mask);

	output_alias[0] = shuffle_word;

	#pragma unroll
	for (int WORD = 1; WORD < WORDS; ++WORD)
	{
	shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
	src_lane,
	SHFL_C,
	member_mask);

	output_alias[WORD] = shuffle_word;
	}

	return output;
	}



	/**
	* Compute a 32b mask of threads having the same least-significant
	* LABEL_BITS of \p label as the calling thread.
	*/
	template <int LABEL_BITS>
	inline __device__ unsigned int MatchAny(unsigned int label)
	{
	unsigned int retval;

	// Extract masks of common threads for each bit
	#pragma unroll
	for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
	{
	unsigned int mask;
	unsigned int current_bit = 1 << BIT;
	asm ("{\n"
	" .reg .pred p;\n"
	" and.b32 %0, %1, %2;"
	" setp.eq.u32 p, %0, %2;\n"
	#ifdef CUB_USE_COOPERATIVE_GROUPS
	" vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
	#else
	" vote.ballot.b32 %0, p;\n"
	#endif
	" @!p not.b32 %0, %0;\n"
	"}\n" : "=r"(mask) : "r"(label), "r"(current_bit));

	// Remove peers who differ
	retval = (BIT == 0) ? mask : retval & mask;
	}

	return retval;

	// // VOLTA match
	// unsigned int retval;
	// asm ("{\n"
	// " match.any.sync.b32 %0, %1, 0xffffffff;\n"
	// "}\n" : "=r"(retval) : "r"(label));
	// return retval;

	}


















	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)