49 template <
typename Element>
    56   size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
    58   for (; idx < capacity; idx += gridDim.x * blockDim.x) {
    59     if (ptr_A[idx] != ptr_B[idx]) {
    66 template <
typename Element>
    73   Element nonzero_floor) {
    75   size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
    77   for (; idx < capacity; idx += gridDim.x * blockDim.x) {
    79     Element a = ptr_A[idx];
    80     Element b = ptr_B[idx];
    95 template <
typename Element>
   101   int block_size = 0) {
   104   int *device_equal_flag = 
nullptr;
   106   if (cudaMalloc((
void **)&device_equal_flag, 
sizeof(
int)) != cudaSuccess) {
   107     throw std::runtime_error(
"Failed to allocate device flag.");
   114     cudaMemcpyHostToDevice) != cudaSuccess) {
   116     throw std::runtime_error(
"Failed to copy equality flag to device.");
   119   if (!grid_size || !block_size) {
   122     cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
   125       reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
   127     if (result != cudaSuccess) {
   128       throw std::runtime_error(
"Failed to query occupancy.");
   133     block_size = (block_size < 128 ? block_size : 128);
   136   dim3 grid(grid_size, 1, 1);
   137   dim3 block(block_size, 1, 1);
   139   kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
   145     cudaMemcpyDeviceToHost) != cudaSuccess) {
   147     cudaFree(device_equal_flag);
   149     throw std::runtime_error(
"Failed to copy equality flag from device.");
   152   cudaFree(device_equal_flag);
   160 template <
typename Element>
   162   Element 
const *ptr_A,
   163   Element 
const *ptr_B,
   166   Element nonzero_floor,
   168   int block_size = 0) {
   171   int *device_equal_flag = 
nullptr;
   173   if (cudaMalloc((
void **)&device_equal_flag, 
sizeof(
int)) != cudaSuccess) {
   174     throw std::runtime_error(
"Failed to allocate device flag.");
   181     cudaMemcpyHostToDevice) != cudaSuccess) {
   183     throw std::runtime_error(
"Failed to copy equality flag to device.");
   186   if (!grid_size || !block_size) {
   189     cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
   192       reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
   194     if (result != cudaSuccess) {
   195       throw std::runtime_error(
"Failed to query occupancy.");
   200     block_size = (block_size < 128 ? block_size : 128);
   203   dim3 grid(grid_size, 1, 1);
   204   dim3 block(block_size, 1, 1);
   206   kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
   219     cudaMemcpyDeviceToHost) != cudaSuccess) {
   221     cudaFree(device_equal_flag);
   223     throw std::runtime_error(
"Failed to copy equality flag from device.");
   226   cudaFree(device_equal_flag);
 Definition: aligned_buffer.h:35
__global__ void BlockCompareRelativelyEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity, Element epsilon, Element nonzero_floor)
Definition: device/tensor_compare.h:67
CUTLASS_HOST_DEVICE bool relatively_equal(T a, T b, T epsilon, T nonzero_floor)
__global__ void BlockCompareEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity)
Definition: device/tensor_compare.h:50
This header contains a class to parametrize a statistical distribution function. 
Basic include for CUTLASS.