Spaces:

aelitta
/

BioMistral_gradio

Runtime error

App Files Files Community

BioMistral_gradio / llama-cpp-python /vendor /llama.cpp /ggml-cuda /common.cuh

aelitta

Upload folder using huggingface_hub

4bdb245 verified over 1 year ago

raw

history blame contribute delete

22.1 kB

	#pragma once

	#include "ggml.h"
	#include "ggml-cuda.h"

	#include <memory>

	#if defined(GGML_USE_HIPBLAS)
	#define GGML_COMMON_DECL_HIP
	#define GGML_COMMON_IMPL_HIP
	#else
	#define GGML_COMMON_DECL_CUDA
	#define GGML_COMMON_IMPL_CUDA
	#endif
	#include "ggml-common.h"

	#include <cstdio>
	#include <array>
	#include <cassert>
	#include <cfloat>
	#include <string>

	#if defined(GGML_USE_HIPBLAS)
	#include <hip/hip_runtime.h>
	#include <hipblas/hipblas.h>
	#include <hip/hip_fp16.h>
	#ifdef __HIP_PLATFORM_AMD__
	// for rocblas_initialize()
	#include "rocblas/rocblas.h"
	#endif // __HIP_PLATFORM_AMD__
	#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
	#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
	#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
	#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
	#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
	#define CUBLAS_OP_N HIPBLAS_OP_N
	#define CUBLAS_OP_T HIPBLAS_OP_T
	#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
	#define CUBLAS_TF32_TENSOR_OP_MATH 0
	#define CUDA_R_16F HIPBLAS_R_16F
	#define CUDA_R_32F HIPBLAS_R_32F
	#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
	#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
	#define cublasCreate hipblasCreate
	#define cublasDestroy hipblasDestroy
	#define cublasGemmEx hipblasGemmEx
	#define cublasGemmBatchedEx hipblasGemmBatchedEx
	#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
	#define cublasHandle_t hipblasHandle_t
	#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
	#define cublasSetStream hipblasSetStream
	#define cublasSgemm hipblasSgemm
	#define cublasStatus_t hipblasStatus_t
	#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
	#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
	#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
	#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
	#define cudaDeviceProp hipDeviceProp_t
	#define cudaDeviceSynchronize hipDeviceSynchronize
	#define cudaError_t hipError_t
	#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
	#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
	#define cudaEventCreateWithFlags hipEventCreateWithFlags
	#define cudaEventDisableTiming hipEventDisableTiming
	#define cudaEventRecord hipEventRecord
	#define cudaEventSynchronize hipEventSynchronize
	#define cudaEvent_t hipEvent_t
	#define cudaEventDestroy hipEventDestroy
	#define cudaFree hipFree
	#define cudaFreeHost hipHostFree
	#define cudaGetDevice hipGetDevice
	#define cudaGetDeviceCount hipGetDeviceCount
	#define cudaGetDeviceProperties hipGetDeviceProperties
	#define cudaGetErrorString hipGetErrorString
	#define cudaGetLastError hipGetLastError
	#define cudaHostRegister hipHostRegister
	#define cudaHostRegisterPortable hipHostRegisterPortable
	#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
	#define cudaHostUnregister hipHostUnregister
	#define cudaLaunchHostFunc hipLaunchHostFunc
	#ifdef GGML_HIP_UMA
	#define cudaMalloc hipMallocManaged
	#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
	#else
	#define cudaMalloc hipMalloc
	#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
	#endif
	#define cudaMemcpy hipMemcpy
	#define cudaMemcpyAsync hipMemcpyAsync
	#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
	#define cudaMemcpy2DAsync hipMemcpy2DAsync
	#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
	#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
	#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
	#define cudaMemcpyKind hipMemcpyKind
	#define cudaMemset hipMemset
	#define cudaMemsetAsync hipMemsetAsync
	#define cudaMemGetInfo hipMemGetInfo
	#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
	#define cudaSetDevice hipSetDevice
	#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
	#define cudaStreamDestroy hipStreamDestroy
	#define cudaStreamFireAndForget hipStreamFireAndForget
	#define cudaStreamNonBlocking hipStreamNonBlocking
	#define cudaStreamPerThread hipStreamPerThread
	#define cudaStreamSynchronize hipStreamSynchronize
	#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
	#define cudaStream_t hipStream_t
	#define cudaSuccess hipSuccess
	#define __trap abort
	#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
	#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
	#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
	#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
	#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
	#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
	#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
	#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
	#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
	#else
	#include <cuda_runtime.h>
	#include <cuda.h>
	#include <cublas_v2.h>
	#include <cuda_fp16.h>

	#if CUDART_VERSION < 11020
	#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
	#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
	#define CUBLAS_COMPUTE_16F CUDA_R_16F
	#define CUBLAS_COMPUTE_32F CUDA_R_32F
	#define cublasComputeType_t cudaDataType_t
	#endif // CUDART_VERSION < 11020

	#endif // defined(GGML_USE_HIPBLAS)

	#define STRINGIZE_IMPL(...) #__VA_ARGS__
	#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

	#define WARP_SIZE 32
	#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
	#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons

	#define CC_PASCAL 600
	#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
	#define CC_VOLTA 700
	#define CC_AMPERE 800
	#define CC_OFFSET_AMD 1000000
	#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
	#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
	#define CC_RDNA3 (CC_OFFSET_AMD + 1100)

	// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
	// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
	// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
	// - 7B quantum model: +100-200 MB
	// - 13B quantum model: +200-400 MB
	//
	//#define GGML_CUDA_FORCE_MMQ

	// TODO: improve this to be correct for more hardware
	// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
	#if !defined(GGML_CUDA_FORCE_MMQ)
	#define CUDA_USE_TENSOR_CORES
	#endif

	#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
	#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available

	#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses

	#if defined(_MSC_VER)
	#pragma warning(disable: 4244 4267) // possible loss of data
	#endif

	#define GGML_CUDA_MAX_STREAMS 8

	[[noreturn]]
	void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);

	#define CUDA_CHECK_GEN(err, success, error_fn) \
	do { \
	auto err_ = (err); \
	if (err_ != (success)) { \
	ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
	} \
	} while (0)

	#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)

	#if CUDART_VERSION >= 12000
	static const char * cublas_get_error_str(const cublasStatus_t err) {
	return cublasGetStatusString(err);
	}
	#else
	static const char * cublas_get_error_str(const cublasStatus_t err) {
	switch (err) {
	case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
	case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
	case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
	case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
	case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
	case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
	case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
	case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
	case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
	default: return "unknown error";
	}
	}
	#endif // CUDART_VERSION >= 12000

	#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)

	#if !defined(GGML_USE_HIPBLAS)
	static const char * cu_get_error_str(CUresult err) {
	const char * err_str;
	cuGetErrorString(err, &err_str);
	return err_str;
	}
	#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
	#endif

	#if CUDART_VERSION >= 11100
	#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
	#else
	#define GGML_CUDA_ASSUME(x)
	#endif // CUDART_VERSION >= 11100

	#ifdef GGML_CUDA_F16
	typedef half dfloat; // dequantize float
	typedef half2 dfloat2;
	#else
	typedef float dfloat; // dequantize float
	typedef float2 dfloat2;
	#endif //GGML_CUDA_F16

	[[noreturn]]
	static __device__ void no_device_code(
	const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {

	#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
	printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
	file_name, line, function_name, arch);
	GGML_UNUSED(arch_list);
	#else
	printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
	file_name, line, function_name, arch, arch_list);
	#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
	__trap();

	GGML_UNUSED(no_device_code); // suppress unused function warning
	}

	#ifdef __CUDA_ARCH__
	#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
	#else
	#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
	#endif // __CUDA_ARCH__

	static __device__ __forceinline__ float warp_reduce_sum(float x) {
	#pragma unroll
	for (int mask = 16; mask > 0; mask >>= 1) {
	x += __shfl_xor_sync(0xffffffff, x, mask, 32);
	}
	return x;
	}

	static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
	#pragma unroll
	for (int mask = 16; mask > 0; mask >>= 1) {
	a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
	a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
	}
	return a;
	}

	static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
	#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
	#pragma unroll
	for (int mask = 16; mask > 0; mask >>= 1) {
	a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
	}
	return a;
	#else
	GGML_UNUSED(a);
	NO_DEVICE_CODE;
	#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
	}

	static __device__ __forceinline__ float warp_reduce_max(float x) {
	#pragma unroll
	for (int mask = 16; mask > 0; mask >>= 1) {
	x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
	}
	return x;
	}

	static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
	#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))

	#if CUDART_VERSION >= CUDART_HMAX
	return __hmax(a, b);
	#else
	return __half2float(a) > __half2float(b) ? a : b;
	#endif // CUDART_VERSION >= CUDART_HMAX

	#else
	GGML_UNUSED(a);
	GGML_UNUSED(b);
	NO_DEVICE_CODE;
	#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
	}
	static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
	#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))

	#if CUDART_VERSION >= CUDART_HMAX
	return __hmax2(a, b);
	#else
	half2 ret;
	reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
	reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
	return ret;
	#endif // CUDART_VERSION >= CUDART_HMAX

	#else
	GGML_UNUSED(a);
	GGML_UNUSED(b);
	NO_DEVICE_CODE;
	#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
	}

	static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
	#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
	#pragma unroll
	for (int mask = 16; mask > 0; mask >>= 1) {
	x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
	}
	return x;
	#else
	GGML_UNUSED(x);
	NO_DEVICE_CODE;
	#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
	}

	#if CUDART_VERSION < CUDART_HMASK
	static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
	const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
	const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
	return mask_low \| mask_high;
	}
	#endif // CUDART_VERSION < 12000

	#if defined(GGML_USE_HIPBLAS)
	#define __CUDA_ARCH__ 1300

	#if defined(__gfx1100__) \|\| defined(__gfx1101__) \|\| defined(__gfx1102__) \|\| defined(__gfx1103__) \|\| \
	defined(__gfx1150__) \|\| defined(__gfx1151__)
	#define RDNA3
	#endif

	#if defined(__gfx1030__) \|\| defined(__gfx1031__) \|\| defined(__gfx1032__) \|\| defined(__gfx1033__) \|\| \
	defined(__gfx1034__) \|\| defined(__gfx1035__) \|\| defined(__gfx1036__) \|\| defined(__gfx1037__)
	#define RDNA2
	#endif

	#ifndef __has_builtin
	#define __has_builtin(x) 0
	#endif

	typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
	typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
	static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
	const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
	const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
	#if __has_builtin(__builtin_elementwise_sub_sat)
	const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
	return reinterpret_cast<const int &>(c);
	#else
	int8x4_t c;
	int16_t tmp;
	#pragma unroll
	for (int i = 0; i < 4; i++) {
	tmp = va[i] - vb[i];
	if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
	if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
	c[i] = tmp;
	}
	return reinterpret_cast<int &>(c);
	#endif // __has_builtin(__builtin_elementwise_sub_sat)
	}

	static __device__ __forceinline__ int __vsub4(const int a, const int b) {
	return __vsubss4(a, b);
	}

	static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
	const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
	const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
	unsigned int c;
	uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
	#pragma unroll
	for (int i = 0; i < 4; ++i) {
	vc[i] = va[i] == vb[i] ? 0xff : 0x00;
	}
	return c;
	}

	static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
	#if defined(__gfx906__) \|\| defined(__gfx908__) \|\| defined(__gfx90a__) \|\| defined(__gfx1030__)
	c = __builtin_amdgcn_sdot4(a, b, c, false);
	#elif defined(RDNA3)
	c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
	#elif defined(__gfx1010__) \|\| defined(__gfx900__)
	int tmp1;
	int tmp2;
	asm("\n \
	v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
	v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
	v_add3_u32 %0, %1, %2, %0 \n \
	v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
	v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
	v_add3_u32 %0, %1, %2, %0 \n \
	"
	: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
	: "v"(a), "v"(b)
	);
	#else
	const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
	const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
	c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
	#endif
	return c;
	}
	#endif // defined(GGML_USE_HIPBLAS)

	#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
	defined(RDNA1) \|\| defined(RDNA2) \|\| defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL

	#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

	// TODO: move to ggml-common.h
	static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

	typedef void (dequantize_kernel_t)(const void vx, const int64_t ib, const int iqs, dfloat2 & v);


	//////////////////////

	struct ggml_cuda_device_info {
	int device_count;

	struct cuda_device_info {
	int cc; // compute capability
	int nsm; // number of streaming multiprocessors
	size_t smpb; // max. shared memory per block
	bool vmm; // virtual memory support
	size_t vmm_granularity; // granularity of virtual memory
	size_t total_vram;
	};

	cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};

	std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
	};

	const ggml_cuda_device_info & ggml_cuda_info();

	void ggml_cuda_set_device(int device);
	int ggml_cuda_get_device();

	struct ggml_cuda_pool {
	virtual ~ggml_cuda_pool() = default;

	virtual void * alloc(size_t size, size_t * actual_size) = 0;
	virtual void free(void * ptr, size_t size) = 0;
	};

	template<typename T>
	struct ggml_cuda_pool_alloc {
	ggml_cuda_pool * pool = nullptr;
	T * ptr = nullptr;
	size_t actual_size = 0;

	ggml_cuda_pool_alloc() = default;

	explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
	}

	ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
	alloc(size);
	}

	~ggml_cuda_pool_alloc() {
	if (ptr != nullptr) {
	pool->free(ptr, actual_size);
	}
	}

	// size is in number of elements
	T * alloc(size_t size) {
	GGML_ASSERT(pool != nullptr);
	GGML_ASSERT(ptr == nullptr);
	ptr = (T ) pool->alloc(size sizeof(T), &this->actual_size);
	return ptr;
	}

	T * alloc(ggml_cuda_pool & pool, size_t size) {
	this->pool = &pool;
	return alloc(size);
	}

	T * get() {
	return ptr;
	}

	ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
	ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
	ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
	ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
	};


	// backend interface

	struct ggml_tensor_extra_gpu {
	void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
	cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
	};

	struct ggml_backend_cuda_context {
	int device;
	std::string name;
	cudaEvent_t copy_event = nullptr;

	cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
	cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

	explicit ggml_backend_cuda_context(int device) :
	device(device),
	name(GGML_CUDA_NAME + std::to_string(device)) {
	}

	~ggml_backend_cuda_context() {
	if (copy_event != nullptr) {
	CUDA_CHECK(cudaEventDestroy(copy_event));
	}
	for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
	for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
	if (streams[i][j] != nullptr) {
	CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
	}
	}
	if (cublas_handles[i] != nullptr) {
	CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
	}
	}
	}

	cudaStream_t stream(int device, int stream) {
	if (streams[device][stream] == nullptr) {
	ggml_cuda_set_device(device);
	CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
	}
	return streams[device][stream];
	}

	cudaStream_t stream() {
	return stream(device, 0);
	}

	cublasHandle_t cublas_handle(int device) {
	if (cublas_handles[device] == nullptr) {
	ggml_cuda_set_device(device);
	CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
	CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
	}
	return cublas_handles[device];
	}

	cublasHandle_t cublas_handle() {
	return cublas_handle(device);
	}

	// pool
	std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];

	static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);

	ggml_cuda_pool & pool(int device) {
	if (pools[device] == nullptr) {
	pools[device] = new_pool_for_device(device);
	}
	return *pools[device];
	}

	ggml_cuda_pool & pool() {
	return pool(device);
	}
	};