Spaces:
Runtime error
Runtime error
| [[noreturn]] | |
| void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); | |
| static const char * cublas_get_error_str(const cublasStatus_t err) { | |
| return cublasGetStatusString(err); | |
| } | |
| static const char * cublas_get_error_str(const cublasStatus_t err) { | |
| switch (err) { | |
| case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; | |
| case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; | |
| case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; | |
| case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; | |
| case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; | |
| case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; | |
| case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; | |
| case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; | |
| case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; | |
| default: return "unknown error"; | |
| } | |
| } | |
| static const char * cu_get_error_str(CUresult err) { | |
| const char * err_str; | |
| cuGetErrorString(err, &err_str); | |
| return err_str; | |
| } | |
| typedef half dfloat; // dequantize float | |
| typedef half2 dfloat2; | |
| typedef float dfloat; // dequantize float | |
| typedef float2 dfloat2; | |
| static constexpr bool fast_fp16_available(const int cc) { | |
| return cc >= CC_PASCAL && cc != 610; | |
| } | |
| static constexpr bool fp16_mma_available(const int cc) { | |
| return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; | |
| } | |
| static constexpr bool int8_mma_available(const int cc) { | |
| return cc < CC_OFFSET_AMD && cc >= CC_TURING; | |
| } | |
| [[noreturn]] | |
| static __device__ void no_device_code( | |
| const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { | |
| printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", | |
| file_name, line, function_name, arch); | |
| GGML_UNUSED(arch_list); | |
| printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", | |
| file_name, line, function_name, arch, arch_list); | |
| __trap(); | |
| GGML_UNUSED(no_device_code); // suppress unused function warning | |
| } | |
| static __device__ __forceinline__ int warp_reduce_sum(int x) { | |
| return __reduce_add_sync(0xffffffff, x); | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| x += __shfl_xor_sync(0xffffffff, x, mask, 32); | |
| } | |
| return x; | |
| } | |
| static __device__ __forceinline__ float warp_reduce_sum(float x) { | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| x += __shfl_xor_sync(0xffffffff, x, mask, 32); | |
| } | |
| return x; | |
| } | |
| static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); | |
| a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); | |
| } | |
| return a; | |
| } | |
| static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); | |
| reinterpret_cast<half&>(a.x) += __low2half(a_other); | |
| reinterpret_cast<half&>(a.y) += __high2half(a_other); | |
| } | |
| return a; | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); | |
| } | |
| return a; | |
| NO_DEVICE_CODE; | |
| return a; | |
| } | |
| static __device__ __forceinline__ float warp_reduce_max(float x) { | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); | |
| } | |
| return x; | |
| } | |
| static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { | |
| return __float2half(fmaxf(__half2float(a), __half2float(b))); | |
| return __hmax(a, b); | |
| NO_DEVICE_CODE; | |
| GGML_UNUSED(b); | |
| return a; | |
| } | |
| static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { | |
| return __hmax2(a, b); | |
| half2 ret; | |
| reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b))); | |
| reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b))); | |
| return ret; | |
| GGML_UNUSED(a); | |
| GGML_UNUSED(b); | |
| NO_DEVICE_CODE; | |
| } | |
| static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { | |
| for (int mask = 16; mask > 0; mask >>= 1) { | |
| x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); | |
| } | |
| return x; | |
| GGML_UNUSED(x); | |
| NO_DEVICE_CODE; | |
| } | |
| static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { | |
| const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); | |
| const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); | |
| return mask_low | mask_high; | |
| } | |
| static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { | |
| c = __builtin_amdgcn_sdot4(a, b, c, false); | |
| c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); | |
| int tmp1; | |
| int tmp2; | |
| asm("\n \ | |
| v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ | |
| v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ | |
| v_add3_u32 %0, %1, %2, %0 \n \ | |
| v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ | |
| v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ | |
| v_add3_u32 %0, %1, %2, %0 \n \ | |
| " | |
| : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) | |
| : "v"(a), "v"(b) | |
| ); | |
| const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); | |
| const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); | |
| c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; | |
| return c; | |
| return __dp4a(a, b, c); | |
| const int8_t * a8 = (const int8_t *) &a; | |
| const int8_t * b8 = (const int8_t *) &b; | |
| return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; | |
| } | |
| // TODO: move to ggml-common.h | |
| static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; | |
| typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); | |
| static __device__ __forceinline__ float get_alibi_slope( | |
| const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1 | |
| ) { | |
| if (max_bias <= 0.0f) { | |
| return 1.0f; | |
| } | |
| const float base = h < n_head_log2 ? m0 : m1; | |
| const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; | |
| return powf(base, exph); | |
| } | |
| template <ggml_type type> | |
| struct ggml_cuda_type_traits; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_F16> { | |
| static constexpr int qk = 1; | |
| static constexpr int qr = 1; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> { | |
| static constexpr int qk = QK4_0; | |
| static constexpr int qr = QR4_0; | |
| static constexpr int qi = QI4_0; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> { | |
| static constexpr int qk = QK4_1; | |
| static constexpr int qr = QR4_1; | |
| static constexpr int qi = QI4_1; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> { | |
| static constexpr int qk = QK5_0; | |
| static constexpr int qr = QR5_0; | |
| static constexpr int qi = QI5_0; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> { | |
| static constexpr int qk = QK5_1; | |
| static constexpr int qr = QR5_1; | |
| static constexpr int qi = QI5_1; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> { | |
| static constexpr int qk = QK8_0; | |
| static constexpr int qr = QR8_0; | |
| static constexpr int qi = QI8_0; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR2_K; | |
| static constexpr int qi = QI2_K; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR3_K; | |
| static constexpr int qi = QI3_K; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR4_K; | |
| static constexpr int qi = QI4_K; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR5_K; | |
| static constexpr int qi = QI5_K; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR6_K; | |
| static constexpr int qi = QI6_K; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR2_XXS; | |
| static constexpr int qi = QI2_XXS; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR2_XS; | |
| static constexpr int qi = QI2_XS; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR2_S; | |
| static constexpr int qi = QI2_S; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR3_XXS; | |
| static constexpr int qi = QI3_XXS; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR1_S; | |
| static constexpr int qi = QI1_S; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR1_M; | |
| static constexpr int qi = QI1_M; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> { | |
| static constexpr int qk = QK4_NL; | |
| static constexpr int qr = QR4_NL; | |
| static constexpr int qi = QI4_NL; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR4_XS; | |
| static constexpr int qi = QI4_XS; | |
| }; | |
| template<> | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> { | |
| static constexpr int qk = QK_K; | |
| static constexpr int qr = QR3_S; | |
| static constexpr int qi = QI3_S; | |
| }; | |
| ////////////////////// | |
| struct ggml_cuda_device_info { | |
| int device_count; | |
| struct cuda_device_info { | |
| int cc; // compute capability | |
| int nsm; // number of streaming multiprocessors | |
| size_t smpb; // max. shared memory per block | |
| size_t smpbo; // max. shared memory per block (with opt-in) | |
| bool vmm; // virtual memory support | |
| size_t vmm_granularity; // granularity of virtual memory | |
| size_t total_vram; | |
| }; | |
| cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; | |
| std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {}; | |
| }; | |
| const ggml_cuda_device_info & ggml_cuda_info(); | |
| void ggml_cuda_set_device(int device); | |
| int ggml_cuda_get_device(); | |
| struct ggml_cuda_pool { | |
| virtual ~ggml_cuda_pool() = default; | |
| virtual void * alloc(size_t size, size_t * actual_size) = 0; | |
| virtual void free(void * ptr, size_t size) = 0; | |
| }; | |
| template<typename T> | |
| struct ggml_cuda_pool_alloc { | |
| ggml_cuda_pool * pool = nullptr; | |
| T * ptr = nullptr; | |
| size_t actual_size = 0; | |
| ggml_cuda_pool_alloc() = default; | |
| explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { | |
| } | |
| ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { | |
| alloc(size); | |
| } | |
| ~ggml_cuda_pool_alloc() { | |
| if (ptr != nullptr) { | |
| pool->free(ptr, actual_size); | |
| } | |
| } | |
| // size is in number of elements | |
| T * alloc(size_t size) { | |
| GGML_ASSERT(pool != nullptr); | |
| GGML_ASSERT(ptr == nullptr); | |
| ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); | |
| return ptr; | |
| } | |
| T * alloc(ggml_cuda_pool & pool, size_t size) { | |
| this->pool = &pool; | |
| return alloc(size); | |
| } | |
| T * get() { | |
| return ptr; | |
| } | |
| ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; | |
| ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; | |
| ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; | |
| ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; | |
| }; | |
| // backend interface | |
| struct ggml_tensor_extra_gpu { | |
| void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors | |
| cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs | |
| }; | |
| struct ggml_graph_node_properties { | |
| void * node_address; | |
| ggml_op node_op; | |
| int64_t ne[GGML_MAX_DIMS]; | |
| size_t nb[GGML_MAX_DIMS]; | |
| void * src_address[GGML_MAX_SRC]; | |
| int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; | |
| }; | |
| struct ggml_cuda_graph { | |
| ~ggml_cuda_graph() { | |
| if (instance != nullptr) { | |
| CUDA_CHECK(cudaGraphExecDestroy(instance)); | |
| } | |
| if (graph != nullptr) { | |
| CUDA_CHECK(cudaGraphDestroy(graph)); | |
| } | |
| } | |
| cudaGraph_t graph = nullptr; | |
| cudaGraphExec_t instance = nullptr; | |
| size_t num_nodes = 0; | |
| std::vector<cudaGraphNode_t> nodes; | |
| std::vector<cudaKernelNodeParams> params; | |
| bool disable_due_to_gpu_arch = false; | |
| bool disable_due_to_too_many_updates = false; | |
| bool disable_due_to_failed_graph_capture = false; | |
| int number_consecutive_updates = 0; | |
| std::vector<ggml_graph_node_properties> ggml_graph_properties; | |
| std::vector<char **> updated_kernel_arg; | |
| }; | |
| struct ggml_backend_cuda_context { | |
| int device; | |
| std::string name; | |
| cudaEvent_t copy_event = nullptr; | |
| cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; | |
| cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; | |
| std::unique_ptr<ggml_cuda_graph> cuda_graph; | |
| explicit ggml_backend_cuda_context(int device) : | |
| device(device), | |
| name(GGML_CUDA_NAME + std::to_string(device)) { | |
| } | |
| ~ggml_backend_cuda_context() { | |
| if (copy_event != nullptr) { | |
| CUDA_CHECK(cudaEventDestroy(copy_event)); | |
| } | |
| for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { | |
| for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { | |
| if (streams[i][j] != nullptr) { | |
| CUDA_CHECK(cudaStreamDestroy(streams[i][j])); | |
| } | |
| } | |
| if (cublas_handles[i] != nullptr) { | |
| CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); | |
| } | |
| } | |
| } | |
| cudaStream_t stream(int device, int stream) { | |
| if (streams[device][stream] == nullptr) { | |
| ggml_cuda_set_device(device); | |
| CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); | |
| } | |
| return streams[device][stream]; | |
| } | |
| cudaStream_t stream() { | |
| return stream(device, 0); | |
| } | |
| cublasHandle_t cublas_handle(int device) { | |
| if (cublas_handles[device] == nullptr) { | |
| ggml_cuda_set_device(device); | |
| CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); | |
| CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); | |
| } | |
| return cublas_handles[device]; | |
| } | |
| cublasHandle_t cublas_handle() { | |
| return cublas_handle(device); | |
| } | |
| // pool | |
| std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES]; | |
| static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device); | |
| ggml_cuda_pool & pool(int device) { | |
| if (pools[device] == nullptr) { | |
| pools[device] = new_pool_for_device(device); | |
| } | |
| return *pools[device]; | |
| } | |
| ggml_cuda_pool & pool() { | |
| return pool(device); | |
| } | |
| }; | |