""" Functional API for BitLinear operations. This module provides the core functional implementations that will be called by the nn.Module wrappers. These functions implement the mathematical operations described in the BitNet and ternary neural network papers. """ import torch import torch.nn.functional as F from typing import Optional, Tuple def bitlinear_python( x: torch.Tensor, W: torch.Tensor, gamma: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Pure PyTorch reference implementation of BitLinear forward pass. This implements the core BitLinear computation: output = x @ W^T * gamma + bias where W is a ternary weight matrix ({-1, 0, +1}), and gamma is a per-output scaling factor that compensates for the quantization. Args: x: Input tensor of shape [..., in_features] W: Ternary weight matrix of shape [out_features, in_features] with values in {-1, 0, +1} gamma: Scaling factors of shape [out_features] or [1, out_features] bias: Optional bias tensor of shape [out_features] Returns: Output tensor of shape [..., out_features] Notes: - This is the reference implementation for correctness - CUDA kernels will optimize the ternary matrix multiplication - Gamma scaling is applied per output channel """ # Matrix multiplication: [..., in_features] @ [in_features, out_features] # W is [out_features, in_features], so we transpose it output = torch.matmul(x, W.t()) # Shape: [..., out_features] # Apply per-channel scaling with gamma # Ensure gamma broadcasts correctly: reshape to [1, out_features] if needed if gamma.dim() == 1: # Reshape gamma from [out_features] to [1, out_features] for broadcasting output = output * gamma.unsqueeze(0) else: # gamma is already 2D, use as is output = output * gamma # Add bias if provided if bias is not None: output = output + bias return output def greedy_ternary_decomposition( W: torch.Tensor, k: int, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Greedy ternary decomposition of a weight matrix. Decomposes a dense weight matrix W into a sum of k ternary matrices: W ≈ sum_{i=1}^k gamma_i * W_i^ternary This follows the greedy residual minimization approach: 1. Quantize W to ternary → W_1, compute gamma_1 2. Compute residual R_1 = W - gamma_1 * W_1 3. Quantize R_1 to ternary → W_2, compute gamma_2 4. Repeat for k iterations Args: W: Dense weight matrix of shape [out_features, in_features] k: Number of ternary components (typically 2-4 for BitNet) Returns: W_ternary: Stacked ternary matrices of shape [k, out_features, in_features] gammas: Scaling factors of shape [k, out_features] Notes: - Each iteration reduces the residual error - Larger k provides better approximation but more computation - This is used in MultiTernaryLinear for improved expressiveness References: - BitNet paper: "BitNet: Scaling 1-bit Transformers for Large Language Models" - JMLR paper: https://jmlr.org/papers/volume26/24-2050/24-2050.pdf """ from .quantization import weight_to_ternary # Initialize residual with the original weight matrix residual = W.clone() # Lists to store ternary components and their scaling factors ternary_weights = [] gammas = [] # Greedy residual quantization loop for i in range(k): # Quantize current residual to ternary with per-channel scaling W_t, gamma = weight_to_ternary(residual, per_channel=True) # Store this component ternary_weights.append(W_t) gammas.append(gamma) # Compute residual for next iteration # residual = residual - gamma * W_t # Expand gamma for proper broadcasting: [out_features] -> [out_features, 1] residual = residual - (gamma.unsqueeze(1) * W_t) # Stack all components W_ternary = torch.stack(ternary_weights, dim=0) # [k, out_features, in_features] gammas_stacked = torch.stack(gammas, dim=0) # [k, out_features] return W_ternary, gammas_stacked def multi_ternary_linear_python( x: torch.Tensor, W_ternary: torch.Tensor, gammas: torch.Tensor, bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Forward pass for multi-component ternary linear layer. Computes the sum of k ternary linear transformations: output = sum_{i=1}^k (x @ W_i^T * gamma_i) + bias Args: x: Input tensor of shape [..., in_features] W_ternary: Stacked ternary weights of shape [k, out_features, in_features] gammas: Scaling factors of shape [k, out_features] bias: Optional bias tensor of shape [out_features] Returns: Output tensor of shape [..., out_features] """ k = W_ternary.size(0) # Number of ternary components # Initialize output with zeros # Get output shape by doing a dummy matmul with first component output_shape = list(x.shape[:-1]) + [W_ternary.size(1)] # [..., out_features] output = torch.zeros(output_shape, dtype=x.dtype, device=x.device) # Sum contributions from all k ternary components for i in range(k): # Get i-th ternary weight matrix and its scaling factor W_i = W_ternary[i] # [out_features, in_features] gamma_i = gammas[i] # [out_features] # Compute: x @ W_i^T * gamma_i component_output = bitlinear_python(x, W_i, gamma_i, bias=None) # Accumulate output = output + component_output # Add bias once at the end if bias is not None: output = output + bias return output def activation_quant(x: torch.Tensor, bits: int = 8) -> torch.Tensor: """ Quantize activations for BitLinear. BitNet uses activation quantization in addition to weight quantization. This function implements per-token absmax quantization for activations. Args: x: Input activations of shape [..., features] bits: Number of bits for quantization (default: 8) Returns: Quantized activations (as float, not int) Notes: - Uses absmax scaling per token - Returns float tensor for compatibility with autograd - Simulates quantization effects without actual INT8 storage """ # Compute quantization levels Q_max = 2 ** (bits - 1) - 1 # e.g., 127 for 8-bit Q_min = -Q_max # e.g., -127 for 8-bit # Compute absmax scale per token (last dimension) # Keep dimensions for broadcasting scale = torch.max(torch.abs(x), dim=-1, keepdim=True)[0] # Avoid division by zero scale = torch.clamp(scale, min=1e-5) # Normalize to [-1, 1] range x_normalized = x / scale # Scale to quantization range and round x_quant_int = torch.clamp( torch.round(x_normalized * Q_max), min=Q_min, max=Q_max ) # Scale back to original range (simulate dequantization) x_quant = (x_quant_int / Q_max) * scale return x_quant