from huggingface_hub import PyTorchModelHubMixin 


# ... (rest of your model code)
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# --- Hyperparameters (You can adjust these later) ---
# For a "Tiny" LLM, we keep the size very small.
n_embed = 64      # C: Embedding dimension (size of the vector representing a character)
n_head = 4        # H: Number of attention heads
n_layer = 4       # Number of repeating Transformer blocks
dropout = 0.1     # Dropout rate

# --- 1. Causal Self-Attention (The "Attention is All You Need" Component) ---

class CausalSelfAttention(nn.Module):
    """A multi-head masked self-attention module."""

    def __init__(self, n_embed, n_head, block_size, dropout):
        super().__init__()
        
        self.n_embed = n_embed
        self.n_head = n_head
        self.head_size = n_embed // n_head
        
        # Combined projection for Q, K, and V (more efficient)
        self.c_attn = nn.Linear(n_embed, 3 * n_embed, bias=False)
        # Output projection
        self.c_proj = nn.Linear(n_embed, n_embed, bias=False)
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

        # Causal Mask (tril = lower triangular matrix)
        # This mask prevents a token from attending to future tokens (autoregressive)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))
                                    .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.shape  # Batch size, Sequence length (Time), Embedding dimension (Channel)

        # 1. Compute Q, K, V and split (efficiently)
        # q, k, v are (B, T, C)
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embed, dim=2) 
        
        # 2. Reshape for Multi-Head Attention (B, T, C) -> (B, H, T, Head_size)
        # We prepare the tensors so that each head processes a smaller chunk of the dimension C
        k = k.view(B, T, self.n_head, self.head_size).transpose(1, 2) 
        q = q.view(B, T, self.n_head, self.head_size).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_size).transpose(1, 2)
        
        # 3. Scaled Dot-Product Attention: (B, H, T, T)
        # wei = (q @ k.transpose(-2, -1)) / sqrt(Head_size)
        wei = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_size))
        
        # 4. Apply Causal Mask
        # Set attention scores to -inf for future tokens (where tril == 0)
        wei = wei.masked_fill(self.tril[:,:,:T,:T] == 0, float('-inf'))
        
        # 5. Softmax and Dropout
        wei = F.softmax(wei, dim=-1)
        wei = self.attn_dropout(wei)

        # 6. Compute Weighted Sum of Values: (B, H, T, Head_size)
        out = wei @ v 

        # 7. Re-assemble heads: (B, H, T, Head_size) -> (B, T, C)
        out = out.transpose(1, 2).contiguous().view(B, T, C)

        # 8. Final Linear Projection
        out = self.resid_dropout(self.c_proj(out))
        return out

# --- 2. Feed Forward Network (FFN) ---

class FeedForward(nn.Module):
    """A two-layer MLP for processing attention output."""
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            # Standard ratio is 4x the embedding size
            nn.Linear(n_embed, 4 * n_embed),
            nn.GELU(), # Modern activation function (smoother than ReLU)
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


# --- 3. Transformer Block (The Repeating Unit) ---

class TransformerBlock(nn.Module):
    """A standard Transformer decoder block with Attention and FFN."""

    def __init__(self, n_embed, n_head, block_size, dropout):
        super().__init__()
        # LayerNorm applied BEFORE the sub-layer (Pre-Norm style)
        self.ln_1 = nn.LayerNorm(n_embed) 
        self.attn = CausalSelfAttention(n_embed, n_head, block_size, dropout)
        self.ln_2 = nn.LayerNorm(n_embed)
        self.ffn = FeedForward(n_embed, dropout)

    def forward(self, x):
        # 1. Attention with Residual Connection and LayerNorm
        x = x + self.attn(self.ln_1(x))
        # 2. FFN with Residual Connection and LayerNorm
        x = x + self.ffn(self.ln_2(x))
        return x

# --- 4. The Final TinyLLM Model ---

class TinyLLM(nn.Module, PyTorchModelHubMixin):
    """The complete Decoder-Only Transformer model."""
    
    def __init__(self, vocab_size, n_embed, n_head, n_layer, block_size, dropout):
        super().__init__()
        
        self.block_size = block_size
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        # Positional Encoding: A fixed table for position information
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        
        # Stack of Transformer Blocks
        self.blocks = nn.Sequential(*[
            TransformerBlock(n_embed, n_head, block_size, dropout) 
            for _ in range(n_layer)
        ])
        
        self.ln_f = nn.LayerNorm(n_embed) # Final LayerNorm
        # Linear layer to map the embedding vector back to the vocabulary space
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        # idx is the input tensor X of shape (B, T)
        B, T = idx.shape
        
        # 1. Token and Positional Embeddings
        # Token embedding: (B, T, C)
        tok_emb = self.token_embedding_table(idx) 
        # Position embedding: (T, C) -> expanded to (B, T, C)
        pos = torch.arange(T, device=idx.device) 
        pos_emb = self.position_embedding_table(pos) 
        
        # 2. Combine (Add) Embeddings
        x = tok_emb + pos_emb # (B, T, C)
        
        # 3. Pass through Transformer Blocks
        x = self.blocks(x) # (B, T, C)
        
        # 4. Final LayerNorm and Linear Head
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        loss = None
        if targets is not None:
            # Reshape for CrossEntropyLoss: (B*T, vocab_size) and (B*T)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            
            # Compute the negative log-likelihood loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss