Spaces:

venni16
/

material-summeraizer

Running

File size: 4,317 Bytes
from transformers import pipeline, AutoTokenizer
import logging
from typing import Optional

logger = logging.getLogger(__name__)

# Global summarizer instance for better performance
_summarizer = None
_tokenizer = None

def get_summarizer(model_name: str = "facebook/bart-large-cnn"):
    """Get or create summarizer instance with caching"""
    global _summarizer, _tokenizer
    
    if _summarizer is None:
        try:
            _summarizer = pipeline(
                "summarization", 
                model=model_name,
                tokenizer=model_name
            )
            _tokenizer = AutoTokenizer.from_pretrained(model_name)
            logger.info(f"Summarizer model {model_name} loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load summarizer: {e}")
            raise
    
    return _summarizer, _tokenizer

def summarize_text(

    text: str,

    model_name: str = "facebook/bart-large-cnn",

    max_length: int = 500,

    min_length: int = 200,

    compression_ratio: Optional[float] = None

) -> str:
    """

    Summarize text using transformer models with enhanced error handling

    """
    try:
        summarizer, tokenizer = get_summarizer(model_name)
        
        # If text is too short, return as is
        if len(text.split()) < 30:
            return text
        
        # Calculate appropriate lengths
        word_count = len(text.split())
        
        if compression_ratio:
            max_length = min(max_length, int(word_count * compression_ratio))
            min_length = min(min_length, max_length // 2)
        else:
            # Adaptive length calculation
            if word_count < 100:
                max_length = min(100, word_count - 10)
                min_length = max(30, max_length // 2)
            elif word_count < 500:
                max_length = min(150, word_count // 3)
                min_length = max(50, max_length // 2)
            else:
                max_length = min(max_length, word_count // 4)
                min_length = min(min_length, max_length // 3)
        
        # Ensure min_length < max_length
        min_length = min(min_length, max_length - 1)
        
        # Tokenize to check length
        tokens = tokenizer.encode(text)
        if len(tokens) > tokenizer.model_max_length:
            # Truncate if too long
            tokens = tokens[:tokenizer.model_max_length - 100]
            text = tokenizer.decode(tokens, skip_special_tokens=True)
        
        logger.info(f"Summarizing text: {word_count} words -> {max_length} max tokens")
        
        summary = summarizer(
            text, 
            max_length=max_length, 
            min_length=min_length, 
            do_sample=False,
            truncation=True,
            clean_up_tokenization_spaces=True
        )
        
        result = summary[0]['summary_text'].strip()
        
        if not result or len(result.split()) < 3:
            raise ValueError("Generated summary is too short or empty")
            
        return result
        
    except Exception as e:
        logger.error(f"Summarization error: {e}")
        # Enhanced fallback: extract key sentences
        return extract_key_sentences(text, min(3, max_length // 50))

def extract_key_sentences(text: str, num_sentences: int = 3) -> str:
    """

    Fallback method to extract key sentences when summarization fails

    """
    sentences = text.split('.')
    meaningful_sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
    
    if not meaningful_sentences:
        return text[:500] + "..." if len(text) > 500 else text
    
    # Simple heuristic: take first, middle, and last sentences
    if len(meaningful_sentences) <= num_sentences:
        return '. '.join(meaningful_sentences) + '.'
    
    key_indices = [0]  # First sentence
    
    # Add a middle sentence
    if len(meaningful_sentences) > 2:
        key_indices.append(len(meaningful_sentences) // 2)
    
    # Add last sentence
    key_indices.append(len(meaningful_sentences) - 1)
    
    key_sentences = [meaningful_sentences[i] for i in key_indices[:num_sentences]]
    return '. '.join(key_sentences) + '.'