Spaces:
Running
Running
| import logging | |
| from typing import List, Callable | |
| logger = logging.getLogger(__name__) | |
| def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]: | |
| """ | |
| Split text into overlapping chunks for processing long documents | |
| """ | |
| chunks = [] | |
| start = 0 | |
| text_length = len(text) | |
| # If text is shorter than chunk_size, return as single chunk | |
| if text_length <= chunk_size: | |
| return [text] | |
| while start < text_length: | |
| end = min(start + chunk_size, text_length) | |
| # Try to break at sentence boundary | |
| if end < text_length: | |
| # Look for sentence end in the last 100 characters of chunk | |
| sentence_end = max( | |
| text.rfind('. ', start, end), | |
| text.rfind('? ', start, end), | |
| text.rfind('! ', start, end) | |
| ) | |
| if sentence_end > start + chunk_size * 0.7: # Only if reasonable | |
| end = sentence_end + 1 | |
| chunk = text[start:end].strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| start = end - overlap if end - overlap > start else end | |
| # Prevent infinite loop | |
| if start >= text_length: | |
| break | |
| return chunks | |
| def chunked_summarize( | |
| text: str, | |
| summarize_func: Callable, | |
| max_chunk_size: int = 1500, | |
| overlap: int = 200 | |
| ) -> str: | |
| """ | |
| Summarize long text by processing in chunks and combining results | |
| """ | |
| if len(text) <= max_chunk_size: | |
| return summarize_func(text) | |
| text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=overlap) | |
| logger.info(f"Processing {len(text_chunks)} chunks...") | |
| partial_summaries = [] | |
| for i, chunk in enumerate(text_chunks): | |
| logger.info(f"Summarizing chunk {i+1}/{len(text_chunks)}...") | |
| try: | |
| summary = summarize_func(chunk) | |
| if summary and len(summary.strip()) > 10: | |
| partial_summaries.append(summary) | |
| except Exception as e: | |
| logger.warning(f"Failed to summarize chunk {i+1}: {e}") | |
| # Include original chunk as fallback | |
| partial_summaries.append(chunk[:200] + "...") | |
| if not partial_summaries: | |
| return "Unable to generate summary from the document." | |
| combined_summary_input = " ".join(partial_summaries) | |
| # Final summarization if combined text is still long | |
| if len(combined_summary_input) > max_chunk_size: | |
| logger.info("Final summarization of combined chunks...") | |
| try: | |
| return summarize_func(combined_summary_input) | |
| except Exception as e: | |
| logger.error(f"Final summarization failed: {e}") | |
| # Return the combined partial summaries | |
| return combined_summary_input | |
| return combined_summary_input | |
| def estimate_reading_time(text: str, words_per_minute: int = 200) -> int: | |
| """ | |
| Estimate reading time in minutes | |
| """ | |
| word_count = len(text.split()) | |
| return max(1, round(word_count / words_per_minute)) |