Spaces:
Running
Running
File size: 3,164 Bytes
5335722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import logging
from typing import List, Callable
logger = logging.getLogger(__name__)
def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
"""
Split text into overlapping chunks for processing long documents
"""
chunks = []
start = 0
text_length = len(text)
# If text is shorter than chunk_size, return as single chunk
if text_length <= chunk_size:
return [text]
while start < text_length:
end = min(start + chunk_size, text_length)
# Try to break at sentence boundary
if end < text_length:
# Look for sentence end in the last 100 characters of chunk
sentence_end = max(
text.rfind('. ', start, end),
text.rfind('? ', start, end),
text.rfind('! ', start, end)
)
if sentence_end > start + chunk_size * 0.7: # Only if reasonable
end = sentence_end + 1
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap if end - overlap > start else end
# Prevent infinite loop
if start >= text_length:
break
return chunks
def chunked_summarize(
text: str,
summarize_func: Callable,
max_chunk_size: int = 1500,
overlap: int = 200
) -> str:
"""
Summarize long text by processing in chunks and combining results
"""
if len(text) <= max_chunk_size:
return summarize_func(text)
text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=overlap)
logger.info(f"Processing {len(text_chunks)} chunks...")
partial_summaries = []
for i, chunk in enumerate(text_chunks):
logger.info(f"Summarizing chunk {i+1}/{len(text_chunks)}...")
try:
summary = summarize_func(chunk)
if summary and len(summary.strip()) > 10:
partial_summaries.append(summary)
except Exception as e:
logger.warning(f"Failed to summarize chunk {i+1}: {e}")
# Include original chunk as fallback
partial_summaries.append(chunk[:200] + "...")
if not partial_summaries:
return "Unable to generate summary from the document."
combined_summary_input = " ".join(partial_summaries)
# Final summarization if combined text is still long
if len(combined_summary_input) > max_chunk_size:
logger.info("Final summarization of combined chunks...")
try:
return summarize_func(combined_summary_input)
except Exception as e:
logger.error(f"Final summarization failed: {e}")
# Return the combined partial summaries
return combined_summary_input
return combined_summary_input
def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
"""
Estimate reading time in minutes
"""
word_count = len(text.split())
return max(1, round(word_count / words_per_minute)) |