Spaces:
Running
Running
| # extract_news.py | |
| # This script extracts full content from news articles using the newspaper3k library. | |
| import logging | |
| import pandas as pd | |
| from newspaper import Article | |
| def extract_full_content(url, min_length=100): | |
| try: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| text = article.text.strip() | |
| title = article.title.strip() if article.title else "Untitled" | |
| if len(text) < min_length: | |
| logging.warning(f"Extracted content is too short from {url}.") | |
| return None | |
| return {"url": url, "text": text, "title": title} | |
| except Exception as e: | |
| logging.error(f"Failed to extract content from {url}: {str(e)}") | |
| return None | |
| def extract_news_articles(urls, min_length=100): | |
| extracted_articles = [] | |
| for url in urls: | |
| article = extract_full_content(url, min_length=min_length) | |
| if article and article.get("text"): | |
| article["original_url"] = url | |
| extracted_articles.append(article) | |
| return extracted_articles | |
| def create_dataframe(articles): | |
| return pd.DataFrame(articles) | |
| def save_to_csv(df, filename): | |
| df.to_csv(filename, index=False) |