Spaces:

harao-ml
/

QuickPulse

Running

App Files Files Community

QuickPulse / gather_news.py

harao-ml

Update gather_news.py

869d1ed verified 5 months ago

raw

history blame contribute delete

4.87 kB

	# gather_news.py
	# News Source Integration
	# This script integrates with various news sources to fetch the latest articles from the specified news sources,
	# extracts relevant information such as title, URL, Source, Author and Publish date, and extracts full content.

	import requests
	import os
	from extract_news import extract_news_articles, create_dataframe, save_to_csv

	def fetch_newsapi_top_headlines(min_length=100, max_articles=25):
	#import config
	url = 'https://newsapi.org/v2/top-headlines'
	api_key = os.environ.get("api_key")
	params = {
	'apiKey': api_key,
	'language': 'en',
	'pageSize': max_articles
	}
	response = requests.get(url, params=params)
	if response.status_code != 200:
	print(f"Error: Failed to fetch news from NewsAPI Top Headlines. Status code: {response.status_code}")
	return []
	articles = response.json().get("articles", [])
	if not articles:
	print("No articles found in NewsAPI Top Headlines.")
	return []
	meta_by_url = {}
	urls = []
	for article in articles:
	url = article.get("url", "#")
	meta = {
	"url": url,
	"title": article.get("title", ""),
	"source": article.get("source", {}).get("name", ""),
	"author": article.get("author", "Unknown"),
	"publishedAt": article.get("publishedAt", "Unknown"),
	}
	meta_by_url[url] = meta
	urls.append(url)
	print(f"Fetched {len(urls)} article URLs from NewsAPI Top Headlines.")
	extracted_articles = extract_news_articles(urls, min_length=min_length)
	merged_articles = []
	for art in extracted_articles:
	meta = meta_by_url.get(art.get("original_url"))
	if not meta:
	meta = {
	"title": art.get("title", "Untitled"),
	"source": "",
	"author": "Unknown",
	"publishedAt": "Unknown"
	}
	merged = {
	"url": art.get("url"),
	"title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
	"source": meta["source"],
	"author": meta["author"],
	"publishedAt": meta["publishedAt"],
	"text": art.get("text", ""),
	}
	merged_articles.append(merged)
	print(f"Usable articles after extraction (NewsAPI Top Headlines): {len(merged_articles)}")
	return merged_articles

	def fetch_newsapi_everything(topic, min_length=100, max_articles=50):
	#import config
	url = 'https://newsapi.org/v2/everything'
	api_key = os.environ.get("api_key")
	params = {
	'apiKey': api_key,
	'language': 'en',
	'q': topic,
	'pageSize': max_articles,
	'sortBy': 'publishedAt'
	}
	response = requests.get(url, params=params)
	if response.status_code != 200:
	print(f"Error: Failed to fetch news from NewsAPI Everything. Status code: {response.status_code}")
	return []
	articles = response.json().get("articles", [])
	if not articles:
	print("No articles found in NewsAPI Everything.")
	return []
	meta_by_url = {}
	urls = []
	for article in articles:
	url = article.get("url", "#")
	meta = {
	"url": url,
	"title": article.get("title", ""),
	"source": article.get("source", {}).get("name", ""),
	"author": article.get("author", "Unknown"),
	"publishedAt": article.get("publishedAt", "Unknown"),
	}
	meta_by_url[url] = meta
	urls.append(url)
	print(f"Fetched {len(urls)} article URLs from NewsAPI Everything.")
	extracted_articles = extract_news_articles(urls, min_length=min_length)
	merged_articles = []
	for art in extracted_articles:
	meta = meta_by_url.get(art.get("original_url"))
	if not meta:
	meta = {
	"title": art.get("title", "Untitled"),
	"source": "",
	"author": "Unknown",
	"publishedAt": "Unknown"
	}
	merged = {
	"url": art.get("url"),
	"title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
	"source": meta["source"],
	"author": meta["author"],
	"publishedAt": meta["publishedAt"],
	"text": art.get("text", ""),
	}
	merged_articles.append(merged)
	print(f"Usable articles after extraction (NewsAPI Everything): {len(merged_articles)}")
	return merged_articles

	def fetch_articles(topic=None, min_length=100, max_articles=25):
	if topic and topic.strip():
	return fetch_newsapi_everything(topic, min_length=min_length, max_articles=max_articles)
	else:
	return fetch_newsapi_top_headlines(min_length=min_length, max_articles=max_articles)