Spaces:
Sleeping
Sleeping
File size: 12,292 Bytes
9679fcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
"""
Comprehensive Wikipedia Ireland Data Extractor
Extracts ALL Ireland-related Wikipedia articles with full content, metadata, and links.
"""
import wikipediaapi
import time
import json
import re
from typing import List, Dict, Set
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
class IrelandWikipediaExtractor:
"""Extract comprehensive Ireland-related Wikipedia content"""
def __init__(self, output_dir="dataset/wikipedia_ireland"):
self.wiki = wikipediaapi.Wikipedia(
user_agent='IrelandKG/1.0 (educational research project)',
language='en',
extract_format=wikipediaapi.ExtractFormat.WIKI,
timeout=60 # Increase timeout to 60 seconds
)
self.output_dir = output_dir
self.ireland_categories = [
"Category:Ireland",
"Category:History of Ireland",
"Category:Geography of Ireland",
"Category:Culture of Ireland",
"Category:Politics of Ireland",
"Category:Economy of Ireland",
"Category:Education in Ireland",
"Category:Irish people",
"Category:Irish language",
"Category:Counties of Ireland",
"Category:Cities and towns in Ireland",
"Category:Buildings and structures in Ireland",
"Category:Sport in Ireland",
"Category:Irish literature",
"Category:Irish music",
"Category:Irish mythology",
"Category:Religion in Ireland",
"Category:Transport in Ireland",
"Category:Science and technology in Ireland",
"Category:Environment of Ireland",
"Category:Northern Ireland",
"Category:Republic of Ireland"
]
def get_category_members(self, category_name: str, depth: int = 2, retries: int = 3) -> Set[str]:
"""Recursively get all pages in a category and its subcategories"""
print(f"[INFO] Fetching category: {category_name} (depth={depth})")
pages = set()
for attempt in range(retries):
try:
cat = self.wiki.page(category_name)
if not cat.exists():
print(f"[WARNING] Category not found: {category_name}")
return pages
break
except Exception as e:
if attempt < retries - 1:
wait_time = (attempt + 1) * 5 # Exponential backoff: 5s, 10s, 15s
print(f"[RETRY] Attempt {attempt + 1} failed: {str(e)[:100]}")
print(f"[RETRY] Waiting {wait_time}s before retry...")
time.sleep(wait_time)
else:
print(f"[ERROR] Failed after {retries} attempts: {e}")
print(f"[ERROR] Skipping category: {category_name}")
return pages
# Add all pages in this category
for page_title in cat.categorymembers.keys():
member = cat.categorymembers[page_title]
if member.ns == wikipediaapi.Namespace.MAIN: # Article namespace
pages.add(page_title)
elif member.ns == wikipediaapi.Namespace.CATEGORY and depth > 0:
# Recursively get subcategory members with rate limiting
time.sleep(1) # Wait 1 second between subcategory requests
subcategory_pages = self.get_category_members(page_title, depth - 1)
pages.update(subcategory_pages)
return pages
def get_all_ireland_pages(self) -> List[str]:
"""Get ALL Ireland-related Wikipedia page titles"""
print("[INFO] Collecting all Ireland-related Wikipedia pages...")
all_pages = set()
# Get pages from all Ireland categories
for idx, category in enumerate(self.ireland_categories, 1):
print(f"[INFO] Processing category {idx}/{len(self.ireland_categories)}: {category}")
pages = self.get_category_members(category, depth=2)
all_pages.update(pages)
print(f"[INFO] Found {len(pages)} pages. Total unique: {len(all_pages)}")
time.sleep(2) # Increased rate limiting to 2 seconds
# Add core Ireland articles that might be missed
core_pages = [
"Ireland",
"Republic of Ireland",
"Northern Ireland",
"Dublin",
"Belfast",
"Irish language",
"History of Ireland",
"Politics of Ireland",
"Economy of Ireland"
]
all_pages.update(core_pages)
print(f"[SUCCESS] Total unique pages found: {len(all_pages)}")
return sorted(list(all_pages))
def extract_article_content(self, page_title: str, retries: int = 3) -> Dict:
"""Extract full article content with metadata"""
for attempt in range(retries):
try:
page = self.wiki.page(page_title)
if not page.exists():
return None
break
except Exception as e:
if attempt < retries - 1:
time.sleep(2)
continue
else:
print(f"[ERROR] Failed to fetch {page_title}: {e}")
return None
try:
# Extract links to other Wikipedia articles
links = [link for link in page.links.keys() if not link.startswith("Category:")]
# Extract categories
categories = [cat for cat in page.categories.keys()]
# Extract sections
sections = self._extract_sections(page)
return {
"title": page.title,
"url": page.fullurl,
"summary": page.summary[:1000] if page.summary else "",
"full_text": page.text,
"text_length": len(page.text),
"links": links[:100], # Limit to avoid huge files
"categories": categories,
"sections": sections,
"backlinks_count": 0, # Will populate later if needed
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
except Exception as e:
print(f"[ERROR] Failed to extract {page_title}: {e}")
return None
def _extract_sections(self, page) -> List[Dict]:
"""Extract section structure from Wikipedia page"""
sections = []
def traverse_sections(section_list, level=1):
for section in section_list:
sections.append({
"title": section.title,
"level": level,
"text_length": len(section.text)
})
if hasattr(section, 'sections'):
traverse_sections(section.sections, level + 1)
if hasattr(page, 'sections'):
traverse_sections(page.sections)
return sections
def extract_all_articles(self, page_titles: List[str], max_workers: int = 5, checkpoint_every: int = 100):
"""Extract all articles in parallel with checkpointing"""
import os
checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
progress_file = f"{self.output_dir}/extraction_progress.json"
# Load existing articles if checkpoint exists
articles = []
extracted_titles = set()
start_index = 0
if os.path.exists(checkpoint_file):
print(f"[RESUME] Found checkpoint file, loading...")
with open(checkpoint_file, 'r', encoding='utf-8') as f:
articles = json.load(f)
extracted_titles = {a['title'] for a in articles}
start_index = len(articles)
print(f"[RESUME] Resuming from {start_index}/{len(page_titles)} articles")
# Filter out already extracted articles
remaining_titles = [t for t in page_titles if t not in extracted_titles]
if not remaining_titles:
print(f"[INFO] All {len(page_titles)} articles already extracted!")
return articles
print(f"[INFO] Extracting {len(remaining_titles)} remaining articles...")
print(f"[INFO] Using {max_workers} parallel workers")
print(f"[INFO] Checkpointing every {checkpoint_every} articles")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(self.extract_article_content, title): title
for title in remaining_titles}
with tqdm(total=len(remaining_titles), desc="Extracting articles", initial=0) as pbar:
batch_count = 0
for future in as_completed(futures):
result = future.result()
if result:
articles.append(result)
batch_count += 1
# Checkpoint every N articles
if batch_count % checkpoint_every == 0:
with open(checkpoint_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
with open(progress_file, 'w') as f:
json.dump({
'total': len(page_titles),
'completed': len(articles),
'remaining': len(page_titles) - len(articles)
}, f)
print(f"\n[CHECKPOINT] Saved progress: {len(articles)}/{len(page_titles)} articles")
pbar.update(1)
# Final save
with open(checkpoint_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
print(f"[SUCCESS] Extracted {len(articles)} total articles")
return articles
def save_articles(self, articles: List[Dict], filename: str = "ireland_articles.json"):
"""Save articles to JSON file"""
import os
os.makedirs(self.output_dir, exist_ok=True)
output_path = f"{self.output_dir}/{filename}"
# Remove checkpoint file after final save
checkpoint_file = f"{self.output_dir}/checkpoint_articles.json"
if os.path.exists(checkpoint_file):
os.remove(checkpoint_file)
print(f"[CLEANUP] Removed checkpoint file")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
print(f"[SUCCESS] Saved {len(articles)} articles to {output_path}")
# Save statistics
stats = {
"total_articles": len(articles),
"total_text_length": sum(a["text_length"] for a in articles),
"avg_text_length": sum(a["text_length"] for a in articles) / len(articles),
"total_links": sum(len(a.get("links", [])) for a in articles),
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
stats_path = f"{self.output_dir}/extraction_stats.json"
with open(stats_path, 'w') as f:
json.dump(stats, f, indent=2)
print(f"[INFO] Statistics saved to {stats_path}")
return output_path
def run_full_extraction(self):
"""Run complete extraction pipeline"""
print("=" * 80)
print("IRELAND WIKIPEDIA COMPREHENSIVE EXTRACTION")
print("=" * 80)
# Step 1: Get all page titles
page_titles = self.get_all_ireland_pages()
# Save page titles
import os
os.makedirs(self.output_dir, exist_ok=True)
with open(f"{self.output_dir}/page_titles.json", 'w') as f:
json.dump(page_titles, f, indent=2)
# Step 2: Extract all articles
articles = self.extract_all_articles(page_titles)
# Step 3: Save articles
output_path = self.save_articles(articles)
print("=" * 80)
print("EXTRACTION COMPLETE!")
print(f"Output: {output_path}")
print("=" * 80)
return articles
if __name__ == "__main__":
extractor = IrelandWikipediaExtractor()
extractor.run_full_extraction()
|