Spaces:
Runtime error
Runtime error
| """ | |
| Intelligent Benchmark & Exam Scraper | |
| Scrapes the web to find domain-specific questions, scenarios, and test content. | |
| Automatically builds comprehensive benchmarks for any use case. | |
| """ | |
| import re | |
| import json | |
| import requests | |
| from typing import List, Dict, Optional | |
| from pathlib import Path | |
| import time | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import quote_plus | |
| class IntelligentBenchmarkScraper: | |
| """ | |
| Scrapes web sources to build domain-specific benchmarks and exams. | |
| Features: | |
| - Web search for relevant content | |
| - Multi-source scraping (Wikipedia, educational sites, forums, documentation) | |
| - Question extraction and generation | |
| - Quality scoring and filtering | |
| - Benchmark formatting | |
| """ | |
| def __init__(self, api_key: Optional[str] = None): | |
| """ | |
| Initialize scraper. | |
| Args: | |
| api_key: OpenAI/Anthropic key for question generation from scraped content | |
| """ | |
| self.api_key = api_key | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| def search_web(self, query: str, num_results: int = 10) -> List[Dict]: | |
| """ | |
| Search the web for relevant content using DuckDuckGo (no API key needed). | |
| Args: | |
| query: Search query | |
| num_results: Number of results to return | |
| Returns: | |
| List of search results with title, URL, snippet | |
| """ | |
| results = [] | |
| try: | |
| # Use DuckDuckGo HTML search (no API required) | |
| search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" | |
| response = self.session.get(search_url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Parse results | |
| for result_div in soup.find_all('div', class_='result')[:num_results]: | |
| title_elem = result_div.find('a', class_='result__a') | |
| snippet_elem = result_div.find('a', class_='result__snippet') | |
| if title_elem and snippet_elem: | |
| results.append({ | |
| 'title': title_elem.get_text(strip=True), | |
| 'url': title_elem['href'], | |
| 'snippet': snippet_elem.get_text(strip=True) | |
| }) | |
| except Exception as e: | |
| print(f"Search error: {e}") | |
| # Fallback: provide some generic sources | |
| results = self._get_fallback_sources(query) | |
| return results | |
| def _get_fallback_sources(self, query: str) -> List[Dict]: | |
| """Provide fallback educational sources when search fails.""" | |
| domain_keywords = query.lower() | |
| sources = [] | |
| # Wikipedia | |
| wiki_topic = query.replace(' ', '_') | |
| sources.append({ | |
| 'title': f"Wikipedia: {query}", | |
| 'url': f"https://en.wikipedia.org/wiki/{wiki_topic}", | |
| 'snippet': f"Comprehensive overview of {query}" | |
| }) | |
| # Add domain-specific sources | |
| if 'financial' in domain_keywords or 'finance' in domain_keywords: | |
| sources.extend([ | |
| { | |
| 'title': "Investopedia: Financial Certification Exams", | |
| 'url': "https://www.investopedia.com/", | |
| 'snippet': "Financial education and exam prep" | |
| }, | |
| { | |
| 'title': "CFP Board Practice Questions", | |
| 'url': "https://www.cfp.net/", | |
| 'snippet': "CFP certification resources" | |
| } | |
| ]) | |
| elif 'medical' in domain_keywords or 'health' in domain_keywords: | |
| sources.extend([ | |
| { | |
| 'title': "NCBI Medical Resources", | |
| 'url': "https://www.ncbi.nlm.nih.gov/", | |
| 'snippet': "Medical knowledge base" | |
| }, | |
| { | |
| 'title': "MedlinePlus Health Topics", | |
| 'url': "https://medlineplus.gov/", | |
| 'snippet': "Consumer health information" | |
| } | |
| ]) | |
| elif 'legal' in domain_keywords or 'law' in domain_keywords: | |
| sources.extend([ | |
| { | |
| 'title': "Cornell Legal Information Institute", | |
| 'url': "https://www.law.cornell.edu/", | |
| 'snippet': "Free legal resources and case law" | |
| } | |
| ]) | |
| return sources | |
| def scrape_content(self, url: str) -> str: | |
| """ | |
| Scrape text content from a URL. | |
| Args: | |
| url: URL to scrape | |
| Returns: | |
| Extracted text content | |
| """ | |
| try: | |
| response = self.session.get(url, timeout=15) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(['script', 'style', 'header', 'footer', 'nav']): | |
| script.decompose() | |
| # Get text | |
| text = soup.get_text() | |
| # Clean up | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = ' '.join(chunk for chunk in chunks if chunk) | |
| # Limit size | |
| return text[:10000] # Max 10K chars per page | |
| except Exception as e: | |
| print(f"Scraping error for {url}: {e}") | |
| return "" | |
| def extract_questions_from_text(self, text: str, max_questions: int = 20) -> List[Dict]: | |
| """ | |
| Extract questions from text using pattern matching. | |
| Args: | |
| text: Text content to analyze | |
| max_questions: Maximum questions to extract | |
| Returns: | |
| List of question dicts | |
| """ | |
| questions = [] | |
| # Pattern 1: Questions with answers | |
| # Example: "What is X? Y is..." | |
| qa_pattern = r'(?:^|\n)([^.!?]*\?)\s*([^.!?]+[.!?])' | |
| matches = re.findall(qa_pattern, text) | |
| for question, answer in matches[:max_questions//2]: | |
| question = question.strip() | |
| answer = answer.strip() | |
| if len(question) > 20 and len(answer) > 20: | |
| questions.append({ | |
| 'question': question, | |
| 'answer': answer, | |
| 'type': 'extracted' | |
| }) | |
| # Pattern 2: Numbered/bulleted questions | |
| numbered_pattern = r'(?:^|\n)\s*(?:\d+[\.\)]\s*|[•\-\*]\s*)([^.!?]*\?)' | |
| numbered_matches = re.findall(numbered_pattern, text) | |
| for question in numbered_matches[:max_questions//2]: | |
| question = question.strip() | |
| if len(question) > 20: | |
| questions.append({ | |
| 'question': question, | |
| 'answer': "", # Will be generated later | |
| 'type': 'extracted_no_answer' | |
| }) | |
| return questions[:max_questions] | |
| def generate_questions_from_content(self, content: str, domain: str, num_questions: int = 10) -> List[Dict]: | |
| """ | |
| Generate questions from content using LLM. | |
| Args: | |
| content: Source content | |
| domain: Domain/topic | |
| num_questions: Number of questions to generate | |
| Returns: | |
| List of generated questions | |
| """ | |
| if not self.api_key: | |
| # Fallback: use simple templates | |
| return self._generate_template_questions(content, domain, num_questions) | |
| try: | |
| # Try OpenAI first | |
| if self.api_key.startswith('sk-'): | |
| from openai import OpenAI | |
| client = OpenAI(api_key=self.api_key) | |
| prompt = f"""Based on the following content about {domain}, generate {num_questions} test questions with answers. | |
| Content: | |
| {content[:3000]} | |
| Format each question as JSON: | |
| {{"question": "...", "answer": "...", "difficulty": "beginner|intermediate|advanced"}} | |
| Return a JSON array of questions.""" | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": "You are an expert test creator."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.7 | |
| ) | |
| # Parse JSON response | |
| content_text = response.choices[0].message.content | |
| json_match = re.search(r'\[.*\]', content_text, re.DOTALL) | |
| if json_match: | |
| questions = json.loads(json_match.group()) | |
| return questions | |
| # Try Anthropic | |
| elif self.api_key.startswith('sk-ant-'): | |
| from anthropic import Anthropic | |
| client = Anthropic(api_key=self.api_key) | |
| prompt = f"""Based on the following content about {domain}, generate {num_questions} test questions with answers. | |
| Content: | |
| {content[:3000]} | |
| Format each question as JSON: | |
| {{"question": "...", "answer": "...", "difficulty": "beginner|intermediate|advanced"}} | |
| Return a JSON array of questions.""" | |
| response = client.messages.create( | |
| model="claude-3-5-sonnet-20241022", | |
| max_tokens=2000, | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| # Parse JSON response | |
| content_text = response.content[0].text | |
| json_match = re.search(r'\[.*\]', content_text, re.DOTALL) | |
| if json_match: | |
| questions = json.loads(json_match.group()) | |
| return questions | |
| except Exception as e: | |
| print(f"LLM generation error: {e}") | |
| # Fallback | |
| return self._generate_template_questions(content, domain, num_questions) | |
| def _generate_template_questions(self, content: str, domain: str, num_questions: int) -> List[Dict]: | |
| """Generate basic questions using templates when no API available.""" | |
| questions = [] | |
| # Extract key terms (simple approach) | |
| words = content.split() | |
| unique_words = list(set([w for w in words if len(w) > 5]))[:num_questions] | |
| templates = [ | |
| ("What is {term}?", "answer_placeholder"), | |
| ("Explain the concept of {term}.", "answer_placeholder"), | |
| ("How does {term} work in the context of {domain}?", "answer_placeholder"), | |
| ("What are the key aspects of {term}?", "answer_placeholder"), | |
| ] | |
| for i, term in enumerate(unique_words[:num_questions]): | |
| template = templates[i % len(templates)] | |
| questions.append({ | |
| 'question': template[0].format(term=term, domain=domain), | |
| 'answer': f"This question requires domain expertise in {domain} regarding {term}.", | |
| 'difficulty': 'intermediate', | |
| 'type': 'template_generated' | |
| }) | |
| return questions | |
| def build_benchmark( | |
| self, | |
| domain: str, | |
| num_questions: int = 50, | |
| use_llm: bool = True | |
| ) -> Dict: | |
| """ | |
| Build a comprehensive benchmark for a domain. | |
| Args: | |
| domain: Domain/topic (e.g., "financial planning", "medical diagnostics") | |
| num_questions: Target number of questions | |
| use_llm: Whether to use LLM for question generation | |
| Returns: | |
| Benchmark dict with questions | |
| """ | |
| print(f"Building benchmark for: {domain}") | |
| print(f"Target questions: {num_questions}") | |
| all_questions = [] | |
| # Step 1: Search for relevant content | |
| print("\n[1/4] Searching web for content...") | |
| search_queries = [ | |
| f"{domain} practice questions", | |
| f"{domain} exam questions", | |
| f"{domain} test scenarios", | |
| f"{domain} certification study guide" | |
| ] | |
| all_sources = [] | |
| for query in search_queries: | |
| sources = self.search_web(query, num_results=5) | |
| all_sources.extend(sources) | |
| time.sleep(1) # Rate limiting | |
| print(f"Found {len(all_sources)} sources") | |
| # Step 2: Scrape content from sources | |
| print("\n[2/4] Scraping content from sources...") | |
| scraped_content = [] | |
| for i, source in enumerate(all_sources[:10]): # Limit to 10 sources | |
| print(f" Scraping {i+1}/10: {source['title'][:50]}...") | |
| content = self.scrape_content(source['url']) | |
| if content: | |
| scraped_content.append({ | |
| 'url': source['url'], | |
| 'title': source['title'], | |
| 'content': content | |
| }) | |
| time.sleep(1) # Be polite | |
| print(f"Successfully scraped {len(scraped_content)} pages") | |
| # Step 3: Extract existing questions | |
| print("\n[3/4] Extracting questions from content...") | |
| for item in scraped_content: | |
| extracted = self.extract_questions_from_text(item['content']) | |
| for q in extracted: | |
| q['source'] = item['url'] | |
| q['source_title'] = item['title'] | |
| all_questions.extend(extracted) | |
| print(f"Extracted {len(all_questions)} questions from sources") | |
| # Step 4: Generate additional questions if needed | |
| if use_llm and len(all_questions) < num_questions: | |
| print("\n[4/4] Generating additional questions using LLM...") | |
| remaining = num_questions - len(all_questions) | |
| # Use best content for generation | |
| best_content = max(scraped_content, key=lambda x: len(x['content']))['content'] if scraped_content else "" | |
| if best_content: | |
| generated = self.generate_questions_from_content( | |
| best_content, | |
| domain, | |
| num_questions=remaining | |
| ) | |
| all_questions.extend(generated) | |
| print(f"Generated {len(generated)} additional questions") | |
| # Build final benchmark | |
| benchmark = { | |
| 'name': f"{domain.title()} Benchmark", | |
| 'domain': domain, | |
| 'description': f"Automatically generated benchmark for {domain} with {len(all_questions)} questions", | |
| 'created_at': time.strftime('%Y-%m-%d %H:%M:%S'), | |
| 'num_questions': len(all_questions), | |
| 'sources': [s['url'] for s in scraped_content], | |
| 'questions': all_questions[:num_questions] | |
| } | |
| print(f"\n[OK] Benchmark created with {len(benchmark['questions'])} questions") | |
| return benchmark | |
| def save_benchmark(self, benchmark: Dict, filepath: str): | |
| """Save benchmark to JSON file.""" | |
| Path(filepath).parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(benchmark, f, indent=2, ensure_ascii=False) | |
| print(f"Saved benchmark to: {filepath}") | |
| def create_scraped_benchmark(domain: str, num_questions: int = 50, api_key: Optional[str] = None) -> str: | |
| """ | |
| Helper function to create a benchmark from web scraping. | |
| Args: | |
| domain: Domain/topic | |
| num_questions: Number of questions | |
| api_key: Optional API key for LLM generation | |
| Returns: | |
| Status message | |
| """ | |
| scraper = IntelligentBenchmarkScraper(api_key=api_key) | |
| benchmark = scraper.build_benchmark( | |
| domain=domain, | |
| num_questions=num_questions, | |
| use_llm=bool(api_key) | |
| ) | |
| # Save | |
| filename = domain.lower().replace(' ', '_') | |
| filepath = f"benchmarks/{filename}_benchmark.json" | |
| scraper.save_benchmark(benchmark, filepath) | |
| return filepath, benchmark | |
| if __name__ == "__main__": | |
| # Test | |
| import sys | |
| domain = sys.argv[1] if len(sys.argv) > 1 else "financial planning" | |
| scraper = IntelligentBenchmarkScraper() | |
| benchmark = scraper.build_benchmark(domain, num_questions=20, use_llm=False) | |
| print("\nSample questions:") | |
| for i, q in enumerate(benchmark['questions'][:3], 1): | |
| print(f"\n{i}. {q['question']}") | |
| if q.get('answer'): | |
| print(f" A: {q['answer'][:100]}...") | |