Spaces:

MCP-1st-Birthday
/

common_core_mcp

Running

Lindow

add mcp and gradio app

3fac7d8 15 days ago

27.6 kB

	"""CLI entry point for EduMatch Data Management."""

	from __future__ import annotations

	import sys
	from pathlib import Path

	# Add project root to Python path
	project_root = Path(__file__).parent.parent
	if str(project_root) not in sys.path:
	sys.path.insert(0, str(project_root))

	import typer
	from loguru import logger
	from rich.console import Console
	from rich.table import Table

	from tools import api_client, data_manager
	from tools.config import get_settings
	from tools.pinecone_processor import process_and_save

	settings = get_settings()

	# Configure logger
	logger.remove() # Remove default handler
	logger.add(
	sys.stderr,
	format="<green>{time:HH:mm:ss}</green> \| <level>{level: <8}</level> \| <level>{message}</level>",
	)
	logger.add(
	settings.log_file,
	rotation=settings.log_rotation,
	retention=settings.log_retention,
	format="{time} \| {level} \| {message}",
	)

	app = typer.Typer(help="Common Core MCP CLI - Manage educational standards data")
	console = Console()


	@app.command()
	def jurisdictions(
	search: str = typer.Option(
	None,
	"--search",
	"-s",
	help="Filter by jurisdiction name (case-insensitive partial match)",
	),
	type: str = typer.Option(
	None,
	"--type",
	"-t",
	help="Filter by jurisdiction type: school, organization, state, or nation",
	),
	force: bool = typer.Option(
	False, "--force", "-f", help="Force refresh from API, ignoring local cache"
	),
	):
	"""
	List all available jurisdictions (states/organizations).

	By default, jurisdictions are loaded from local cache (data/raw/jurisdictions.json)
	to avoid repeated API calls. Use --force to fetch fresh data from the API and update
	the cache. The cache is automatically created on first use.

	Filters can be combined: use --search to filter by name and --type to filter by type.
	"""
	try:
	if force:
	console.print("[yellow]Forcing refresh from API...[/yellow]")

	# Validate type filter if provided
	if type:
	valid_types = {"school", "organization", "state", "nation"}
	if type.lower() not in valid_types:
	console.print(
	f"[red]Error: Invalid type '{type}'. Must be one of: {', '.join(sorted(valid_types))}[/red]"
	)
	raise typer.Exit(code=1)

	results = api_client.get_jurisdictions(
	search_term=search, type_filter=type, force_refresh=force
	)

	table = Table("ID", "Title", "Type", title="Jurisdictions")
	for j in results:
	table.add_row(j.id, j.title, j.type)

	console.print(table)
	console.print(f"\n[green]Found {len(results)} jurisdictions[/green]")

	if not force:
	console.print("[dim]Tip: Use --force to refresh from API[/dim]")

	except Exception as e:
	console.print(f"[red]Error: {e}[/red]")
	logger.exception("Failed to fetch jurisdictions")
	raise typer.Exit(code=1)


	@app.command()
	def jurisdiction_details(
	jurisdiction_id: str = typer.Argument(..., help="Jurisdiction ID"),
	force: bool = typer.Option(
	False, "--force", "-f", help="Force refresh from API, ignoring local cache"
	),
	):
	"""
	Download and display jurisdiction metadata including standard set references.

	By default, jurisdiction metadata is loaded from local cache (data/raw/jurisdictions/{id}/data.json)
	to avoid repeated API calls. Use --force to fetch fresh data from the API and update the cache.
	The cache is automatically created on first use.

	Note: This command downloads metadata about standard sets (IDs, titles, subjects) but NOT
	the full standard set content. Use the 'download' command to get full standard set data.
	"""
	try:
	if force:
	console.print("[yellow]Forcing refresh from API...[/yellow]")

	jurisdiction_data = api_client.get_jurisdiction_details(
	jurisdiction_id, force_refresh=force
	)

	# Display jurisdiction info
	console.print(f"\n[bold]Jurisdiction:[/bold] {jurisdiction_data.title}")
	console.print(f"[bold]Type:[/bold] {jurisdiction_data.type}")
	console.print(f"[bold]ID:[/bold] {jurisdiction_data.id}")

	# Display standard sets
	standard_sets = jurisdiction_data.standardSets
	if standard_sets:
	table = Table(
	"Set ID", "Subject", "Title", "Grade Levels", title="Standard Sets"
	)
	for s in standard_sets:
	grade_levels = ", ".join(s.educationLevels)
	table.add_row(
	s.id,
	s.subject,
	s.title,
	grade_levels or "N/A",
	)

	console.print("\n")
	console.print(table)
	console.print(f"\n[green]Found {len(standard_sets)} standard sets[/green]")
	else:
	console.print("\n[yellow]No standard sets found[/yellow]")

	if not force:
	console.print("[dim]Tip: Use --force to refresh from API[/dim]")

	except Exception as e:
	console.print(f"[red]Error: {e}[/red]")
	logger.exception("Failed to fetch jurisdiction details")
	raise typer.Exit(code=1)


	@app.command("download-sets")
	def download_sets(
	set_id: str = typer.Argument(None, help="Standard set ID (if downloading by ID)"),
	jurisdiction: str = typer.Option(
	None,
	"--jurisdiction",
	"-j",
	help="Jurisdiction ID (if downloading by jurisdiction)",
	),
	force: bool = typer.Option(
	False, "--force", "-f", help="Force refresh from API, ignoring local cache"
	),
	yes: bool = typer.Option(
	False,
	"--yes",
	"-y",
	help="Skip confirmation prompt when downloading by jurisdiction",
	),
	dry_run: bool = typer.Option(
	False,
	"--dry-run",
	help="Show what would be downloaded without actually downloading",
	),
	education_levels: str = typer.Option(
	None,
	"--education-levels",
	help="Comma-separated grade levels (e.g., '03,04,05')",
	),
	publication_status: str = typer.Option(
	None,
	"--publication-status",
	help="Publication status filter (e.g., 'Published', 'Deprecated')",
	),
	valid_year: str = typer.Option(
	None, "--valid-year", help="Valid year filter (e.g., '2012')"
	),
	title: str = typer.Option(
	None, "--title", help="Partial title match (case-insensitive)"
	),
	subject: str = typer.Option(
	None, "--subject", help="Partial subject match (case-insensitive)"
	),
	):
	"""
	Download standard sets either by ID or by jurisdiction with filtering.

	When downloading by jurisdiction, filters can be applied and all filters combine with AND logic.
	A confirmation prompt will be shown listing all standard sets that will be downloaded.

	Use --dry-run to preview what would be downloaded without actually downloading anything.
	"""
	try:
	# Validate arguments
	if not set_id and not jurisdiction:
	console.print(
	"[red]Error: Must provide either set_id or --jurisdiction[/red]"
	)
	raise typer.Exit(code=1)

	if set_id and jurisdiction:
	console.print(
	"[red]Error: Cannot specify both set_id and --jurisdiction[/red]"
	)
	raise typer.Exit(code=1)

	# Download by ID
	if set_id:
	if dry_run:
	console.print(
	f"[yellow][DRY RUN] Would download standard set: {set_id}[/yellow]"
	)
	cache_path = Path("data/raw/standardSets") / set_id / "data.json"
	console.print(f" Would cache to: {cache_path}")
	return

	with console.status(f"[bold blue]Downloading standard set {set_id}..."):
	api_client.download_standard_set(set_id, force_refresh=force)

	cache_path = Path("data/raw/standardSets") / set_id / "data.json"
	console.print("[green]✓ Successfully downloaded standard set[/green]")
	console.print(f" Cached to: {cache_path}")

	# Process the downloaded set
	try:
	with console.status(f"[bold blue]Processing standard set {set_id}..."):
	processed_path = process_and_save(set_id)
	console.print("[green]✓ Successfully processed standard set[/green]")
	console.print(f" Processed to: {processed_path}")
	except FileNotFoundError:
	console.print(
	"[yellow]Warning: data.json not found, skipping processing[/yellow]"
	)
	except Exception as e:
	console.print(
	f"[yellow]Warning: Failed to process standard set: {e}[/yellow]"
	)
	logger.exception(f"Failed to process standard set {set_id}")

	return

	# Download by jurisdiction
	if jurisdiction:
	# Parse education levels
	education_levels_list = None
	if education_levels:
	education_levels_list = [
	level.strip() for level in education_levels.split(",")
	]

	# Get jurisdiction metadata
	jurisdiction_data = api_client.get_jurisdiction_details(
	jurisdiction, force_refresh=False
	)
	all_sets = jurisdiction_data.standardSets

	# Apply filters using the API client's filter function
	from tools.api_client import _filter_standard_set

	filtered_sets = [
	s
	for s in all_sets
	if _filter_standard_set(
	s,
	education_levels=education_levels_list,
	publication_status=publication_status,
	valid_year=valid_year,
	title_search=title,
	subject_search=subject,
	)
	]

	if not filtered_sets:
	console.print(
	"[yellow]No standard sets match the provided filters.[/yellow]"
	)
	return

	# Display filtered sets
	if dry_run:
	console.print(
	f"\n[yellow][DRY RUN] Standard sets that would be downloaded ({len(filtered_sets)}):[/yellow]"
	)
	else:
	console.print(
	f"\n[bold]Standard sets to download ({len(filtered_sets)}):[/bold]"
	)

	table = Table(
	"Set ID",
	"Subject",
	"Title",
	"Grade Levels",
	"Status",
	"Year",
	"Downloaded",
	title="Standard Sets",
	)
	for s in filtered_sets:
	display_id = s.id[:20] + "..." if len(s.id) > 20 else s.id
	# Check if already downloaded
	set_data_path = settings.standard_sets_dir / s.id / "data.json"
	is_downloaded = set_data_path.exists()
	downloaded_status = (
	"[green]✓[/green]" if is_downloaded else "[yellow]✗[/yellow]"
	)
	table.add_row(
	display_id,
	s.subject,
	s.title[:40],
	", ".join(s.educationLevels),
	s.document.publicationStatus or "N/A",
	s.document.valid,
	downloaded_status,
	)
	console.print(table)

	# If dry run, show summary and exit
	if dry_run:
	console.print(
	f"\n[yellow][DRY RUN] Would download {len(filtered_sets)} standard set(s)[/yellow]"
	)
	console.print(
	"[dim]Run without --dry-run to actually download these standard sets.[/dim]"
	)
	return

	# Confirmation prompt
	if not yes:
	if not typer.confirm(
	f"\nDownload {len(filtered_sets)} standard set(s)?"
	):
	console.print("[yellow]Download cancelled.[/yellow]")
	return

	# Download each standard set
	console.print(
	f"\n[bold blue]Downloading {len(filtered_sets)} standard set(s)...[/bold blue]"
	)
	downloaded = 0
	failed = 0

	for i, standard_set in enumerate(filtered_sets, 1):
	set_id = standard_set.id
	try:
	with console.status(
	f"[bold blue][{i}/{len(filtered_sets)}] Downloading {set_id[:20]}..."
	):
	api_client.download_standard_set(set_id, force_refresh=force)
	downloaded += 1

	# Process the downloaded set
	try:
	with console.status(
	f"[bold blue][{i}/{len(filtered_sets)}] Processing {set_id[:20]}..."
	):
	process_and_save(set_id)
	except FileNotFoundError:
	console.print(
	f"[yellow]Warning: Skipping processing for {set_id[:20]}... (data.json not found)[/yellow]"
	)
	except Exception as e:
	console.print(
	f"[yellow]Warning: Failed to process {set_id[:20]}...: {e}[/yellow]"
	)
	logger.exception(f"Failed to process standard set {set_id}")

	except Exception as e:
	console.print(f"[red]✗ Failed to download {set_id}: {e}[/red]")
	logger.exception(f"Failed to download standard set {set_id}")
	failed += 1

	# Summary
	console.print(
	f"\n[green]✓ Successfully downloaded {downloaded} standard set(s)[/green]"
	)
	if failed > 0:
	console.print(
	f"[red]✗ Failed to download {failed} standard set(s)[/red]"
	)

	except Exception as e:
	console.print(f"[red]Error: {e}[/red]")
	logger.exception("Failed to download standard sets")
	raise typer.Exit(code=1)


	@app.command("list")
	def list_datasets():
	"""List all downloaded standard sets and their processing status."""
	try:
	datasets = data_manager.list_downloaded_standard_sets()

	if not datasets:
	console.print("[yellow]No standard sets downloaded yet.[/yellow]")
	console.print("[dim]Use 'download-sets' to download standard sets.[/dim]")
	return

	# Check for processed.json files
	for d in datasets:
	set_dir = settings.standard_sets_dir / d.set_id
	processed_file = set_dir / "processed.json"
	d.processed = processed_file.exists()

	# Count processed vs unprocessed
	processed_count = sum(1 for d in datasets if d.processed)
	unprocessed_count = len(datasets) - processed_count

	table = Table(
	"Set ID",
	"Jurisdiction",
	"Subject",
	"Title",
	"Grades",
	"Status",
	"Processed",
	title="Downloaded Standard Sets",
	)
	for d in datasets:
	# Truncate long set IDs
	display_id = d.set_id[:25] + "..." if len(d.set_id) > 25 else d.set_id

	table.add_row(
	display_id,
	d.jurisdiction,
	d.subject[:30],
	d.title[:30],
	", ".join(d.education_levels),
	d.publication_status,
	"[green]✓[/green]" if d.processed else "[yellow]✗[/yellow]",
	)

	console.print(table)
	console.print("\n[bold]Summary:[/bold]")
	console.print(f" Total: {len(datasets)} standard sets")
	console.print(f" Processed: [green]{processed_count}[/green]")
	console.print(f" Unprocessed: [yellow]{unprocessed_count}[/yellow]")

	except Exception as e:
	console.print(f"[red]Error: {e}[/red]")
	logger.exception("Failed to list datasets")
	raise typer.Exit(code=1)


	@app.command("pinecone-init")
	def pinecone_init():
	"""
	Initialize Pinecone index.

	Checks if the configured index exists and creates it if not.
	Uses integrated embeddings with llama-text-embed-v2 model.
	"""
	try:
	from src.pinecone_client import PineconeClient

	console.print("[bold]Initializing Pinecone...[/bold]")

	# Initialize Pinecone client (validates API key)
	try:
	client = PineconeClient()
	except ValueError as e:
	console.print(f"[red]Error: {e}[/red]")
	raise typer.Exit(code=1)

	console.print(f" Index name: [cyan]{client.index_name}[/cyan]")
	console.print(f" Namespace: [cyan]{client.namespace}[/cyan]")

	# Check and create index if needed
	with console.status("[bold blue]Checking index status..."):
	created = client.ensure_index_exists()

	if created:
	console.print(
	f"\n[green]Successfully created index '{client.index_name}'[/green]"
	)
	console.print("[dim]Index configuration:[/dim]")
	console.print(" Cloud: aws")
	console.print(" Region: us-east-1")
	console.print(" Embedding model: llama-text-embed-v2")
	console.print(" Field map: text -> content")
	else:
	console.print(
	f"\n[green]Index '{client.index_name}' already exists[/green]"
	)

	# Show index stats
	with console.status("[bold blue]Fetching index stats..."):
	stats = client.get_index_stats()

	console.print("\n[bold]Index Statistics:[/bold]")
	console.print(f" Total vectors: [cyan]{stats['total_vector_count']}[/cyan]")

	namespaces = stats.get("namespaces", {})
	if namespaces:
	console.print(f" Namespaces: [cyan]{len(namespaces)}[/cyan]")
	table = Table("Namespace", "Vector Count", title="Namespace Details")
	for ns_name, ns_info in namespaces.items():
	vector_count = getattr(ns_info, "vector_count", 0)
	table.add_row(ns_name or "(default)", str(vector_count))
	console.print(table)
	else:
	console.print(" Namespaces: [yellow]None (empty index)[/yellow]")

	except Exception as e:
	console.print(f"[red]Error: {e}[/red]")
	logger.exception("Failed to initialize Pinecone")
	raise typer.Exit(code=1)


	@app.command("pinecone-upload")
	def pinecone_upload(
	set_id: str = typer.Option(
	None, "--set-id", help="Upload a specific standard set by ID"
	),
	all: bool = typer.Option(
	False, "--all", help="Upload all downloaded standard sets with processed.json"
	),
	force: bool = typer.Option(
	False,
	"--force",
	help="Re-upload even if .pinecone_uploaded marker exists",
	),
	dry_run: bool = typer.Option(
	False,
	"--dry-run",
	help="Show what would be uploaded without actually uploading",
	),
	batch_size: int = typer.Option(
	96, "--batch-size", help="Number of records per batch (default: 96)"
	),
	):
	"""
	Upload processed standard sets to Pinecone.

	Use --set-id to upload a specific set, or --all to upload all sets with processed.json.
	If neither is provided, you'll be prompted to confirm uploading all sets.
	"""
	try:
	from src.pinecone_client import PineconeClient
	from tools.pinecone_models import ProcessedStandardSet
	import json

	# Initialize Pinecone client
	try:
	client = PineconeClient()
	except ValueError as e:
	console.print(f"[red]Error: {e}[/red]")
	raise typer.Exit(code=1)

	# Validate index exists
	try:
	client.validate_index()
	except ValueError as e:
	console.print(f"[red]Error: {e}[/red]")
	raise typer.Exit(code=1)

	# Discover standard sets with processed.json
	standard_sets_dir = settings.standard_sets_dir
	if not standard_sets_dir.exists():
	console.print("[yellow]No standard sets directory found.[/yellow]")
	console.print(
	"[dim]Use 'download-sets' to download standard sets first.[/dim]"
	)
	return

	# Find all sets with processed.json
	sets_to_upload = []
	for set_dir in standard_sets_dir.iterdir():
	if not set_dir.is_dir():
	continue

	processed_file = set_dir / "processed.json"
	if not processed_file.exists():
	continue

	set_id_from_dir = set_dir.name

	# Check if already uploaded (unless --force)
	# Mark all sets during discovery; filtering by --set-id happens later
	if not force and PineconeClient.is_uploaded(set_dir):
	sets_to_upload.append(
	(set_id_from_dir, set_dir, True)
	) # True = already uploaded
	else:
	sets_to_upload.append(
	(set_id_from_dir, set_dir, False)
	) # False = needs upload

	if not sets_to_upload:
	console.print(
	"[yellow]No standard sets with processed.json found.[/yellow]"
	)
	console.print(
	"[dim]Use 'download-sets' to download and process standard sets first.[/dim]"
	)
	return

	# Filter by --set-id if provided
	if set_id:
	sets_to_upload = [
	(sid, sdir, skipped)
	for sid, sdir, skipped in sets_to_upload
	if sid == set_id
	]
	if not sets_to_upload:
	console.print(
	f"[yellow]Standard set '{set_id}' not found or has no processed.json.[/yellow]"
	)
	return

	# If neither --set-id nor --all provided, prompt for confirmation
	if not set_id and not all:
	console.print(
	f"\n[bold]Found {len(sets_to_upload)} standard set(s) with processed.json:[/bold]"
	)
	table = Table("Set ID", "Status", title="Standard Sets")
	for sid, sdir, skipped in sets_to_upload:
	status = (
	"[yellow]Already uploaded[/yellow]"
	if skipped
	else "[green]Ready[/green]"
	)
	table.add_row(sid, status)
	console.print(table)

	if not typer.confirm(
	f"\nUpload {len(sets_to_upload)} standard set(s) to Pinecone?"
	):
	console.print("[yellow]Upload cancelled.[/yellow]")
	return

	# Show what would be uploaded (dry-run or preview)
	if dry_run or not all:
	console.print(
	f"\n[bold]Standard sets to upload ({len(sets_to_upload)}):[/bold]"
	)
	table = Table("Set ID", "Records", "Status", title="Upload Preview")
	for sid, sdir, skipped in sets_to_upload:
	if skipped and not force:
	table.add_row(
	sid, "N/A", "[yellow]Skipped (already uploaded)[/yellow]"
	)
	continue

	# Load processed.json to count records
	try:
	with open(sdir / "processed.json", encoding="utf-8") as f:
	processed_data = json.load(f)
	record_count = len(processed_data.get("records", []))
	status = (
	"[green]Ready[/green]"
	if not dry_run
	else "[yellow]Would upload[/yellow]"
	)
	table.add_row(sid, str(record_count), status)
	except Exception as e:
	table.add_row(sid, "Error", f"[red]Failed to read: {e}[/red]")
	console.print(table)

	if dry_run:
	console.print(
	f"\n[yellow][DRY RUN] Would upload {len([s for s in sets_to_upload if not s[2] or force])} standard set(s)[/yellow]"
	)
	console.print("[dim]Run without --dry-run to actually upload.[/dim]")
	return

	# Perform uploads
	uploaded_count = 0
	failed_count = 0
	skipped_count = 0

	for i, (sid, sdir, already_uploaded) in enumerate(sets_to_upload, 1):
	if already_uploaded and not force:
	skipped_count += 1
	continue

	try:
	# Load processed.json
	with open(sdir / "processed.json", encoding="utf-8") as f:
	processed_data = json.load(f)

	processed_set = ProcessedStandardSet(**processed_data)
	records = processed_set.records

	if not records:
	console.print(
	f"[yellow]Skipping {sid} (no records)[/yellow]"
	)
	skipped_count += 1
	continue

	# Upload records
	with console.status(
	f"[bold blue][{i}/{len(sets_to_upload)}] Uploading {sid} ({len(records)} records)"
	):
	client.batch_upsert(records, batch_size=batch_size)

	# Mark as uploaded
	PineconeClient.mark_uploaded(sdir)
	uploaded_count += 1
	console.print(
	f"[green]✓ [{i}/{len(sets_to_upload)}] Uploaded {sid} ({len(records)} records)[/green]"
	)

	except FileNotFoundError:
	console.print(
	f"[red]✗ [{i}/{len(sets_to_upload)}] Failed: {sid} (processed.json not found)[/red]"
	)
	logger.exception(f"Failed to upload standard set {sid}")
	failed_count += 1
	except Exception as e:
	console.print(
	f"[red]✗ [{i}/{len(sets_to_upload)}] Failed: {sid} ({e})[/red]"
	)
	logger.exception(f"Failed to upload standard set {sid}")
	failed_count += 1

	# Summary
	console.print("\n[bold]Upload Summary:[/bold]")
	console.print(f" Uploaded: [green]{uploaded_count}[/green]")
	if skipped_count > 0:
	console.print(f" Skipped: [yellow]{skipped_count}[/yellow]")
	if failed_count > 0:
	console.print(f" Failed: [red]{failed_count}[/red]")

	except Exception as e:
	console.print(f"[red]Error: {e}[/red]")
	logger.exception("Failed to upload to Pinecone")
	raise typer.Exit(code=1)


	if __name__ == "__main__":
	app()