|
|
"""CLI entry point for EduMatch Data Management.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
project_root = Path(__file__).parent.parent |
|
|
if str(project_root) not in sys.path: |
|
|
sys.path.insert(0, str(project_root)) |
|
|
|
|
|
import typer |
|
|
from loguru import logger |
|
|
from rich.console import Console |
|
|
from rich.table import Table |
|
|
|
|
|
from tools import api_client, data_manager |
|
|
from tools.config import get_settings |
|
|
from tools.pinecone_processor import process_and_save |
|
|
|
|
|
settings = get_settings() |
|
|
|
|
|
|
|
|
logger.remove() |
|
|
logger.add( |
|
|
sys.stderr, |
|
|
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>", |
|
|
) |
|
|
logger.add( |
|
|
settings.log_file, |
|
|
rotation=settings.log_rotation, |
|
|
retention=settings.log_retention, |
|
|
format="{time} | {level} | {message}", |
|
|
) |
|
|
|
|
|
app = typer.Typer(help="Common Core MCP CLI - Manage educational standards data") |
|
|
console = Console() |
|
|
|
|
|
|
|
|
@app.command() |
|
|
def jurisdictions( |
|
|
search: str = typer.Option( |
|
|
None, |
|
|
"--search", |
|
|
"-s", |
|
|
help="Filter by jurisdiction name (case-insensitive partial match)", |
|
|
), |
|
|
type: str = typer.Option( |
|
|
None, |
|
|
"--type", |
|
|
"-t", |
|
|
help="Filter by jurisdiction type: school, organization, state, or nation", |
|
|
), |
|
|
force: bool = typer.Option( |
|
|
False, "--force", "-f", help="Force refresh from API, ignoring local cache" |
|
|
), |
|
|
): |
|
|
""" |
|
|
List all available jurisdictions (states/organizations). |
|
|
|
|
|
By default, jurisdictions are loaded from local cache (data/raw/jurisdictions.json) |
|
|
to avoid repeated API calls. Use --force to fetch fresh data from the API and update |
|
|
the cache. The cache is automatically created on first use. |
|
|
|
|
|
Filters can be combined: use --search to filter by name and --type to filter by type. |
|
|
""" |
|
|
try: |
|
|
if force: |
|
|
console.print("[yellow]Forcing refresh from API...[/yellow]") |
|
|
|
|
|
|
|
|
if type: |
|
|
valid_types = {"school", "organization", "state", "nation"} |
|
|
if type.lower() not in valid_types: |
|
|
console.print( |
|
|
f"[red]Error: Invalid type '{type}'. Must be one of: {', '.join(sorted(valid_types))}[/red]" |
|
|
) |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
results = api_client.get_jurisdictions( |
|
|
search_term=search, type_filter=type, force_refresh=force |
|
|
) |
|
|
|
|
|
table = Table("ID", "Title", "Type", title="Jurisdictions") |
|
|
for j in results: |
|
|
table.add_row(j.id, j.title, j.type) |
|
|
|
|
|
console.print(table) |
|
|
console.print(f"\n[green]Found {len(results)} jurisdictions[/green]") |
|
|
|
|
|
if not force: |
|
|
console.print("[dim]Tip: Use --force to refresh from API[/dim]") |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
logger.exception("Failed to fetch jurisdictions") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
@app.command() |
|
|
def jurisdiction_details( |
|
|
jurisdiction_id: str = typer.Argument(..., help="Jurisdiction ID"), |
|
|
force: bool = typer.Option( |
|
|
False, "--force", "-f", help="Force refresh from API, ignoring local cache" |
|
|
), |
|
|
): |
|
|
""" |
|
|
Download and display jurisdiction metadata including standard set references. |
|
|
|
|
|
By default, jurisdiction metadata is loaded from local cache (data/raw/jurisdictions/{id}/data.json) |
|
|
to avoid repeated API calls. Use --force to fetch fresh data from the API and update the cache. |
|
|
The cache is automatically created on first use. |
|
|
|
|
|
Note: This command downloads metadata about standard sets (IDs, titles, subjects) but NOT |
|
|
the full standard set content. Use the 'download' command to get full standard set data. |
|
|
""" |
|
|
try: |
|
|
if force: |
|
|
console.print("[yellow]Forcing refresh from API...[/yellow]") |
|
|
|
|
|
jurisdiction_data = api_client.get_jurisdiction_details( |
|
|
jurisdiction_id, force_refresh=force |
|
|
) |
|
|
|
|
|
|
|
|
console.print(f"\n[bold]Jurisdiction:[/bold] {jurisdiction_data.title}") |
|
|
console.print(f"[bold]Type:[/bold] {jurisdiction_data.type}") |
|
|
console.print(f"[bold]ID:[/bold] {jurisdiction_data.id}") |
|
|
|
|
|
|
|
|
standard_sets = jurisdiction_data.standardSets |
|
|
if standard_sets: |
|
|
table = Table( |
|
|
"Set ID", "Subject", "Title", "Grade Levels", title="Standard Sets" |
|
|
) |
|
|
for s in standard_sets: |
|
|
grade_levels = ", ".join(s.educationLevels) |
|
|
table.add_row( |
|
|
s.id, |
|
|
s.subject, |
|
|
s.title, |
|
|
grade_levels or "N/A", |
|
|
) |
|
|
|
|
|
console.print("\n") |
|
|
console.print(table) |
|
|
console.print(f"\n[green]Found {len(standard_sets)} standard sets[/green]") |
|
|
else: |
|
|
console.print("\n[yellow]No standard sets found[/yellow]") |
|
|
|
|
|
if not force: |
|
|
console.print("[dim]Tip: Use --force to refresh from API[/dim]") |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
logger.exception("Failed to fetch jurisdiction details") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
@app.command("download-sets") |
|
|
def download_sets( |
|
|
set_id: str = typer.Argument(None, help="Standard set ID (if downloading by ID)"), |
|
|
jurisdiction: str = typer.Option( |
|
|
None, |
|
|
"--jurisdiction", |
|
|
"-j", |
|
|
help="Jurisdiction ID (if downloading by jurisdiction)", |
|
|
), |
|
|
force: bool = typer.Option( |
|
|
False, "--force", "-f", help="Force refresh from API, ignoring local cache" |
|
|
), |
|
|
yes: bool = typer.Option( |
|
|
False, |
|
|
"--yes", |
|
|
"-y", |
|
|
help="Skip confirmation prompt when downloading by jurisdiction", |
|
|
), |
|
|
dry_run: bool = typer.Option( |
|
|
False, |
|
|
"--dry-run", |
|
|
help="Show what would be downloaded without actually downloading", |
|
|
), |
|
|
education_levels: str = typer.Option( |
|
|
None, |
|
|
"--education-levels", |
|
|
help="Comma-separated grade levels (e.g., '03,04,05')", |
|
|
), |
|
|
publication_status: str = typer.Option( |
|
|
None, |
|
|
"--publication-status", |
|
|
help="Publication status filter (e.g., 'Published', 'Deprecated')", |
|
|
), |
|
|
valid_year: str = typer.Option( |
|
|
None, "--valid-year", help="Valid year filter (e.g., '2012')" |
|
|
), |
|
|
title: str = typer.Option( |
|
|
None, "--title", help="Partial title match (case-insensitive)" |
|
|
), |
|
|
subject: str = typer.Option( |
|
|
None, "--subject", help="Partial subject match (case-insensitive)" |
|
|
), |
|
|
): |
|
|
""" |
|
|
Download standard sets either by ID or by jurisdiction with filtering. |
|
|
|
|
|
When downloading by jurisdiction, filters can be applied and all filters combine with AND logic. |
|
|
A confirmation prompt will be shown listing all standard sets that will be downloaded. |
|
|
|
|
|
Use --dry-run to preview what would be downloaded without actually downloading anything. |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not set_id and not jurisdiction: |
|
|
console.print( |
|
|
"[red]Error: Must provide either set_id or --jurisdiction[/red]" |
|
|
) |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
if set_id and jurisdiction: |
|
|
console.print( |
|
|
"[red]Error: Cannot specify both set_id and --jurisdiction[/red]" |
|
|
) |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
if set_id: |
|
|
if dry_run: |
|
|
console.print( |
|
|
f"[yellow][DRY RUN] Would download standard set: {set_id}[/yellow]" |
|
|
) |
|
|
cache_path = Path("data/raw/standardSets") / set_id / "data.json" |
|
|
console.print(f" Would cache to: {cache_path}") |
|
|
return |
|
|
|
|
|
with console.status(f"[bold blue]Downloading standard set {set_id}..."): |
|
|
api_client.download_standard_set(set_id, force_refresh=force) |
|
|
|
|
|
cache_path = Path("data/raw/standardSets") / set_id / "data.json" |
|
|
console.print("[green]β Successfully downloaded standard set[/green]") |
|
|
console.print(f" Cached to: {cache_path}") |
|
|
|
|
|
|
|
|
try: |
|
|
with console.status(f"[bold blue]Processing standard set {set_id}..."): |
|
|
processed_path = process_and_save(set_id) |
|
|
console.print("[green]β Successfully processed standard set[/green]") |
|
|
console.print(f" Processed to: {processed_path}") |
|
|
except FileNotFoundError: |
|
|
console.print( |
|
|
"[yellow]Warning: data.json not found, skipping processing[/yellow]" |
|
|
) |
|
|
except Exception as e: |
|
|
console.print( |
|
|
f"[yellow]Warning: Failed to process standard set: {e}[/yellow]" |
|
|
) |
|
|
logger.exception(f"Failed to process standard set {set_id}") |
|
|
|
|
|
return |
|
|
|
|
|
|
|
|
if jurisdiction: |
|
|
|
|
|
education_levels_list = None |
|
|
if education_levels: |
|
|
education_levels_list = [ |
|
|
level.strip() for level in education_levels.split(",") |
|
|
] |
|
|
|
|
|
|
|
|
jurisdiction_data = api_client.get_jurisdiction_details( |
|
|
jurisdiction, force_refresh=False |
|
|
) |
|
|
all_sets = jurisdiction_data.standardSets |
|
|
|
|
|
|
|
|
from tools.api_client import _filter_standard_set |
|
|
|
|
|
filtered_sets = [ |
|
|
s |
|
|
for s in all_sets |
|
|
if _filter_standard_set( |
|
|
s, |
|
|
education_levels=education_levels_list, |
|
|
publication_status=publication_status, |
|
|
valid_year=valid_year, |
|
|
title_search=title, |
|
|
subject_search=subject, |
|
|
) |
|
|
] |
|
|
|
|
|
if not filtered_sets: |
|
|
console.print( |
|
|
"[yellow]No standard sets match the provided filters.[/yellow]" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
if dry_run: |
|
|
console.print( |
|
|
f"\n[yellow][DRY RUN] Standard sets that would be downloaded ({len(filtered_sets)}):[/yellow]" |
|
|
) |
|
|
else: |
|
|
console.print( |
|
|
f"\n[bold]Standard sets to download ({len(filtered_sets)}):[/bold]" |
|
|
) |
|
|
|
|
|
table = Table( |
|
|
"Set ID", |
|
|
"Subject", |
|
|
"Title", |
|
|
"Grade Levels", |
|
|
"Status", |
|
|
"Year", |
|
|
"Downloaded", |
|
|
title="Standard Sets", |
|
|
) |
|
|
for s in filtered_sets: |
|
|
display_id = s.id[:20] + "..." if len(s.id) > 20 else s.id |
|
|
|
|
|
set_data_path = settings.standard_sets_dir / s.id / "data.json" |
|
|
is_downloaded = set_data_path.exists() |
|
|
downloaded_status = ( |
|
|
"[green]β[/green]" if is_downloaded else "[yellow]β[/yellow]" |
|
|
) |
|
|
table.add_row( |
|
|
display_id, |
|
|
s.subject, |
|
|
s.title[:40], |
|
|
", ".join(s.educationLevels), |
|
|
s.document.publicationStatus or "N/A", |
|
|
s.document.valid, |
|
|
downloaded_status, |
|
|
) |
|
|
console.print(table) |
|
|
|
|
|
|
|
|
if dry_run: |
|
|
console.print( |
|
|
f"\n[yellow][DRY RUN] Would download {len(filtered_sets)} standard set(s)[/yellow]" |
|
|
) |
|
|
console.print( |
|
|
"[dim]Run without --dry-run to actually download these standard sets.[/dim]" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
if not yes: |
|
|
if not typer.confirm( |
|
|
f"\nDownload {len(filtered_sets)} standard set(s)?" |
|
|
): |
|
|
console.print("[yellow]Download cancelled.[/yellow]") |
|
|
return |
|
|
|
|
|
|
|
|
console.print( |
|
|
f"\n[bold blue]Downloading {len(filtered_sets)} standard set(s)...[/bold blue]" |
|
|
) |
|
|
downloaded = 0 |
|
|
failed = 0 |
|
|
|
|
|
for i, standard_set in enumerate(filtered_sets, 1): |
|
|
set_id = standard_set.id |
|
|
try: |
|
|
with console.status( |
|
|
f"[bold blue][{i}/{len(filtered_sets)}] Downloading {set_id[:20]}..." |
|
|
): |
|
|
api_client.download_standard_set(set_id, force_refresh=force) |
|
|
downloaded += 1 |
|
|
|
|
|
|
|
|
try: |
|
|
with console.status( |
|
|
f"[bold blue][{i}/{len(filtered_sets)}] Processing {set_id[:20]}..." |
|
|
): |
|
|
process_and_save(set_id) |
|
|
except FileNotFoundError: |
|
|
console.print( |
|
|
f"[yellow]Warning: Skipping processing for {set_id[:20]}... (data.json not found)[/yellow]" |
|
|
) |
|
|
except Exception as e: |
|
|
console.print( |
|
|
f"[yellow]Warning: Failed to process {set_id[:20]}...: {e}[/yellow]" |
|
|
) |
|
|
logger.exception(f"Failed to process standard set {set_id}") |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]β Failed to download {set_id}: {e}[/red]") |
|
|
logger.exception(f"Failed to download standard set {set_id}") |
|
|
failed += 1 |
|
|
|
|
|
|
|
|
console.print( |
|
|
f"\n[green]β Successfully downloaded {downloaded} standard set(s)[/green]" |
|
|
) |
|
|
if failed > 0: |
|
|
console.print( |
|
|
f"[red]β Failed to download {failed} standard set(s)[/red]" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
logger.exception("Failed to download standard sets") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
@app.command("list") |
|
|
def list_datasets(): |
|
|
"""List all downloaded standard sets and their processing status.""" |
|
|
try: |
|
|
datasets = data_manager.list_downloaded_standard_sets() |
|
|
|
|
|
if not datasets: |
|
|
console.print("[yellow]No standard sets downloaded yet.[/yellow]") |
|
|
console.print("[dim]Use 'download-sets' to download standard sets.[/dim]") |
|
|
return |
|
|
|
|
|
|
|
|
for d in datasets: |
|
|
set_dir = settings.standard_sets_dir / d.set_id |
|
|
processed_file = set_dir / "processed.json" |
|
|
d.processed = processed_file.exists() |
|
|
|
|
|
|
|
|
processed_count = sum(1 for d in datasets if d.processed) |
|
|
unprocessed_count = len(datasets) - processed_count |
|
|
|
|
|
table = Table( |
|
|
"Set ID", |
|
|
"Jurisdiction", |
|
|
"Subject", |
|
|
"Title", |
|
|
"Grades", |
|
|
"Status", |
|
|
"Processed", |
|
|
title="Downloaded Standard Sets", |
|
|
) |
|
|
for d in datasets: |
|
|
|
|
|
display_id = d.set_id[:25] + "..." if len(d.set_id) > 25 else d.set_id |
|
|
|
|
|
table.add_row( |
|
|
display_id, |
|
|
d.jurisdiction, |
|
|
d.subject[:30], |
|
|
d.title[:30], |
|
|
", ".join(d.education_levels), |
|
|
d.publication_status, |
|
|
"[green]β[/green]" if d.processed else "[yellow]β[/yellow]", |
|
|
) |
|
|
|
|
|
console.print(table) |
|
|
console.print("\n[bold]Summary:[/bold]") |
|
|
console.print(f" Total: {len(datasets)} standard sets") |
|
|
console.print(f" Processed: [green]{processed_count}[/green]") |
|
|
console.print(f" Unprocessed: [yellow]{unprocessed_count}[/yellow]") |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
logger.exception("Failed to list datasets") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
@app.command("pinecone-init") |
|
|
def pinecone_init(): |
|
|
""" |
|
|
Initialize Pinecone index. |
|
|
|
|
|
Checks if the configured index exists and creates it if not. |
|
|
Uses integrated embeddings with llama-text-embed-v2 model. |
|
|
""" |
|
|
try: |
|
|
from src.pinecone_client import PineconeClient |
|
|
|
|
|
console.print("[bold]Initializing Pinecone...[/bold]") |
|
|
|
|
|
|
|
|
try: |
|
|
client = PineconeClient() |
|
|
except ValueError as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
console.print(f" Index name: [cyan]{client.index_name}[/cyan]") |
|
|
console.print(f" Namespace: [cyan]{client.namespace}[/cyan]") |
|
|
|
|
|
|
|
|
with console.status("[bold blue]Checking index status..."): |
|
|
created = client.ensure_index_exists() |
|
|
|
|
|
if created: |
|
|
console.print( |
|
|
f"\n[green]Successfully created index '{client.index_name}'[/green]" |
|
|
) |
|
|
console.print("[dim]Index configuration:[/dim]") |
|
|
console.print(" Cloud: aws") |
|
|
console.print(" Region: us-east-1") |
|
|
console.print(" Embedding model: llama-text-embed-v2") |
|
|
console.print(" Field map: text -> content") |
|
|
else: |
|
|
console.print( |
|
|
f"\n[green]Index '{client.index_name}' already exists[/green]" |
|
|
) |
|
|
|
|
|
|
|
|
with console.status("[bold blue]Fetching index stats..."): |
|
|
stats = client.get_index_stats() |
|
|
|
|
|
console.print("\n[bold]Index Statistics:[/bold]") |
|
|
console.print(f" Total vectors: [cyan]{stats['total_vector_count']}[/cyan]") |
|
|
|
|
|
namespaces = stats.get("namespaces", {}) |
|
|
if namespaces: |
|
|
console.print(f" Namespaces: [cyan]{len(namespaces)}[/cyan]") |
|
|
table = Table("Namespace", "Vector Count", title="Namespace Details") |
|
|
for ns_name, ns_info in namespaces.items(): |
|
|
vector_count = getattr(ns_info, "vector_count", 0) |
|
|
table.add_row(ns_name or "(default)", str(vector_count)) |
|
|
console.print(table) |
|
|
else: |
|
|
console.print(" Namespaces: [yellow]None (empty index)[/yellow]") |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
logger.exception("Failed to initialize Pinecone") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
@app.command("pinecone-upload") |
|
|
def pinecone_upload( |
|
|
set_id: str = typer.Option( |
|
|
None, "--set-id", help="Upload a specific standard set by ID" |
|
|
), |
|
|
all: bool = typer.Option( |
|
|
False, "--all", help="Upload all downloaded standard sets with processed.json" |
|
|
), |
|
|
force: bool = typer.Option( |
|
|
False, |
|
|
"--force", |
|
|
help="Re-upload even if .pinecone_uploaded marker exists", |
|
|
), |
|
|
dry_run: bool = typer.Option( |
|
|
False, |
|
|
"--dry-run", |
|
|
help="Show what would be uploaded without actually uploading", |
|
|
), |
|
|
batch_size: int = typer.Option( |
|
|
96, "--batch-size", help="Number of records per batch (default: 96)" |
|
|
), |
|
|
): |
|
|
""" |
|
|
Upload processed standard sets to Pinecone. |
|
|
|
|
|
Use --set-id to upload a specific set, or --all to upload all sets with processed.json. |
|
|
If neither is provided, you'll be prompted to confirm uploading all sets. |
|
|
""" |
|
|
try: |
|
|
from src.pinecone_client import PineconeClient |
|
|
from tools.pinecone_models import ProcessedStandardSet |
|
|
import json |
|
|
|
|
|
|
|
|
try: |
|
|
client = PineconeClient() |
|
|
except ValueError as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
try: |
|
|
client.validate_index() |
|
|
except ValueError as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
standard_sets_dir = settings.standard_sets_dir |
|
|
if not standard_sets_dir.exists(): |
|
|
console.print("[yellow]No standard sets directory found.[/yellow]") |
|
|
console.print( |
|
|
"[dim]Use 'download-sets' to download standard sets first.[/dim]" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
sets_to_upload = [] |
|
|
for set_dir in standard_sets_dir.iterdir(): |
|
|
if not set_dir.is_dir(): |
|
|
continue |
|
|
|
|
|
processed_file = set_dir / "processed.json" |
|
|
if not processed_file.exists(): |
|
|
continue |
|
|
|
|
|
set_id_from_dir = set_dir.name |
|
|
|
|
|
|
|
|
|
|
|
if not force and PineconeClient.is_uploaded(set_dir): |
|
|
sets_to_upload.append( |
|
|
(set_id_from_dir, set_dir, True) |
|
|
) |
|
|
else: |
|
|
sets_to_upload.append( |
|
|
(set_id_from_dir, set_dir, False) |
|
|
) |
|
|
|
|
|
if not sets_to_upload: |
|
|
console.print( |
|
|
"[yellow]No standard sets with processed.json found.[/yellow]" |
|
|
) |
|
|
console.print( |
|
|
"[dim]Use 'download-sets' to download and process standard sets first.[/dim]" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
if set_id: |
|
|
sets_to_upload = [ |
|
|
(sid, sdir, skipped) |
|
|
for sid, sdir, skipped in sets_to_upload |
|
|
if sid == set_id |
|
|
] |
|
|
if not sets_to_upload: |
|
|
console.print( |
|
|
f"[yellow]Standard set '{set_id}' not found or has no processed.json.[/yellow]" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
if not set_id and not all: |
|
|
console.print( |
|
|
f"\n[bold]Found {len(sets_to_upload)} standard set(s) with processed.json:[/bold]" |
|
|
) |
|
|
table = Table("Set ID", "Status", title="Standard Sets") |
|
|
for sid, sdir, skipped in sets_to_upload: |
|
|
status = ( |
|
|
"[yellow]Already uploaded[/yellow]" |
|
|
if skipped |
|
|
else "[green]Ready[/green]" |
|
|
) |
|
|
table.add_row(sid, status) |
|
|
console.print(table) |
|
|
|
|
|
if not typer.confirm( |
|
|
f"\nUpload {len(sets_to_upload)} standard set(s) to Pinecone?" |
|
|
): |
|
|
console.print("[yellow]Upload cancelled.[/yellow]") |
|
|
return |
|
|
|
|
|
|
|
|
if dry_run or not all: |
|
|
console.print( |
|
|
f"\n[bold]Standard sets to upload ({len(sets_to_upload)}):[/bold]" |
|
|
) |
|
|
table = Table("Set ID", "Records", "Status", title="Upload Preview") |
|
|
for sid, sdir, skipped in sets_to_upload: |
|
|
if skipped and not force: |
|
|
table.add_row( |
|
|
sid, "N/A", "[yellow]Skipped (already uploaded)[/yellow]" |
|
|
) |
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
with open(sdir / "processed.json", encoding="utf-8") as f: |
|
|
processed_data = json.load(f) |
|
|
record_count = len(processed_data.get("records", [])) |
|
|
status = ( |
|
|
"[green]Ready[/green]" |
|
|
if not dry_run |
|
|
else "[yellow]Would upload[/yellow]" |
|
|
) |
|
|
table.add_row(sid, str(record_count), status) |
|
|
except Exception as e: |
|
|
table.add_row(sid, "Error", f"[red]Failed to read: {e}[/red]") |
|
|
console.print(table) |
|
|
|
|
|
if dry_run: |
|
|
console.print( |
|
|
f"\n[yellow][DRY RUN] Would upload {len([s for s in sets_to_upload if not s[2] or force])} standard set(s)[/yellow]" |
|
|
) |
|
|
console.print("[dim]Run without --dry-run to actually upload.[/dim]") |
|
|
return |
|
|
|
|
|
|
|
|
uploaded_count = 0 |
|
|
failed_count = 0 |
|
|
skipped_count = 0 |
|
|
|
|
|
for i, (sid, sdir, already_uploaded) in enumerate(sets_to_upload, 1): |
|
|
if already_uploaded and not force: |
|
|
skipped_count += 1 |
|
|
continue |
|
|
|
|
|
try: |
|
|
|
|
|
with open(sdir / "processed.json", encoding="utf-8") as f: |
|
|
processed_data = json.load(f) |
|
|
|
|
|
processed_set = ProcessedStandardSet(**processed_data) |
|
|
records = processed_set.records |
|
|
|
|
|
if not records: |
|
|
console.print( |
|
|
f"[yellow]Skipping {sid} (no records)[/yellow]" |
|
|
) |
|
|
skipped_count += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
with console.status( |
|
|
f"[bold blue][{i}/{len(sets_to_upload)}] Uploading {sid} ({len(records)} records)" |
|
|
): |
|
|
client.batch_upsert(records, batch_size=batch_size) |
|
|
|
|
|
|
|
|
PineconeClient.mark_uploaded(sdir) |
|
|
uploaded_count += 1 |
|
|
console.print( |
|
|
f"[green]β [{i}/{len(sets_to_upload)}] Uploaded {sid} ({len(records)} records)[/green]" |
|
|
) |
|
|
|
|
|
except FileNotFoundError: |
|
|
console.print( |
|
|
f"[red]β [{i}/{len(sets_to_upload)}] Failed: {sid} (processed.json not found)[/red]" |
|
|
) |
|
|
logger.exception(f"Failed to upload standard set {sid}") |
|
|
failed_count += 1 |
|
|
except Exception as e: |
|
|
console.print( |
|
|
f"[red]β [{i}/{len(sets_to_upload)}] Failed: {sid} ({e})[/red]" |
|
|
) |
|
|
logger.exception(f"Failed to upload standard set {sid}") |
|
|
failed_count += 1 |
|
|
|
|
|
|
|
|
console.print("\n[bold]Upload Summary:[/bold]") |
|
|
console.print(f" Uploaded: [green]{uploaded_count}[/green]") |
|
|
if skipped_count > 0: |
|
|
console.print(f" Skipped: [yellow]{skipped_count}[/yellow]") |
|
|
if failed_count > 0: |
|
|
console.print(f" Failed: [red]{failed_count}[/red]") |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]Error: {e}[/red]") |
|
|
logger.exception("Failed to upload to Pinecone") |
|
|
raise typer.Exit(code=1) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app() |
|
|
|