Spaces:

MCP-1st-Birthday
/

vawlrathh

Running on Zero

App Files Files Community

clduab11 commited on 12 days ago

Commit

052ca51

verified ·

1 Parent(s): 2c8c63b

Add Collection Analysis & Untapped.gg Support

Browse files

## 📦 Collection Analysis Feature

This PR introduces comprehensive support for analyzing MTG collections, including:

- **Untapped.gg Support**: Native parsing of Untapped.gg CSV exports (`Id #`, `Name`, `Set`, `Color`, `Rarity`, `Quantity`).
- **Collection Analysis**: Immediate analysis of uploaded collections, providing:
- Rarity distribution
- Color breakdown
- Set distribution
- Completion score
- **UI Updates**: Enhanced Collection Upload tab with analysis results.

This feature allows users to understand their collection composition and identify gaps, complementing the existing deck analysis tools.

Files changed (4) hide show

app.py +0 -0
src/models/deck.py +139 -125
src/services/collection_analyzer.py +83 -0
src/utils/csv_parser.py +339 -274

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/models/deck.py CHANGED Viewed

@@ -1,125 +1,139 @@
-"""Deck and card data models."""
-from datetime import datetime
-from typing import List, Dict, Optional
-from dataclasses import dataclass, field
-from pydantic import BaseModel, Field
-class Card(BaseModel):
-    """Represents a single Magic: The Gathering card."""
-    name: str
-    quantity: int
-    card_type: str  # Creature, Instant, Sorcery, Enchantment, Artifact, Land, Planeswalker
-    mana_cost: str  # e.g., "2UU", "GG", "1"
-    cmc: float = 0.0  # Converted mana cost
-    colors: List[str] = Field(default_factory=list)  # W, U, B, R, G
-    rarity: Optional[str] = None  # Common, Uncommon, Rare, Mythic
-    set_code: Optional[str] = None
-class Deck(BaseModel):
-    """Represents a complete MTG Arena deck."""
-    name: str
-    format: str = "Standard"  # Standard, Historic, Explorer, etc.
-    mainboard: List[Card]
-    sideboard: List[Card] = Field(default_factory=list)
-    commander: Optional[Card] = None
-class ManaCurve(BaseModel):
-    """Mana curve analysis for a deck."""
-    distribution: Dict[int, int]  # CMC -> count
-    average_cmc: float
-    median_cmc: float
-    curve_score: float  # 0-100 rating
-class CardSynergy(BaseModel):
-    """Represents synergy between two cards."""
-    card1: str
-    card2: str
-    synergy_type: str  # combo, support, anti-synergy
-    strength: float  # 0-100
-    explanation: str
-class MetaMatchup(BaseModel):
-    """Meta matchup analysis."""
-    archetype: str
-    win_rate: float
-    favorable: bool
-    key_cards: List[str]
-    sideboard_suggestions: List[str]
-class DeckAnalysis(BaseModel):
-    """Complete deck analysis result."""
-    deck_name: str
-    mana_curve: ManaCurve
-    color_distribution: Dict[str, int]
-    card_types: Dict[str, int]
-    synergies: List[CardSynergy]
-    meta_matchups: List[MetaMatchup]
-    strengths: List[str]
-    weaknesses: List[str]
-    overall_score: float  # 0-100
-class DeckSuggestion(BaseModel):
-    """Deck optimization suggestion."""
-    type: str  # add, remove, replace
-    card_name: str
-    quantity: int = 1
-    reason: str
-    impact_score: float  # 0-100
-    replacement_for: Optional[str] = None
-class OptimizedDeck(BaseModel):
-    """Optimized deck with suggestions."""
-    original_deck: Deck
-    suggestions: List[DeckSuggestion]
-    predicted_win_rate: float
-    confidence: float
-@dataclass
-class Collection:
-    """Represents a card collection (not a deck)."""
-    id: Optional[int] = None
-    name: str = "Imported Collection"
-    cards: List[Card] = field(default_factory=list)
-    total_cards: int = 0
-    unique_cards: int = 0
-    created_at: Optional[datetime] = None
-    def __post_init__(self):
-        if not self.total_cards:
-            self.total_cards = sum(card.quantity for card in self.cards)
-        if not self.unique_cards:
-            self.unique_cards = len(self.cards)
-@dataclass
-class CollectionProcessingResult:
-    """Result of processing a collection CSV."""
-    collection_id: Optional[int] = None
-    total_cards: int = 0
-    unique_cards: int = 0
-    total_quantity: int = 0
-    chunks_processed: int = 0
-    chunks_failed: int = 0
-    failed_rows: List[int] = field(default_factory=list)
-    processing_time_seconds: float = 0.0
-    status: str = "pending"  # 'complete', 'partial', 'failed'

+"""Deck and card data models."""
+from datetime import datetime
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+from pydantic import BaseModel, Field
+class Card(BaseModel):
+    """Represents a single Magic: The Gathering card."""
+    name: str
+    quantity: int
+    card_type: str  # Creature, Instant, Sorcery, Enchantment, Artifact, Land, Planeswalker
+    mana_cost: str  # e.g., "2UU", "GG", "1"
+    cmc: float = 0.0  # Converted mana cost
+    colors: List[str] = Field(default_factory=list)  # W, U, B, R, G
+    rarity: Optional[str] = None  # Common, Uncommon, Rare, Mythic
+    set_code: Optional[str] = None
+class Deck(BaseModel):
+    """Represents a complete MTG Arena deck."""
+    name: str
+    format: str = "Standard"  # Standard, Historic, Explorer, etc.
+    mainboard: List[Card]
+    sideboard: List[Card] = Field(default_factory=list)
+    commander: Optional[Card] = None
+class ManaCurve(BaseModel):
+    """Mana curve analysis for a deck."""
+    distribution: Dict[int, int]  # CMC -> count
+    average_cmc: float
+    median_cmc: float
+    curve_score: float  # 0-100 rating
+class CardSynergy(BaseModel):
+    """Represents synergy between two cards."""
+    card1: str
+    card2: str
+    synergy_type: str  # combo, support, anti-synergy
+    strength: float  # 0-100
+    explanation: str
+class MetaMatchup(BaseModel):
+    """Meta matchup analysis."""
+    archetype: str
+    win_rate: float
+    favorable: bool
+    key_cards: List[str]
+    sideboard_suggestions: List[str]
+class DeckAnalysis(BaseModel):
+    """Complete deck analysis result."""
+    deck_name: str
+    mana_curve: ManaCurve
+    color_distribution: Dict[str, int]
+    card_types: Dict[str, int]
+    synergies: List[CardSynergy]
+    meta_matchups: List[MetaMatchup]
+    strengths: List[str]
+    weaknesses: List[str]
+    overall_score: float  # 0-100
+class DeckSuggestion(BaseModel):
+    """Deck optimization suggestion."""
+    type: str  # add, remove, replace
+    card_name: str
+    quantity: int = 1
+    reason: str
+    impact_score: float  # 0-100
+    replacement_for: Optional[str] = None
+class OptimizedDeck(BaseModel):
+    """Optimized deck with suggestions."""
+    original_deck: Deck
+    suggestions: List[DeckSuggestion]
+    predicted_win_rate: float
+    confidence: float
+@dataclass
+class Collection:
+    """Represents a card collection (not a deck)."""
+    id: Optional[int] = None
+    name: str = "Imported Collection"
+    cards: List[Card] = field(default_factory=list)
+    total_cards: int = 0
+    unique_cards: int = 0
+    created_at: Optional[datetime] = None
+    def __post_init__(self):
+        if not self.total_cards:
+            self.total_cards = sum(card.quantity for card in self.cards)
+        if not self.unique_cards:
+            self.unique_cards = len(self.cards)
+@dataclass
+class CollectionProcessingResult:
+    """Result of processing a collection CSV."""
+    collection_id: Optional[int] = None
+    total_cards: int = 0
+    unique_cards: int = 0
+    total_quantity: int = 0
+    chunks_processed: int = 0
+    chunks_failed: int = 0
+    failed_rows: List[int] = field(default_factory=list)
+    processing_time_seconds: float = 0.0
+    status: str = "pending"  # 'complete', 'partial', 'failed'
+@dataclass
+class CollectionAnalysis:
+    """Analysis result for a card collection."""
+    total_cards: int
+    unique_cards: int
+    total_value: float = 0.0  # Estimated value
+    rarity_distribution: Dict[str, int] = field(default_factory=dict)
+    color_distribution: Dict[str, int] = field(default_factory=dict)
+    set_distribution: Dict[str, int] = field(default_factory=dict)
+    top_cards: List[Card] = field(default_factory=list)
+    completion_score: float = 0.0  # 0-100 score based on staple coverage

src/services/collection_analyzer.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Service for analyzing card collections."""
+import logging
+from collections import Counter
+from typing import List, Dict
+from ..models.deck import Card, Collection, CollectionAnalysis
+logger = logging.getLogger(__name__)
+class CollectionAnalyzer:
+    """Analyzes card collections to generate statistics and insights."""
+    async def analyze_collection(self, collection: Collection) -> CollectionAnalysis:
+        """
+        Perform comprehensive analysis on a card collection.
+        Args:
+            collection: The collection to analyze
+        Returns:
+            CollectionAnalysis object with statistics
+        """
+        if not collection or not collection.cards:
+            return CollectionAnalysis(total_cards=0, unique_cards=0)
+        cards = collection.cards
+        total_cards = sum(c.quantity for c in cards)
+        unique_cards = len(cards)
+        # Calculate distributions
+        rarity_dist = Counter()
+        color_dist = Counter()
+        set_dist = Counter()
+        # Track top cards (e.g., by rarity or utility - simplified here)
+        # In a real app, we'd check against a "staples" list
+        mythics = []
+        rares = []
+        for card in cards:
+            # Rarity
+            if card.rarity:
+                rarity_dist[card.rarity] += card.quantity
+                if card.rarity.lower() == 'mythic':
+                    mythics.append(card)
+                elif card.rarity.lower() == 'rare':
+                    rares.append(card)
+            else:
+                rarity_dist['Unknown'] += card.quantity
+            # Colors
+            if card.colors:
+                for color in card.colors:
+                    color_dist[color] += card.quantity
+            elif card.card_type and 'Land' in card.card_type:
+                color_dist['Lands'] += card.quantity
+            else:
+                color_dist['Colorless'] += card.quantity
+            # Sets
+            if card.set_code:
+                set_dist[card.set_code] += card.quantity
+        # Sort top cards by quantity (simplified "top" metric)
+        # Prioritize Mythics and Rares
+        top_cards = sorted(mythics + rares, key=lambda c: c.quantity, reverse=True)[:20]
+        # Calculate a simple completion score (placeholder logic)
+        # Real logic would compare against meta decks
+        completion_score = min(100.0, (unique_cards / 2000) * 100)  # Arbitrary baseline
+        return CollectionAnalysis(
+            total_cards=total_cards,
+            unique_cards=unique_cards,
+            total_value=0.0,  # Would need price service integration
+            rarity_distribution=dict(rarity_dist),
+            color_distribution=dict(color_dist),
+            set_distribution=dict(set_dist),
+            top_cards=top_cards,
+            completion_score=round(completion_score, 1)
+        )

src/utils/csv_parser.py CHANGED Viewed

@@ -1,274 +1,339 @@
-"""CSV parser for MTG Arena deck exports."""
-import gc
-import re
-import logging
-from typing import List, Tuple, Generator
-from io import StringIO
-import pandas as pd
-from ..models.deck import Card, Deck
-from .mana_calculator import calculate_cmc, parse_mana_cost, extract_colors
-logger = logging.getLogger(__name__)
-def parse_deck_string(deck_string: str) -> Deck:
-    """
-    Parse MTG Arena deck format string.
-    Format examples:
-        4 Lightning Bolt (M11) 146
-        2 Counterspell (MH2) 267
-        20 Island (ZNR) 381
-    """
-    lines = deck_string.strip().split('\n')
-    mainboard = []
-    sideboard = []
-    current_section = mainboard
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # Check for sideboard marker
-        if line.lower() in ['sideboard', 'sideboard:']:
-            current_section = sideboard
-            continue
-        # Parse card line: "4 Card Name (SET) 123"
-        # Split by parentheses to avoid ReDoS vulnerability
-        paren_match = re.search(r'\(([A-Z0-9]+)\)\s+(\d+)$', line)
-        if paren_match:
-            # Extract quantity and name from beginning
-            prefix = line[:paren_match.start()].strip()
-            # Use simple split to avoid ReDoS
-            parts = prefix.split(None, 1)  # Split on first whitespace
-            if len(parts) == 2 and parts[0].isdigit():
-                quantity = int(parts[0])
-                card_name = parts[1].strip()
-                set_code = paren_match.group(1)
-                # Determine card type and mana cost (simplified - would need card database)
-                card_type = determine_card_type(card_name)
-                # WARNING: Text format parsing limitation - mana cost is not available
-                # This will result in CMC=0 and no colors for all non-land cards,
-                # which significantly affects deck analysis accuracy.
-                # Use CSV format for accurate mana curve and color analysis.
-                mana_cost = ""  # Would need card database lookup
-                card = Card(
-                    name=card_name,
-                    quantity=quantity,
-                    card_type=card_type,
-                    mana_cost=mana_cost,
-                    cmc=calculate_cmc(mana_cost),
-                    colors=extract_colors(mana_cost),
-                    set_code=set_code
-                )
-                current_section.append(card)
-    return Deck(
-        name="Imported Deck",
-        mainboard=mainboard,
-        sideboard=sideboard
-    )
-def parse_arena_csv(csv_content: str) -> Deck:
-    """
-    Parse CSV export from Steam MTG Arena.
-    Expected CSV format:
-        Quantity,Name,Set,Collector Number,Type,Mana Cost,CMC,Colors,Rarity
-    """
-    # Read CSV
-    df = pd.read_csv(StringIO(csv_content))
-    # Normalize column names
-    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
-    mainboard = []
-    sideboard = []
-    for _, row in df.iterrows():
-        quantity = int(row.get('quantity', 1))
-        name = str(row.get('name', ''))
-        set_code = str(row.get('set', '')) if 'set' in row else None
-        card_type = str(row.get('type', 'Unknown'))
-        mana_cost = str(row.get('mana_cost', ''))
-        # Calculate CMC if not provided
-        if 'cmc' in row:
-            cmc = float(row['cmc'])
-        else:
-            cmc = calculate_cmc(mana_cost)
-        # Extract colors if not provided
-        if 'colors' in row and pd.notna(row['colors']):
-            colors = [c.strip() for c in str(row['colors']).split(',')]
-        else:
-            colors = extract_colors(mana_cost)
-        rarity = str(row.get('rarity', '')) if 'rarity' in row else None
-        is_sideboard = row.get('sideboard', False) if 'sideboard' in row else False
-        card = Card(
-            name=name,
-            quantity=quantity,
-            card_type=card_type,
-            mana_cost=mana_cost,
-            cmc=cmc,
-            colors=colors,
-            rarity=rarity,
-            set_code=set_code
-        )
-        if is_sideboard:
-            sideboard.append(card)
-        else:
-            mainboard.append(card)
-    return Deck(
-        name="CSV Import",
-        mainboard=mainboard,
-        sideboard=sideboard
-    )
-def determine_card_type(card_name: str) -> str:
-    """Determine card type based on name (simplified heuristic)."""
-    # This is a simplified version - in production would use card database
-    name_lower = card_name.lower()
-    if any(land in name_lower for land in ['island', 'mountain', 'forest', 'plains', 'swamp']):
-        return "Land"
-    # Default to Unknown - should be looked up from card database
-    return "Unknown"
-def count_csv_rows(filepath: str) -> int:
-    """
-    Efficiently count rows in CSV without loading into memory.
-    Args:
-        filepath: Path to the CSV file
-    Returns:
-        Number of data rows (excluding header)
-    """
-    with open(filepath, 'r', encoding='utf-8') as f:
-        # Count lines, subtract 1 for header
-        return sum(1 for _ in f) - 1
-def _parse_card_row(row: pd.Series) -> Card:
-    """
-    Parse a single CSV row into a Card object.
-    Args:
-        row: pandas Series with normalized column names
-    Returns:
-        Card object
-    Raises:
-        ValueError: If required fields are missing or invalid
-    """
-    # Handle quantity - default to 1 if missing
-    quantity = int(row.get('quantity', 1) or 1)
-    # Get name - required field
-    name = str(row.get('name', '')).strip()
-    if not name:
-        raise ValueError("Card name is required")
-    # Parse CMC - handle various formats
-    cmc_val = row.get('cmc', 0)
-    try:
-        cmc = int(float(cmc_val)) if pd.notna(cmc_val) else 0
-    except (ValueError, TypeError):
-        cmc = 0
-    # Parse colors - handle string or list
-    colors_raw = row.get('colors', '')
-    if pd.isna(colors_raw):
-        colors = []
-    elif isinstance(colors_raw, str):
-        colors = [c.strip() for c in colors_raw.split(',') if c.strip()]
-    else:
-        colors = list(colors_raw) if colors_raw else []
-    return Card(
-        name=name,
-        quantity=quantity,
-        card_type=str(row.get('type', '')).strip() or None,
-        mana_cost=str(row.get('mana_cost', '')).strip() or None,
-        cmc=cmc,
-        colors=colors,
-        rarity=str(row.get('rarity', '')).strip() or None,
-        set_code=str(row.get('set', '')).strip() or None
-    )
-def parse_arena_csv_chunked(
-    filepath: str,
-    chunk_size: int = 5000
-) -> Generator[Tuple[int, List[Card], List[int]], None, None]:
-    """
-    Parse large CSV file in chunks using pandas chunked reader.
-    This is memory-efficient for large collection CSVs (up to 70K+ rows).
-    Args:
-        filepath: Path to the CSV file
-        chunk_size: Number of rows per chunk (default 5000)
-    Yields:
-        Tuple of (chunk_index, cards_list, failed_row_indices)
-    Example:
-        for chunk_idx, cards, failed in parse_arena_csv_chunked("collection.csv"):
-            all_cards.extend(cards)
-            all_failed.extend(failed)
-    """
-    chunk_iter = pd.read_csv(
-        filepath,
-        chunksize=chunk_size,
-        dtype={
-            'Quantity': 'Int64',  # Nullable integer
-            'Name': 'string',
-            'CMC': 'Float64',     # Nullable float
-        },
-        on_bad_lines='warn',
-        encoding='utf-8'
-    )
-    for chunk_idx, chunk_df in enumerate(chunk_iter):
-        # Normalize column names (lowercase, underscores)
-        chunk_df.columns = (
-            chunk_df.columns.str.strip()
-            .str.lower()
-            .str.replace(' ', '_')
-            .str.replace('-', '_')
-        )
-        cards: List[Card] = []
-        failed_rows: List[int] = []
-        for row_idx, row in chunk_df.iterrows():
-            global_row_idx = chunk_idx * chunk_size + row_idx
-            try:
-                card = _parse_card_row(row)
-                cards.append(card)
-            except Exception as e:
-                failed_rows.append(global_row_idx)
-                logger.warning(f"Failed to parse row {global_row_idx}: {e}")
-        yield chunk_idx, cards, failed_rows
-        # Explicit memory cleanup after each chunk
-        del chunk_df
-        gc.collect()

+"""CSV parser for MTG Arena deck exports."""
+import gc
+import re
+import logging
+from typing import List, Tuple, Generator
+from io import StringIO
+import pandas as pd
+from ..models.deck import Card, Deck
+from .mana_calculator import calculate_cmc, parse_mana_cost, extract_colors
+logger = logging.getLogger(__name__)
+def parse_deck_string(deck_string: str) -> Deck:
+    """
+    Parse MTG Arena deck format string.
+    Format examples:
+        4 Lightning Bolt (M11) 146
+        2 Counterspell (MH2) 267
+        20 Island (ZNR) 381
+    """
+    lines = deck_string.strip().split('\n')
+    mainboard = []
+    sideboard = []
+    current_section = mainboard
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Check for sideboard marker
+        if line.lower() in ['sideboard', 'sideboard:']:
+            current_section = sideboard
+            continue
+        # Parse card line: "4 Card Name (SET) 123"
+        # Split by parentheses to avoid ReDoS vulnerability
+        paren_match = re.search(r'\(([A-Z0-9]+)\)\s+(\d+)$', line)
+        if paren_match:
+            # Extract quantity and name from beginning
+            prefix = line[:paren_match.start()].strip()
+            # Use simple split to avoid ReDoS
+            parts = prefix.split(None, 1)  # Split on first whitespace
+            if len(parts) == 2 and parts[0].isdigit():
+                quantity = int(parts[0])
+                card_name = parts[1].strip()
+                set_code = paren_match.group(1)
+                # Determine card type and mana cost (simplified - would need card database)
+                card_type = determine_card_type(card_name)
+                # WARNING: Text format parsing limitation - mana cost is not available
+                # This will result in CMC=0 and no colors for all non-land cards,
+                # which significantly affects deck analysis accuracy.
+                # Use CSV format for accurate mana curve and color analysis.
+                mana_cost = ""  # Would need card database lookup
+                card = Card(
+                    name=card_name,
+                    quantity=quantity,
+                    card_type=card_type,
+                    mana_cost=mana_cost,
+                    cmc=calculate_cmc(mana_cost),
+                    colors=extract_colors(mana_cost),
+                    set_code=set_code
+                )
+                current_section.append(card)
+    return Deck(
+        name="Imported Deck",
+        mainboard=mainboard,
+        sideboard=sideboard
+    )
+def parse_arena_csv(csv_content: str) -> Deck:
+    """
+    Parse CSV export from Steam MTG Arena.
+    Expected CSV format:
+        Quantity,Name,Set,Collector Number,Type,Mana Cost,CMC,Colors,Rarity
+    """
+    # Read CSV
+    df = pd.read_csv(StringIO(csv_content))
+    # Normalize column names
+    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
+    mainboard = []
+    sideboard = []
+    for _, row in df.iterrows():
+        quantity = int(row.get('quantity', 1))
+        name = str(row.get('name', ''))
+        set_code = str(row.get('set', '')) if 'set' in row else None
+        card_type = str(row.get('type', 'Unknown'))
+        mana_cost = str(row.get('mana_cost', ''))
+        # Calculate CMC if not provided
+        if 'cmc' in row:
+            cmc = float(row['cmc'])
+        else:
+            cmc = calculate_cmc(mana_cost)
+        # Extract colors if not provided
+        if 'colors' in row and pd.notna(row['colors']):
+            colors = [c.strip() for c in str(row['colors']).split(',')]
+        else:
+            colors = extract_colors(mana_cost)
+        rarity = str(row.get('rarity', '')) if 'rarity' in row else None
+        is_sideboard = row.get('sideboard', False) if 'sideboard' in row else False
+        card = Card(
+            name=name,
+            quantity=quantity,
+            card_type=card_type,
+            mana_cost=mana_cost,
+            cmc=cmc,
+            colors=colors,
+            rarity=rarity,
+            set_code=set_code
+        )
+        if is_sideboard:
+            sideboard.append(card)
+        else:
+            mainboard.append(card)
+    return Deck(
+        name="CSV Import",
+        mainboard=mainboard,
+        sideboard=sideboard
+    )
+def determine_card_type(card_name: str) -> str:
+    """Determine card type based on name (simplified heuristic)."""
+    # This is a simplified version - in production would use card database
+    name_lower = card_name.lower()
+    if any(land in name_lower for land in ['island', 'mountain', 'forest', 'plains', 'swamp']):
+        return "Land"
+    # Default to Unknown - should be looked up from card database
+    return "Unknown"
+def count_csv_rows(filepath: str) -> int:
+    """
+    Efficiently count rows in CSV without loading into memory.
+    Args:
+        filepath: Path to the CSV file
+    Returns:
+        Number of data rows (excluding header)
+    """
+    with open(filepath, 'r', encoding='utf-8') as f:
+        # Count lines, subtract 1 for header
+        return sum(1 for _ in f) - 1
+def _parse_card_row(row: pd.Series) -> Card:
+    """
+    Parse a single CSV row into a Card object.
+    Args:
+        row: pandas Series with normalized column names
+    Returns:
+        Card object
+    Raises:
+        ValueError: If required fields are missing or invalid
+    """
+    # Handle quantity - default to 1 if missing
+    quantity = int(row.get('quantity', 1) or 1)
+    # Get name - required field
+    name = str(row.get('name', '')).strip()
+    if not name:
+        raise ValueError("Card name is required")
+    # Parse CMC - handle various formats
+    cmc_val = row.get('cmc', 0)
+    try:
+        cmc = int(float(cmc_val)) if pd.notna(cmc_val) else 0
+    except (ValueError, TypeError):
+        cmc = 0
+    # Parse colors - handle string or list
+    colors_raw = row.get('colors', '')
+    if pd.isna(colors_raw):
+        colors = []
+    elif isinstance(colors_raw, str):
+        # Check if it's an Untapped.gg color name
+        if colors_raw.lower() in ['gold', 'white', 'blue', 'black', 'red', 'green', 'colorless']:
+            colors = _parse_untapped_colors(colors_raw)
+        else:
+            # Standard Arena format (W, U, B, etc.)
+            colors = [c.strip() for c in colors_raw.split(',') if c.strip()]
+    else:
+        colors = list(colors_raw) if colors_raw else []
+    return Card(
+        name=name,
+        quantity=quantity,
+        card_type=str(row.get('type', '')).strip() or "Unknown",
+        mana_cost=str(row.get('mana_cost', '')).strip() or None,
+        cmc=cmc,
+        colors=colors,
+        rarity=str(row.get('rarity', '')).strip() or None,
+        set_code=str(row.get('set', '')).strip() or None
+    )
+def parse_arena_csv_chunked(
+    filepath: str,
+    chunk_size: int = 5000
+) -> Generator[Tuple[int, List[Card], List[int]], None, None]:
+    """
+    Parse large CSV file in chunks using pandas chunked reader.
+    This is memory-efficient for large collection CSVs (up to 70K+ rows).
+    Args:
+        filepath: Path to the CSV file
+        chunk_size: Number of rows per chunk (default 5000)
+    Yields:
+        Tuple of (chunk_index, cards_list, failed_row_indices)
+    Example:
+        for chunk_idx, cards, failed in parse_arena_csv_chunked("collection.csv"):
+            all_cards.extend(cards)
+            all_failed.extend(failed)
+    """
+    chunk_iter = pd.read_csv(
+        filepath,
+        chunksize=chunk_size,
+        dtype={
+            'Quantity': 'Int64',  # Nullable integer
+            'Name': 'string',
+            'CMC': 'Float64',     # Nullable float
+        },
+        on_bad_lines='warn',
+        encoding='utf-8'
+    )
+    for chunk_idx, chunk_df in enumerate(chunk_iter):
+        # Normalize column names (lowercase, underscores)
+        chunk_df.columns = (
+            chunk_df.columns.str.strip()
+            .str.lower()
+            .str.replace(' ', '_')
+            .str.replace('-', '_')
+            .str.replace('#', '') # Handle "Id #" -> "id_"
+        )
+        # Apply Untapped.gg normalization if needed
+        if 'id_' in chunk_df.columns and 'color' in chunk_df.columns:
+             chunk_df = _normalize_untapped_columns(chunk_df)
+        cards: List[Card] = []
+        failed_rows: List[int] = []
+        for row_idx, row in chunk_df.iterrows():
+            global_row_idx = chunk_idx * chunk_size + row_idx
+            try:
+                card = _parse_card_row(row)
+                cards.append(card)
+            except Exception as e:
+                failed_rows.append(global_row_idx)
+                logger.warning(f"Failed to parse row {global_row_idx}: {e}")
+        yield chunk_idx, cards, failed_rows
+        # Explicit memory cleanup after each chunk
+        del chunk_df
+        gc.collect()
+def _normalize_untapped_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalize Untapped.gg column names to standard internal format.
+    Untapped.gg format: Id #, Name, Set, Color, Rarity, Quantity
+    Target format: quantity, name, set, colors, rarity
+    """
+    # Map Untapped columns to internal names
+    column_map = {
+        'id_#': 'collector_number',
+        'name': 'name',
+        'set': 'set',
+        'color': 'colors',
+        'rarity': 'rarity',
+        'quantity': 'quantity'
+    }
+    # Rename columns that exist
+    df = df.rename(columns=lambda x: column_map.get(x, x))
+    return df
+def _parse_untapped_colors(color_str: str) -> List[str]:
+    """Parse Untapped.gg color string (e.g., 'Gold', 'Blue', 'Red')."""
+    if pd.isna(color_str) or not isinstance(color_str, str):
+        return []
+    color_str = color_str.lower().strip()
+    if color_str == 'gold':
+        # Gold implies multicolor, but we don't know exact colors without lookup.
+        # For analysis, we might treat it as a special category or try to infer.
+        # Returning empty list or specific marker might be safer.
+        # For now, let's leave it empty as we can't map to WUBRG accurately without card data.
+        return []
+    # Map standard names to WUBRG
+    color_map = {
+        'white': 'W',
+        'blue': 'U',
+        'black': 'B',
+        'red': 'R',
+        'green': 'G',
+        'colorless': ''
+    }
+    if color_str in color_map:
+        val = color_map[color_str]
+        return [val] if val else []
+    return []