Spaces:

MCP-1st-Birthday
/

vawlrathh

Running on Zero

App Files Files Community

Add Collection Analysis & Untapped.gg Support

#52

by clduab11 - opened 13 days ago

base: refs/heads/main

←

from: refs/pr/52

Discussion Files changed

+2206

-1998

Files changed (4) hide show

app.py +0 -0
src/models/deck.py +139 -125
src/services/collection_analyzer.py +83 -0
src/utils/csv_parser.py +339 -274

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/models/deck.py CHANGED Viewed

@@ -1,125 +1,139 @@
-"""Deck and card data models."""
-from datetime import datetime
-from typing import List, Dict, Optional
-from dataclasses import dataclass, field
-from pydantic import BaseModel, Field
-class Card(BaseModel):
-    """Represents a single Magic: The Gathering card."""
-    name: str
-    quantity: int
-    card_type: str  # Creature, Instant, Sorcery, Enchantment, Artifact, Land, Planeswalker
-    mana_cost: str  # e.g., "2UU", "GG", "1"
-    cmc: float = 0.0  # Converted mana cost
-    colors: List[str] = Field(default_factory=list)  # W, U, B, R, G
-    rarity: Optional[str] = None  # Common, Uncommon, Rare, Mythic
-    set_code: Optional[str] = None
-class Deck(BaseModel):
-    """Represents a complete MTG Arena deck."""
-    name: str
-    format: str = "Standard"  # Standard, Historic, Explorer, etc.
-    mainboard: List[Card]
-    sideboard: List[Card] = Field(default_factory=list)
-    commander: Optional[Card] = None
-class ManaCurve(BaseModel):
-    """Mana curve analysis for a deck."""
-    distribution: Dict[int, int]  # CMC -> count
-    average_cmc: float
-    median_cmc: float
-    curve_score: float  # 0-100 rating
-class CardSynergy(BaseModel):
-    """Represents synergy between two cards."""
-    card1: str
-    card2: str
-    synergy_type: str  # combo, support, anti-synergy
-    strength: float  # 0-100
-    explanation: str
-class MetaMatchup(BaseModel):
-    """Meta matchup analysis."""
-    archetype: str
-    win_rate: float
-    favorable: bool
-    key_cards: List[str]
-    sideboard_suggestions: List[str]
-class DeckAnalysis(BaseModel):
-    """Complete deck analysis result."""
-    deck_name: str
-    mana_curve: ManaCurve
-    color_distribution: Dict[str, int]
-    card_types: Dict[str, int]
-    synergies: List[CardSynergy]
-    meta_matchups: List[MetaMatchup]
-    strengths: List[str]
-    weaknesses: List[str]
-    overall_score: float  # 0-100
-class DeckSuggestion(BaseModel):
-    """Deck optimization suggestion."""
-    type: str  # add, remove, replace
-    card_name: str
-    quantity: int = 1
-    reason: str
-    impact_score: float  # 0-100
-    replacement_for: Optional[str] = None
-class OptimizedDeck(BaseModel):
-    """Optimized deck with suggestions."""
-    original_deck: Deck
-    suggestions: List[DeckSuggestion]
-    predicted_win_rate: float
-    confidence: float
-@dataclass
-class Collection:
-    """Represents a card collection (not a deck)."""
-    id: Optional[int] = None
-    name: str = "Imported Collection"
-    cards: List[Card] = field(default_factory=list)
-    total_cards: int = 0
-    unique_cards: int = 0
-    created_at: Optional[datetime] = None
-    def __post_init__(self):
-        if not self.total_cards:
-            self.total_cards = sum(card.quantity for card in self.cards)
-        if not self.unique_cards:
-            self.unique_cards = len(self.cards)
-@dataclass
-class CollectionProcessingResult:
-    """Result of processing a collection CSV."""
-    collection_id: Optional[int] = None
-    total_cards: int = 0
-    unique_cards: int = 0
-    total_quantity: int = 0
-    chunks_processed: int = 0
-    chunks_failed: int = 0
-    failed_rows: List[int] = field(default_factory=list)
-    processing_time_seconds: float = 0.0
-    status: str = "pending"  # 'complete', 'partial', 'failed'

+"""Deck and card data models."""
+from datetime import datetime
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+from pydantic import BaseModel, Field
+class Card(BaseModel):
+    """Represents a single Magic: The Gathering card."""
+    name: str
+    quantity: int
+    card_type: str  # Creature, Instant, Sorcery, Enchantment, Artifact, Land, Planeswalker
+    mana_cost: str  # e.g., "2UU", "GG", "1"
+    cmc: float = 0.0  # Converted mana cost
+    colors: List[str] = Field(default_factory=list)  # W, U, B, R, G
+    rarity: Optional[str] = None  # Common, Uncommon, Rare, Mythic
+    set_code: Optional[str] = None
+class Deck(BaseModel):
+    """Represents a complete MTG Arena deck."""
+    name: str
+    format: str = "Standard"  # Standard, Historic, Explorer, etc.
+    mainboard: List[Card]
+    sideboard: List[Card] = Field(default_factory=list)
+    commander: Optional[Card] = None
+class ManaCurve(BaseModel):
+    """Mana curve analysis for a deck."""
+    distribution: Dict[int, int]  # CMC -> count
+    average_cmc: float
+    median_cmc: float
+    curve_score: float  # 0-100 rating
+class CardSynergy(BaseModel):
+    """Represents synergy between two cards."""
+    card1: str
+    card2: str
+    synergy_type: str  # combo, support, anti-synergy
+    strength: float  # 0-100
+    explanation: str
+class MetaMatchup(BaseModel):
+    """Meta matchup analysis."""
+    archetype: str
+    win_rate: float
+    favorable: bool
+    key_cards: List[str]
+    sideboard_suggestions: List[str]
+class DeckAnalysis(BaseModel):
+    """Complete deck analysis result."""
+    deck_name: str
+    mana_curve: ManaCurve
+    color_distribution: Dict[str, int]
+    card_types: Dict[str, int]
+    synergies: List[CardSynergy]
+    meta_matchups: List[MetaMatchup]
+    strengths: List[str]
+    weaknesses: List[str]
+    overall_score: float  # 0-100
+class DeckSuggestion(BaseModel):
+    """Deck optimization suggestion."""
+    type: str  # add, remove, replace
+    card_name: str
+    quantity: int = 1
+    reason: str
+    impact_score: float  # 0-100
+    replacement_for: Optional[str] = None
+class OptimizedDeck(BaseModel):
+    """Optimized deck with suggestions."""
+    original_deck: Deck
+    suggestions: List[DeckSuggestion]
+    predicted_win_rate: float
+    confidence: float
+@dataclass
+class Collection:
+    """Represents a card collection (not a deck)."""
+    id: Optional[int] = None
+    name: str = "Imported Collection"
+    cards: List[Card] = field(default_factory=list)
+    total_cards: int = 0
+    unique_cards: int = 0
+    created_at: Optional[datetime] = None
+    def __post_init__(self):
+        if not self.total_cards:
+            self.total_cards = sum(card.quantity for card in self.cards)
+        if not self.unique_cards:
+            self.unique_cards = len(self.cards)
+@dataclass
+class CollectionProcessingResult:
+    """Result of processing a collection CSV."""
+    collection_id: Optional[int] = None
+    total_cards: int = 0
+    unique_cards: int = 0
+    total_quantity: int = 0
+    chunks_processed: int = 0
+    chunks_failed: int = 0
+    failed_rows: List[int] = field(default_factory=list)
+    processing_time_seconds: float = 0.0
+    status: str = "pending"  # 'complete', 'partial', 'failed'
+@dataclass
+class CollectionAnalysis:
+    """Analysis result for a card collection."""
+    total_cards: int
+    unique_cards: int
+    total_value: float = 0.0  # Estimated value
+    rarity_distribution: Dict[str, int] = field(default_factory=dict)
+    color_distribution: Dict[str, int] = field(default_factory=dict)
+    set_distribution: Dict[str, int] = field(default_factory=dict)
+    top_cards: List[Card] = field(default_factory=list)
+    completion_score: float = 0.0  # 0-100 score based on staple coverage

src/services/collection_analyzer.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Service for analyzing card collections."""
+import logging
+from collections import Counter
+from typing import List, Dict
+from ..models.deck import Card, Collection, CollectionAnalysis
+logger = logging.getLogger(__name__)
+class CollectionAnalyzer:
+    """Analyzes card collections to generate statistics and insights."""
+    async def analyze_collection(self, collection: Collection) -> CollectionAnalysis:
+        """
+        Perform comprehensive analysis on a card collection.
+        Args:
+            collection: The collection to analyze
+        Returns:
+            CollectionAnalysis object with statistics
+        """
+        if not collection or not collection.cards:
+            return CollectionAnalysis(total_cards=0, unique_cards=0)
+        cards = collection.cards
+        total_cards = sum(c.quantity for c in cards)
+        unique_cards = len(cards)
+        # Calculate distributions
+        rarity_dist = Counter()
+        color_dist = Counter()
+        set_dist = Counter()
+        # Track top cards (e.g., by rarity or utility - simplified here)
+        # In a real app, we'd check against a "staples" list
+        mythics = []
+        rares = []
+        for card in cards:
+            # Rarity
+            if card.rarity:
+                rarity_dist[card.rarity] += card.quantity
+                if card.rarity.lower() == 'mythic':
+                    mythics.append(card)
+                elif card.rarity.lower() == 'rare':
+                    rares.append(card)
+            else:
+                rarity_dist['Unknown'] += card.quantity
+            # Colors
+            if card.colors:
+                for color in card.colors:
+                    color_dist[color] += card.quantity
+            elif card.card_type and 'Land' in card.card_type:
+                color_dist['Lands'] += card.quantity
+            else:
+                color_dist['Colorless'] += card.quantity
+            # Sets
+            if card.set_code:
+                set_dist[card.set_code] += card.quantity
+        # Sort top cards by quantity (simplified "top" metric)
+        # Prioritize Mythics and Rares
+        top_cards = sorted(mythics + rares, key=lambda c: c.quantity, reverse=True)[:20]
+        # Calculate a simple completion score (placeholder logic)
+        # Real logic would compare against meta decks
+        completion_score = min(100.0, (unique_cards / 2000) * 100)  # Arbitrary baseline
+        return CollectionAnalysis(
+            total_cards=total_cards,
+            unique_cards=unique_cards,
+            total_value=0.0,  # Would need price service integration
+            rarity_distribution=dict(rarity_dist),
+            color_distribution=dict(color_dist),
+            set_distribution=dict(set_dist),
+            top_cards=top_cards,
+            completion_score=round(completion_score, 1)
+        )

src/utils/csv_parser.py CHANGED Viewed

@@ -1,274 +1,339 @@
-"""CSV parser for MTG Arena deck exports."""
-import gc
-import re
-import logging
-from typing import List, Tuple, Generator
-from io import StringIO
-import pandas as pd
-from ..models.deck import Card, Deck
-from .mana_calculator import calculate_cmc, parse_mana_cost, extract_colors
-logger = logging.getLogger(__name__)
-def parse_deck_string(deck_string: str) -> Deck:
-    """
-    Parse MTG Arena deck format string.
-    Format examples:
-        4 Lightning Bolt (M11) 146
-        2 Counterspell (MH2) 267
-        20 Island (ZNR) 381
-    """
-    lines = deck_string.strip().split('\n')
-    mainboard = []
-    sideboard = []
-    current_section = mainboard
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # Check for sideboard marker
-        if line.lower() in ['sideboard', 'sideboard:']:
-            current_section = sideboard
-            continue
-        # Parse card line: "4 Card Name (SET) 123"
-        # Split by parentheses to avoid ReDoS vulnerability
-        paren_match = re.search(r'\(([A-Z0-9]+)\)\s+(\d+)$', line)
-        if paren_match:
-            # Extract quantity and name from beginning
-            prefix = line[:paren_match.start()].strip()
-            # Use simple split to avoid ReDoS
-            parts = prefix.split(None, 1)  # Split on first whitespace
-            if len(parts) == 2 and parts[0].isdigit():
-                quantity = int(parts[0])
-                card_name = parts[1].strip()
-                set_code = paren_match.group(1)
-                # Determine card type and mana cost (simplified - would need card database)
-                card_type = determine_card_type(card_name)
-                # WARNING: Text format parsing limitation - mana cost is not available
-                # This will result in CMC=0 and no colors for all non-land cards,
-                # which significantly affects deck analysis accuracy.
-                # Use CSV format for accurate mana curve and color analysis.
-                mana_cost = ""  # Would need card database lookup
-                card = Card(
-                    name=card_name,
-                    quantity=quantity,
-                    card_type=card_type,
-                    mana_cost=mana_cost,
-                    cmc=calculate_cmc(mana_cost),
-                    colors=extract_colors(mana_cost),
-                    set_code=set_code
-                )
-                current_section.append(card)
-    return Deck(
-        name="Imported Deck",
-        mainboard=mainboard,
-        sideboard=sideboard
-    )
-def parse_arena_csv(csv_content: str) -> Deck:
-    """
-    Parse CSV export from Steam MTG Arena.
-    Expected CSV format:
-        Quantity,Name,Set,Collector Number,Type,Mana Cost,CMC,Colors,Rarity
-    """
-    # Read CSV
-    df = pd.read_csv(StringIO(csv_content))
-    # Normalize column names
-    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
-    mainboard = []
-    sideboard = []
-    for _, row in df.iterrows():
-        quantity = int(row.get('quantity', 1))
-        name = str(row.get('name', ''))
-        set_code = str(row.get('set', '')) if 'set' in row else None
-        card_type = str(row.get('type', 'Unknown'))
-        mana_cost = str(row.get('mana_cost', ''))
-        # Calculate CMC if not provided
-        if 'cmc' in row:
-            cmc = float(row['cmc'])
-        else:
-            cmc = calculate_cmc(mana_cost)
-        # Extract colors if not provided
-        if 'colors' in row and pd.notna(row['colors']):
-            colors = [c.strip() for c in str(row['colors']).split(',')]
-        else:
-            colors = extract_colors(mana_cost)
-        rarity = str(row.get('rarity', '')) if 'rarity' in row else None
-        is_sideboard = row.get('sideboard', False) if 'sideboard' in row else False
-        card = Card(
-            name=name,
-            quantity=quantity,
-            card_type=card_type,
-            mana_cost=mana_cost,
-            cmc=cmc,
-            colors=colors,
-            rarity=rarity,
-            set_code=set_code
-        )
-        if is_sideboard:
-            sideboard.append(card)
-        else:
-            mainboard.append(card)
-    return Deck(
-        name="CSV Import",
-        mainboard=mainboard,
-        sideboard=sideboard
-    )
-def determine_card_type(card_name: str) -> str:
-    """Determine card type based on name (simplified heuristic)."""
-    # This is a simplified version - in production would use card database
-    name_lower = card_name.lower()
-    if any(land in name_lower for land in ['island', 'mountain', 'forest', 'plains', 'swamp']):
-        return "Land"
-    # Default to Unknown - should be looked up from card database
-    return "Unknown"
-def count_csv_rows(filepath: str) -> int:
-    """
-    Efficiently count rows in CSV without loading into memory.
-    Args:
-        filepath: Path to the CSV file
-    Returns:
-        Number of data rows (excluding header)
-    """
-    with open(filepath, 'r', encoding='utf-8') as f:
-        # Count lines, subtract 1 for header
-        return sum(1 for _ in f) - 1
-def _parse_card_row(row: pd.Series) -> Card:
-    """
-    Parse a single CSV row into a Card object.
-    Args:
-        row: pandas Series with normalized column names
-    Returns:
-        Card object
-    Raises:
-        ValueError: If required fields are missing or invalid
-    """
-    # Handle quantity - default to 1 if missing
-    quantity = int(row.get('quantity', 1) or 1)
-    # Get name - required field
-    name = str(row.get('name', '')).strip()
-    if not name:
-        raise ValueError("Card name is required")
-    # Parse CMC - handle various formats
-    cmc_val = row.get('cmc', 0)
-    try:
-        cmc = int(float(cmc_val)) if pd.notna(cmc_val) else 0
-    except (ValueError, TypeError):
-        cmc = 0
-    # Parse colors - handle string or list
-    colors_raw = row.get('colors', '')
-    if pd.isna(colors_raw):
-        colors = []
-    elif isinstance(colors_raw, str):
-        colors = [c.strip() for c in colors_raw.split(',') if c.strip()]
-    else:
-        colors = list(colors_raw) if colors_raw else []
-    return Card(
-        name=name,
-        quantity=quantity,
-        card_type=str(row.get('type', '')).strip() or None,
-        mana_cost=str(row.get('mana_cost', '')).strip() or None,
-        cmc=cmc,
-        colors=colors,
-        rarity=str(row.get('rarity', '')).strip() or None,
-        set_code=str(row.get('set', '')).strip() or None
-    )
-def parse_arena_csv_chunked(
-    filepath: str,
-    chunk_size: int = 5000
-) -> Generator[Tuple[int, List[Card], List[int]], None, None]:
-    """
-    Parse large CSV file in chunks using pandas chunked reader.
-    This is memory-efficient for large collection CSVs (up to 70K+ rows).
-    Args:
-        filepath: Path to the CSV file
-        chunk_size: Number of rows per chunk (default 5000)
-    Yields:
-        Tuple of (chunk_index, cards_list, failed_row_indices)
-    Example:
-        for chunk_idx, cards, failed in parse_arena_csv_chunked("collection.csv"):
-            all_cards.extend(cards)
-            all_failed.extend(failed)
-    """
-    chunk_iter = pd.read_csv(
-        filepath,
-        chunksize=chunk_size,
-        dtype={
-            'Quantity': 'Int64',  # Nullable integer
-            'Name': 'string',
-            'CMC': 'Float64',     # Nullable float
-        },
-        on_bad_lines='warn',
-        encoding='utf-8'
-    )
-    for chunk_idx, chunk_df in enumerate(chunk_iter):
-        # Normalize column names (lowercase, underscores)
-        chunk_df.columns = (
-            chunk_df.columns.str.strip()
-            .str.lower()
-            .str.replace(' ', '_')
-            .str.replace('-', '_')
-        )
-        cards: List[Card] = []
-        failed_rows: List[int] = []
-        for row_idx, row in chunk_df.iterrows():
-            global_row_idx = chunk_idx * chunk_size + row_idx
-            try:
-                card = _parse_card_row(row)
-                cards.append(card)
-            except Exception as e:
-                failed_rows.append(global_row_idx)
-                logger.warning(f"Failed to parse row {global_row_idx}: {e}")
-        yield chunk_idx, cards, failed_rows
-        # Explicit memory cleanup after each chunk
-        del chunk_df
-        gc.collect()

+"""CSV parser for MTG Arena deck exports."""
+import gc
+import re
+import logging
+from typing import List, Tuple, Generator
+from io import StringIO
+import pandas as pd
+from ..models.deck import Card, Deck
+from .mana_calculator import calculate_cmc, parse_mana_cost, extract_colors
+logger = logging.getLogger(__name__)
+def parse_deck_string(deck_string: str) -> Deck:
+    """
+    Parse MTG Arena deck format string.
+    Format examples:
+        4 Lightning Bolt (M11) 146
+        2 Counterspell (MH2) 267
+        20 Island (ZNR) 381
+    """
+    lines = deck_string.strip().split('\n')
+    mainboard = []
+    sideboard = []
+    current_section = mainboard
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Check for sideboard marker
+        if line.lower() in ['sideboard', 'sideboard:']:
+            current_section = sideboard
+            continue
+        # Parse card line: "4 Card Name (SET) 123"
+        # Split by parentheses to avoid ReDoS vulnerability
+        paren_match = re.search(r'\(([A-Z0-9]+)\)\s+(\d+)$', line)
+        if paren_match:
+            # Extract quantity and name from beginning
+            prefix = line[:paren_match.start()].strip()
+            # Use simple split to avoid ReDoS
+            parts = prefix.split(None, 1)  # Split on first whitespace
+            if len(parts) == 2 and parts[0].isdigit():
+                quantity = int(parts[0])
+                card_name = parts[1].strip()
+                set_code = paren_match.group(1)
+                # Determine card type and mana cost (simplified - would need card database)
+                card_type = determine_card_type(card_name)
+                # WARNING: Text format parsing limitation - mana cost is not available
+                # This will result in CMC=0 and no colors for all non-land cards,
+                # which significantly affects deck analysis accuracy.
+                # Use CSV format for accurate mana curve and color analysis.
+                mana_cost = ""  # Would need card database lookup
+                card = Card(
+                    name=card_name,
+                    quantity=quantity,
+                    card_type=card_type,
+                    mana_cost=mana_cost,
+                    cmc=calculate_cmc(mana_cost),
+                    colors=extract_colors(mana_cost),
+                    set_code=set_code
+                )
+                current_section.append(card)
+    return Deck(
+        name="Imported Deck",
+        mainboard=mainboard,
+        sideboard=sideboard
+    )
+def parse_arena_csv(csv_content: str) -> Deck:
+    """
+    Parse CSV export from Steam MTG Arena.
+    Expected CSV format:
+        Quantity,Name,Set,Collector Number,Type,Mana Cost,CMC,Colors,Rarity
+    """
+    # Read CSV
+    df = pd.read_csv(StringIO(csv_content))
+    # Normalize column names
+    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
+    mainboard = []
+    sideboard = []
+    for _, row in df.iterrows():
+        quantity = int(row.get('quantity', 1))
+        name = str(row.get('name', ''))
+        set_code = str(row.get('set', '')) if 'set' in row else None
+        card_type = str(row.get('type', 'Unknown'))
+        mana_cost = str(row.get('mana_cost', ''))
+        # Calculate CMC if not provided
+        if 'cmc' in row:
+            cmc = float(row['cmc'])
+        else:
+            cmc = calculate_cmc(mana_cost)
+        # Extract colors if not provided
+        if 'colors' in row and pd.notna(row['colors']):
+            colors = [c.strip() for c in str(row['colors']).split(',')]
+        else:
+            colors = extract_colors(mana_cost)
+        rarity = str(row.get('rarity', '')) if 'rarity' in row else None
+        is_sideboard = row.get('sideboard', False) if 'sideboard' in row else False
+        card = Card(
+            name=name,
+            quantity=quantity,
+            card_type=card_type,
+            mana_cost=mana_cost,
+            cmc=cmc,
+            colors=colors,
+            rarity=rarity,
+            set_code=set_code
+        )
+        if is_sideboard:
+            sideboard.append(card)
+        else:
+            mainboard.append(card)
+    return Deck(
+        name="CSV Import",
+        mainboard=mainboard,
+        sideboard=sideboard
+    )
+def determine_card_type(card_name: str) -> str:
+    """Determine card type based on name (simplified heuristic)."""
+    # This is a simplified version - in production would use card database
+    name_lower = card_name.lower()
+    if any(land in name_lower for land in ['island', 'mountain', 'forest', 'plains', 'swamp']):
+        return "Land"
+    # Default to Unknown - should be looked up from card database
+    return "Unknown"
+def count_csv_rows(filepath: str) -> int:
+    """
+    Efficiently count rows in CSV without loading into memory.
+    Args:
+        filepath: Path to the CSV file
+    Returns:
+        Number of data rows (excluding header)
+    """
+    with open(filepath, 'r', encoding='utf-8') as f:
+        # Count lines, subtract 1 for header
+        return sum(1 for _ in f) - 1
+def _parse_card_row(row: pd.Series) -> Card:
+    """
+    Parse a single CSV row into a Card object.
+    Args:
+        row: pandas Series with normalized column names
+    Returns:
+        Card object
+    Raises:
+        ValueError: If required fields are missing or invalid
+    """
+    # Handle quantity - default to 1 if missing
+    quantity = int(row.get('quantity', 1) or 1)
+    # Get name - required field
+    name = str(row.get('name', '')).strip()
+    if not name:
+        raise ValueError("Card name is required")
+    # Parse CMC - handle various formats
+    cmc_val = row.get('cmc', 0)
+    try:
+        cmc = int(float(cmc_val)) if pd.notna(cmc_val) else 0
+    except (ValueError, TypeError):
+        cmc = 0
+    # Parse colors - handle string or list
+    colors_raw = row.get('colors', '')
+    if pd.isna(colors_raw):
+        colors = []
+    elif isinstance(colors_raw, str):
+        # Check if it's an Untapped.gg color name
+        if colors_raw.lower() in ['gold', 'white', 'blue', 'black', 'red', 'green', 'colorless']:
+            colors = _parse_untapped_colors(colors_raw)
+        else:
+            # Standard Arena format (W, U, B, etc.)
+            colors = [c.strip() for c in colors_raw.split(',') if c.strip()]
+    else:
+        colors = list(colors_raw) if colors_raw else []
+    return Card(
+        name=name,
+        quantity=quantity,
+        card_type=str(row.get('type', '')).strip() or "Unknown",
+        mana_cost=str(row.get('mana_cost', '')).strip() or None,
+        cmc=cmc,
+        colors=colors,
+        rarity=str(row.get('rarity', '')).strip() or None,
+        set_code=str(row.get('set', '')).strip() or None
+    )
+def parse_arena_csv_chunked(
+    filepath: str,
+    chunk_size: int = 5000
+) -> Generator[Tuple[int, List[Card], List[int]], None, None]:
+    """
+    Parse large CSV file in chunks using pandas chunked reader.
+    This is memory-efficient for large collection CSVs (up to 70K+ rows).
+    Args:
+        filepath: Path to the CSV file
+        chunk_size: Number of rows per chunk (default 5000)
+    Yields:
+        Tuple of (chunk_index, cards_list, failed_row_indices)
+    Example:
+        for chunk_idx, cards, failed in parse_arena_csv_chunked("collection.csv"):
+            all_cards.extend(cards)
+            all_failed.extend(failed)
+    """
+    chunk_iter = pd.read_csv(
+        filepath,
+        chunksize=chunk_size,
+        dtype={
+            'Quantity': 'Int64',  # Nullable integer
+            'Name': 'string',
+            'CMC': 'Float64',     # Nullable float
+        },
+        on_bad_lines='warn',
+        encoding='utf-8'
+    )
+    for chunk_idx, chunk_df in enumerate(chunk_iter):
+        # Normalize column names (lowercase, underscores)
+        chunk_df.columns = (
+            chunk_df.columns.str.strip()
+            .str.lower()
+            .str.replace(' ', '_')
+            .str.replace('-', '_')
+            .str.replace('#', '') # Handle "Id #" -> "id_"
+        )
+        # Apply Untapped.gg normalization if needed
+        if 'id_' in chunk_df.columns and 'color' in chunk_df.columns:
+             chunk_df = _normalize_untapped_columns(chunk_df)
+        cards: List[Card] = []
+        failed_rows: List[int] = []
+        for row_idx, row in chunk_df.iterrows():
+            global_row_idx = chunk_idx * chunk_size + row_idx
+            try:
+                card = _parse_card_row(row)
+                cards.append(card)
+            except Exception as e:
+                failed_rows.append(global_row_idx)
+                logger.warning(f"Failed to parse row {global_row_idx}: {e}")
+        yield chunk_idx, cards, failed_rows
+        # Explicit memory cleanup after each chunk
+        del chunk_df
+        gc.collect()
+def _normalize_untapped_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalize Untapped.gg column names to standard internal format.
+    Untapped.gg format: Id #, Name, Set, Color, Rarity, Quantity
+    Target format: quantity, name, set, colors, rarity
+    """
+    # Map Untapped columns to internal names
+    column_map = {
+        'id_#': 'collector_number',
+        'name': 'name',
+        'set': 'set',
+        'color': 'colors',
+        'rarity': 'rarity',
+        'quantity': 'quantity'
+    }
+    # Rename columns that exist
+    df = df.rename(columns=lambda x: column_map.get(x, x))
+    return df
+def _parse_untapped_colors(color_str: str) -> List[str]:
+    """Parse Untapped.gg color string (e.g., 'Gold', 'Blue', 'Red')."""
+    if pd.isna(color_str) or not isinstance(color_str, str):
+        return []
+    color_str = color_str.lower().strip()
+    if color_str == 'gold':
+        # Gold implies multicolor, but we don't know exact colors without lookup.
+        # For analysis, we might treat it as a special category or try to infer.
+        # Returning empty list or specific marker might be safer.
+        # For now, let's leave it empty as we can't map to WUBRG accurately without card data.
+        return []
+    # Map standard names to WUBRG
+    color_map = {
+        'white': 'W',
+        'blue': 'U',
+        'black': 'B',
+        'red': 'R',
+        'green': 'G',
+        'colorless': ''
+    }
+    if color_str in color_map:
+        val = color_map[color_str]
+        return [val] if val else []
+    return []