Add Collection Analysis & Untapped.gg Support

#52
by clduab11 - opened
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
src/models/deck.py CHANGED
@@ -1,125 +1,139 @@
1
- """Deck and card data models."""
2
-
3
- from datetime import datetime
4
- from typing import List, Dict, Optional
5
- from dataclasses import dataclass, field
6
- from pydantic import BaseModel, Field
7
-
8
-
9
- class Card(BaseModel):
10
- """Represents a single Magic: The Gathering card."""
11
-
12
- name: str
13
- quantity: int
14
- card_type: str # Creature, Instant, Sorcery, Enchantment, Artifact, Land, Planeswalker
15
- mana_cost: str # e.g., "2UU", "GG", "1"
16
- cmc: float = 0.0 # Converted mana cost
17
- colors: List[str] = Field(default_factory=list) # W, U, B, R, G
18
- rarity: Optional[str] = None # Common, Uncommon, Rare, Mythic
19
- set_code: Optional[str] = None
20
-
21
-
22
- class Deck(BaseModel):
23
- """Represents a complete MTG Arena deck."""
24
-
25
- name: str
26
- format: str = "Standard" # Standard, Historic, Explorer, etc.
27
- mainboard: List[Card]
28
- sideboard: List[Card] = Field(default_factory=list)
29
- commander: Optional[Card] = None
30
-
31
-
32
- class ManaCurve(BaseModel):
33
- """Mana curve analysis for a deck."""
34
-
35
- distribution: Dict[int, int] # CMC -> count
36
- average_cmc: float
37
- median_cmc: float
38
- curve_score: float # 0-100 rating
39
-
40
-
41
- class CardSynergy(BaseModel):
42
- """Represents synergy between two cards."""
43
-
44
- card1: str
45
- card2: str
46
- synergy_type: str # combo, support, anti-synergy
47
- strength: float # 0-100
48
- explanation: str
49
-
50
-
51
- class MetaMatchup(BaseModel):
52
- """Meta matchup analysis."""
53
-
54
- archetype: str
55
- win_rate: float
56
- favorable: bool
57
- key_cards: List[str]
58
- sideboard_suggestions: List[str]
59
-
60
-
61
- class DeckAnalysis(BaseModel):
62
- """Complete deck analysis result."""
63
-
64
- deck_name: str
65
- mana_curve: ManaCurve
66
- color_distribution: Dict[str, int]
67
- card_types: Dict[str, int]
68
- synergies: List[CardSynergy]
69
- meta_matchups: List[MetaMatchup]
70
- strengths: List[str]
71
- weaknesses: List[str]
72
- overall_score: float # 0-100
73
-
74
-
75
- class DeckSuggestion(BaseModel):
76
- """Deck optimization suggestion."""
77
-
78
- type: str # add, remove, replace
79
- card_name: str
80
- quantity: int = 1
81
- reason: str
82
- impact_score: float # 0-100
83
- replacement_for: Optional[str] = None
84
-
85
-
86
- class OptimizedDeck(BaseModel):
87
- """Optimized deck with suggestions."""
88
-
89
- original_deck: Deck
90
- suggestions: List[DeckSuggestion]
91
- predicted_win_rate: float
92
- confidence: float
93
-
94
-
95
- @dataclass
96
- class Collection:
97
- """Represents a card collection (not a deck)."""
98
-
99
- id: Optional[int] = None
100
- name: str = "Imported Collection"
101
- cards: List[Card] = field(default_factory=list)
102
- total_cards: int = 0
103
- unique_cards: int = 0
104
- created_at: Optional[datetime] = None
105
-
106
- def __post_init__(self):
107
- if not self.total_cards:
108
- self.total_cards = sum(card.quantity for card in self.cards)
109
- if not self.unique_cards:
110
- self.unique_cards = len(self.cards)
111
-
112
-
113
- @dataclass
114
- class CollectionProcessingResult:
115
- """Result of processing a collection CSV."""
116
-
117
- collection_id: Optional[int] = None
118
- total_cards: int = 0
119
- unique_cards: int = 0
120
- total_quantity: int = 0
121
- chunks_processed: int = 0
122
- chunks_failed: int = 0
123
- failed_rows: List[int] = field(default_factory=list)
124
- processing_time_seconds: float = 0.0
125
- status: str = "pending" # 'complete', 'partial', 'failed'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deck and card data models."""
2
+
3
+ from datetime import datetime
4
+ from typing import List, Dict, Optional
5
+ from dataclasses import dataclass, field
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class Card(BaseModel):
10
+ """Represents a single Magic: The Gathering card."""
11
+
12
+ name: str
13
+ quantity: int
14
+ card_type: str # Creature, Instant, Sorcery, Enchantment, Artifact, Land, Planeswalker
15
+ mana_cost: str # e.g., "2UU", "GG", "1"
16
+ cmc: float = 0.0 # Converted mana cost
17
+ colors: List[str] = Field(default_factory=list) # W, U, B, R, G
18
+ rarity: Optional[str] = None # Common, Uncommon, Rare, Mythic
19
+ set_code: Optional[str] = None
20
+
21
+
22
+ class Deck(BaseModel):
23
+ """Represents a complete MTG Arena deck."""
24
+
25
+ name: str
26
+ format: str = "Standard" # Standard, Historic, Explorer, etc.
27
+ mainboard: List[Card]
28
+ sideboard: List[Card] = Field(default_factory=list)
29
+ commander: Optional[Card] = None
30
+
31
+
32
+ class ManaCurve(BaseModel):
33
+ """Mana curve analysis for a deck."""
34
+
35
+ distribution: Dict[int, int] # CMC -> count
36
+ average_cmc: float
37
+ median_cmc: float
38
+ curve_score: float # 0-100 rating
39
+
40
+
41
+ class CardSynergy(BaseModel):
42
+ """Represents synergy between two cards."""
43
+
44
+ card1: str
45
+ card2: str
46
+ synergy_type: str # combo, support, anti-synergy
47
+ strength: float # 0-100
48
+ explanation: str
49
+
50
+
51
+ class MetaMatchup(BaseModel):
52
+ """Meta matchup analysis."""
53
+
54
+ archetype: str
55
+ win_rate: float
56
+ favorable: bool
57
+ key_cards: List[str]
58
+ sideboard_suggestions: List[str]
59
+
60
+
61
+ class DeckAnalysis(BaseModel):
62
+ """Complete deck analysis result."""
63
+
64
+ deck_name: str
65
+ mana_curve: ManaCurve
66
+ color_distribution: Dict[str, int]
67
+ card_types: Dict[str, int]
68
+ synergies: List[CardSynergy]
69
+ meta_matchups: List[MetaMatchup]
70
+ strengths: List[str]
71
+ weaknesses: List[str]
72
+ overall_score: float # 0-100
73
+
74
+
75
+ class DeckSuggestion(BaseModel):
76
+ """Deck optimization suggestion."""
77
+
78
+ type: str # add, remove, replace
79
+ card_name: str
80
+ quantity: int = 1
81
+ reason: str
82
+ impact_score: float # 0-100
83
+ replacement_for: Optional[str] = None
84
+
85
+
86
+ class OptimizedDeck(BaseModel):
87
+ """Optimized deck with suggestions."""
88
+
89
+ original_deck: Deck
90
+ suggestions: List[DeckSuggestion]
91
+ predicted_win_rate: float
92
+ confidence: float
93
+
94
+
95
+ @dataclass
96
+ class Collection:
97
+ """Represents a card collection (not a deck)."""
98
+
99
+ id: Optional[int] = None
100
+ name: str = "Imported Collection"
101
+ cards: List[Card] = field(default_factory=list)
102
+ total_cards: int = 0
103
+ unique_cards: int = 0
104
+ created_at: Optional[datetime] = None
105
+
106
+ def __post_init__(self):
107
+ if not self.total_cards:
108
+ self.total_cards = sum(card.quantity for card in self.cards)
109
+ if not self.unique_cards:
110
+ self.unique_cards = len(self.cards)
111
+
112
+
113
+ @dataclass
114
+ class CollectionProcessingResult:
115
+ """Result of processing a collection CSV."""
116
+
117
+ collection_id: Optional[int] = None
118
+ total_cards: int = 0
119
+ unique_cards: int = 0
120
+ total_quantity: int = 0
121
+ chunks_processed: int = 0
122
+ chunks_failed: int = 0
123
+ failed_rows: List[int] = field(default_factory=list)
124
+ processing_time_seconds: float = 0.0
125
+ status: str = "pending" # 'complete', 'partial', 'failed'
126
+
127
+
128
+ @dataclass
129
+ class CollectionAnalysis:
130
+ """Analysis result for a card collection."""
131
+
132
+ total_cards: int
133
+ unique_cards: int
134
+ total_value: float = 0.0 # Estimated value
135
+ rarity_distribution: Dict[str, int] = field(default_factory=dict)
136
+ color_distribution: Dict[str, int] = field(default_factory=dict)
137
+ set_distribution: Dict[str, int] = field(default_factory=dict)
138
+ top_cards: List[Card] = field(default_factory=list)
139
+ completion_score: float = 0.0 # 0-100 score based on staple coverage
src/services/collection_analyzer.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Service for analyzing card collections."""
2
+
3
+ import logging
4
+ from collections import Counter
5
+ from typing import List, Dict
6
+
7
+ from ..models.deck import Card, Collection, CollectionAnalysis
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class CollectionAnalyzer:
13
+ """Analyzes card collections to generate statistics and insights."""
14
+
15
+ async def analyze_collection(self, collection: Collection) -> CollectionAnalysis:
16
+ """
17
+ Perform comprehensive analysis on a card collection.
18
+
19
+ Args:
20
+ collection: The collection to analyze
21
+
22
+ Returns:
23
+ CollectionAnalysis object with statistics
24
+ """
25
+ if not collection or not collection.cards:
26
+ return CollectionAnalysis(total_cards=0, unique_cards=0)
27
+
28
+ cards = collection.cards
29
+ total_cards = sum(c.quantity for c in cards)
30
+ unique_cards = len(cards)
31
+
32
+ # Calculate distributions
33
+ rarity_dist = Counter()
34
+ color_dist = Counter()
35
+ set_dist = Counter()
36
+
37
+ # Track top cards (e.g., by rarity or utility - simplified here)
38
+ # In a real app, we'd check against a "staples" list
39
+ mythics = []
40
+ rares = []
41
+
42
+ for card in cards:
43
+ # Rarity
44
+ if card.rarity:
45
+ rarity_dist[card.rarity] += card.quantity
46
+ if card.rarity.lower() == 'mythic':
47
+ mythics.append(card)
48
+ elif card.rarity.lower() == 'rare':
49
+ rares.append(card)
50
+ else:
51
+ rarity_dist['Unknown'] += card.quantity
52
+
53
+ # Colors
54
+ if card.colors:
55
+ for color in card.colors:
56
+ color_dist[color] += card.quantity
57
+ elif card.card_type and 'Land' in card.card_type:
58
+ color_dist['Lands'] += card.quantity
59
+ else:
60
+ color_dist['Colorless'] += card.quantity
61
+
62
+ # Sets
63
+ if card.set_code:
64
+ set_dist[card.set_code] += card.quantity
65
+
66
+ # Sort top cards by quantity (simplified "top" metric)
67
+ # Prioritize Mythics and Rares
68
+ top_cards = sorted(mythics + rares, key=lambda c: c.quantity, reverse=True)[:20]
69
+
70
+ # Calculate a simple completion score (placeholder logic)
71
+ # Real logic would compare against meta decks
72
+ completion_score = min(100.0, (unique_cards / 2000) * 100) # Arbitrary baseline
73
+
74
+ return CollectionAnalysis(
75
+ total_cards=total_cards,
76
+ unique_cards=unique_cards,
77
+ total_value=0.0, # Would need price service integration
78
+ rarity_distribution=dict(rarity_dist),
79
+ color_distribution=dict(color_dist),
80
+ set_distribution=dict(set_dist),
81
+ top_cards=top_cards,
82
+ completion_score=round(completion_score, 1)
83
+ )
src/utils/csv_parser.py CHANGED
@@ -1,274 +1,339 @@
1
- """CSV parser for MTG Arena deck exports."""
2
-
3
- import gc
4
- import re
5
- import logging
6
- from typing import List, Tuple, Generator
7
- from io import StringIO
8
- import pandas as pd
9
-
10
- from ..models.deck import Card, Deck
11
- from .mana_calculator import calculate_cmc, parse_mana_cost, extract_colors
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- def parse_deck_string(deck_string: str) -> Deck:
17
- """
18
- Parse MTG Arena deck format string.
19
-
20
- Format examples:
21
- 4 Lightning Bolt (M11) 146
22
- 2 Counterspell (MH2) 267
23
- 20 Island (ZNR) 381
24
- """
25
- lines = deck_string.strip().split('\n')
26
- mainboard = []
27
- sideboard = []
28
- current_section = mainboard
29
-
30
- for line in lines:
31
- line = line.strip()
32
- if not line:
33
- continue
34
-
35
- # Check for sideboard marker
36
- if line.lower() in ['sideboard', 'sideboard:']:
37
- current_section = sideboard
38
- continue
39
-
40
- # Parse card line: "4 Card Name (SET) 123"
41
- # Split by parentheses to avoid ReDoS vulnerability
42
- paren_match = re.search(r'\(([A-Z0-9]+)\)\s+(\d+)$', line)
43
- if paren_match:
44
- # Extract quantity and name from beginning
45
- prefix = line[:paren_match.start()].strip()
46
- # Use simple split to avoid ReDoS
47
- parts = prefix.split(None, 1) # Split on first whitespace
48
- if len(parts) == 2 and parts[0].isdigit():
49
- quantity = int(parts[0])
50
- card_name = parts[1].strip()
51
- set_code = paren_match.group(1)
52
-
53
- # Determine card type and mana cost (simplified - would need card database)
54
- card_type = determine_card_type(card_name)
55
- # WARNING: Text format parsing limitation - mana cost is not available
56
- # This will result in CMC=0 and no colors for all non-land cards,
57
- # which significantly affects deck analysis accuracy.
58
- # Use CSV format for accurate mana curve and color analysis.
59
- mana_cost = "" # Would need card database lookup
60
-
61
- card = Card(
62
- name=card_name,
63
- quantity=quantity,
64
- card_type=card_type,
65
- mana_cost=mana_cost,
66
- cmc=calculate_cmc(mana_cost),
67
- colors=extract_colors(mana_cost),
68
- set_code=set_code
69
- )
70
- current_section.append(card)
71
-
72
- return Deck(
73
- name="Imported Deck",
74
- mainboard=mainboard,
75
- sideboard=sideboard
76
- )
77
-
78
-
79
- def parse_arena_csv(csv_content: str) -> Deck:
80
- """
81
- Parse CSV export from Steam MTG Arena.
82
-
83
- Expected CSV format:
84
- Quantity,Name,Set,Collector Number,Type,Mana Cost,CMC,Colors,Rarity
85
- """
86
- # Read CSV
87
- df = pd.read_csv(StringIO(csv_content))
88
-
89
- # Normalize column names
90
- df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
91
-
92
- mainboard = []
93
- sideboard = []
94
-
95
- for _, row in df.iterrows():
96
- quantity = int(row.get('quantity', 1))
97
- name = str(row.get('name', ''))
98
- set_code = str(row.get('set', '')) if 'set' in row else None
99
- card_type = str(row.get('type', 'Unknown'))
100
- mana_cost = str(row.get('mana_cost', ''))
101
-
102
- # Calculate CMC if not provided
103
- if 'cmc' in row:
104
- cmc = float(row['cmc'])
105
- else:
106
- cmc = calculate_cmc(mana_cost)
107
-
108
- # Extract colors if not provided
109
- if 'colors' in row and pd.notna(row['colors']):
110
- colors = [c.strip() for c in str(row['colors']).split(',')]
111
- else:
112
- colors = extract_colors(mana_cost)
113
-
114
- rarity = str(row.get('rarity', '')) if 'rarity' in row else None
115
- is_sideboard = row.get('sideboard', False) if 'sideboard' in row else False
116
-
117
- card = Card(
118
- name=name,
119
- quantity=quantity,
120
- card_type=card_type,
121
- mana_cost=mana_cost,
122
- cmc=cmc,
123
- colors=colors,
124
- rarity=rarity,
125
- set_code=set_code
126
- )
127
-
128
- if is_sideboard:
129
- sideboard.append(card)
130
- else:
131
- mainboard.append(card)
132
-
133
- return Deck(
134
- name="CSV Import",
135
- mainboard=mainboard,
136
- sideboard=sideboard
137
- )
138
-
139
-
140
- def determine_card_type(card_name: str) -> str:
141
- """Determine card type based on name (simplified heuristic)."""
142
- # This is a simplified version - in production would use card database
143
- name_lower = card_name.lower()
144
-
145
- if any(land in name_lower for land in ['island', 'mountain', 'forest', 'plains', 'swamp']):
146
- return "Land"
147
-
148
- # Default to Unknown - should be looked up from card database
149
- return "Unknown"
150
-
151
-
152
- def count_csv_rows(filepath: str) -> int:
153
- """
154
- Efficiently count rows in CSV without loading into memory.
155
-
156
- Args:
157
- filepath: Path to the CSV file
158
-
159
- Returns:
160
- Number of data rows (excluding header)
161
- """
162
- with open(filepath, 'r', encoding='utf-8') as f:
163
- # Count lines, subtract 1 for header
164
- return sum(1 for _ in f) - 1
165
-
166
-
167
- def _parse_card_row(row: pd.Series) -> Card:
168
- """
169
- Parse a single CSV row into a Card object.
170
-
171
- Args:
172
- row: pandas Series with normalized column names
173
-
174
- Returns:
175
- Card object
176
-
177
- Raises:
178
- ValueError: If required fields are missing or invalid
179
- """
180
- # Handle quantity - default to 1 if missing
181
- quantity = int(row.get('quantity', 1) or 1)
182
-
183
- # Get name - required field
184
- name = str(row.get('name', '')).strip()
185
- if not name:
186
- raise ValueError("Card name is required")
187
-
188
- # Parse CMC - handle various formats
189
- cmc_val = row.get('cmc', 0)
190
- try:
191
- cmc = int(float(cmc_val)) if pd.notna(cmc_val) else 0
192
- except (ValueError, TypeError):
193
- cmc = 0
194
-
195
- # Parse colors - handle string or list
196
- colors_raw = row.get('colors', '')
197
- if pd.isna(colors_raw):
198
- colors = []
199
- elif isinstance(colors_raw, str):
200
- colors = [c.strip() for c in colors_raw.split(',') if c.strip()]
201
- else:
202
- colors = list(colors_raw) if colors_raw else []
203
-
204
- return Card(
205
- name=name,
206
- quantity=quantity,
207
- card_type=str(row.get('type', '')).strip() or None,
208
- mana_cost=str(row.get('mana_cost', '')).strip() or None,
209
- cmc=cmc,
210
- colors=colors,
211
- rarity=str(row.get('rarity', '')).strip() or None,
212
- set_code=str(row.get('set', '')).strip() or None
213
- )
214
-
215
-
216
- def parse_arena_csv_chunked(
217
- filepath: str,
218
- chunk_size: int = 5000
219
- ) -> Generator[Tuple[int, List[Card], List[int]], None, None]:
220
- """
221
- Parse large CSV file in chunks using pandas chunked reader.
222
-
223
- This is memory-efficient for large collection CSVs (up to 70K+ rows).
224
-
225
- Args:
226
- filepath: Path to the CSV file
227
- chunk_size: Number of rows per chunk (default 5000)
228
-
229
- Yields:
230
- Tuple of (chunk_index, cards_list, failed_row_indices)
231
-
232
- Example:
233
- for chunk_idx, cards, failed in parse_arena_csv_chunked("collection.csv"):
234
- all_cards.extend(cards)
235
- all_failed.extend(failed)
236
- """
237
- chunk_iter = pd.read_csv(
238
- filepath,
239
- chunksize=chunk_size,
240
- dtype={
241
- 'Quantity': 'Int64', # Nullable integer
242
- 'Name': 'string',
243
- 'CMC': 'Float64', # Nullable float
244
- },
245
- on_bad_lines='warn',
246
- encoding='utf-8'
247
- )
248
-
249
- for chunk_idx, chunk_df in enumerate(chunk_iter):
250
- # Normalize column names (lowercase, underscores)
251
- chunk_df.columns = (
252
- chunk_df.columns.str.strip()
253
- .str.lower()
254
- .str.replace(' ', '_')
255
- .str.replace('-', '_')
256
- )
257
-
258
- cards: List[Card] = []
259
- failed_rows: List[int] = []
260
-
261
- for row_idx, row in chunk_df.iterrows():
262
- global_row_idx = chunk_idx * chunk_size + row_idx
263
- try:
264
- card = _parse_card_row(row)
265
- cards.append(card)
266
- except Exception as e:
267
- failed_rows.append(global_row_idx)
268
- logger.warning(f"Failed to parse row {global_row_idx}: {e}")
269
-
270
- yield chunk_idx, cards, failed_rows
271
-
272
- # Explicit memory cleanup after each chunk
273
- del chunk_df
274
- gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CSV parser for MTG Arena deck exports."""
2
+
3
+ import gc
4
+ import re
5
+ import logging
6
+ from typing import List, Tuple, Generator
7
+ from io import StringIO
8
+ import pandas as pd
9
+
10
+ from ..models.deck import Card, Deck
11
+ from .mana_calculator import calculate_cmc, parse_mana_cost, extract_colors
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def parse_deck_string(deck_string: str) -> Deck:
17
+ """
18
+ Parse MTG Arena deck format string.
19
+
20
+ Format examples:
21
+ 4 Lightning Bolt (M11) 146
22
+ 2 Counterspell (MH2) 267
23
+ 20 Island (ZNR) 381
24
+ """
25
+ lines = deck_string.strip().split('\n')
26
+ mainboard = []
27
+ sideboard = []
28
+ current_section = mainboard
29
+
30
+ for line in lines:
31
+ line = line.strip()
32
+ if not line:
33
+ continue
34
+
35
+ # Check for sideboard marker
36
+ if line.lower() in ['sideboard', 'sideboard:']:
37
+ current_section = sideboard
38
+ continue
39
+
40
+ # Parse card line: "4 Card Name (SET) 123"
41
+ # Split by parentheses to avoid ReDoS vulnerability
42
+ paren_match = re.search(r'\(([A-Z0-9]+)\)\s+(\d+)$', line)
43
+ if paren_match:
44
+ # Extract quantity and name from beginning
45
+ prefix = line[:paren_match.start()].strip()
46
+ # Use simple split to avoid ReDoS
47
+ parts = prefix.split(None, 1) # Split on first whitespace
48
+ if len(parts) == 2 and parts[0].isdigit():
49
+ quantity = int(parts[0])
50
+ card_name = parts[1].strip()
51
+ set_code = paren_match.group(1)
52
+
53
+ # Determine card type and mana cost (simplified - would need card database)
54
+ card_type = determine_card_type(card_name)
55
+ # WARNING: Text format parsing limitation - mana cost is not available
56
+ # This will result in CMC=0 and no colors for all non-land cards,
57
+ # which significantly affects deck analysis accuracy.
58
+ # Use CSV format for accurate mana curve and color analysis.
59
+ mana_cost = "" # Would need card database lookup
60
+
61
+ card = Card(
62
+ name=card_name,
63
+ quantity=quantity,
64
+ card_type=card_type,
65
+ mana_cost=mana_cost,
66
+ cmc=calculate_cmc(mana_cost),
67
+ colors=extract_colors(mana_cost),
68
+ set_code=set_code
69
+ )
70
+ current_section.append(card)
71
+
72
+ return Deck(
73
+ name="Imported Deck",
74
+ mainboard=mainboard,
75
+ sideboard=sideboard
76
+ )
77
+
78
+
79
+ def parse_arena_csv(csv_content: str) -> Deck:
80
+ """
81
+ Parse CSV export from Steam MTG Arena.
82
+
83
+ Expected CSV format:
84
+ Quantity,Name,Set,Collector Number,Type,Mana Cost,CMC,Colors,Rarity
85
+ """
86
+ # Read CSV
87
+ df = pd.read_csv(StringIO(csv_content))
88
+
89
+ # Normalize column names
90
+ df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
91
+
92
+ mainboard = []
93
+ sideboard = []
94
+
95
+ for _, row in df.iterrows():
96
+ quantity = int(row.get('quantity', 1))
97
+ name = str(row.get('name', ''))
98
+ set_code = str(row.get('set', '')) if 'set' in row else None
99
+ card_type = str(row.get('type', 'Unknown'))
100
+ mana_cost = str(row.get('mana_cost', ''))
101
+
102
+ # Calculate CMC if not provided
103
+ if 'cmc' in row:
104
+ cmc = float(row['cmc'])
105
+ else:
106
+ cmc = calculate_cmc(mana_cost)
107
+
108
+ # Extract colors if not provided
109
+ if 'colors' in row and pd.notna(row['colors']):
110
+ colors = [c.strip() for c in str(row['colors']).split(',')]
111
+ else:
112
+ colors = extract_colors(mana_cost)
113
+
114
+ rarity = str(row.get('rarity', '')) if 'rarity' in row else None
115
+ is_sideboard = row.get('sideboard', False) if 'sideboard' in row else False
116
+
117
+ card = Card(
118
+ name=name,
119
+ quantity=quantity,
120
+ card_type=card_type,
121
+ mana_cost=mana_cost,
122
+ cmc=cmc,
123
+ colors=colors,
124
+ rarity=rarity,
125
+ set_code=set_code
126
+ )
127
+
128
+ if is_sideboard:
129
+ sideboard.append(card)
130
+ else:
131
+ mainboard.append(card)
132
+
133
+ return Deck(
134
+ name="CSV Import",
135
+ mainboard=mainboard,
136
+ sideboard=sideboard
137
+ )
138
+
139
+
140
+ def determine_card_type(card_name: str) -> str:
141
+ """Determine card type based on name (simplified heuristic)."""
142
+ # This is a simplified version - in production would use card database
143
+ name_lower = card_name.lower()
144
+
145
+ if any(land in name_lower for land in ['island', 'mountain', 'forest', 'plains', 'swamp']):
146
+ return "Land"
147
+
148
+ # Default to Unknown - should be looked up from card database
149
+ return "Unknown"
150
+
151
+
152
+ def count_csv_rows(filepath: str) -> int:
153
+ """
154
+ Efficiently count rows in CSV without loading into memory.
155
+
156
+ Args:
157
+ filepath: Path to the CSV file
158
+
159
+ Returns:
160
+ Number of data rows (excluding header)
161
+ """
162
+ with open(filepath, 'r', encoding='utf-8') as f:
163
+ # Count lines, subtract 1 for header
164
+ return sum(1 for _ in f) - 1
165
+
166
+
167
+ def _parse_card_row(row: pd.Series) -> Card:
168
+ """
169
+ Parse a single CSV row into a Card object.
170
+
171
+ Args:
172
+ row: pandas Series with normalized column names
173
+
174
+ Returns:
175
+ Card object
176
+
177
+ Raises:
178
+ ValueError: If required fields are missing or invalid
179
+ """
180
+ # Handle quantity - default to 1 if missing
181
+ quantity = int(row.get('quantity', 1) or 1)
182
+
183
+ # Get name - required field
184
+ name = str(row.get('name', '')).strip()
185
+ if not name:
186
+ raise ValueError("Card name is required")
187
+
188
+ # Parse CMC - handle various formats
189
+ cmc_val = row.get('cmc', 0)
190
+ try:
191
+ cmc = int(float(cmc_val)) if pd.notna(cmc_val) else 0
192
+ except (ValueError, TypeError):
193
+ cmc = 0
194
+
195
+ # Parse colors - handle string or list
196
+ colors_raw = row.get('colors', '')
197
+ if pd.isna(colors_raw):
198
+ colors = []
199
+ elif isinstance(colors_raw, str):
200
+ # Check if it's an Untapped.gg color name
201
+ if colors_raw.lower() in ['gold', 'white', 'blue', 'black', 'red', 'green', 'colorless']:
202
+ colors = _parse_untapped_colors(colors_raw)
203
+ else:
204
+ # Standard Arena format (W, U, B, etc.)
205
+ colors = [c.strip() for c in colors_raw.split(',') if c.strip()]
206
+ else:
207
+ colors = list(colors_raw) if colors_raw else []
208
+
209
+ return Card(
210
+ name=name,
211
+ quantity=quantity,
212
+ card_type=str(row.get('type', '')).strip() or "Unknown",
213
+ mana_cost=str(row.get('mana_cost', '')).strip() or None,
214
+ cmc=cmc,
215
+ colors=colors,
216
+ rarity=str(row.get('rarity', '')).strip() or None,
217
+ set_code=str(row.get('set', '')).strip() or None
218
+ )
219
+
220
+
221
+ def parse_arena_csv_chunked(
222
+ filepath: str,
223
+ chunk_size: int = 5000
224
+ ) -> Generator[Tuple[int, List[Card], List[int]], None, None]:
225
+ """
226
+ Parse large CSV file in chunks using pandas chunked reader.
227
+
228
+ This is memory-efficient for large collection CSVs (up to 70K+ rows).
229
+
230
+ Args:
231
+ filepath: Path to the CSV file
232
+ chunk_size: Number of rows per chunk (default 5000)
233
+
234
+ Yields:
235
+ Tuple of (chunk_index, cards_list, failed_row_indices)
236
+
237
+ Example:
238
+ for chunk_idx, cards, failed in parse_arena_csv_chunked("collection.csv"):
239
+ all_cards.extend(cards)
240
+ all_failed.extend(failed)
241
+ """
242
+ chunk_iter = pd.read_csv(
243
+ filepath,
244
+ chunksize=chunk_size,
245
+ dtype={
246
+ 'Quantity': 'Int64', # Nullable integer
247
+ 'Name': 'string',
248
+ 'CMC': 'Float64', # Nullable float
249
+ },
250
+ on_bad_lines='warn',
251
+ encoding='utf-8'
252
+ )
253
+
254
+ for chunk_idx, chunk_df in enumerate(chunk_iter):
255
+ # Normalize column names (lowercase, underscores)
256
+ chunk_df.columns = (
257
+ chunk_df.columns.str.strip()
258
+ .str.lower()
259
+ .str.replace(' ', '_')
260
+ .str.replace('-', '_')
261
+ .str.replace('#', '') # Handle "Id #" -> "id_"
262
+ )
263
+
264
+ # Apply Untapped.gg normalization if needed
265
+ if 'id_' in chunk_df.columns and 'color' in chunk_df.columns:
266
+ chunk_df = _normalize_untapped_columns(chunk_df)
267
+
268
+ cards: List[Card] = []
269
+ failed_rows: List[int] = []
270
+
271
+ for row_idx, row in chunk_df.iterrows():
272
+ global_row_idx = chunk_idx * chunk_size + row_idx
273
+ try:
274
+ card = _parse_card_row(row)
275
+ cards.append(card)
276
+ except Exception as e:
277
+ failed_rows.append(global_row_idx)
278
+ logger.warning(f"Failed to parse row {global_row_idx}: {e}")
279
+
280
+ yield chunk_idx, cards, failed_rows
281
+
282
+ # Explicit memory cleanup after each chunk
283
+ del chunk_df
284
+ gc.collect()
285
+
286
+
287
+ def _normalize_untapped_columns(df: pd.DataFrame) -> pd.DataFrame:
288
+ """
289
+ Normalize Untapped.gg column names to standard internal format.
290
+
291
+ Untapped.gg format: Id #, Name, Set, Color, Rarity, Quantity
292
+ Target format: quantity, name, set, colors, rarity
293
+ """
294
+ # Map Untapped columns to internal names
295
+ column_map = {
296
+ 'id_#': 'collector_number',
297
+ 'name': 'name',
298
+ 'set': 'set',
299
+ 'color': 'colors',
300
+ 'rarity': 'rarity',
301
+ 'quantity': 'quantity'
302
+ }
303
+
304
+ # Rename columns that exist
305
+ df = df.rename(columns=lambda x: column_map.get(x, x))
306
+
307
+ return df
308
+
309
+
310
+ def _parse_untapped_colors(color_str: str) -> List[str]:
311
+ """Parse Untapped.gg color string (e.g., 'Gold', 'Blue', 'Red')."""
312
+ if pd.isna(color_str) or not isinstance(color_str, str):
313
+ return []
314
+
315
+ color_str = color_str.lower().strip()
316
+
317
+ if color_str == 'gold':
318
+ # Gold implies multicolor, but we don't know exact colors without lookup.
319
+ # For analysis, we might treat it as a special category or try to infer.
320
+ # Returning empty list or specific marker might be safer.
321
+ # For now, let's leave it empty as we can't map to WUBRG accurately without card data.
322
+ return []
323
+
324
+ # Map standard names to WUBRG
325
+ color_map = {
326
+ 'white': 'W',
327
+ 'blue': 'U',
328
+ 'black': 'B',
329
+ 'red': 'R',
330
+ 'green': 'G',
331
+ 'colorless': ''
332
+ }
333
+
334
+ if color_str in color_map:
335
+ val = color_map[color_str]
336
+ return [val] if val else []
337
+
338
+ return []
339
+