""" HuggingFace Dataset Loader Downloads and converts HuggingFace datasets into financial advisor training format. Supports transaction categorization, financial Q&A, and other financial datasets. """ from datasets import load_dataset from typing import List, Dict, Any, Optional import random import os class HuggingFaceDatasetLoader: """Load and convert HuggingFace datasets for financial advisor training""" def __init__(self): # Popular financial datasets on HuggingFace (publicly accessible) self.known_datasets = { # Core Financial Q&A Datasets "financial-alpaca": { "path": "gbharti/finance-alpaca", "type": "qa", "description": "Financial Q&A dataset based on Alpaca format (52K examples)", "category": "General Finance" }, "fingpt-finred": { "path": "FinGPT/fingpt-finred", "type": "qa", "description": "Financial relation extraction dataset", "category": "Financial Analysis" }, "finance-qa-10k": { "path": "virattt/financial-qa-10K", "type": "qa", "description": "Financial Q&A from 10-K filings", "category": "SEC Filings" }, # Financial News & Sentiment "financial-phrasebank": { "path": "financial_phrasebank", "type": "qa", "description": "Financial news sentiment analysis (4.8K sentences)", "category": "Sentiment Analysis" }, "fin-sentiment": { "path": "zeroshot/twitter-financial-news-sentiment", "type": "qa", "description": "Financial news sentiment from Twitter (11K examples)", "category": "Sentiment Analysis" }, # Investment & Trading "stock-market-qa": { "path": "virattt/financial-qa-10K", "type": "qa", "description": "Stock market Q&A from 10-K filings", "category": "Investments" }, "sec-edgar-filings": { "path": "JanosAudron/financial-reports-sec", "type": "qa", "description": "SEC EDGAR financial reports", "category": "SEC Filings" }, # Banking & Risk "credit-card-fraud": { "path": "nelsoncode/credit-card-fraud", "type": "transaction", "description": "Credit card fraud detection dataset", "category": "Fraud Detection" }, # Economics & Policy "econ-qa": { "path": "ChanceFocus/econ-qa", "type": "qa", "description": "Economics Q&A dataset", "category": "Economics" }, # Instruction Following "finance-instructions": { "path": "rombodawg/MegaCodeTraining", "type": "qa", "description": "Financial instruction following dataset", "category": "Instruction Following" }, # Multi-Domain Financial "fin-llama": { "path": "bavest/fin-llama-dataset", "type": "qa", "description": "Multi-domain financial dataset for LLaMA", "category": "General Finance" }, "finance-chat": { "path": "sujet-ai/Sujet-Finance-Instruct-177k", "type": "qa", "description": "Finance chat instructions (177K examples)", "category": "General Finance" }, # Specialized Financial Topics "accounting-qa": { "path": "0-hero/OIG-small-chip2", "type": "qa", "description": "Accounting and bookkeeping Q&A", "category": "Accounting" }, "tax-qa": { "path": "Locutusque/Tax-assistant", "type": "qa", "description": "Tax-related questions and answers", "category": "Tax & Legal" }, # Financial Education "fin-education": { "path": "FinGPT/fingpt-fineval", "type": "qa", "description": "Financial education and evaluation dataset", "category": "Education" }, # Real Estate & Mortgages "real-estate-qa": { "path": "0-hero/OIG-small-chip2", "type": "qa", "description": "Real estate and mortgage Q&A", "category": "Real Estate" }, # Insurance "insurance-qa": { "path": "0-hero/OIG-small-chip2", "type": "qa", "description": "Insurance-related questions and answers", "category": "Insurance" }, # Cryptocurrency & DeFi "crypto-qa": { "path": "Locutusque/hercules-v5.0", "type": "qa", "description": "Cryptocurrency and DeFi Q&A", "category": "Cryptocurrency" } } def get_preset_datasets(self) -> Dict[str, Dict[str, str]]: """ Get dictionary of preset datasets Returns the known_datasets dictionary """ return self.known_datasets def load_dataset_by_name(self, dataset_name: str, split: str = "train", max_examples: Optional[int] = None): """ Load a known dataset by name Args: dataset_name: Short name from known_datasets split: Dataset split (train/test/validation) max_examples: Maximum number of examples to load Returns: List of examples in Q&A format """ if dataset_name not in self.known_datasets: raise ValueError(f"Unknown dataset: {dataset_name}. Choose from: {list(self.known_datasets.keys())}") dataset_info = self.known_datasets[dataset_name] return self.load_dataset_by_path( dataset_info["path"], dataset_type=dataset_info["type"], split=split, max_examples=max_examples ) def load_dataset_by_path(self, dataset_path: str, dataset_type: str = "auto", split: str = "train", max_examples: Optional[int] = None): """ Load a dataset from HuggingFace by path Args: dataset_path: Full path like "gbharti/finance-alpaca" dataset_type: Type of dataset (transaction/qa/auto) split: Dataset split max_examples: Maximum examples to load Returns: List of examples in Q&A format """ print(f"Loading dataset: {dataset_path} (split: {split})...") try: # Get HuggingFace token from environment if available hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN") # Load from HuggingFace with authentication try: if hf_token: dataset = load_dataset(dataset_path, split=split, token=hf_token) else: dataset = load_dataset(dataset_path, split=split) except Exception as auth_error: error_msg = str(auth_error) if "gated" in error_msg.lower() or "authenticated" in error_msg.lower(): raise Exception( f"Dataset '{dataset_path}' requires authentication.\n\n" f"This is a GATED dataset that requires special access.\n\n" f"To use this dataset:\n" f"1. Go to https://huggingface.co/datasets/{dataset_path}\n" f"2. Click 'Access repository' and accept terms\n" f"3. Make sure your HuggingFace token is set in Settings tab\n\n" f"Or try one of the publicly accessible datasets instead:\n" f"- gbharti/finance-alpaca (52K financial Q&A)\n" f"- FinGPT/fingpt-finred (Financial relations)\n" f"- virattt/financial-qa-10K (10-K filings Q&A)" ) else: raise # Limit examples if requested if max_examples and len(dataset) > max_examples: # Sample randomly for diversity indices = random.sample(range(len(dataset)), max_examples) dataset = dataset.select(indices) print(f"Loaded {len(dataset)} examples") # Auto-detect type if needed if dataset_type == "auto": dataset_type = self._detect_dataset_type(dataset[0]) print(f"Auto-detected type: {dataset_type}") # Convert to Q&A format if dataset_type == "transaction": converted = self._convert_transaction_dataset(dataset) elif dataset_type == "qa": converted = self._convert_qa_dataset(dataset) else: raise ValueError(f"Unsupported dataset type: {dataset_type}") print(f"Converted {len(converted)} examples to Q&A format") return converted except Exception as e: raise Exception(f"{str(e)}") def _detect_dataset_type(self, example: Dict[str, Any]) -> str: """Auto-detect dataset type from first example""" keys = set(example.keys()) # Check for transaction data if "transaction" in keys or "category" in keys or "amount" in keys: return "transaction" # Check for Q&A data if ("question" in keys and "answer" in keys) or \ ("instruction" in keys and "output" in keys) or \ ("input" in keys and "output" in keys): return "qa" return "unknown" def _convert_transaction_dataset(self, dataset) -> List[Dict[str, str]]: """ Convert transaction categorization dataset to Q&A format Creates questions like: Q: "Categorize this transaction: $50.00 at Starbucks" A: "This transaction should be categorized as 'Food & Dining'..." """ converted = [] for item in dataset: # Extract fields (adapt to actual dataset structure) if "transaction" in item and "category" in item: transaction_text = item["transaction"] category = item["category"] # Get amount if available amount = item.get("amount", "") merchant = item.get("merchant", "") # Create Q&A pair qa_pair = self._create_transaction_qa(transaction_text, category, amount, merchant) converted.append(qa_pair) # Handle alternate structures elif "text" in item and "label" in item: text = item["text"] label = item["label"] qa_pair = self._create_transaction_qa(text, label, "", "") converted.append(qa_pair) return converted def _create_transaction_qa(self, transaction: str, category: str, amount: str, merchant: str) -> Dict[str, str]: """Create a Q&A pair from transaction data""" # Build transaction description transaction_desc = transaction if amount and merchant: transaction_desc = f"{amount} at {merchant}" elif amount: transaction_desc = f"{amount} - {transaction}" elif merchant: transaction_desc = f"{merchant} - {transaction}" # Create question (vary the format) question_templates = [ f"What category should this transaction be in: {transaction_desc}?", f"How would you categorize this transaction: {transaction_desc}?", f"Categorize this expense: {transaction_desc}", f"Which spending category does this belong to: {transaction_desc}?", f"Help me categorize: {transaction_desc}" ] question = random.choice(question_templates) # Create detailed answer answer = self._generate_transaction_answer(transaction_desc, category) return { "instruction": question, "input": "", "output": answer } def _generate_transaction_answer(self, transaction: str, category: str) -> str: """Generate a detailed answer for transaction categorization""" # Common category explanations category_explanations = { "Food & Dining": "restaurants, groceries, coffee shops, and food delivery services", "Shopping": "retail purchases, online shopping, clothing, and general merchandise", "Transportation": "gas, public transit, ride-sharing services, parking, and vehicle maintenance", "Bills & Utilities": "electricity, water, internet, phone bills, and subscriptions", "Entertainment": "movies, concerts, streaming services, hobbies, and recreational activities", "Health & Fitness": "gym memberships, medical expenses, pharmacy purchases, and wellness services", "Travel": "flights, hotels, vacation expenses, and travel-related costs", "Personal Care": "haircuts, spa services, cosmetics, and personal grooming", "Education": "tuition, books, courses, and educational materials", "Gifts & Donations": "charitable contributions, gifts, and donations", "Home": "rent, mortgage, furniture, home improvement, and household supplies", "Insurance": "health insurance, car insurance, life insurance, and other policies", "Fees & Charges": "bank fees, ATM fees, service charges, and late fees", "Income": "salary, wages, refunds, and other income sources", "Investments": "stock purchases, retirement contributions, and investment transactions" } # Get explanation or use generic explanation = category_explanations.get( category, f"expenses related to {category.lower()}" ) # Generate answer answer = f"This transaction should be categorized as '{category}'. " answer += f"This category typically includes {explanation}. " answer += f"\n\nBy tracking expenses in the '{category}' category, you can better understand your spending patterns " answer += f"and make informed decisions about your budget. " # Add budgeting tip based on category if category in ["Food & Dining", "Shopping", "Entertainment"]: answer += f"Consider setting a monthly budget limit for {category} to help control discretionary spending." elif category in ["Bills & Utilities", "Insurance"]: answer += f"These are typically fixed expenses that should be factored into your monthly budget planning." elif category in ["Health & Fitness", "Education"]: answer += f"These are investments in yourself that can provide long-term value and returns." elif category == "Income": answer += f"Regular income tracking helps you understand your cash flow and plan your savings goals." return answer def _convert_qa_dataset(self, dataset) -> List[Dict[str, str]]: """ Convert Q&A dataset to standard format Handles various Q&A formats from HuggingFace """ converted = [] for item in dataset: qa_pair = {} # Try different field name combinations if "instruction" in item and "output" in item: qa_pair = { "instruction": item["instruction"], "input": item.get("input", ""), "output": item["output"] } elif "question" in item and "answer" in item: qa_pair = { "instruction": item["question"], "input": item.get("context", ""), "output": item["answer"] } elif "prompt" in item and "response" in item: qa_pair = { "instruction": item["prompt"], "input": "", "output": item["response"] } elif "text" in item: # Try to parse conversational format text = item["text"] if "Human:" in text and "Assistant:" in text: parts = text.split("Assistant:") if len(parts) >= 2: question = parts[0].replace("Human:", "").strip() answer = parts[1].strip() qa_pair = { "instruction": question, "input": "", "output": answer } if qa_pair: converted.append(qa_pair) return converted def list_available_datasets(self) -> List[Dict[str, str]]: """List all known financial datasets""" datasets = [] for name, info in self.known_datasets.items(): datasets.append({ "name": name, "path": info["path"], "type": info["type"], "description": info["description"] }) return datasets def preview_dataset(self, dataset_path: str, num_examples: int = 3) -> str: """ Preview a dataset before loading Args: dataset_path: HuggingFace dataset path num_examples: Number of examples to show Returns: Formatted preview string """ try: # Get HuggingFace token from environment if available hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN") # Load small sample with authentication try: if hf_token: dataset = load_dataset(dataset_path, split="train", streaming=False, token=hf_token) else: dataset = load_dataset(dataset_path, split="train", streaming=False) except Exception as auth_error: error_msg = str(auth_error) if "gated" in error_msg.lower() or "authenticated" in error_msg.lower(): return ( f"⚠️ Dataset '{dataset_path}' requires authentication.\n\n" f"This is a GATED dataset. To preview:\n" f"1. Visit: https://huggingface.co/datasets/{dataset_path}\n" f"2. Click 'Access repository' and accept terms\n" f"3. Set your HuggingFace token in Settings tab\n\n" f"Try these publicly accessible datasets instead:\n" f"- gbharti/finance-alpaca\n" f"- FinGPT/fingpt-finred\n" f"- virattt/financial-qa-10K" ) else: return f"Error: {auth_error}" # Get first N examples sample_size = min(num_examples, len(dataset)) samples = dataset.select(range(sample_size)) preview = f"Dataset: {dataset_path}\n" preview += f"Total examples: {len(dataset)}\n" preview += f"Fields: {list(samples[0].keys())}\n\n" preview += "Sample examples:\n" preview += "=" * 60 + "\n\n" for i, example in enumerate(samples, 1): preview += f"Example {i}:\n" for key, value in example.items(): value_str = str(value)[:100] preview += f" {key}: {value_str}\n" preview += "\n" return preview except Exception as e: return f"Error previewing dataset: {e}" def get_dataset_info(self, dataset_path: str) -> Dict[str, Any]: """Get metadata about a dataset""" try: from datasets import get_dataset_config_names, get_dataset_split_names configs = get_dataset_config_names(dataset_path) splits = get_dataset_split_names(dataset_path) return { "path": dataset_path, "configs": configs, "splits": splits, "status": "available" } except Exception as e: return { "path": dataset_path, "error": str(e), "status": "error" }