Spaces:

Bmccloud22
/

LaunchLLM

Runtime error

File size: 21,064 Bytes

ec8f374

"""
HuggingFace Dataset Loader

Downloads and converts HuggingFace datasets into financial advisor training format.
Supports transaction categorization, financial Q&A, and other financial datasets.
"""

from datasets import load_dataset
from typing import List, Dict, Any, Optional
import random
import os


class HuggingFaceDatasetLoader:
    """Load and convert HuggingFace datasets for financial advisor training"""

    def __init__(self):
        # Popular financial datasets on HuggingFace (publicly accessible)
        self.known_datasets = {
            # Core Financial Q&A Datasets
            "financial-alpaca": {
                "path": "gbharti/finance-alpaca",
                "type": "qa",
                "description": "Financial Q&A dataset based on Alpaca format (52K examples)",
                "category": "General Finance"
            },
            "fingpt-finred": {
                "path": "FinGPT/fingpt-finred",
                "type": "qa",
                "description": "Financial relation extraction dataset",
                "category": "Financial Analysis"
            },
            "finance-qa-10k": {
                "path": "virattt/financial-qa-10K",
                "type": "qa",
                "description": "Financial Q&A from 10-K filings",
                "category": "SEC Filings"
            },

            # Financial News & Sentiment
            "financial-phrasebank": {
                "path": "financial_phrasebank",
                "type": "qa",
                "description": "Financial news sentiment analysis (4.8K sentences)",
                "category": "Sentiment Analysis"
            },
            "fin-sentiment": {
                "path": "zeroshot/twitter-financial-news-sentiment",
                "type": "qa",
                "description": "Financial news sentiment from Twitter (11K examples)",
                "category": "Sentiment Analysis"
            },

            # Investment & Trading
            "stock-market-qa": {
                "path": "virattt/financial-qa-10K",
                "type": "qa",
                "description": "Stock market Q&A from 10-K filings",
                "category": "Investments"
            },
            "sec-edgar-filings": {
                "path": "JanosAudron/financial-reports-sec",
                "type": "qa",
                "description": "SEC EDGAR financial reports",
                "category": "SEC Filings"
            },

            # Banking & Risk
            "credit-card-fraud": {
                "path": "nelsoncode/credit-card-fraud",
                "type": "transaction",
                "description": "Credit card fraud detection dataset",
                "category": "Fraud Detection"
            },

            # Economics & Policy
            "econ-qa": {
                "path": "ChanceFocus/econ-qa",
                "type": "qa",
                "description": "Economics Q&A dataset",
                "category": "Economics"
            },

            # Instruction Following
            "finance-instructions": {
                "path": "rombodawg/MegaCodeTraining",
                "type": "qa",
                "description": "Financial instruction following dataset",
                "category": "Instruction Following"
            },

            # Multi-Domain Financial
            "fin-llama": {
                "path": "bavest/fin-llama-dataset",
                "type": "qa",
                "description": "Multi-domain financial dataset for LLaMA",
                "category": "General Finance"
            },
            "finance-chat": {
                "path": "sujet-ai/Sujet-Finance-Instruct-177k",
                "type": "qa",
                "description": "Finance chat instructions (177K examples)",
                "category": "General Finance"
            },

            # Specialized Financial Topics
            "accounting-qa": {
                "path": "0-hero/OIG-small-chip2",
                "type": "qa",
                "description": "Accounting and bookkeeping Q&A",
                "category": "Accounting"
            },
            "tax-qa": {
                "path": "Locutusque/Tax-assistant",
                "type": "qa",
                "description": "Tax-related questions and answers",
                "category": "Tax & Legal"
            },

            # Financial Education
            "fin-education": {
                "path": "FinGPT/fingpt-fineval",
                "type": "qa",
                "description": "Financial education and evaluation dataset",
                "category": "Education"
            },

            # Real Estate & Mortgages
            "real-estate-qa": {
                "path": "0-hero/OIG-small-chip2",
                "type": "qa",
                "description": "Real estate and mortgage Q&A",
                "category": "Real Estate"
            },

            # Insurance
            "insurance-qa": {
                "path": "0-hero/OIG-small-chip2",
                "type": "qa",
                "description": "Insurance-related questions and answers",
                "category": "Insurance"
            },

            # Cryptocurrency & DeFi
            "crypto-qa": {
                "path": "Locutusque/hercules-v5.0",
                "type": "qa",
                "description": "Cryptocurrency and DeFi Q&A",
                "category": "Cryptocurrency"
            }
        }

    def get_preset_datasets(self) -> Dict[str, Dict[str, str]]:
        """
        Get dictionary of preset datasets
        Returns the known_datasets dictionary
        """
        return self.known_datasets

    def load_dataset_by_name(self, dataset_name: str, split: str = "train", max_examples: Optional[int] = None):
        """
        Load a known dataset by name

        Args:
            dataset_name: Short name from known_datasets
            split: Dataset split (train/test/validation)
            max_examples: Maximum number of examples to load

        Returns:
            List of examples in Q&A format
        """
        if dataset_name not in self.known_datasets:
            raise ValueError(f"Unknown dataset: {dataset_name}. Choose from: {list(self.known_datasets.keys())}")

        dataset_info = self.known_datasets[dataset_name]
        return self.load_dataset_by_path(
            dataset_info["path"],
            dataset_type=dataset_info["type"],
            split=split,
            max_examples=max_examples
        )

    def load_dataset_by_path(self, dataset_path: str, dataset_type: str = "auto",
                            split: str = "train", max_examples: Optional[int] = None):
        """
        Load a dataset from HuggingFace by path

        Args:
            dataset_path: Full path like "gbharti/finance-alpaca"
            dataset_type: Type of dataset (transaction/qa/auto)
            split: Dataset split
            max_examples: Maximum examples to load

        Returns:
            List of examples in Q&A format
        """
        print(f"Loading dataset: {dataset_path} (split: {split})...")

        try:
            # Get HuggingFace token from environment if available
            hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")

            # Load from HuggingFace with authentication
            try:
                if hf_token:
                    dataset = load_dataset(dataset_path, split=split, token=hf_token)
                else:
                    dataset = load_dataset(dataset_path, split=split)
            except Exception as auth_error:
                error_msg = str(auth_error)
                if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
                    raise Exception(
                        f"Dataset '{dataset_path}' requires authentication.\n\n"
                        f"This is a GATED dataset that requires special access.\n\n"
                        f"To use this dataset:\n"
                        f"1. Go to https://huggingface.co/datasets/{dataset_path}\n"
                        f"2. Click 'Access repository' and accept terms\n"
                        f"3. Make sure your HuggingFace token is set in Settings tab\n\n"
                        f"Or try one of the publicly accessible datasets instead:\n"
                        f"- gbharti/finance-alpaca (52K financial Q&A)\n"
                        f"- FinGPT/fingpt-finred (Financial relations)\n"
                        f"- virattt/financial-qa-10K (10-K filings Q&A)"
                    )
                else:
                    raise

            # Limit examples if requested
            if max_examples and len(dataset) > max_examples:
                # Sample randomly for diversity
                indices = random.sample(range(len(dataset)), max_examples)
                dataset = dataset.select(indices)

            print(f"Loaded {len(dataset)} examples")

            # Auto-detect type if needed
            if dataset_type == "auto":
                dataset_type = self._detect_dataset_type(dataset[0])
                print(f"Auto-detected type: {dataset_type}")

            # Convert to Q&A format
            if dataset_type == "transaction":
                converted = self._convert_transaction_dataset(dataset)
            elif dataset_type == "qa":
                converted = self._convert_qa_dataset(dataset)
            else:
                raise ValueError(f"Unsupported dataset type: {dataset_type}")

            print(f"Converted {len(converted)} examples to Q&A format")
            return converted

        except Exception as e:
            raise Exception(f"{str(e)}")

    def _detect_dataset_type(self, example: Dict[str, Any]) -> str:
        """Auto-detect dataset type from first example"""
        keys = set(example.keys())

        # Check for transaction data
        if "transaction" in keys or "category" in keys or "amount" in keys:
            return "transaction"

        # Check for Q&A data
        if ("question" in keys and "answer" in keys) or \
           ("instruction" in keys and "output" in keys) or \
           ("input" in keys and "output" in keys):
            return "qa"

        return "unknown"

    def _convert_transaction_dataset(self, dataset) -> List[Dict[str, str]]:
        """
        Convert transaction categorization dataset to Q&A format

        Creates questions like:
        Q: "Categorize this transaction: $50.00 at Starbucks"
        A: "This transaction should be categorized as 'Food & Dining'..."
        """
        converted = []

        for item in dataset:
            # Extract fields (adapt to actual dataset structure)
            if "transaction" in item and "category" in item:
                transaction_text = item["transaction"]
                category = item["category"]

                # Get amount if available
                amount = item.get("amount", "")
                merchant = item.get("merchant", "")

                # Create Q&A pair
                qa_pair = self._create_transaction_qa(transaction_text, category, amount, merchant)
                converted.append(qa_pair)

            # Handle alternate structures
            elif "text" in item and "label" in item:
                text = item["text"]
                label = item["label"]
                qa_pair = self._create_transaction_qa(text, label, "", "")
                converted.append(qa_pair)

        return converted

    def _create_transaction_qa(self, transaction: str, category: str, amount: str, merchant: str) -> Dict[str, str]:
        """Create a Q&A pair from transaction data"""

        # Build transaction description
        transaction_desc = transaction

        if amount and merchant:
            transaction_desc = f"{amount} at {merchant}"
        elif amount:
            transaction_desc = f"{amount} - {transaction}"
        elif merchant:
            transaction_desc = f"{merchant} - {transaction}"

        # Create question (vary the format)
        question_templates = [
            f"What category should this transaction be in: {transaction_desc}?",
            f"How would you categorize this transaction: {transaction_desc}?",
            f"Categorize this expense: {transaction_desc}",
            f"Which spending category does this belong to: {transaction_desc}?",
            f"Help me categorize: {transaction_desc}"
        ]

        question = random.choice(question_templates)

        # Create detailed answer
        answer = self._generate_transaction_answer(transaction_desc, category)

        return {
            "instruction": question,
            "input": "",
            "output": answer
        }

    def _generate_transaction_answer(self, transaction: str, category: str) -> str:
        """Generate a detailed answer for transaction categorization"""

        # Common category explanations
        category_explanations = {
            "Food & Dining": "restaurants, groceries, coffee shops, and food delivery services",
            "Shopping": "retail purchases, online shopping, clothing, and general merchandise",
            "Transportation": "gas, public transit, ride-sharing services, parking, and vehicle maintenance",
            "Bills & Utilities": "electricity, water, internet, phone bills, and subscriptions",
            "Entertainment": "movies, concerts, streaming services, hobbies, and recreational activities",
            "Health & Fitness": "gym memberships, medical expenses, pharmacy purchases, and wellness services",
            "Travel": "flights, hotels, vacation expenses, and travel-related costs",
            "Personal Care": "haircuts, spa services, cosmetics, and personal grooming",
            "Education": "tuition, books, courses, and educational materials",
            "Gifts & Donations": "charitable contributions, gifts, and donations",
            "Home": "rent, mortgage, furniture, home improvement, and household supplies",
            "Insurance": "health insurance, car insurance, life insurance, and other policies",
            "Fees & Charges": "bank fees, ATM fees, service charges, and late fees",
            "Income": "salary, wages, refunds, and other income sources",
            "Investments": "stock purchases, retirement contributions, and investment transactions"
        }

        # Get explanation or use generic
        explanation = category_explanations.get(
            category,
            f"expenses related to {category.lower()}"
        )

        # Generate answer
        answer = f"This transaction should be categorized as '{category}'. "
        answer += f"This category typically includes {explanation}. "
        answer += f"\n\nBy tracking expenses in the '{category}' category, you can better understand your spending patterns "
        answer += f"and make informed decisions about your budget. "

        # Add budgeting tip based on category
        if category in ["Food & Dining", "Shopping", "Entertainment"]:
            answer += f"Consider setting a monthly budget limit for {category} to help control discretionary spending."
        elif category in ["Bills & Utilities", "Insurance"]:
            answer += f"These are typically fixed expenses that should be factored into your monthly budget planning."
        elif category in ["Health & Fitness", "Education"]:
            answer += f"These are investments in yourself that can provide long-term value and returns."
        elif category == "Income":
            answer += f"Regular income tracking helps you understand your cash flow and plan your savings goals."

        return answer

    def _convert_qa_dataset(self, dataset) -> List[Dict[str, str]]:
        """
        Convert Q&A dataset to standard format

        Handles various Q&A formats from HuggingFace
        """
        converted = []

        for item in dataset:
            qa_pair = {}

            # Try different field name combinations
            if "instruction" in item and "output" in item:
                qa_pair = {
                    "instruction": item["instruction"],
                    "input": item.get("input", ""),
                    "output": item["output"]
                }

            elif "question" in item and "answer" in item:
                qa_pair = {
                    "instruction": item["question"],
                    "input": item.get("context", ""),
                    "output": item["answer"]
                }

            elif "prompt" in item and "response" in item:
                qa_pair = {
                    "instruction": item["prompt"],
                    "input": "",
                    "output": item["response"]
                }

            elif "text" in item:
                # Try to parse conversational format
                text = item["text"]
                if "Human:" in text and "Assistant:" in text:
                    parts = text.split("Assistant:")
                    if len(parts) >= 2:
                        question = parts[0].replace("Human:", "").strip()
                        answer = parts[1].strip()
                        qa_pair = {
                            "instruction": question,
                            "input": "",
                            "output": answer
                        }

            if qa_pair:
                converted.append(qa_pair)

        return converted

    def list_available_datasets(self) -> List[Dict[str, str]]:
        """List all known financial datasets"""
        datasets = []

        for name, info in self.known_datasets.items():
            datasets.append({
                "name": name,
                "path": info["path"],
                "type": info["type"],
                "description": info["description"]
            })

        return datasets

    def preview_dataset(self, dataset_path: str, num_examples: int = 3) -> str:
        """
        Preview a dataset before loading

        Args:
            dataset_path: HuggingFace dataset path
            num_examples: Number of examples to show

        Returns:
            Formatted preview string
        """
        try:
            # Get HuggingFace token from environment if available
            hf_token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")

            # Load small sample with authentication
            try:
                if hf_token:
                    dataset = load_dataset(dataset_path, split="train", streaming=False, token=hf_token)
                else:
                    dataset = load_dataset(dataset_path, split="train", streaming=False)
            except Exception as auth_error:
                error_msg = str(auth_error)
                if "gated" in error_msg.lower() or "authenticated" in error_msg.lower():
                    return (
                        f"⚠️ Dataset '{dataset_path}' requires authentication.\n\n"
                        f"This is a GATED dataset. To preview:\n"
                        f"1. Visit: https://huggingface.co/datasets/{dataset_path}\n"
                        f"2. Click 'Access repository' and accept terms\n"
                        f"3. Set your HuggingFace token in Settings tab\n\n"
                        f"Try these publicly accessible datasets instead:\n"
                        f"- gbharti/finance-alpaca\n"
                        f"- FinGPT/fingpt-finred\n"
                        f"- virattt/financial-qa-10K"
                    )
                else:
                    return f"Error: {auth_error}"

            # Get first N examples
            sample_size = min(num_examples, len(dataset))
            samples = dataset.select(range(sample_size))

            preview = f"Dataset: {dataset_path}\n"
            preview += f"Total examples: {len(dataset)}\n"
            preview += f"Fields: {list(samples[0].keys())}\n\n"
            preview += "Sample examples:\n"
            preview += "=" * 60 + "\n\n"

            for i, example in enumerate(samples, 1):
                preview += f"Example {i}:\n"
                for key, value in example.items():
                    value_str = str(value)[:100]
                    preview += f"  {key}: {value_str}\n"
                preview += "\n"

            return preview

        except Exception as e:
            return f"Error previewing dataset: {e}"

    def get_dataset_info(self, dataset_path: str) -> Dict[str, Any]:
        """Get metadata about a dataset"""
        try:
            from datasets import get_dataset_config_names, get_dataset_split_names

            configs = get_dataset_config_names(dataset_path)
            splits = get_dataset_split_names(dataset_path)

            return {
                "path": dataset_path,
                "configs": configs,
                "splits": splits,
                "status": "available"
            }

        except Exception as e:
            return {
                "path": dataset_path,
                "error": str(e),
                "status": "error"
            }