Spaces:

SpencerCPurdy
/

Multi-Agent_AI_Collaboration_System_for_Document_Classification

Running

App Files Files Community

SpencerCPurdy commited on 12 days ago

Commit

ff01b81

verified ·

1 Parent(s): 29b2b91

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -24

app.py CHANGED Viewed

@@ -66,7 +66,7 @@ if torch.cuda.is_available():
 # Core libraries
 import pandas as pd
 import numpy as np
-from sklearn.datasets import fetch_20newsgroups
 from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.preprocessing import LabelEncoder, StandardScaler
@@ -185,7 +185,7 @@ class NewsGroupsDataLoader:
     Loads and preprocesses the 20 Newsgroups dataset.
     Dataset Information:
-    - Source: 20 Newsgroups dataset (publicly available via scikit-learn)
     - License: Public domain
     - Size: ~18,000 newsgroup posts across 20 categories
     - Task: Multi-class text classification
@@ -208,26 +208,21 @@ class NewsGroupsDataLoader:
         Returns:
             Tuple of (train_df, val_df, test_df)
         """
-        logger.info("Loading 20 Newsgroups dataset...")
-        # Load training data
-        train_data = fetch_20newsgroups(
-            subset='train',
-            remove=('headers', 'footers', 'quotes'),
-            random_state=self.config.random_seed
-        )
-        # Load test data
-        test_data = fetch_20newsgroups(
-            subset='test',
-            remove=('headers', 'footers', 'quotes'),
-            random_state=self.config.random_seed
-        )
         # Combine for proper splitting
-        all_texts = list(train_data.data) + list(test_data.data)
-        all_labels = list(train_data.target) + list(test_data.target)
-        self.categories = train_data.target_names
         logger.info(f"Total documents: {len(all_texts)}")
         logger.info(f"Number of categories: {len(self.categories)}")
@@ -1326,7 +1321,7 @@ def create_gradio_interface(system: MultiAgentSystem, training_results: Dict):
             gr.Markdown("""
             ### Model Limitations and Failure Cases
             **Known Limitations:**
             1. **Domain Specificity**: Trained on newsgroup data, may not generalize well to
                significantly different domains (e.g., legal documents, medical reports)
@@ -1393,7 +1388,7 @@ def create_gradio_interface(system: MultiAgentSystem, training_results: Dict):
             **Dataset:**
             - 20 Newsgroups dataset
-            - Publicly available via scikit-learn
             - Approximately 18,000 newsgroup posts
             - 20 categories covering diverse topics
             - No personal or sensitive information
@@ -1421,7 +1416,7 @@ def create_gradio_interface(system: MultiAgentSystem, training_results: Dict):
             **Acknowledgments:**
             - 20 Newsgroups dataset creators
             - scikit-learn team
-            - Hugging Face for sentence-transformers
             - Open source ML community
             """)

 # Core libraries
 import pandas as pd
 import numpy as np
+from datasets import load_dataset
 from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.preprocessing import LabelEncoder, StandardScaler
     Loads and preprocesses the 20 Newsgroups dataset.
     Dataset Information:
+    - Source: 20 Newsgroups dataset (publicly available via Hugging Face)
     - License: Public domain
     - Size: ~18,000 newsgroup posts across 20 categories
     - Task: Multi-class text classification
         Returns:
             Tuple of (train_df, val_df, test_df)
         """
+        logger.info("Loading 20 Newsgroups dataset from Hugging Face...")
+        # Load dataset from Hugging Face
+        dataset = load_dataset("SetFit/20_newsgroups")
+        # Extract train and test data
+        train_data = dataset['train']
+        test_data = dataset['test']
         # Combine for proper splitting
+        all_texts = train_data['text'] + test_data['text']
+        all_labels = train_data['label'] + test_data['label']
+        # Get category names from dataset features
+        self.categories = train_data.features['label'].names
         logger.info(f"Total documents: {len(all_texts)}")
         logger.info(f"Number of categories: {len(self.categories)}")
             gr.Markdown("""
             ### Model Limitations and Failure Cases
             **Known Limitations:**
             1. **Domain Specificity**: Trained on newsgroup data, may not generalize well to
                significantly different domains (e.g., legal documents, medical reports)
             **Dataset:**
             - 20 Newsgroups dataset
+            - Publicly available via Hugging Face
             - Approximately 18,000 newsgroup posts
             - 20 categories covering diverse topics
             - No personal or sensitive information
             **Acknowledgments:**
             - 20 Newsgroups dataset creators
             - scikit-learn team
+            - Hugging Face for sentence-transformers and dataset hosting
             - Open source ML community
             """)