Tcid

Sleeping

App Files Files Community

badaoui HF Staff commited on 19 days ago

Commit

025c761

1 Parent(s): b2f075d

small fix for historical data loading

Browse files

Files changed (1) hide show

data.py +122 -15

data.py CHANGED Viewed

@@ -6,6 +6,7 @@ import threading
 import traceback
 import json
 import re
 from typing import List, Tuple, Optional, Dict
 # NOTE: if caching is an issue, try adding `use_listings_cache=False`
@@ -60,6 +61,11 @@ KEYS_TO_KEEP = [
 # HELPER FUNCTIONS
 # ============================================================================
 def parse_json_field(value) -> dict:
     """Safely parse a JSON field that might be a string or dict."""
     if value is None or pd.isna(value):
@@ -100,6 +106,8 @@ def log_dataframe_link(link: str) -> str:
     Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
     report.
     """
     logger.info(f"Reading df located at {link}")
     # Make sure the links starts with an http adress
     if link.startswith("hf://"):
@@ -175,6 +183,7 @@ def get_available_dates() -> List[str]:
             return common_dates[:30]  # Limit to last 30 days
         # No real dates available - log warning and return empty list
         logger.warning("No common dates found between AMD and NVIDIA datasets")
         return []
@@ -243,11 +252,15 @@ def get_data_for_date(target_date: str) -> tuple[pd.DataFrame, str]:
     except Exception as e:
         logger.error(f"Error getting data for date {target_date}: {e}")
         return pd.DataFrame(), target_date
-def get_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
     """Get historical data for a date range."""
     try:
         start_dt = datetime.strptime(start_date, "%Y-%m-%d")
         end_dt = datetime.strptime(end_date, "%Y-%m-%d")
@@ -271,7 +284,7 @@ def get_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
     except Exception as e:
         logger.error(f"Error getting historical data: {e}")
-        return pd.DataFrame()
 def get_distant_data() -> tuple[pd.DataFrame, str]:
@@ -303,6 +316,65 @@ def get_distant_data() -> tuple[pd.DataFrame, str]:
     return filtered_joined, latest_update_msg
 def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
     """Find the first date when a specific test failure appeared in historical data."""
     if historical_df is None or historical_df.empty:
@@ -438,25 +510,48 @@ class CIResults:
         self.available_dates = []
         self.historical_df = pd.DataFrame()
         self.all_historical_data = pd.DataFrame()  # Store all historical data at startup
     def load_data(self) -> None:
         """Load data from the data source."""
-        logger.info("Loading distant data...")
-        new_df, latest_update_msg = get_distant_data()
-        self.latest_update_msg = latest_update_msg
-        # Get available dates
         try:
-            self.available_dates = get_available_dates()
-            logger.info(f"Available dates: {len(self.available_dates)} dates")
-            if self.available_dates:
-                logger.info(f"Date range: {self.available_dates[-1]} to {self.available_dates[0]}")
             else:
-                logger.warning("No available dates found")
-                self.available_dates = []
         except Exception as e:
             logger.warning(f"Failed to get available dates: {e}")
-            self.available_dates = []
         # Update attributes
         self.df = new_df
@@ -465,6 +560,13 @@ class CIResults:
         # Load all historical data at startup
         self.load_all_historical_data()
         # Log and return distant load status
         logger.info(f"Data loaded successfully: {len(self.available_models)} models")
         logger.info(f"Models: {self.available_models[:5]}{'...' if len(self.available_models) > 5 else ''}")
@@ -483,7 +585,7 @@ class CIResults:
         logger.info(json.dumps(msg, indent=4))
     def load_all_historical_data(self) -> None:
-        """Load all available historical data at startup."""
         try:
             if not self.available_dates:
                 logger.warning("No available dates found, skipping historical data load")
@@ -492,8 +594,12 @@ class CIResults:
             logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
             start_date, end_date = self.available_dates[-1], self.available_dates[0]
-            self.all_historical_data = get_historical_data(start_date, end_date)
             logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
         except Exception as e:
             logger.error(f"Error loading all historical data: {e}")
             self.all_historical_data = pd.DataFrame()
@@ -544,3 +650,4 @@ class CIResults:
         timer.daemon = True
         timer.start()
         logger.info("Data auto-reload scheduled every 15 minutes")

 import traceback
 import json
 import re
+import random
 from typing import List, Tuple, Optional, Dict
 # NOTE: if caching is an issue, try adding `use_listings_cache=False`
 # HELPER FUNCTIONS
 # ============================================================================
+def generate_fake_dates(num_days: int = 7) -> List[str]:
+    """Generate fake dates for the last N days."""
+    today = datetime.now()
+    return [(today - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_days)]
 def parse_json_field(value) -> dict:
     """Safely parse a JSON field that might be a string or dict."""
     if value is None or pd.isna(value):
     Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
     report.
     """
+    if link.startswith("sample_"):
+        return "9999-99-99"
     logger.info(f"Reading df located at {link}")
     # Make sure the links starts with an http adress
     if link.startswith("hf://"):
             return common_dates[:30]  # Limit to last 30 days
         # No real dates available - log warning and return empty list
+        # This will allow the system to fall back to sample data properly
         logger.warning("No common dates found between AMD and NVIDIA datasets")
         return []
     except Exception as e:
         logger.error(f"Error getting data for date {target_date}: {e}")
+        # Return empty dataframe instead of sample data for historical functionality
         return pd.DataFrame(), target_date
+def get_historical_data(start_date: str, end_date: str, sample_data = False) -> pd.DataFrame:
     """Get historical data for a date range."""
+    if sample_data:
+        return get_fake_historical_data(start_date, end_date)
     try:
         start_dt = datetime.strptime(start_date, "%Y-%m-%d")
         end_dt = datetime.strptime(end_date, "%Y-%m-%d")
     except Exception as e:
         logger.error(f"Error getting historical data: {e}")
+        return get_fake_historical_data(start_date, end_date)
 def get_distant_data() -> tuple[pd.DataFrame, str]:
     return filtered_joined, latest_update_msg
+def get_sample_data() -> tuple[pd.DataFrame, str]:
+    # Retrieve sample dataframes
+    df_amd, _ = read_one_dataframe("sample_amd.json", "amd")
+    df_nvidia, _ = read_one_dataframe("sample_nvidia.json", "nvidia")
+    # Join both dataframes
+    joined = df_amd.join(df_nvidia, rsuffix="_nvidia", lsuffix="_amd", how="outer")
+    joined = joined[KEYS_TO_KEEP]
+    joined.index = joined.index.str.replace("^models_", "", regex=True)
+    # Fitler out all but important models
+    important_models_lower = [model.lower() for model in IMPORTANT_MODELS]
+    filtered_joined = joined[joined.index.str.lower().isin(important_models_lower)]
+    # Prefix all model names with "sample_"
+    filtered_joined.index = "sample_" + filtered_joined.index
+    return filtered_joined, "sample data was loaded"
+def get_fake_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
+    """Generate fake historical data for a date range when real data loading fails."""
+    try:
+        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
+        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
+        sample_df, _ = get_sample_data()
+        historical_data = []
+        # Generate data for each date
+        current_dt = start_dt
+        while current_dt <= end_dt:
+            date_df = sample_df.copy()
+            date_df['date'] = current_dt.strftime("%Y-%m-%d")
+            # Add random variations to make it realistic
+            for idx in date_df.index:
+                # Vary success/skipped counts (±20%)
+                for col in ['success_amd', 'success_nvidia', 'skipped_amd', 'skipped_nvidia']:
+                    if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
+                        val = date_df.loc[idx, col]
+                        if val > 0:
+                            date_df.loc[idx, col] = max(0, int(val * random.uniform(0.8, 1.2)))
+                # Vary failure counts more dramatically (±50-100%)
+                for col in ['failed_multi_no_amd', 'failed_multi_no_nvidia', 'failed_single_no_amd', 'failed_single_no_nvidia']:
+                    if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
+                        val = date_df.loc[idx, col]
+                        date_df.loc[idx, col] = max(0, int(val * random.uniform(0.5, 2.0)))
+            historical_data.append(date_df)
+            current_dt += timedelta(days=1)
+        if not historical_data:
+            return pd.DataFrame()
+        combined_df = pd.concat(historical_data, ignore_index=False)
+        logger.info(f"Generated fake historical data: {len(combined_df)} records from {start_date} to {end_date}")
+        return combined_df
+    except Exception as e:
+        logger.error(f"Error generating fake historical data: {e}")
+        return pd.DataFrame()
 def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
     """Find the first date when a specific test failure appeared in historical data."""
     if historical_df is None or historical_df.empty:
         self.available_dates = []
         self.historical_df = pd.DataFrame()
         self.all_historical_data = pd.DataFrame()  # Store all historical data at startup
+        self.sample_data = False
     def load_data(self) -> None:
         """Load data from the data source."""
+        # Try loading the distant data, and fall back on sample data for local tinkering
+        try:
+            logger.info("Loading distant data...")
+            new_df, latest_update_msg = get_distant_data()
+            self.latest_update_msg = latest_update_msg
+            self.sample_data = False
+        except Exception as e:
+            error_msg = [
+                "Loading data failed:",
+                "-" * 120,
+                traceback.format_exc(),
+                "-" * 120,
+                "Falling back on sample data."
+            ]
+            logger.error("\n".join(error_msg))
+            self.sample_data = True
+            new_df, latest_update_msg = get_sample_data()
+            self.latest_update_msg = latest_update_msg
+        # Try to get available dates
         try:
+            if not self.sample_data:
+                self.available_dates = get_available_dates()
+                logger.info(f"Available dates: {len(self.available_dates)} dates")
+                if self.available_dates:
+                    logger.info(f"Date range: {self.available_dates[-1]} to {self.available_dates[0]}")
+                else:
+                    logger.warning("No available dates found")
+                    self.available_dates = []
             else:
+                # Generate fake dates for sample data historical functionality
+                self.available_dates = generate_fake_dates()
         except Exception as e:
             logger.warning(f"Failed to get available dates: {e}")
+            if self.sample_data:
+                self.available_dates = generate_fake_dates()
+            else:
+                self.available_dates = []
         # Update attributes
         self.df = new_df
         # Load all historical data at startup
         self.load_all_historical_data()
+        # Update historical_df with latest available dates after reload
+        if self.available_dates:
+            start_date_val = self.available_dates[-1]  # Last date (oldest)
+            end_date_val = self.available_dates[0]       # First date (newest)
+            self.load_historical_data(start_date_val, end_date_val)
+            logger.info(f"Updated historical_df with {len(self.historical_df)} records")
         # Log and return distant load status
         logger.info(f"Data loaded successfully: {len(self.available_models)} models")
         logger.info(f"Models: {self.available_models[:5]}{'...' if len(self.available_models) > 5 else ''}")
         logger.info(json.dumps(msg, indent=4))
     def load_all_historical_data(self) -> None:
+        """Load all available historical data. Replaces existing data to ensure latest dates are included."""
         try:
             if not self.available_dates:
                 logger.warning("No available dates found, skipping historical data load")
             logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
             start_date, end_date = self.available_dates[-1], self.available_dates[0]
+            logger.info(f"Date range: {start_date} to {end_date}")
+            self.all_historical_data = get_historical_data(start_date, end_date, self.sample_data)
             logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
+            if not self.all_historical_data.empty:
+                unique_dates = sorted(self.all_historical_data['date'].unique())
+                logger.info(f"Loaded dates: {unique_dates[0]} to {unique_dates[-1]} ({len(unique_dates)} unique dates)")
         except Exception as e:
             logger.error(f"Error loading all historical data: {e}")
             self.all_historical_data = pd.DataFrame()
         timer.daemon = True
         timer.start()
         logger.info("Data auto-reload scheduled every 15 minutes")