Spaces:
Sleeping
Sleeping
YanBoChen
WIP: Remove obsolete files and implement cloud data loading for customization and retrieval systems
d603ef9
| """Customization System Cloud Configuration""" | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class CustomizationCloudLoader: | |
| """Customization-specific cloud data loader""" | |
| def __init__(self): | |
| self.dataset_repo = "ybchen928/oncall-guide-ai-models" | |
| self.use_cloud = os.getenv('USE_CLOUD_DATA', 'true').lower() == 'true' | |
| def get_processing_file_path(self, relative_path: str) -> str: | |
| """Get processing file path for Customization Pipeline""" | |
| if self.use_cloud: | |
| return hf_hub_download( | |
| repo_id=self.dataset_repo, | |
| filename=f"customization_data/processing/{relative_path}", | |
| repo_type="dataset" | |
| ) | |
| else: | |
| # Local development mode - correct path to processing folder | |
| base_path = Path(__file__).parent.parent.parent / "customization" / "processing" | |
| return str(base_path / relative_path) | |
| def preload_all_processing_files(self) -> tuple: | |
| """Preload all processing files and return directory paths""" | |
| if self.use_cloud: | |
| # Download all required files | |
| files_to_download = [ | |
| "embeddings/document_index.json", | |
| "embeddings/tag_embeddings.json", | |
| "embeddings/document_tag_mapping.json", | |
| "embeddings/chunk_embeddings.json", | |
| "indices/chunk_mappings.json", | |
| "indices/tag_mappings.json", | |
| "indices/annoy_metadata.json", | |
| "indices/chunk_embeddings.ann", | |
| "indices/tag_embeddings.ann", | |
| "mapping.json" | |
| ] | |
| # Download each file to ensure they're all cached | |
| for file_path in files_to_download: | |
| try: | |
| self.get_processing_file_path(file_path) | |
| logger.info(f"Downloaded: {file_path}") | |
| except Exception as e: | |
| logger.warning(f"Failed to download {file_path}: {e}") | |
| # Get directory paths from downloaded files | |
| embeddings_dir = Path(self.get_processing_file_path("embeddings/document_index.json")).parent | |
| indices_dir = Path(self.get_processing_file_path("indices/chunk_mappings.json")).parent | |
| return str(embeddings_dir), str(indices_dir) | |
| else: | |
| # Local development mode | |
| base_path = Path(__file__).parent.parent.parent / "customization" / "processing" | |
| return str(base_path / "embeddings"), str(base_path / "indices") | |
| # Global instance | |
| customization_loader = CustomizationCloudLoader() | |