Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Test suite for PDF-based dataset ingestion | |
| Tests for handling datasets with PDF fields instead of text content. | |
| Includes fallback handling when PDF extraction is unavailable. | |
| """ | |
| import pytest | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from unittest.mock import Mock, patch, MagicMock | |
| from typing import Dict, List, Any | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from warbler_cda.utils.transformers import ( | |
| NovelsTransformer, | |
| PortugueseEducationTransformer, | |
| EnterpriseTransformer, | |
| ArxivTransformer, | |
| PromptReportTransformer, | |
| ManualsTransformer, | |
| ) | |
| class TestPDFExtraction: | |
| """Test PDF extraction capability""" | |
| def test_pdf_support_detection(self): | |
| """Test that transformers can be instantiated""" | |
| transformer = NovelsTransformer() | |
| assert transformer is not None | |
| assert hasattr(transformer, "transform") | |
| def test_pdf_extraction_method_exists(self): | |
| """Test that transformers have required methods""" | |
| transformer = NovelsTransformer() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_placeholder_creation_method_exists(self): | |
| """Test that transformer is properly initialized""" | |
| transformer = NovelsTransformer() | |
| assert transformer is not None | |
| assert hasattr(transformer, "__class__") | |
| class TestNovelDatasetWithPDF: | |
| """Test novel dataset handling with PDF fallback""" | |
| def test_novel_transform_handles_missing_fields(self): | |
| """Test that novel transformer handles datasets with only PDF field""" | |
| transformer = NovelsTransformer() | |
| mock_novel = {"pdf": b"fake_pdf_bytes", "title": "Test Novel"} | |
| with patch("warbler_cda.utils.transformers.novels.load_dataset") as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__iter__.return_value = [mock_novel] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert doc["metadata"]["realm_type"] == "narrative" | |
| def test_novel_with_text_field(self): | |
| """Test novel transformer with actual text field""" | |
| transformer = NovelsTransformer() | |
| mock_novel = { | |
| "text": "Once upon a time there was a kingdom far away. " * 50, | |
| "title": "Story of the Kingdom", | |
| } | |
| with patch("warbler_cda.utils.transformers.novels.load_dataset") as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__iter__.return_value = [mock_novel] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| def test_novel_transformer_output_format(self): | |
| """Test that novel transformer produces Warbler-compatible format""" | |
| transformer = NovelsTransformer() | |
| mock_novel = {"text": "Novel content here. " * 100, "title": "Test Novel"} | |
| with patch("warbler_cda.utils.transformers.novels.load_dataset") as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__iter__.return_value = [mock_novel] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| for doc in docs: | |
| assert "content_id" in doc | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| metadata = doc["metadata"] | |
| assert "pack" in metadata | |
| assert metadata["pack"] == "warbler-pack-novels" | |
| assert "realm_type" in metadata | |
| assert metadata["realm_type"] == "narrative" | |
| assert "license" in metadata | |
| assert metadata["license"] == "MIT" | |
| class TestPortugueseEducationWithPDF: | |
| """Test Portuguese education dataset with PDF handling""" | |
| def test_portuguese_handles_pdf_field(self): | |
| """Test Portuguese education with PDF-only field""" | |
| transformer = PortugueseEducationTransformer() | |
| mock_doc = {"pdf": b"pdf_content_bytes", "title": "Introdução à Programação"} | |
| with patch("warbler_cda.utils.transformers.portuguese_education.load_dataset") as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__iter__.return_value = [mock_doc] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| assert doc["metadata"]["realm_type"] == "educational" | |
| def test_portuguese_with_text_field(self): | |
| """Test Portuguese education with text field""" | |
| transformer = PortugueseEducationTransformer() | |
| mock_doc = { | |
| "content": "A programação é a arte de instruir o computador.", | |
| "title": "Introdução à Programação", | |
| "language": "pt", | |
| } | |
| with patch("warbler_cda.utils.transformers.portuguese_education.load_dataset") as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__iter__.return_value = [mock_doc] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content" in doc | |
| assert "metadata" in doc | |
| class TestEnterpriseDatasetFallback: | |
| """Test enterprise dataset with graceful fallback""" | |
| def test_enterprise_load_error_handling(self): | |
| """Test that enterprise transformer handles load errors gracefully""" | |
| transformer = EnterpriseTransformer() | |
| with patch("warbler_cda.utils.transformers.enterprise.load_dataset") as mock_load: | |
| mock_load.side_effect = RuntimeError("Dataset generation failed") | |
| docs = transformer.transform() | |
| assert isinstance(docs, list) | |
| def test_enterprise_with_messages(self): | |
| """Test enterprise transformer with conversation messages""" | |
| transformer = EnterpriseTransformer() | |
| mock_entry = { | |
| "messages": [ | |
| {"role": "system", "content": "You are a helpful assistant"}, | |
| {"role": "user", "content": "How do I deploy this?"}, | |
| {"role": "assistant", "content": "Here are the steps..."}, | |
| ] | |
| } | |
| with patch("warbler_cda.utils.transformers.enterprise.load_dataset") as mock_load: | |
| mock_dataset = MagicMock() | |
| mock_dataset.__iter__.return_value = [mock_entry] | |
| mock_load.return_value = mock_dataset | |
| docs = transformer.transform() | |
| assert len(docs) > 0 | |
| doc = docs[0] | |
| assert "content" in doc | |
| class TestDatasetIntegration: | |
| """Integration tests for full dataset ingestion""" | |
| def test_all_datasets_without_actual_api_calls(self): | |
| """Test all transformers can be instantiated""" | |
| transformers = [ | |
| ArxivTransformer, | |
| PromptReportTransformer, | |
| NovelsTransformer, | |
| ManualsTransformer, | |
| PortugueseEducationTransformer, | |
| ] | |
| for transformer_class in transformers: | |
| transformer = transformer_class() | |
| assert hasattr(transformer, "transform") | |
| assert callable(transformer.transform) | |
| def test_documents_have_required_fields(self): | |
| """Test that all documents have required Warbler fields""" | |
| test_doc = { | |
| "content_id": "test/1", | |
| "content": "Test content for validation", | |
| "metadata": { | |
| "pack": "warbler-pack-test", | |
| "source_dataset": "test", | |
| "realm_type": "test", | |
| "realm_label": "test", | |
| "lifecycle_stage": "emergence", | |
| "activity_level": 0.7, | |
| "license": "MIT", | |
| }, | |
| } | |
| required_fields = ["content_id", "content", "metadata"] | |
| required_metadata = [ | |
| "pack", | |
| "source_dataset", | |
| "realm_type", | |
| "realm_label", | |
| "lifecycle_stage", | |
| "activity_level", | |
| "license", | |
| ] | |
| for field in required_fields: | |
| assert field in test_doc | |
| for meta_field in required_metadata: | |
| assert meta_field in test_doc["metadata"] | |