File size: 4,049 Bytes
0ccf2f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Comprehensive tests for HuggingFace Warbler pack ingestion utilities.

Tests the HF dataset ingestion pipeline that transforms datasets into
Warbler pack format for NPC intelligence training via the magma layer.
"""

import tempfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import pytest
import click.testing

from warbler_cda.utils.hf_warbler_ingest import cli


class TestHuggingFaceWarblerIngestCLI:
    """Test CLI interface for HF ingestion."""

    @pytest.mark.skipif(True, reason="MANUAL RUN ONLY - downloads actual datasets")
    @patch('warbler_cda.utils.hf_warbler_ingest.click.echo')
    def test_ingest_with_disabled_chunking(self, mock_echo):
        """Test ingestion with chunking disabled. MANUAL RUN ONLY - downloads actual datasets."""
        runner = click.testing.CliRunner()
        result = runner.invoke(cli, [
            'ingest',
            '--datasets', 'arxiv',
            '--max-docs-per-chunk', '0'  # Disable chunking
        ])

        # CLI handles gracefully even with transformer failures
        assert result.exit_code == 0

    @pytest.mark.skipif(True, reason="MANUAL RUN ONLY - downloads actual datasets")
    @patch('warbler_cda.utils.hf_warbler_ingest.click.echo')
    @patch('warbler_cda.utils.hf_warbler_ingest.click.secho')
    def test_ingest_all_datasets(self, mock_secho, mock_echo):
        """Test ingesting all datasets with --datasets all. MANUAL RUN ONLY - downloads actual datasets."""
        runner = click.testing.CliRunner()
        result = runner.invoke(cli, ['ingest', '--datasets', 'all'])

        # Should complete without crashing
        assert result.exit_code == 0

    def test_dataset_validation(self):
        """Test validation of dataset parameter."""
        runner = click.testing.CliRunner()

        # Invalid dataset should show error
        result = runner.invoke(cli, ['ingest', '--datasets', 'invalid_dataset'])
        assert result.exit_code != 0
        assert "Invalid value" in result.output

    @patch('warbler_cda.utils.hf_warbler_ingest.click.echo')
    def test_pdf_page_limit_handling(self, mock_echo):
        """Test PDF page limit parameter handling."""
        runner = click.testing.CliRunner()
        result = runner.invoke(cli, [
            'ingest',
            '--datasets', 'arxiv',
            '--max-pdf-pages', '50'
        ])

        # Should complete gracefully
        assert result.exit_code == 0

    @patch('warbler_cda.utils.hf_warbler_ingest.click.echo')
    @patch('warbler_cda.utils.hf_warbler_ingest.click.secho')
    def test_pack_prefix_customization(self, mock_secho, mock_echo):
        """Test custom pack prefix handling."""
        runner = click.testing.CliRunner()
        result = runner.invoke(cli, [
            'ingest',
            '--datasets', 'arxiv',
            '--pack-prefix', 'my-custom-prefix'
        ])

        assert result.exit_code == 0


class TestCLIParameterValidation:
    """Test CLI parameter validation."""

    def test_help_command(self):
        """Test help command works."""
        runner = click.testing.CliRunner()
        result = runner.invoke(cli, ['--help'])
        assert result.exit_code == 0
        assert "HuggingFace Warbler Pack Ingestion Tool" in result.output

    def test_ingest_help(self):
        """Test ingest subcommand help."""
        runner = click.testing.CliRunner()
        result = runner.invoke(cli, ['ingest', '--help'])
        assert result.exit_code == 0
        assert "Ingest HF datasets into Warbler packs" in result.output

    def test_datasets_parameter_defaults_to_arxiv(self):
        """Test that datasets parameter defaults to arxiv."""
        runner = click.testing.CliRunner()
        # Just run without any args to get help - this will work since it has defaults
        result = runner.invoke(cli, ['ingest', '--help'])
        assert result.exit_code == 0
        # The help shows that --datasets has a default value, not required


if __name__ == "__main__":
    pytest.main([__file__, "-v"])