File size: 13,161 Bytes
016b413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
"""Unit tests for Phase 7: Judge integration in iterative research flow."""

from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from src.orchestrator.research_flow import IterativeResearchFlow
from src.utils.models import (
    AgentSelectionPlan,
    AgentTask,
    AssessmentDetails,
    JudgeAssessment,
    KnowledgeGapOutput,
    ToolAgentOutput,
)


@pytest.fixture
def mock_judge_handler():
    """Create a mock judge handler."""
    judge = MagicMock()
    judge.assess = AsyncMock()
    return judge


def create_judge_assessment(
    sufficient: bool,
    confidence: float,
    recommendation: str,
    reasoning: str,
) -> JudgeAssessment:
    """Helper to create a valid JudgeAssessment."""
    return JudgeAssessment(
        details=AssessmentDetails(
            mechanism_score=5,
            mechanism_reasoning="Test mechanism reasoning that is long enough",
            clinical_evidence_score=5,
            clinical_reasoning="Test clinical reasoning that is long enough",
            drug_candidates=[],
            key_findings=[],
        ),
        sufficient=sufficient,
        confidence=confidence,
        recommendation=recommendation,
        reasoning=reasoning,
    )


@pytest.fixture
def mock_agents():
    """Create mock agents for the flow."""
    return {
        "knowledge_gap": AsyncMock(),
        "tool_selector": AsyncMock(),
        "thinking": AsyncMock(),
        "writer": AsyncMock(),
    }


@pytest.fixture
def flow_with_judge(mock_agents, mock_judge_handler):
    """Create an IterativeResearchFlow with mocked agents and judge."""
    with (
        patch("src.orchestrator.research_flow.create_knowledge_gap_agent") as mock_kg,
        patch("src.orchestrator.research_flow.create_tool_selector_agent") as mock_ts,
        patch("src.orchestrator.research_flow.create_thinking_agent") as mock_thinking,
        patch("src.orchestrator.research_flow.create_writer_agent") as mock_writer,
        patch("src.orchestrator.research_flow.create_judge_handler") as mock_judge_factory,
        patch("src.orchestrator.research_flow.execute_tool_tasks") as mock_execute,
        patch("src.orchestrator.research_flow.get_workflow_state") as mock_state,
    ):
        mock_kg.return_value = mock_agents["knowledge_gap"]
        mock_ts.return_value = mock_agents["tool_selector"]
        mock_thinking.return_value = mock_agents["thinking"]
        mock_writer.return_value = mock_agents["writer"]
        mock_judge_factory.return_value = mock_judge_handler
        mock_execute.return_value = {
            "task_1": ToolAgentOutput(output="Finding 1", sources=["url1"]),
        }

        # Mock workflow state
        mock_state_obj = MagicMock()
        mock_state_obj.evidence = []
        mock_state_obj.add_evidence = MagicMock(return_value=1)
        mock_state.return_value = mock_state_obj

        return IterativeResearchFlow(max_iterations=2, max_time_minutes=5)


@pytest.mark.unit
@pytest.mark.asyncio
class TestJudgeIntegration:
    """Tests for judge integration in iterative research flow."""

    async def test_judge_called_after_tool_execution(
        self, flow_with_judge, mock_agents, mock_judge_handler
    ):
        """Judge should be called after tool execution."""
        # Mock knowledge gap agent to return incomplete
        mock_agents["knowledge_gap"].evaluate = AsyncMock(
            return_value=KnowledgeGapOutput(
                research_complete=False,
                outstanding_gaps=["Need more info"],
            )
        )

        # Mock thinking agent
        mock_agents["thinking"].generate_observations = AsyncMock(return_value="Initial thoughts")

        # Mock tool selector
        mock_agents["tool_selector"].select_tools = AsyncMock(
            return_value=AgentSelectionPlan(
                tasks=[
                    AgentTask(
                        gap="Need more info",
                        agent="WebSearchAgent",
                        query="test query",
                    )
                ]
            )
        )

        # Mock judge to return sufficient
        mock_judge_handler.assess = AsyncMock(
            return_value=create_judge_assessment(
                sufficient=True,
                confidence=0.9,
                recommendation="synthesize",
                reasoning="Evidence is sufficient to provide a comprehensive answer.",
            )
        )

        # Mock writer
        mock_agents["writer"].write_report = AsyncMock(
            return_value="# Final Report\n\nContent here."
        )

        result = await flow_with_judge.run("Test query")

        # Verify judge was called
        assert mock_judge_handler.assess.called
        assert isinstance(result, str)
        assert "Final Report" in result

    async def test_loop_completes_when_judge_says_sufficient(
        self, flow_with_judge, mock_agents, mock_judge_handler
    ):
        """Loop should complete when judge says evidence is sufficient."""
        # Mock knowledge gap to return incomplete
        mock_agents["knowledge_gap"].evaluate = AsyncMock(
            return_value=KnowledgeGapOutput(
                research_complete=False,
                outstanding_gaps=["Need more info"],
            )
        )

        mock_agents["thinking"].generate_observations = AsyncMock(return_value="Thoughts")

        mock_agents["tool_selector"].select_tools = AsyncMock(
            return_value=AgentSelectionPlan(
                tasks=[
                    AgentTask(
                        gap="Need more info",
                        agent="WebSearchAgent",
                        query="test",
                    )
                ]
            )
        )

        # Judge says sufficient
        mock_judge_handler.assess = AsyncMock(
            return_value=create_judge_assessment(
                sufficient=True,
                confidence=0.95,
                recommendation="synthesize",
                reasoning="Enough evidence has been collected to synthesize a comprehensive answer.",
            )
        )

        mock_agents["writer"].write_report = AsyncMock(return_value="# Report\n\nDone.")

        result = await flow_with_judge.run("Test query")

        # Should complete after judge says sufficient
        assert flow_with_judge.should_continue is False
        assert mock_judge_handler.assess.called
        assert isinstance(result, str)

    async def test_loop_continues_when_judge_says_insufficient(
        self, flow_with_judge, mock_agents, mock_judge_handler
    ):
        """Loop should continue when judge says evidence is insufficient."""
        call_count = {"kg": 0, "judge": 0}

        def mock_kg_evaluate(*args, **kwargs):
            call_count["kg"] += 1
            if call_count["kg"] == 1:
                return KnowledgeGapOutput(
                    research_complete=False,
                    outstanding_gaps=["Need more info"],
                )
            # Second call: complete
            return KnowledgeGapOutput(
                research_complete=True,
                outstanding_gaps=[],
            )

        def mock_judge_assess(*args, **kwargs):
            call_count["judge"] += 1
            # First call: insufficient
            if call_count["judge"] == 1:
                return create_judge_assessment(
                    sufficient=False,
                    confidence=0.5,
                    recommendation="continue",
                    reasoning="Need more evidence to provide a comprehensive answer.",
                )
            # Second call: sufficient (but won't be reached due to max_iterations)
            return create_judge_assessment(
                sufficient=True,
                confidence=0.9,
                recommendation="synthesize",
                reasoning="Enough evidence has now been collected to proceed.",
            )

        mock_agents["knowledge_gap"].evaluate = AsyncMock(side_effect=mock_kg_evaluate)
        mock_agents["thinking"].generate_observations = AsyncMock(return_value="Thoughts")
        mock_agents["tool_selector"].select_tools = AsyncMock(
            return_value=AgentSelectionPlan(
                tasks=[
                    AgentTask(
                        gap="Need more info",
                        agent="WebSearchAgent",
                        query="test",
                    )
                ]
            )
        )
        mock_judge_handler.assess = AsyncMock(side_effect=mock_judge_assess)
        mock_agents["writer"].write_report = AsyncMock(return_value="# Report\n\nDone.")

        result = await flow_with_judge.run("Test query")

        # Judge should be called
        assert mock_judge_handler.assess.called
        # Should eventually complete
        assert isinstance(result, str)

    async def test_judge_receives_evidence_from_state(
        self, flow_with_judge, mock_agents, mock_judge_handler
    ):
        """Judge should receive evidence from workflow state."""
        from src.utils.models import Citation, Evidence

        # Create mock evidence in state
        mock_evidence = [
            Evidence(
                content="Test evidence content",
                citation=Citation(
                    source="rag",  # Use valid SourceName
                    title="Test Title",
                    url="https://example.com",
                    date="2024-01-01",
                    authors=[],
                ),
                relevance=0.8,
            )
        ]

        # Mock state to return evidence
        from unittest.mock import patch

        with patch("src.orchestrator.research_flow.get_workflow_state") as mock_state:
            mock_state_obj = MagicMock()
            mock_state_obj.evidence = mock_evidence
            mock_state_obj.add_evidence = MagicMock(return_value=1)
            mock_state.return_value = mock_state_obj

            mock_agents["knowledge_gap"].evaluate = AsyncMock(
                return_value=KnowledgeGapOutput(
                    research_complete=False,
                    outstanding_gaps=["Need info"],
                )
            )
            mock_agents["thinking"].generate_observations = AsyncMock(return_value="Thoughts")
            mock_agents["tool_selector"].select_tools = AsyncMock(
                return_value=AgentSelectionPlan(
                    tasks=[
                        AgentTask(
                            gap="Need info",
                            agent="WebSearchAgent",
                            query="test",
                        )
                    ]
                )
            )
            mock_judge_handler.assess = AsyncMock(
                return_value=create_judge_assessment(
                    sufficient=True,
                    confidence=0.9,
                    recommendation="synthesize",
                    reasoning="Good evidence has been collected to answer the query.",
                )
            )
            mock_agents["writer"].write_report = AsyncMock(return_value="# Report\n\nDone.")

            result = await flow_with_judge.run("Test query")

            # Verify judge was called with evidence
            assert mock_judge_handler.assess.called
            call_args = mock_judge_handler.assess.call_args
            assert call_args[0][0] == "Test query"  # query
            assert len(call_args[0][1]) >= 0  # evidence list
            assert isinstance(result, str)

    async def test_token_tracking_for_judge_call(
        self, flow_with_judge, mock_agents, mock_judge_handler
    ):
        """Token tracking should work for judge calls."""
        mock_agents["knowledge_gap"].evaluate = AsyncMock(
            return_value=KnowledgeGapOutput(
                research_complete=False,
                outstanding_gaps=["Need info"],
            )
        )
        mock_agents["thinking"].generate_observations = AsyncMock(return_value="Thoughts")
        mock_agents["tool_selector"].select_tools = AsyncMock(
            return_value=AgentSelectionPlan(
                tasks=[
                    AgentTask(
                        gap="Need info",
                        agent="WebSearchAgent",
                        query="test",
                    )
                ]
            )
        )
        mock_judge_handler.assess = AsyncMock(
            return_value=create_judge_assessment(
                sufficient=True,
                confidence=0.9,
                recommendation="synthesize",
                reasoning="Evidence is sufficient to provide a comprehensive answer.",
            )
        )
        mock_agents["writer"].write_report = AsyncMock(return_value="# Report\n\nDone.")

        await flow_with_judge.run("Test query")

        # Check that tokens were tracked for the iteration
        iteration_tokens = flow_with_judge.budget_tracker.get_iteration_tokens(
            flow_with_judge.loop_id, 1
        )
        # Should have tracked tokens (may be 0 if estimation is off, but method should work)
        assert isinstance(iteration_tokens, int)
        assert iteration_tokens >= 0