AUXteam commited on
Commit
fc10d08
·
verified ·
1 Parent(s): 7a38201

Upload folder using huggingface_hub

Browse files
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on:
3
+ pull_request:
4
+ branches: [main]
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ check-size:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Check large files
12
+ uses: ActionsDesk/lfs-warning@v2.0
13
+ with:
14
+ filesizelimit: 10485760 # 10MB
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ workflow_dispatch:
6
+
7
+ jobs:
8
+ sync-to-hub:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v3
12
+ with:
13
+ fetch-depth: 0
14
+ lfs: true
15
+ - name: Push to hub
16
+ env:
17
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
18
+ run: git push --force https://AUXteam:${HF_TOKEN}@huggingface.co/spaces/AUXteam/Critical_Code_Agent main
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ .venv_jax
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+
152
+ # pytype static type analyzer
153
+ .pytype/
154
+
155
+ # Cython debug symbols
156
+ cython_debug/
157
+
158
+ # PyCharm
159
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
162
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163
+ .idea/
164
+ .aider*
165
+ *.DS_Store
166
+
167
+ # Misc folders
168
+ data/
169
+ *ckpt.pt
170
+ *.zip
171
+ ICLR2022-OpenReviewData/
172
+ templates/*/run_0/
173
+ templates/*/*.png
174
+ results/*
.hfignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ai_scientist/
2
+ data/
3
+ docs/
4
+ example_papers/
5
+ review_ai_scientist/
6
+ review_iclr_bench/
7
+ templates/
8
+ .git/
9
+ NPEET/
10
+ __pycache__/
11
+ *.pyc
12
+ *.pyo
13
+ *.pyd
14
+ .pytest_cache/
15
+ .coverage
16
+ htmlcov/
CriticalThinking/PROJECT_STRUCTURE.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CriticalThinking Project Structure
2
+
3
+ ## 1. Project Description
4
+ **CriticalThinking** is an autonomous agent system that performs deep architectural analysis of software projects. Inspired by `deep-thinking-agent` and `AI-Scientist`, it moves beyond simple linting to identify high-level design weaknesses, suggest structural improvements, and recommend state-of-the-art replacements (e.g., Hugging Face models) for custom implementations.
5
+
6
+ ### In-app Integrations
7
+ - **Git-based Indexer**: Automatically clones and indexes codebases.
8
+ - **Deep-Thinking Orchestrator**: Uses iterative planning and reasoning (Planner -> Retriever -> Analyzer).
9
+ - **Hypothesis Generator**: Generates "Improvement Hypotheses" and validates them against the code context.
10
+ - **Hugging Face Hub**: Searches for replacement components.
11
+ - **Web Researcher**: Uses specialized MCP servers (harvesthealth/github-mcp-server) and Gradio clients to search GitHub and Hugging Face Spaces for community-driven solutions.
12
+
13
+ ### Proposed FASTAPI Setup
14
+ - **App structure**:
15
+ - `main.py`: App entry point.
16
+ - `api/router.py`: API routes.
17
+ - `services/`: Business logic and agent orchestration.
18
+ - `tests/`: Automated tests.
19
+ - **Dependency injection**: Services are initialized per-task with appropriate configuration (LLM model, API keys).
20
+
21
+ ## 2. Tasks and Tests
22
+ ### Backend & Infrastructure
23
+ - **Task: Project Scaffolding**
24
+ - *Test*: `tests/test_main.py` -> Verifies health check.
25
+ - **Task: Codebase Indexing Service**
26
+ - *Test*: `tests/test_indexer.py` -> Verifies cloning, chunking, and search.
27
+
28
+ ### Agent Logic
29
+ - **Task: Multi-Agent Orchestration**
30
+ - *Test*: `tests/test_orchestrator.py` -> Verifies planning and analysis loop.
31
+ - **Task: Improvement & Replacement Logic**
32
+ - *Test*: `tests/test_improvements.py` -> Verifies roadmap generation and HF matching.
33
+
34
+ ### API & End-to-End
35
+ - **Task: API Exposure & Background Jobs**
36
+ - *Test*: `tests/test_api.py` -> Verifies the full /analyze -> /report flow.
37
+
38
+ ## 3. Functionality Expectations
39
+ ### User Perspective
40
+ - Submit a repository URL.
41
+ - Receive a "Critical Thinking Report".
42
+ - View "Critical Weaknesses" and an "Improvement Roadmap".
43
+ - See "Suggested Replacements" (libraries/models) for custom code.
44
+
45
+ ### Technical Perspective
46
+ - **Iterative Reasoning**: The agent doesn't just look at code once; it plans its investigation and refines its findings.
47
+ - **Schema-Aware RAG**: Uses structural context to find relevant code snippets.
48
+ - **External Knowledge**: Connects to Hugging Face Hub for modernization suggestions.
49
+
50
+ ## 4. API Endpoints
51
+ - **POST /analyze**
52
+ - Request: `{"repo_url": "string", "project_description": "string"}`
53
+ - Response: `{"task_id": "uuid"}`
54
+ - **GET /report/{task_id}**
55
+ - Response: `{"status": "completed", "report": {"weaknesses": [], "improvements": []}}`
CriticalThinking/app/api/router.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, BackgroundTasks, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List, Dict, Any, Optional
4
+ import uuid
5
+ import os
6
+ from app.services.indexer import CodeIndexer
7
+ from app.services.agent_orchestrator import AgentOrchestrator
8
+ from app.services.improvement_agent import ImprovementAgent
9
+ from app.services.hf_matcher import HFMatcher
10
+ from app.services.web_researcher import WebResearcher
11
+
12
+ router = APIRouter()
13
+
14
+ # In-memory task store
15
+ tasks: Dict[str, Any] = {}
16
+
17
+ class AnalyzeRequest(BaseModel):
18
+ repo_url: str
19
+ project_description: Optional[str] = "Generic software project"
20
+
21
+ class AnalyzeResponse(BaseModel):
22
+ task_id: str
23
+
24
+ async def run_analysis_task(task_id: str, repo_url: str, project_description: str):
25
+ tasks[task_id]["status"] = "processing"
26
+ try:
27
+ api_key = os.getenv("OPENAI_API_KEY", "dummy")
28
+ # Initialize services
29
+ indexer = CodeIndexer(qdrant_url=":memory:", openai_api_key=api_key)
30
+ orchestrator = AgentOrchestrator(indexer=indexer, openai_api_key=api_key)
31
+ improver = ImprovementAgent(openai_api_key=api_key)
32
+ matcher = HFMatcher()
33
+ web_researcher = WebResearcher()
34
+
35
+ # 1. Index
36
+ indexer.index_repository(repo_url)
37
+
38
+ # 2. Analyze
39
+ analysis_results = orchestrator.run_analysis(project_description)
40
+ weaknesses = analysis_results.get("weaknesses", [])
41
+
42
+ # 3. Improvements
43
+ improvements_results = improver.generate_improvements(weaknesses)
44
+ improvements = improvements_results.get("improvements", [])
45
+
46
+ # 4. Replacement matching and Web Research
47
+ for imp in improvements:
48
+ query = imp.get("replacement_search_query")
49
+ if query:
50
+ # Direct HF search
51
+ replacements = matcher.find_replacements(query)
52
+ imp["suggested_replacements"] = replacements
53
+
54
+ # Web research for GitHub and HF Spaces
55
+ imp["github_research"] = web_researcher.research_github(query)
56
+ imp["hf_spaces_research"] = web_researcher.research_hf_spaces(query)
57
+
58
+ # 5. Store report
59
+ tasks[task_id]["status"] = "completed"
60
+ tasks[task_id]["report"] = {
61
+ "project": project_description,
62
+ "weaknesses": weaknesses,
63
+ "improvements": improvements
64
+ }
65
+ except Exception as e:
66
+ tasks[task_id]["status"] = "failed"
67
+ tasks[task_id]["error"] = str(e)
68
+
69
+ @router.post("/analyze", response_model=AnalyzeResponse)
70
+ async def analyze(request: AnalyzeRequest, background_tasks: BackgroundTasks):
71
+ task_id = str(uuid.uuid4())
72
+ tasks[task_id] = {"status": "pending", "report": None}
73
+ background_tasks.add_task(run_analysis_task, task_id, request.repo_url, request.project_description)
74
+ return AnalyzeResponse(task_id=task_id)
75
+
76
+ @router.get("/report/{task_id}")
77
+ async def get_report(task_id: str):
78
+ if task_id not in tasks:
79
+ raise HTTPException(status_code=404, detail="Task not found")
80
+ return tasks[task_id]
CriticalThinking/app/main.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from app.api.router import router
3
+
4
+ app = FastAPI(title="CriticalThinking API")
5
+
6
+ app.include_router(router)
7
+
8
+ @app.get("/health")
9
+ async def health():
10
+ return {"status": "healthy"}
CriticalThinking/app/services/agent_orchestrator.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Dict, Any
3
+ from openai import OpenAI
4
+ import os
5
+
6
+ class BaseAgent:
7
+ def __init__(self, model: str = "gpt-4o", openai_api_key: str = None):
8
+ self.client = OpenAI(api_key=openai_api_key)
9
+ self.model = model
10
+
11
+ def _get_response(self, system_prompt: str, user_prompt: str, response_format=None) -> Any:
12
+ # For testing purposes without API key or with dummy key
13
+ api_key = self.client.api_key or os.getenv("OPENAI_API_KEY")
14
+ if not api_key or api_key == "dummy":
15
+ return self._mock_response(system_prompt)
16
+
17
+ args = {
18
+ "model": self.model,
19
+ "messages": [
20
+ {"role": "system", "content": system_prompt},
21
+ {"role": "user", "content": user_prompt}
22
+ ],
23
+ "temperature": 0.2
24
+ }
25
+ if response_format:
26
+ args["response_format"] = response_format
27
+
28
+ response = self.client.chat.completions.create(**args)
29
+ content = response.choices[0].message.content
30
+
31
+ if response_format and response_format.get("type") == "json_object":
32
+ return json.loads(content)
33
+ return content
34
+
35
+ def _mock_response(self, system_prompt: str) -> Any:
36
+ if "planner" in system_prompt.lower():
37
+ return {
38
+ "steps": [
39
+ {"index": 0, "sub_question": "Analyze project structure", "tool_type": "doc_search"},
40
+ {"index": 1, "sub_question": "Identify core logic flaws", "tool_type": "doc_search"}
41
+ ],
42
+ "reasoning": "Standard analysis flow"
43
+ }
44
+ elif "weakness" in system_prompt.lower():
45
+ return {
46
+ "summary": "The code has several architectural issues.",
47
+ "weaknesses": ["Manual memory management in Python", "Lack of unit tests"],
48
+ "severity": "high"
49
+ }
50
+ return "Mocked response"
51
+
52
+ class Planner(BaseAgent):
53
+ def plan(self, project_overview: str) -> Dict[str, Any]:
54
+ system_prompt = """You are an expert query planner for a deep-thinking codebase analysis system.
55
+ Your task is to decompose complex codebase investigations into sequential execution plans.
56
+ Guidelines:
57
+ - Create 2-5 steps that build on each other.
58
+ - Each step should have a clear sub-question targeting a specific architectural or logic component.
59
+ - Specify tool_type: doc_search (for code retrieval)."""
60
+ user_prompt = f"Decompose the following project overview into a sequential execution plan:\n\nProject Overview: {project_overview}\n\nRespond with valid JSON in this EXACT format:\n{{\n 'steps': [\n {{\n 'index': 0,\n 'sub_question': 'What specific architectural component needs analysis?',\n 'tool_type': 'doc_search',\n 'expected_outputs': ['finding 1', 'finding 2']\n }}\n ],\n 'reasoning': 'Explain why this plan will effectively find weaknesses.'\n}}"
61
+ return self._get_response(system_prompt, user_prompt, response_format={"type": "json_object"})
62
+
63
+ class WeaknessAnalyzer(BaseAgent):
64
+ def analyze(self, code_context: str) -> Dict[str, Any]:
65
+ system_prompt = """You are an AI senior engineer reviewing a project for critical weaknesses.
66
+ Be critical and cautious. Focus on:
67
+ - Architectural flaws (circular dependencies, lack of modularity).
68
+ - Security risks.
69
+ - Performance bottlenecks.
70
+ - Redundant custom logic that could be replaced by standard libraries or models."""
71
+ user_prompt = f"Analyze the following code snippets for weaknesses:\n\n{code_context}\n\nRespond in JSON format with fields: 'summary', 'weaknesses' (list of strings), 'severity' (high/medium/low)."
72
+ return self._get_response(system_prompt, user_prompt, response_format={"type": "json_object"})
73
+
74
+ class AgentOrchestrator:
75
+ def __init__(self, indexer: Any, openai_api_key: str = None):
76
+ self.indexer = indexer
77
+ self.planner = Planner(openai_api_key=openai_api_key)
78
+ self.analyzer = WeaknessAnalyzer(openai_api_key=openai_api_key)
79
+
80
+ def run_analysis(self, project_overview: str) -> Dict[str, Any]:
81
+ # 1. Plan
82
+ plan = self.planner.plan(project_overview)
83
+
84
+ all_weaknesses = []
85
+ # 2. Execute steps
86
+ for step in plan.get("steps", []):
87
+ sub_q = step.get("sub_question")
88
+ # Search codebase
89
+ results = self.indexer.search(sub_q, limit=3)
90
+ context = "\n---\n".join([r.get("text", "") for r in results])
91
+
92
+ # Analyze
93
+ analysis = self.analyzer.analyze(context)
94
+ all_weaknesses.extend(analysis.get("weaknesses", []))
95
+
96
+ return {
97
+ "plan": plan,
98
+ "weaknesses": list(set(all_weaknesses)),
99
+ "status": "completed"
100
+ }
CriticalThinking/app/services/hf_matcher.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ from typing import List, Dict, Any
3
+
4
+ class HFMatcher:
5
+ def __init__(self):
6
+ self.api = HfApi()
7
+
8
+ def find_replacements(self, description: str, limit: int = 3) -> List[Dict[str, Any]]:
9
+ try:
10
+ models = self.api.list_models(
11
+ search=description,
12
+ sort="downloads",
13
+ direction=-1,
14
+ limit=limit
15
+ )
16
+ results = []
17
+ for model in models:
18
+ results.append({
19
+ "id": model.id,
20
+ "downloads": model.downloads,
21
+ "likes": model.likes,
22
+ "url": f"https://huggingface.co/{model.id}"
23
+ })
24
+ return results
25
+ except Exception as e:
26
+ print(f"HF search failed: {e}")
27
+ return []
CriticalThinking/app/services/improvement_agent.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ from app.services.agent_orchestrator import BaseAgent
3
+
4
+ class ImprovementAgent(BaseAgent):
5
+ def generate_improvements(self, weaknesses: List[str]) -> Dict[str, Any]:
6
+ system_prompt = """You are an AI research scientist and senior architect.
7
+ Your goal is to generate impactful and creative ideas for improving a codebase.
8
+ Consider:
9
+ - Refactoring for better scalability.
10
+ - Replacing custom implementations with state-of-the-art Hugging Face models or open-source projects.
11
+ - Improving performance and maintainability."""
12
+ user_prompt = f"Given these weaknesses:\n{weaknesses}\n\nPropose a next-step improvement roadmap. Respond in JSON with format:\n{{\n 'improvements': [\n {{\n 'weakness': 'the identified weakness',\n 'proposal': 'detailed improvement plan',\n 'replacement_search_query': 'query for Hugging Face or GitHub',\n 'interestingness': 1-10,\n 'feasibility': 1-10\n }}\n ]\n}}"
13
+ return self._get_response(system_prompt, user_prompt, response_format={"type": "json_object"})
14
+
15
+ def _mock_response(self, system_prompt: str) -> Any:
16
+ return {
17
+ "improvements": [
18
+ {
19
+ "weakness": "Manual memory management",
20
+ "proposal": "Use a managed library",
21
+ "replacement_search_query": "memory management library"
22
+ },
23
+ {
24
+ "weakness": "Lack of sentiment analysis accuracy",
25
+ "proposal": "Use a pre-trained transformer model",
26
+ "replacement_search_query": "sentiment analysis"
27
+ }
28
+ ]
29
+ }
CriticalThinking/app/services/indexer.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from typing import List, Dict, Any
5
+ import uuid
6
+ from qdrant_client import QdrantClient
7
+ from qdrant_client.http import models
8
+ from openai import OpenAI
9
+
10
+ class CodeIndexer:
11
+ def __init__(self, qdrant_url: str = ":memory:", openai_api_key: str = None):
12
+ self.qdrant = QdrantClient(qdrant_url)
13
+ self.openai = OpenAI(api_key=openai_api_key)
14
+ self.collection_name = "codebase"
15
+ self._ensure_collection()
16
+
17
+ def _ensure_collection(self):
18
+ collections = self.qdrant.get_collections().collections
19
+ exists = any(c.name == self.collection_name for c in collections)
20
+ if not exists:
21
+ self.qdrant.create_collection(
22
+ collection_name=self.collection_name,
23
+ vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
24
+ )
25
+
26
+ def index_repository(self, repo_url: str):
27
+ import subprocess
28
+ temp_dir = tempfile.mkdtemp()
29
+ try:
30
+ print(f"Cloning {repo_url} into {temp_dir}...")
31
+ if repo_url.startswith("local://"):
32
+ local_path = repo_url.replace("local://", "")
33
+ shutil.copytree(local_path, temp_dir, dirs_exist_ok=True)
34
+ else:
35
+ result = subprocess.run(["git", "clone", "--depth", "1", repo_url, temp_dir], capture_output=True, text=True)
36
+ if result.returncode != 0:
37
+ raise Exception(f"Git clone failed: {result.stderr}")
38
+
39
+ self._index_directory(temp_dir)
40
+ finally:
41
+ shutil.rmtree(temp_dir)
42
+
43
+ def _index_directory(self, root_dir: str):
44
+ points = []
45
+ for root, dirs, files in os.walk(root_dir):
46
+ if ".git" in root:
47
+ continue
48
+ for file in files:
49
+ if file.endswith((".py", ".go", ".js", ".ts", ".md")):
50
+ file_path = os.path.join(root, file)
51
+ relative_path = os.path.relpath(file_path, root_dir)
52
+ with open(file_path, "r", errors="ignore") as f:
53
+ content = f.read()
54
+
55
+ chunks = self._chunk_code(content)
56
+ for i, chunk in enumerate(chunks):
57
+ embedding = self._get_embedding(chunk)
58
+ points.append(models.PointStruct(
59
+ id=str(uuid.uuid4()),
60
+ vector=embedding,
61
+ payload={
62
+ "path": relative_path,
63
+ "chunk_index": i,
64
+ "text": chunk
65
+ }
66
+ ))
67
+
68
+ if points:
69
+ self.qdrant.upsert(
70
+ collection_name=self.collection_name,
71
+ points=points
72
+ )
73
+
74
+ def _chunk_code(self, content: str, max_chars: int = 1500) -> List[str]:
75
+ # Simple chunking by lines for now, ensuring we don't break in the middle of a line
76
+ chunks = []
77
+ lines = content.split("\n")
78
+ current_chunk = []
79
+ current_length = 0
80
+ for line in lines:
81
+ if current_length + len(line) > max_chars and current_chunk:
82
+ chunks.append("\n".join(current_chunk))
83
+ current_chunk = []
84
+ current_length = 0
85
+ current_chunk.append(line)
86
+ current_length += len(line) + 1
87
+ if current_chunk:
88
+ chunks.append("\n".join(current_chunk))
89
+ return chunks
90
+
91
+ def _get_embedding(self, text: str) -> List[float]:
92
+ # Mock embedding if API key is missing or dummy for tests
93
+ api_key = self.openai.api_key or os.getenv("OPENAI_API_KEY")
94
+ if not api_key or api_key == "dummy":
95
+ return [0.0] * 1536
96
+
97
+ response = self.openai.embeddings.create(
98
+ input=text,
99
+ model="text-embedding-3-small"
100
+ )
101
+ return response.data[0].embedding
102
+
103
+ def search(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
104
+ query_vector = self._get_embedding(query)
105
+ try:
106
+ # Try the modern query_points API
107
+ response = self.qdrant.query_points(
108
+ collection_name=self.collection_name,
109
+ query=query_vector,
110
+ limit=limit
111
+ )
112
+ return [hit.payload for hit in response.points]
113
+ except AttributeError:
114
+ # Fallback for older versions if search exists
115
+ hits = self.qdrant.search(
116
+ collection_name=self.collection_name,
117
+ query_vector=query_vector,
118
+ limit=limit
119
+ )
120
+ return [hit.payload for hit in hits]
CriticalThinking/app/services/web_researcher.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+ import os
3
+ from typing import List, Dict, Any
4
+
5
+ import requests
6
+ import uuid
7
+
8
+ class WebResearcher:
9
+ def __init__(self):
10
+ self.web_search_space = "victor/websearch"
11
+ self.hf_search_space = "John6666/testwarm"
12
+ self.github_mcp_url = "https://harvesthealth-github-mcp-server.hf.space/"
13
+ self._web_client = None
14
+ self._hf_client = None
15
+
16
+ @property
17
+ def web_client(self):
18
+ if self._web_client is None:
19
+ try:
20
+ self._web_client = Client(self.web_search_space)
21
+ except Exception as e:
22
+ print(f"Failed to connect to Gradio Client {self.web_search_space}: {e}")
23
+ return self._web_client
24
+
25
+ @property
26
+ def hf_client(self):
27
+ if self._hf_client is None:
28
+ try:
29
+ self._hf_client = Client(self.hf_search_space)
30
+ except Exception as e:
31
+ print(f"Failed to connect to Gradio Client {self.hf_search_space}: {e}")
32
+ return self._hf_client
33
+
34
+ def search_web(self, query: str, search_type: str = "search", num_results: int = 4) -> str:
35
+ if self.web_client is None:
36
+ return "Web search unavailable."
37
+ try:
38
+ return self.web_client.predict(
39
+ query=query,
40
+ search_type=search_type,
41
+ num_results=num_results,
42
+ api_name="/search_web"
43
+ )
44
+ except Exception as e:
45
+ return f"Web search failed: {e}"
46
+
47
+ def search_hf(self, query: str, repo_types: List[str] = ["model", "space"], limit: int = 5) -> str:
48
+ if self.hf_client is None:
49
+ return "HF search unavailable."
50
+ try:
51
+ result = self.hf_client.predict(
52
+ repo_types=repo_types,
53
+ sort="trending_score",
54
+ sort_method="descending order",
55
+ filter_str="",
56
+ search_str=query,
57
+ author="",
58
+ tags="",
59
+ infer="all",
60
+ gated="all",
61
+ appr=["auto", "manual"],
62
+ size_categories=[],
63
+ limit=limit,
64
+ hardware=[],
65
+ stage=[],
66
+ fetch_detail=["Space Runtime"],
67
+ show_labels=["Type", "ID", "Likes", "DLs"],
68
+ api_name="/search"
69
+ )
70
+ # result[0] is a Dict with headers and data
71
+ if isinstance(result, tuple) and len(result) > 0:
72
+ data = result[0].get("data", [])
73
+ return f"Found HF components: {data}"
74
+ return str(result)
75
+ except Exception as e:
76
+ return f"HF search failed: {e}"
77
+
78
+ def research_github(self, topic: str) -> str:
79
+ # Try specialized GitHub MCP search first
80
+ try:
81
+ mcp_result = self.search_github_mcp(topic)
82
+ if "failed" not in mcp_result.lower() and "unavailable" not in mcp_result.lower():
83
+ return mcp_result
84
+ except Exception as e:
85
+ print(f"GitHub MCP search failed, falling back to web search: {e}")
86
+
87
+ # Fallback to web search
88
+ query = f"site:github.com {topic} repository"
89
+ return self.search_web(query)
90
+
91
+ def search_github_mcp(self, query: str) -> str:
92
+ payload = {
93
+ "jsonrpc": "2.0",
94
+ "id": str(uuid.uuid4()),
95
+ "method": "tools/call",
96
+ "params": {
97
+ "name": "search_repositories",
98
+ "arguments": {
99
+ "query": query
100
+ }
101
+ }
102
+ }
103
+ try:
104
+ response = requests.post(self.github_mcp_url, json=payload, timeout=30)
105
+ response.raise_for_status()
106
+ result = response.json()
107
+ if "result" in result:
108
+ return str(result["result"])
109
+ return str(result)
110
+ except Exception as e:
111
+ return f"GitHub MCP search failed: {e}"
112
+
113
+ def research_hf_spaces(self, topic: str) -> str:
114
+ # Use deep HF search for better results
115
+ return self.search_hf(topic, repo_types=["space"])
CriticalThinking/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ openai
4
+ pydantic
5
+ pydantic-settings
6
+ qdrant-client
7
+ huggingface_hub
8
+ python-multipart
9
+ pytest
10
+ httpx
11
+ gradio_client
12
+ requests
CriticalThinking/tests/test_api.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+ from app.main import app
3
+ import time
4
+ from unittest.mock import patch, MagicMock
5
+
6
+ client = TestClient(app)
7
+
8
+ @patch("app.api.router.WebResearcher")
9
+ def test_analyze_flow(mock_web_researcher_class):
10
+ # Mock WebResearcher
11
+ mock_web_researcher = MagicMock()
12
+ mock_web_researcher_class.return_value = mock_web_researcher
13
+ mock_web_researcher.research_github.return_value = "Mocked GitHub results"
14
+ mock_web_researcher.research_hf_spaces.return_value = "Mocked HF Spaces results"
15
+
16
+ # Submit analysis
17
+ response = client.post("/analyze", json={"repo_url": "local://.", "project_description": "Test Project"})
18
+ assert response.status_code == 200
19
+ task_id = response.json()["task_id"]
20
+ assert task_id
21
+
22
+ # Wait a bit for background task
23
+ time.sleep(1)
24
+
25
+ response = client.get(f"/report/{task_id}")
26
+ assert response.status_code == 200
27
+ data = response.json()
28
+ print(f"Task status: {data['status']}")
29
+ if data['status'] == 'failed':
30
+ print(f"Error: {data.get('error')}")
31
+
32
+ assert data["status"] == "completed"
33
+ assert "github_research" in data["report"]["improvements"][0]
34
+ assert "hf_spaces_research" in data["report"]["improvements"][0]
CriticalThinking/tests/test_improvements.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.services.hf_matcher import HFMatcher
2
+ from app.services.improvement_agent import ImprovementAgent
3
+ from unittest.mock import MagicMock
4
+
5
+ def test_hf_matcher():
6
+ matcher = HFMatcher()
7
+ # Mocking HFApi.list_models
8
+ matcher.api.list_models = MagicMock()
9
+ mock_model = MagicMock()
10
+ mock_model.id = "test/model"
11
+ mock_model.downloads = 100
12
+ mock_model.likes = 10
13
+ matcher.api.list_models.return_value = [mock_model]
14
+
15
+ results = matcher.find_replacements("sentiment analysis")
16
+ assert len(results) == 1
17
+ assert results[0]["id"] == "test/model"
18
+
19
+ def test_improvement_agent():
20
+ agent = ImprovementAgent(openai_api_key="dummy")
21
+ result = agent.generate_improvements(["Weakness 1"])
22
+ assert "improvements" in result
23
+ assert len(result["improvements"]) > 0
24
+ assert "replacement_search_query" in result["improvements"][0]
CriticalThinking/tests/test_indexer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.services.indexer import CodeIndexer
3
+ import os
4
+
5
+ def test_indexer_basic():
6
+ # Use in-memory Qdrant and dummy API key
7
+ indexer = CodeIndexer(qdrant_url=":memory:", openai_api_key="dummy-key")
8
+
9
+ # Create a dummy repo
10
+ repo_content = "def add(a, b):\n return a + b\n\n# This is a comment\ndef sub(a, b):\n return a - b\n"
11
+
12
+ # Test _chunk_code
13
+ chunks = indexer._chunk_code(repo_content, max_chars=40)
14
+ assert len(chunks) > 1
15
+
16
+ # Test _index_directory (will use mock embedding because of dummy key and our logic)
17
+ # We need to make sure _get_embedding handles the dummy key
18
+ # Actually, our logic in _get_embedding checks for API key existence.
19
+ # Let's override _get_embedding for the test to be safe.
20
+ indexer._get_embedding = lambda x: [0.1] * 1536
21
+
22
+ indexer._index_directory("app") # Index some local files
23
+ results = indexer.search("health", limit=1)
24
+ assert len(results) >= 0
CriticalThinking/tests/test_main.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+ from app.main import app
3
+
4
+ client = TestClient(app)
5
+
6
+ def test_health():
7
+ response = client.get("/health")
8
+ assert response.status_code == 200
9
+ assert response.json() == {"status": "healthy"}
CriticalThinking/tests/test_orchestrator.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.services.agent_orchestrator import AgentOrchestrator
3
+ from unittest.mock import MagicMock
4
+
5
+ def test_orchestrator_flow():
6
+ mock_indexer = MagicMock()
7
+ mock_indexer.search.return_value = [{"text": "print('hello')"}]
8
+
9
+ orchestrator = AgentOrchestrator(indexer=mock_indexer, openai_api_key="dummy")
10
+
11
+ # Run analysis (will use mock responses from BaseAgent._mock_response)
12
+ result = orchestrator.run_analysis("A simple python script")
13
+
14
+ assert "plan" in result
15
+ assert "weaknesses" in result
16
+ assert result["status"] == "completed"
17
+ assert len(result["weaknesses"]) > 0
18
+ assert mock_indexer.search.called
CriticalThinking/tests/test_web_researcher.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.services.web_researcher import WebResearcher
3
+ from unittest.mock import MagicMock, patch
4
+
5
+ @patch("requests.post")
6
+ def test_web_researcher_github_mcp(mock_post):
7
+ researcher = WebResearcher()
8
+ mock_response = MagicMock()
9
+ mock_response.json.return_value = {"result": "MCP GitHub Result"}
10
+ mock_response.status_code = 200
11
+ mock_post.return_value = mock_response
12
+
13
+ result = researcher.research_github("sentiment analysis")
14
+ assert "MCP GitHub Result" in result
15
+ mock_post.assert_called_once()
16
+
17
+ def test_web_researcher_github_fallback():
18
+ researcher = WebResearcher()
19
+ # Mock search_github_mcp to fail
20
+ researcher.search_github_mcp = MagicMock(return_value="GitHub MCP search failed")
21
+
22
+ with patch("app.services.web_researcher.Client") as mock_client_class:
23
+ mock_instance = MagicMock()
24
+ mock_client_class.return_value = mock_instance
25
+ mock_instance.predict.return_value = "Fallback Web Result"
26
+
27
+ result = researcher.research_github("sentiment analysis")
28
+ assert "Fallback Web Result" in result
29
+
30
+ def test_web_researcher_hf():
31
+ researcher = WebResearcher()
32
+ with patch("app.services.web_researcher.Client") as mock_client_class:
33
+ mock_instance = MagicMock()
34
+ mock_client_class.return_value = mock_instance
35
+ mock_instance.predict.return_value = ({"data": [["space", "example/space", 10, 100]]}, [])
36
+
37
+ result = researcher.research_hf_spaces("sentiment analysis")
38
+ assert "Found HF components" in result
39
+ # Check that it tried to connect to John6666/testwarm
40
+ mock_client_class.assert_any_call("John6666/testwarm")
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11-slim-bullseye
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PYTHONDONTWRITEBYTECODE=1 \
7
+ DEBIAN_FRONTEND=noninteractive \
8
+ PYTHONPATH=.:CriticalThinking
9
+
10
+ # Install system dependencies
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ build-essential \
13
+ git \
14
+ wget \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Set working directory
18
+ WORKDIR /app
19
+
20
+ # Install uv and requirements
21
+ COPY CriticalThinking/requirements.txt .
22
+ RUN pip install --no-cache-dir uv && \
23
+ uv pip install --system --no-cache-dir -r requirements.txt && \
24
+ uv pip install --system --no-cache-dir gradio uvicorn
25
+
26
+ # Create a non-root user and switch to it
27
+ # Hugging Face Spaces use a user with UID 1000
28
+ RUN useradd -m -u 1000 user
29
+ USER user
30
+ ENV PATH="/home/user/.local/bin:$PATH"
31
+ WORKDIR /home/user/app
32
+
33
+ # Copy the rest of the application
34
+ # Use --chown=user to ensure the user has permissions
35
+ COPY --chown=user . .
36
+
37
+ # Expose the port
38
+ EXPOSE 7860
39
+
40
+ # Command to run the application
41
+ # We use uvicorn to run the hf_app:app
42
+ CMD ["uvicorn", "hf_app:app", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The AI Scientist Source Code License
2
+ Version 1.0, December 2025
3
+
4
+ This license is based on the Responsible AI Source Code License v1.1 (http://licenses.ai/).
5
+
6
+ TERMS AND CONDITIONS
7
+
8
+ The AI Scientist Source Code License (“License”) governs the use of the accompanying software. If you access or use the software, you accept the License. If you do not accept the License, do not access or use the software.
9
+
10
+ 1. Definitions.
11
+ (i) "License" means the terms and conditions for use, reproduction, and distribution as defined by Sections one (1) through eight (8) of this document.
12
+ (ii) "Licensor" means Sakana AI, the copyright owner or legal entity authorized by the copyright owner that is granting the License.
13
+ (iii) "You" (or "Your") means an individual or legal entity exercising permissions granted by this License.
14
+ (iv) The terms “reproduce”, “reproduction”, “derivative works”, and “distribution” have the same meaning here as under U.S. Copyright Law.
15
+ (v) “Contribution” means the original software, additions to the original software, modifications to the original software, or derivative works of the original software.
16
+ (vi) "Contributor" means any person or Licensor who provides a Contribution.
17
+
18
+ 2. Grant of Rights.
19
+ Subject to this License, each Contributor grants You a non-exclusive, worldwide, royalty-free copyright license to reproduce its Contribution, prepare derivative works of its Contribution, and distribute its Contribution or any derivative works of its Contribution that You create.
20
+
21
+ 3. Restrictions.
22
+ 3.1. If You distribute any portion of the Contribution, You must include a complete copy of this License with the distribution; and
23
+ 3.2. You agree that the Contribution, or any derivative work of the Contribution, will not be used by You or any third party subject to Your control, to:
24
+
25
+ a. Surveillance
26
+ i. Detect or infer any legally protected class or aspect of any person, as defined by U.S. Federal Law; and
27
+ ii. Detect or infer aspects and/or features of an identity any person, such as name, family name, address, gender, sexual orientation, race, religion, age, location (at any geographical level), skin color, society or political affiliations, employment status and/or employment history, and health and medical conditions.
28
+
29
+ b. Computer Generated Media
30
+ i. Synthesize and/or modify audio-realistic and/or video-realistic representations of people and events, without including a caption, watermark, and/or metadata file indicating that the representations were generated using the Contribution.
31
+
32
+ c. Health Care
33
+ i. Predict the likelihood that any person will request to file an insurance claim;
34
+ ii. Diagnose a medical condition without human oversight.
35
+
36
+ d. Criminal
37
+ i. Predict the likelihood that a crime will be committed by any person or predict the likelihood of any person being a criminal based on facial attributes or personal data.
38
+
39
+ e. Scientific Manuscripts and Academic Integrity (The "AI Scientist" Clause)
40
+ i. Generate or disseminate scientific manuscripts, research papers, or technical reports without expressly and intelligibly disclaiming, in a prominent manner (e.g., in the abstract, or a dedicated 'Disclosure' or 'Methods' section), that the content was machine-generated or produced using The AI Scientist.
41
+
42
+ 3.3. Restrictions referenced in Section 3.2 MUST be included as an enforceable provision by You in any type of legal agreement governing the use and/or distribution of the Work or any Derivative Works.
43
+
44
+ 4. Termination
45
+ Upon the occurrence of any of the restricted uses listed above in “3. Restrictions”, Licensor shall have the right to terminate this License Agreement and require You to immediately return or destroy all copies of the Contribution. Termination of this License Agreement shall be in addition to and not in lieu of any other remedies available to Licensor.
46
+
47
+ 5. Disclaimer of Warranty.
48
+ Unless required by applicable law, Licensor provides any Contribution on an "As-Is" basis, without WARRANTIES OR CONDITIONS OF ANY KIND.
49
+
50
+ 6. Limitation of Liability.
51
+ In no event shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages arising as a result of this License.
52
+
53
+ END OF TERMS AND CONDITIONS
README.md CHANGED
@@ -1,10 +1,412 @@
1
  ---
2
  title: Critical Code Agent
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Critical Code Agent
3
+ emoji: 🦀
4
+ colorFrom: red
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
+ app_port: 7860
9
  ---
10
 
11
+ <h1 align="center">
12
+ <a href="https://github.com/SakanaAI/AI-Scientist/blob/main/docs/logo_2.png">
13
+ <img src="docs/logo_2.png" width="215" /></a><br>
14
+ <b>The AI Scientist: Towards Fully Automated</b><br>
15
+ <b>Open-Ended Scientific Discovery 🧑‍🔬</b><br>
16
+ </h1>
17
+
18
+ <p align="center">
19
+ 📚 <a href="https://arxiv.org/abs/2408.06292">[Paper]</a> |
20
+ 📝 <a href="https://sakana.ai/ai-scientist/">[Blog Post]</a> |
21
+ 📂 <a href="https://drive.google.com/drive/folders/1G7A0wTqfXVa-cpexjk0oaXakaSJwffEt">[Drive Folder]</a>
22
+ </p>
23
+
24
+ One of the grand challenges of artificial intelligence is developing agents capable of conducting scientific research and discovering new knowledge. While frontier models have already been used to aid human scientists—for example, for brainstorming ideas or writing code—they still require extensive manual supervision or are heavily constrained to specific tasks.
25
+
26
+ We're excited to introduce **The AI Scientist**, the first comprehensive system for fully automatic scientific discovery, enabling Foundation Models such as Large Language Models (LLMs) to perform research independently.
27
+
28
+ We provide all runs and data from our paper [here](https://drive.google.com/drive/folders/1G7A0wTqfXVa-cpexjk0oaXakaSJwffEt?usp=sharing), where we run each base model on each template for approximately 50 ideas. We *highly* recommend reading through some of the [Claude papers](https://drive.google.com/drive/folders/1Mmpz6M1FK4q8e-SewgZcUzdeD0Q2zC39?usp=sharing) to get a sense of the system's strengths and weaknesses. Here are some example papers generated by **The AI Scientist** 📝:
29
+
30
+ 1. [DualScale Diffusion: Adaptive Feature Balancing for Low-Dimensional Generative Models](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/adaptive_dual_scale_denoising.pdf)
31
+ 2. [Multi-scale Grid Noise Adaptation: Enhancing Diffusion Models For Low-dimensional Data](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/grid_based_noise_adaptation.pdf)
32
+ 3. [GAN-Enhanced Diffusion: Boosting Sample Quality and Diversity](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/gan_diffusion.pdf)
33
+ 4. [DualDiff: Enhancing Mode Capture in Low-dimensional Diffusion Models via Dual-expert Denoising](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/dual_expert_denoiser.pdf)
34
+ 5. [StyleFusion: Adaptive Multi-style Generation in Character-Level Language Models](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/multi_style_adapter.pdf)
35
+ 6. [Adaptive Learning Rates for Transformers via Q-Learning](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/rl_lr_adaptation.pdf)
36
+ 7. [Unlocking Grokking: A Comparative Study of Weight Initialization Strategies in Transformer Models](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/weight_initialization_grokking.pdf)
37
+ 8. [Grokking Accelerated: Layer-wise Learning Rates for Transformer Generalization](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/layerwise_lr_grokking.pdf)
38
+ 9. [Grokking Through Compression: Unveiling Sudden Generalization via Minimal Description Length](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/mdl_grokking_correlation.pdf)
39
+ 10. [Accelerating Mathematical Insight: Boosting Grokking Through Strategic Data Augmentation](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/data_augmentation_grokking.pdf)
40
+
41
+ > **Note:**
42
+ > **Caution!** This codebase will execute LLM-written code. There are various risks and challenges associated with this autonomy, including the use of potentially dangerous packages, web access, and potential spawning of processes. Use at your own discretion. Please make sure to [containerize](#containerization) and restrict web access appropriately.
43
+
44
+ <p align="center">
45
+ <a href="https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/adaptive_dual_scale_denoising/adaptive_dual_scale_denoising.pdf"><img src="https://github.com/SakanaAI/AI-Scientist/blob/main/docs/anim-ai-scientist.gif" alt="Adaptive Dual Scale Denoising" width="80%" />
46
+ </a></p>
47
+
48
+ ## Table of Contents
49
+
50
+ 1. [Introduction](#introduction)
51
+ 2. [Requirements](#requirements)
52
+ - [Installation](#installation)
53
+ - [Supported Models and API Keys](#supported-models-and-api-keys)
54
+ 3. [Setting Up the Templates](#setting-up-the-templates)
55
+ - [NanoGPT Template](#nanogpt-template)
56
+ - [2D Diffusion Template](#2d-diffusion-template)
57
+ - [Grokking Template](#grokking-template)
58
+ 4. [Run AI Scientist Paper Generation Experiments](#run-ai-scientist-paper-generation-experiments)
59
+ 5. [Getting an LLM-Generated Paper Review](#getting-an-llm-generated-paper-review)
60
+ 6. [Making Your Own Template](#making-your-own-template)
61
+ - [Community-Contributed Templates](#community-contributed-templates)
62
+ 7. [Template Resources](#template-resources)
63
+ 8. [Citing The AI Scientist](#citing-the-ai-scientist)
64
+ 9. [Frequently Asked Questions](#frequently-asked-questions)
65
+ 10. [Containerization](#containerization)
66
+
67
+ ## Introduction
68
+
69
+ We provide three templates, which were used in our paper, covering the following domains: **NanoGPT**, **2D Diffusion**, and **Grokking**. These templates enable The AI Scientist to generate ideas and conduct experiments in these areas. We accept contributions of new templates from the community, but please note that they are not maintained by us. All other templates beyond the three provided are community contributions.
70
+
71
+ ## Requirements
72
+
73
+ This code is designed to run on Linux with NVIDIA GPUs using CUDA and PyTorch. Support for other GPU architectures may be possible by following the [PyTorch guidelines](https://pytorch.org/get-started/locally/). The current templates would likely take an infeasible amount of time on CPU-only machines. Running on other operating systems may require significant adjustments.
74
+
75
+ ### Installation
76
+
77
+ ```bash
78
+ conda create -n ai_scientist python=3.11
79
+ conda activate ai_scientist
80
+ # Install pdflatex
81
+ sudo apt-get install texlive-full
82
+
83
+ # Install PyPI requirements
84
+ pip install -r requirements.txt
85
+ ```
86
+
87
+ **Note:** Installing `texlive-full` can take a long time. You may need to [hold Enter](https://askubuntu.com/questions/956006/pregenerating-context-markiv-format-this-may-take-some-time-takes-forever) during the installation.
88
+
89
+ ### Supported Models and API Keys
90
+
91
+ We support a wide variety of models, including open-weight and API-only models. In general, we recommend using only frontier models above the capability of the original GPT-4. To see a full list of supported models, see [here](https://github.com/SakanaAI/AI-Scientist/blob/main/ai_scientist/llm.py).
92
+
93
+ #### OpenAI API (GPT-4o, GPT-4o-mini, o1 models)
94
+
95
+ By default, this uses the `OPENAI_API_KEY` environment variable.
96
+
97
+ #### Anthropic API (Claude Sonnet 3.5)
98
+
99
+ By default, this uses the `ANTHROPIC_API_KEY` environment variable.
100
+
101
+ ##### Claude Models via Bedrock
102
+
103
+ For Claude models provided by [Amazon Bedrock](https://aws.amazon.com/bedrock/), please install these additional packages:
104
+
105
+ ```bash
106
+ pip install anthropic[bedrock]
107
+ ```
108
+
109
+ Next, specify a set of valid [AWS Credentials](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html) and the target [AWS Region](https://docs.aws.amazon.com/bedrock/latest/userguide/bedrock-regions.html):
110
+
111
+ Set the environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION_NAME`.
112
+
113
+ ##### Claude Models via Vertex AI
114
+
115
+ For Claude models provided by [Vertex AI Model Garden](https://cloud.google.com/model-garden?hl=en), please install these additional packages:
116
+
117
+ ```bash
118
+ pip install google-cloud-aiplatform
119
+ pip install anthropic[vertex]
120
+ ```
121
+
122
+ Next, set up valid authentication for a [Google Cloud project](https://cloud.google.com/vertex-ai/docs/authentication), for example by providing the region and project ID:
123
+
124
+ ```bash
125
+ export CLOUD_ML_REGION="REGION" # for Model Garden call
126
+ export ANTHROPIC_VERTEX_PROJECT_ID="PROJECT_ID" # for Model Garden call
127
+ export VERTEXAI_LOCATION="REGION" # for Aider/LiteLLM call
128
+ export VERTEXAI_PROJECT="PROJECT_ID" # for Aider/LiteLLM call
129
+ ```
130
+
131
+ #### DeepSeek API (deepseek-chat, deepseek-reasoner)
132
+ By default, this uses the `DEEPSEEK_API_KEY` environment variable.
133
+
134
+ #### OpenRouter API (Llama3.1)
135
+
136
+ By default, this uses the `OPENROUTER_API_KEY` environment variable.
137
+
138
+ #### Google Gemini
139
+ We support Google Gemini models (e.g., "gemini-1.5-flash", "gemini-1.5-pro") via the [google-generativeai](https://pypi.org/project/google-generativeai) Python library. By default, it uses the environment variable:
140
+
141
+ ```bash
142
+ export GEMINI_API_KEY="YOUR GEMINI API KEY"
143
+ ```
144
+
145
+ #### Semantic Scholar API (Literature Search)
146
+
147
+ Our code can also optionally use a Semantic Scholar API Key (`S2_API_KEY`) for higher throughput [if you have one](https://www.semanticscholar.org/product/api), though it should work without it in principle. If you have problems with Semantic Scholar, you can skip the literature search and citation phases of paper generation.
148
+
149
+ Be sure to provide the key for the model used for your runs, e.g.:
150
+
151
+ ```bash
152
+ export OPENAI_API_KEY="YOUR KEY HERE"
153
+ export S2_API_KEY="YOUR KEY HERE"
154
+ ```
155
+
156
+ #### OpenAlex API (Literature Search Alternative)
157
+
158
+ OpenAlex API can be used as an alternative if you do not have a Semantic Scholar API Key.
159
+ OpenAlex does not require API key.
160
+
161
+ ```bash
162
+ pip install pyalex
163
+ export OPENALEX_MAIL_ADDRESS="YOUR EMAIL ADDRESS"
164
+ ```
165
+
166
+ And specify `--engine openalex` when you execute the AI Scientist code.
167
+
168
+ Note that this is experimental for those who do not have a Semantic Scholar API Key.
169
+
170
+ ## Setting Up the Templates
171
+
172
+ This section provides instructions for setting up each of the three templates used in our paper. Before running The AI Scientist experiments, please ensure you have completed the setup steps for the templates you are interested in.
173
+
174
+ ### NanoGPT Template
175
+
176
+ **Description:** This template investigates transformer-based autoregressive next-token prediction tasks.
177
+
178
+ **Setup Steps:**
179
+
180
+ 1. **Prepare the data:**
181
+
182
+ ```bash
183
+ python data/enwik8/prepare.py
184
+ python data/shakespeare_char/prepare.py
185
+ python data/text8/prepare.py
186
+ ```
187
+
188
+ 2. **Create baseline runs (machine dependent):**
189
+
190
+ ```bash
191
+ # Set up NanoGPT baseline run
192
+ # NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
193
+ cd templates/nanoGPT
194
+ python experiment.py --out_dir run_0
195
+ python plot.py
196
+ ```
197
+
198
+ ### 2D Diffusion Template
199
+
200
+ **Description:** This template studies improving the performance of diffusion generative models on low-dimensional datasets.
201
+
202
+ **Setup Steps:**
203
+
204
+ 1. **Install dependencies:**
205
+
206
+ ```bash
207
+ # Set up 2D Diffusion
208
+ git clone https://github.com/gregversteeg/NPEET.git
209
+ cd NPEET
210
+ pip install .
211
+ pip install scikit-learn
212
+ ```
213
+
214
+ 2. **Create baseline runs:**
215
+
216
+ ```bash
217
+ # Set up 2D Diffusion baseline run
218
+ cd templates/2d_diffusion
219
+ python experiment.py --out_dir run_0
220
+ python plot.py
221
+ ```
222
+
223
+ ### Grokking Template
224
+
225
+ **Description:** This template investigates questions about generalization and learning speed in deep neural networks.
226
+
227
+ **Setup Steps:**
228
+
229
+ 1. **Install dependencies:**
230
+
231
+ ```bash
232
+ # Set up Grokking
233
+ pip install einops
234
+ ```
235
+
236
+ 2. **Create baseline runs:**
237
+
238
+ ```bash
239
+ # Set up Grokking baseline run
240
+ cd templates/grokking
241
+ python experiment.py --out_dir run_0
242
+ python plot.py
243
+ ```
244
+
245
+ ## Run AI Scientist Paper Generation Experiments
246
+
247
+ **Note:** Please ensure the setup steps above are completed before running these experiments.
248
+
249
+ ```bash
250
+ conda activate ai_scientist
251
+ # Run the paper generation.
252
+ python launch_scientist.py --model "gpt-4o-2024-05-13" --experiment nanoGPT_lite --num-ideas 2
253
+ python launch_scientist.py --model "claude-3-5-sonnet-20241022" --experiment nanoGPT_lite --num-ideas 2
254
+ ```
255
+
256
+ If you have more than one GPU, use the `--parallel` option to parallelize ideas across multiple GPUs.
257
+
258
+ ## Getting an LLM-Generated Paper Review
259
+
260
+ ```python
261
+ import openai
262
+ from ai_scientist.perform_review import load_paper, perform_review
263
+
264
+ client = openai.OpenAI()
265
+ model = "gpt-4o-2024-05-13"
266
+
267
+ # Load paper from PDF file (raw text)
268
+ paper_txt = load_paper("report.pdf")
269
+
270
+ # Get the review dictionary
271
+ review = perform_review(
272
+ paper_txt,
273
+ model,
274
+ client,
275
+ num_reflections=5,
276
+ num_fs_examples=1,
277
+ num_reviews_ensemble=5,
278
+ temperature=0.1,
279
+ )
280
+
281
+ # Inspect review results
282
+ review["Overall"] # Overall score (1-10)
283
+ review["Decision"] # 'Accept' or 'Reject'
284
+ review["Weaknesses"] # List of weaknesses (strings)
285
+ ```
286
+
287
+ To run batch analysis:
288
+
289
+ ```bash
290
+ cd review_iclr_bench
291
+ python iclr_analysis.py --num_reviews 500 --batch_size 100 --num_fs_examples 1 --num_reflections 5 --temperature 0.1 --num_reviews_ensemble 5
292
+ ```
293
+
294
+ ## Making Your Own Template
295
+
296
+ If there is an area of study you would like **The AI Scientist** to explore, it is straightforward to create your own templates. In general, follow the structure of the existing templates, which consist of:
297
+
298
+ - `experiment.py` — This is the main script where the core content is. It takes an argument `--out_dir`, which specifies where it should create the folder and save the relevant information from the run.
299
+ - `plot.py` — This script takes the information from the `run` folders and creates plots. The code should be clear and easy to edit.
300
+ - `prompt.json` — Put information about your template here.
301
+ - `seed_ideas.json` — Place example ideas here. You can also try to generate ideas without any examples and then pick the best one or two to put here.
302
+ - `latex/template.tex` — We recommend using our LaTeX folder but be sure to replace the pre-loaded citations with ones that you expect to be more relevant.
303
+
304
+ The key to making new templates work is matching the base filenames and output JSONs to the existing format; everything else is free to change.
305
+ You should also ensure that the `template.tex` file is updated to use the correct citation style / base plots for your template.
306
+
307
+ ### Community-Contributed Templates
308
+
309
+ We welcome community contributions in the form of new templates. While these are not maintained by us, we are delighted to highlight your templates to others. Below, we list community-contributed templates along with links to their pull requests (PRs):
310
+
311
+ - Infectious Disease Modeling (`seir`) - [PR #137](https://github.com/SakanaAI/AI-Scientist/pull/137)
312
+ - Image Classification with MobileNetV3 (`mobilenetV3`) - [PR #141](https://github.com/SakanaAI/AI-Scientist/pull/141)
313
+ - Sketch RNN (`sketch_rnn`) - [PR #143](https://github.com/SakanaAI/AI-Scientist/pull/143)
314
+ - AI in Quantum Chemistry (`MACE`) - [PR#157](https://github.com/SakanaAI/AI-Scientist/pull/157)
315
+ - Earthquake Prediction (`earthquake-prediction`) - [PR #167](https://github.com/SakanaAI/AI-Scientist/pull/167)
316
+ - Tensorial Radiance Fields (`tensorf`) - [PR #175](https://github.com/SakanaAI/AI-Scientist/pull/175)
317
+ - Large Language Model Steering / Probes (`probes`) - [PR #215](https://github.com/SakanaAI/AI-Scientist/pull/215)
318
+
319
+ *This section is reserved for community contributions. Please submit a pull request to add your template to the list! Please describe the template in the PR description, and also show examples of the generated papers.*
320
+
321
+ ## Template Resources
322
+
323
+ We provide three templates, which heavily use code from other repositories, credited below:
324
+
325
+ - **NanoGPT Template** uses code from [NanoGPT](https://github.com/karpathy/nanoGPT) and this [PR](https://github.com/karpathy/nanoGPT/pull/254).
326
+ - **2D Diffusion Template** uses code from [tiny-diffusion](https://github.com/tanelp/tiny-diffusion), [ema-pytorch](https://github.com/lucidrains/ema-pytorch), and [Datasaur](https://www.research.autodesk.com/publications/same-stats-different-graphs/).
327
+ - **Grokking Template** uses code from [Sea-Snell/grokking](https://github.com/Sea-Snell/grokking) and [danielmamay/grokking](https://github.com/danielmamay/grokking).
328
+
329
+ We would like to thank the developers of the open-source models and packages for their contributions and for making their work available.
330
+
331
+ ## Citing The AI Scientist
332
+
333
+ If you use **The AI Scientist** in your research, please cite it as follows:
334
+
335
+ ```
336
+ @article{lu2024aiscientist,
337
+ title={The {AI} {S}cientist: Towards Fully Automated Open-Ended Scientific Discovery},
338
+ author={Lu, Chris and Lu, Cong and Lange, Robert Tjarko and Foerster, Jakob and Clune, Jeff and Ha, David},
339
+ journal={arXiv preprint arXiv:2408.06292},
340
+ year={2024}
341
+ }
342
+ ```
343
+
344
+ ## Frequently Asked Questions
345
+
346
+ We recommend reading our paper first for any questions you have on The AI Scientist.
347
+
348
+ **Why am I missing files when running The AI Scientist?**
349
+
350
+ Ensure you have completed all the setup and preparation steps before the main experiment script.
351
+
352
+ **Why has a PDF or a review not been generated?**
353
+
354
+ The AI Scientist finishes an idea with a success rate that depends on the template, the base foundation model, and the complexity of the idea. We advise referring to our main paper. The highest success rates are observed with Claude Sonnet 3.5. Reviews are best done with GPT-4o; all other models have issues with positivity bias or failure to conform to required outputs.
355
+
356
+ **What is the cost of each idea generated?**
357
+
358
+ Typically less than $15 per paper with Claude Sonnet 3.5. We recommend DeepSeek Coder V2 for a much more cost-effective approach. A good place to look for new models is the [Aider leaderboard](https://aider.chat/docs/leaderboards/).
359
+
360
+ **How do I change the base conference format associated with the write-ups?**
361
+
362
+ Change the base `template.tex` files contained within each template.
363
+
364
+ **How do I run The AI Scientist for different subject fields?**
365
+
366
+ Please refer to the instructions for different templates. In this current iteration, this is restricted to ideas that can be expressed in code. However, lifting this restriction would represent exciting future work! :)
367
+
368
+ **How do I add support for a new foundation model?**
369
+
370
+ You may modify `ai_scientist/llm.py` to add support for a new foundation model. We do not advise using any model that is significantly weaker than GPT-4 level for **The AI Scientist**.
371
+
372
+ **Why do I need to run the baseline runs myself?**
373
+
374
+ These appear as `run_0` and should be run per machine you execute **The AI Scientist** on for accurate run-time comparisons due to hardware differences.
375
+
376
+ **What if I have problems accessing the Semantic Scholar API?**
377
+
378
+ We use the Semantic Scholar API to check ideas for novelty and collect citations for the paper write-up. You may be able to skip these phases if you don't have an API key or the API is slow to access.
379
+
380
+ ## Containerization
381
+
382
+ We include a [community-contributed](https://github.com/SakanaAI/AI-Scientist/pull/21) Docker image that may assist with your containerization efforts in `experimental/Dockerfile`.
383
+
384
+ You can use this image like this:
385
+
386
+ ```bash
387
+ # Endpoint Script
388
+ docker run -e OPENAI_API_KEY=$OPENAI_API_KEY -v `pwd`/templates:/app/AI-Scientist/templates <AI_SCIENTIST_IMAGE> \
389
+ --model gpt-4o-2024-05-13 \
390
+ --experiment 2d_diffusion \
391
+ --num-ideas 2
392
+ ```
393
+
394
+ ```bash
395
+ # Interactive
396
+ docker run -it -e OPENAI_API_KEY=$OPENAI_API_KEY \
397
+ --entrypoint /bin/bash \
398
+ <AI_SCIENTIST_IMAGE>
399
+ ```
400
+
401
+ ## ⚖️ License & Responsible Use
402
+
403
+ This project is licensed under **The AI Scientist Source Code License** (a derivative of the Responsible AI License).
404
+
405
+ **Mandatory Disclosure:** By using this code, you are legally bound to clearly and prominently disclose the use of AI in any resulting scientific manuscripts or papers.
406
+
407
+ We recommend the following attribution in your paper's Abstract or Methods section:
408
+ > "This manuscript was autonomously generated using [The AI Scientist](https://github.com/SakanaAI/AI-Scientist)."
409
+
410
+ ## Star History
411
+
412
+ [![Star History Chart](https://api.star-history.com/svg?repos=SakanaAI/AI-Scientist&type=Date)](https://star-history.com/#SakanaAI/AI-Scientist&Date)
experimental/Dockerfile ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.11 as the base image
2
+ FROM python:3.11-bullseye
3
+
4
+ # Avoid prompts from apt
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Set working directory
8
+ WORKDIR /app
9
+
10
+ # Install system dependencies including texlive-full
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ wget=1.21-1+deb11u1 \
13
+ git=1:2.30.2-1+deb11u2 \
14
+ build-essential=12.9 \
15
+ libssl-dev=1.1.1w-0+deb11u1 \
16
+ zlib1g-dev=1:1.2.11.dfsg-2+deb11u2 \
17
+ libbz2-dev=1.0.8-4 \
18
+ libreadline-dev=8.1-1 \
19
+ libsqlite3-dev=3.34.1-3 \
20
+ libncursesw5-dev=6.2+20201114-2+deb11u2 \
21
+ xz-utils=5.2.5-2.1~deb11u1 \
22
+ tk-dev=8.6.11+1 \
23
+ libxml2-dev=2.9.10+dfsg-6.7+deb11u4 \
24
+ libxmlsec1-dev=1.2.31-1 \
25
+ libffi-dev=3.3-6 \
26
+ liblzma-dev=5.2.5-2.1~deb11u1 \
27
+ texlive-full=2020.20210202-3 \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
+ # Upgrade pip
31
+ RUN pip install --no-cache-dir --upgrade pip==24.2
32
+
33
+ # Install Python packages
34
+ RUN pip install --no-cache-dir \
35
+ anthropic==0.34.0 \
36
+ aider-chat==0.50.1 \
37
+ backoff==2.2.1 \
38
+ openai==1.40.6 \
39
+ matplotlib==3.9.2 \
40
+ pypdf==4.3.1 \
41
+ pymupdf4llm==0.0.10 \
42
+ torch==2.4.0 \
43
+ numpy==1.26.4 \
44
+ transformers==4.44.0 \
45
+ datasets==2.21.0 \
46
+ tiktoken==0.7.0 \
47
+ wandb==0.17.7 \
48
+ tqdm==4.66.5 \
49
+ scikit-learn==1.5.1 \
50
+ einops==0.8.0
51
+
52
+ # Clone and install NPEET with a specific commit
53
+ RUN git clone https://github.com/gregversteeg/NPEET.git
54
+ WORKDIR /app/NPEET
55
+ RUN git checkout 8b0d9485423f74e5eb199324cf362765596538d3 \
56
+ && pip install .
57
+
58
+ # Clone the AI-Scientist repository
59
+ WORKDIR /app
60
+ RUN git clone https://github.com/SakanaAI/AI-Scientist.git
61
+
62
+ # Set working directory to AI-Scientist
63
+ WORKDIR /app/AI-Scientist
64
+
65
+ # Prepare NanoGPT data
66
+ RUN python data/enwik8/prepare.py && \
67
+ python data/shakespeare_char/prepare.py && \
68
+ python data/text8/prepare.py
69
+
70
+ # Set up baseline runs
71
+ RUN for dir in templates/*/; do \
72
+ if [ -f "${dir}experiment.py" ]; then \
73
+ cd "${dir}" || continue; \
74
+ python experiment.py --out_dir run_0 && \
75
+ python plot.py; \
76
+ cd /app/AI-Scientist || exit; \
77
+ fi \
78
+ done
79
+
80
+ # Create entrypoint script
81
+ RUN printf '#!/bin/bash\n\
82
+ python launch_scientist.py "$@"\n' > /app/entrypoint.sh && \
83
+ chmod +x /app/entrypoint.sh
84
+
85
+ # Set the entrypoint
86
+ ENTRYPOINT ["/app/entrypoint.sh"]
87
+
88
+ # Set the default command to an empty array
89
+ CMD []
experimental/launch_oe_scientist.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import multiprocessing
4
+ import openai
5
+ import os
6
+ import os.path as osp
7
+ import shutil
8
+ import sys
9
+ import time
10
+ import torch
11
+ from aider.coders import Coder
12
+ from aider.io import InputOutput
13
+ from aider.models import Model
14
+ from datetime import datetime
15
+
16
+ from ai_scientist.generate_ideas import generate_next_idea, check_idea_novelty
17
+ from ai_scientist.llm import create_client, AVAILABLE_LLMS
18
+ from ai_scientist.perform_experiments import perform_experiments
19
+ from ai_scientist.perform_review import perform_review, load_paper, perform_improvement
20
+ from ai_scientist.perform_writeup import perform_writeup, generate_latex
21
+
22
+ NUM_REFLECTIONS = 3
23
+
24
+
25
+ def print_time():
26
+ print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
27
+
28
+
29
+ def parse_arguments():
30
+ parser = argparse.ArgumentParser(description="Run AI scientist experiments")
31
+ # add type of experiment (nanoGPT, Boston, etc.)
32
+ parser.add_argument(
33
+ "--experiment",
34
+ type=str,
35
+ default="nanoGPT",
36
+ help="Experiment to run AI Scientist on.",
37
+ )
38
+ parser.add_argument(
39
+ "--model",
40
+ type=str,
41
+ default="claude-3-5-sonnet-20240620",
42
+ choices=AVAILABLE_LLMS,
43
+ help="Model to use for AI Scientist.",
44
+ )
45
+ parser.add_argument(
46
+ "--writeup",
47
+ type=str,
48
+ default="latex",
49
+ choices=["latex"],
50
+ help="What format to use for writeup",
51
+ )
52
+ parser.add_argument(
53
+ "--parallel",
54
+ type=int,
55
+ default=0,
56
+ help="Number of parallel processes to run. 0 for sequential execution.",
57
+ )
58
+ parser.add_argument(
59
+ "--improvement",
60
+ action="store_true",
61
+ help="Improve based on reviews.",
62
+ )
63
+ parser.add_argument(
64
+ "--gpus",
65
+ type=str,
66
+ default=None,
67
+ help="Comma-separated list of GPU IDs to use (e.g., '0,1,2'). If not specified, all available GPUs will be used.",
68
+ )
69
+ parser.add_argument(
70
+ "--num-ideas",
71
+ type=int,
72
+ default=50,
73
+ help="Number of ideas to generate",
74
+ )
75
+ return parser.parse_args()
76
+
77
+
78
+ def get_available_gpus(gpu_ids=None):
79
+ if gpu_ids is not None:
80
+ return [int(gpu_id) for gpu_id in gpu_ids.split(",")]
81
+ return list(range(torch.cuda.device_count()))
82
+
83
+
84
+ def worker(
85
+ queue,
86
+ base_dir,
87
+ results_dir,
88
+ model,
89
+ client,
90
+ client_model,
91
+ writeup,
92
+ improvement,
93
+ gpu_id,
94
+ idea_archive,
95
+ lock,
96
+ ):
97
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
98
+ print(f"Worker {gpu_id} started.")
99
+ while True:
100
+ _ = queue.get()
101
+ with lock:
102
+ idea_archive = generate_next_idea(
103
+ base_dir,
104
+ client=client,
105
+ model=client_model,
106
+ prev_idea_archive=idea_archive,
107
+ num_reflections=NUM_REFLECTIONS,
108
+ )
109
+ idea_archive = check_idea_novelty(
110
+ idea_archive,
111
+ base_dir=base_dir,
112
+ client=client,
113
+ model=client_model,
114
+ )
115
+ idea = idea_archive[-1]
116
+ if _ is None:
117
+ break
118
+ success, score, _ = do_idea(
119
+ base_dir,
120
+ results_dir,
121
+ idea,
122
+ model,
123
+ client,
124
+ client_model,
125
+ writeup,
126
+ improvement,
127
+ log_file=True,
128
+ )
129
+ print(f"Completed idea: {idea['Name']}, Success: {success}, Score: {score}")
130
+ with lock:
131
+ for x in idea_archive:
132
+ if x["Name"] == idea["Name"] and x["Title"] == idea["Title"]:
133
+ x["Score"] = score
134
+ break
135
+ print(f"Worker {gpu_id} finished.")
136
+
137
+
138
+ def do_idea(
139
+ base_dir,
140
+ results_dir,
141
+ idea,
142
+ model,
143
+ client,
144
+ client_model,
145
+ writeup,
146
+ improvement,
147
+ log_file=False,
148
+ ):
149
+ ## CREATE PROJECT FOLDER
150
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
151
+ idea_name = f"{timestamp}_{idea['Name']}"
152
+ folder_name = osp.join(results_dir, idea_name)
153
+ assert not osp.exists(folder_name), f"Folder {folder_name} already exists."
154
+ destination_dir = folder_name
155
+ shutil.copytree(base_dir, destination_dir, dirs_exist_ok=True)
156
+ with open(osp.join(base_dir, "run_0", "final_info.json"), "r") as f:
157
+ baseline_results = json.load(f)
158
+ baseline_results = {k: v["means"] for k, v in baseline_results.items()}
159
+ exp_file = osp.join(folder_name, "experiment.py")
160
+ vis_file = osp.join(folder_name, "plot.py")
161
+ notes = osp.join(folder_name, "notes.txt")
162
+ with open(notes, "w") as f:
163
+ f.write(f"# Title: {idea['Title']}\n")
164
+ f.write(f"# Experiment description: {idea['Experiment']}\n")
165
+ f.write(f"## Run 0: Baseline\n")
166
+ f.write(f"Results: {baseline_results}\n")
167
+ f.write(f"Description: Baseline results.\n")
168
+ if log_file:
169
+ original_stdout = sys.stdout
170
+ original_stderr = sys.stderr
171
+ log_path = osp.join(folder_name, "log.txt")
172
+ log = open(log_path, "a")
173
+ sys.stdout = log
174
+ sys.stderr = log
175
+ try:
176
+ print_time()
177
+ print(f"*Starting idea: {idea_name}*")
178
+ ## PERFORM EXPERIMENTS
179
+ fnames = [exp_file, vis_file, notes]
180
+ io = InputOutput(
181
+ yes=True, chat_history_file=f"{folder_name}/{idea_name}_aider.txt"
182
+ )
183
+ if model == "deepseek-coder-v2-0724":
184
+ main_model = Model("deepseek/deepseek-coder")
185
+ elif model == "llama3.1-405b":
186
+ main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
187
+ else:
188
+ main_model = Model(model)
189
+ coder = Coder.create(
190
+ main_model=main_model,
191
+ fnames=fnames,
192
+ io=io,
193
+ stream=False,
194
+ use_git=False,
195
+ edit_format="diff",
196
+ )
197
+
198
+ print_time()
199
+ print(f"*Starting Experiments*")
200
+ try:
201
+ success = perform_experiments(idea, folder_name, coder, baseline_results)
202
+ except Exception as e:
203
+ print(f"Error during experiments: {e}")
204
+ print(f"Experiments failed for idea {idea_name}")
205
+ return False, 0, idea
206
+
207
+ if not success:
208
+ print(f"Experiments failed for idea {idea_name}")
209
+ return False, 0, idea
210
+
211
+ print_time()
212
+ print(f"*Starting Writeup*")
213
+ ## PERFORM WRITEUP
214
+ if writeup == "latex":
215
+ writeup_file = osp.join(folder_name, "latex", "template.tex")
216
+ fnames = [exp_file, writeup_file, notes]
217
+ if model == "deepseek-coder-v2-0724":
218
+ main_model = Model("deepseek/deepseek-coder")
219
+ elif model == "llama3.1-405b":
220
+ main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
221
+ else:
222
+ main_model = Model(model)
223
+ coder = Coder.create(
224
+ main_model=main_model,
225
+ fnames=fnames,
226
+ io=io,
227
+ stream=False,
228
+ use_git=False,
229
+ edit_format="diff",
230
+ )
231
+ try:
232
+ perform_writeup(idea, folder_name, coder, client, client_model)
233
+ except Exception as e:
234
+ print(f"Failed to perform writeup: {e}")
235
+ return False, 0, idea
236
+ print("Done writeup")
237
+ else:
238
+ raise ValueError(f"Writeup format {writeup} not supported.")
239
+
240
+ print_time()
241
+ print(f"*Starting Review*")
242
+ ## REVIEW PAPER
243
+ if writeup == "latex":
244
+ try:
245
+ paper_text = load_paper(f"{folder_name}/{idea['Name']}.pdf")
246
+ review = perform_review(
247
+ paper_text,
248
+ model="gpt-4o-2024-05-13",
249
+ client=openai.OpenAI(),
250
+ num_reflections=5,
251
+ num_fs_examples=1,
252
+ num_reviews_ensemble=5,
253
+ temperature=0.1,
254
+ )
255
+ review_score = review["Overall"]
256
+ # Store the review in separate review.txt file
257
+ with open(osp.join(folder_name, "review.txt"), "w") as f:
258
+ f.write(json.dumps(review))
259
+ except Exception as e:
260
+ print(f"Failed to perform review: {e}")
261
+ return False, 0, idea
262
+
263
+ ## IMPROVE WRITEUP
264
+ if writeup == "latex" and improvement:
265
+ print_time()
266
+ print(f"*Starting Improvement*")
267
+ try:
268
+ perform_improvement(review, coder)
269
+ generate_latex(
270
+ coder, folder_name, f"{folder_name}/{idea['Name']}_improved.pdf"
271
+ )
272
+ paper_text = load_paper(f"{folder_name}/{idea['Name']}_improved.pdf")
273
+ review = perform_review(
274
+ paper_text,
275
+ model="gpt-4o-2024-05-13",
276
+ client=openai.OpenAI(),
277
+ num_reflections=5,
278
+ num_fs_examples=1,
279
+ num_reviews_ensemble=5,
280
+ temperature=0.1,
281
+ )
282
+ review_score = review["Overall"]
283
+ # Store the review in separate review.txt file
284
+ with open(osp.join(folder_name, "review_improved.txt"), "w") as f:
285
+ f.write(json.dumps(review))
286
+ except Exception as e:
287
+ print(f"Failed to perform improvement: {e}")
288
+ return False, 0, idea
289
+ return True, review_score, idea
290
+ except Exception as e:
291
+ print(f"Failed to evaluate idea {idea_name}: {str(e)}")
292
+ return False, 0, idea
293
+ finally:
294
+ print("FINISHED IDEA")
295
+ if log_file:
296
+ sys.stdout = original_stdout
297
+ sys.stderr = original_stderr
298
+ log.close()
299
+
300
+
301
+ if __name__ == "__main__":
302
+ args = parse_arguments()
303
+
304
+ # Check available GPUs and adjust parallel processes if necessary
305
+ available_gpus = get_available_gpus(args.gpus)
306
+ if args.parallel > len(available_gpus):
307
+ print(
308
+ f"Warning: Requested {args.parallel} parallel processes, but only {len(available_gpus)} GPUs available. Adjusting to {len(available_gpus)}."
309
+ )
310
+ args.parallel = len(available_gpus)
311
+
312
+ print(f"Using GPUs: {available_gpus}")
313
+
314
+ # Create client
315
+ client, client_model = create_client(args.model)
316
+
317
+ base_dir = osp.join("templates", args.experiment)
318
+ results_dir = osp.join("results", args.experiment)
319
+ idea_archive = []
320
+
321
+ if args.parallel > 0:
322
+ print(f"Running {args.parallel} parallel processes")
323
+ queue = multiprocessing.Queue()
324
+ lock = multiprocessing.Lock()
325
+ for _ in range(args.num_ideas):
326
+ queue.put(_)
327
+
328
+ processes = []
329
+ for i in range(args.parallel):
330
+ gpu_id = available_gpus[i % len(available_gpus)]
331
+ p = multiprocessing.Process(
332
+ target=worker,
333
+ args=(
334
+ queue,
335
+ base_dir,
336
+ results_dir,
337
+ args.model,
338
+ client,
339
+ client_model,
340
+ args.writeup,
341
+ args.improvement,
342
+ gpu_id,
343
+ idea_archive,
344
+ lock,
345
+ ),
346
+ )
347
+ p.start()
348
+ time.sleep(150)
349
+ processes.append(p)
350
+
351
+ # Signal workers to exit
352
+ for _ in range(args.parallel):
353
+ queue.put(None)
354
+
355
+ for p in processes:
356
+ p.join()
357
+
358
+ print("All parallel processes completed.")
359
+ else:
360
+ for _ in range(args.num_ideas):
361
+ idea_archive = generate_next_idea(
362
+ base_dir,
363
+ client=client,
364
+ model=client_model,
365
+ prev_idea_archive=idea_archive,
366
+ num_reflections=NUM_REFLECTIONS,
367
+ )
368
+ idea_archive = check_idea_novelty(
369
+ idea_archive,
370
+ base_dir=base_dir,
371
+ client=client,
372
+ model=client_model,
373
+ )
374
+ idea = idea_archive[-1]
375
+ print(f"Processing idea: {idea['Name']}")
376
+ try:
377
+ success, score, _ = do_idea(
378
+ base_dir,
379
+ results_dir,
380
+ idea,
381
+ args.model,
382
+ client,
383
+ client_model,
384
+ args.writeup,
385
+ args.improvement,
386
+ )
387
+ print(
388
+ f"Completed idea: {idea['Name']}, Success: {success}, Score: {score}"
389
+ )
390
+ idea["Score"] = score
391
+ except Exception as e:
392
+ print(f"Failed to evaluate idea {idea['Name']}: {str(e)}")
393
+
394
+ print("All ideas evaluated.")
hf_app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fastapi import FastAPI
3
+ from CriticalThinking.app.main import app as fastapi_app
4
+
5
+ # The FastAPI app is already initialized in CriticalThinking.app.main
6
+ # We can just mount it or use it as the main app.
7
+ # Here we will mount Gradio onto the existing FastAPI app.
8
+
9
+ def analyze_interface(repo_url, project_description):
10
+ # This is a placeholder for the Gradio UI to interact with the API
11
+ # In a real scenario, we might want to use the background task or just call the service.
12
+ return f"Analysis request for {repo_url} received. Please use the API endpoints to monitor progress."
13
+
14
+ with gr.Blocks(title="Critical Code Agent") as demo:
15
+ gr.Markdown("# 🦀 Critical Code Agent")
16
+ gr.Markdown("Autonomous agent system for deep architectural analysis and software weakness identification.")
17
+
18
+ with gr.Row():
19
+ repo_url = gr.Textbox(label="Repository URL", placeholder="https://github.com/username/repo")
20
+ project_desc = gr.Textbox(label="Project Description", placeholder="Brief description of the project")
21
+
22
+ analyze_btn = gr.Button("Analyze Repository", variant="primary")
23
+ output = gr.Textbox(label="Status")
24
+
25
+ analyze_btn.click(analyze_interface, inputs=[repo_url, project_desc], outputs=output)
26
+
27
+ gr.Markdown("### API Endpoints")
28
+ gr.Markdown("- `POST /analyze`: Submit a repository for analysis")
29
+ gr.Markdown("- `GET /report/{task_id}`: Retrieve analysis report")
30
+ gr.Markdown("- `GET /health`: Check service health")
31
+
32
+ # Mount Gradio to the FastAPI app
33
+ app = gr.mount_gradio_app(fastapi_app, demo, path="/")
34
+
35
+ if __name__ == "__main__":
36
+ import uvicorn
37
+ uvicorn.run(app, host="0.0.0.0", port=7860)
launch_scientist.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import multiprocessing
4
+ import openai
5
+ import os
6
+ import os.path as osp
7
+ import shutil
8
+ import sys
9
+ import time
10
+ import torch
11
+ from aider.coders import Coder
12
+ from aider.io import InputOutput
13
+ from aider.models import Model
14
+ from datetime import datetime
15
+
16
+ from ai_scientist.generate_ideas import generate_ideas, check_idea_novelty
17
+ from ai_scientist.llm import create_client, AVAILABLE_LLMS
18
+ from ai_scientist.perform_experiments import perform_experiments
19
+ from ai_scientist.perform_review import perform_review, load_paper, perform_improvement
20
+ from ai_scientist.perform_writeup import perform_writeup, generate_latex
21
+
22
+ NUM_REFLECTIONS = 3
23
+
24
+
25
+ def print_time():
26
+ print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
27
+
28
+
29
+ def parse_arguments():
30
+ parser = argparse.ArgumentParser(description="Run AI scientist experiments")
31
+ parser.add_argument(
32
+ "--skip-idea-generation",
33
+ action="store_true",
34
+ help="Skip idea generation and load existing ideas",
35
+ )
36
+ parser.add_argument(
37
+ "--skip-novelty-check",
38
+ action="store_true",
39
+ help="Skip novelty check and use existing ideas",
40
+ )
41
+ # add type of experiment (nanoGPT, Boston, etc.)
42
+ parser.add_argument(
43
+ "--experiment",
44
+ type=str,
45
+ default="nanoGPT",
46
+ help="Experiment to run AI Scientist on.",
47
+ )
48
+ parser.add_argument(
49
+ "--model",
50
+ type=str,
51
+ default="claude-3-5-sonnet-20240620",
52
+ choices=AVAILABLE_LLMS,
53
+ help="Model to use for AI Scientist.",
54
+ )
55
+ parser.add_argument(
56
+ "--writeup",
57
+ type=str,
58
+ default="latex",
59
+ choices=["latex"],
60
+ help="What format to use for writeup",
61
+ )
62
+ parser.add_argument(
63
+ "--parallel",
64
+ type=int,
65
+ default=0,
66
+ help="Number of parallel processes to run. 0 for sequential execution.",
67
+ )
68
+ parser.add_argument(
69
+ "--improvement",
70
+ action="store_true",
71
+ help="Improve based on reviews.",
72
+ )
73
+ parser.add_argument(
74
+ "--gpus",
75
+ type=str,
76
+ default=None,
77
+ help="Comma-separated list of GPU IDs to use (e.g., '0,1,2'). If not specified, all available GPUs will be used.",
78
+ )
79
+ parser.add_argument(
80
+ "--num-ideas",
81
+ type=int,
82
+ default=50,
83
+ help="Number of ideas to generate",
84
+ )
85
+ parser.add_argument(
86
+ "--engine",
87
+ type=str,
88
+ default="semanticscholar",
89
+ choices=["semanticscholar", "openalex"],
90
+ help="Scholar engine to use.",
91
+ )
92
+ return parser.parse_args()
93
+
94
+
95
+ def get_available_gpus(gpu_ids=None):
96
+ if gpu_ids is not None:
97
+ return [int(gpu_id) for gpu_id in gpu_ids.split(",")]
98
+ return list(range(torch.cuda.device_count()))
99
+
100
+
101
+ def check_latex_dependencies():
102
+ """
103
+ Check if required LaTeX dependencies are installed on the system.
104
+ Returns True if all dependencies are found, False otherwise.
105
+ """
106
+ import shutil
107
+ import sys
108
+
109
+ required_dependencies = ['pdflatex', 'chktex']
110
+ missing_deps = []
111
+
112
+ for dep in required_dependencies:
113
+ if shutil.which(dep) is None:
114
+ missing_deps.append(dep)
115
+
116
+ if missing_deps:
117
+ print("Error: Required LaTeX dependencies not found:", file=sys.stderr)
118
+ return False
119
+
120
+ return True
121
+
122
+ def worker(
123
+ queue,
124
+ base_dir,
125
+ results_dir,
126
+ model,
127
+ client,
128
+ client_model,
129
+ writeup,
130
+ improvement,
131
+ gpu_id,
132
+ ):
133
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
134
+ print(f"Worker {gpu_id} started.")
135
+ while True:
136
+ idea = queue.get()
137
+ if idea is None:
138
+ break
139
+ success = do_idea(
140
+ base_dir,
141
+ results_dir,
142
+ idea,
143
+ model,
144
+ client,
145
+ client_model,
146
+ writeup,
147
+ improvement,
148
+ log_file=True,
149
+ )
150
+ print(f"Completed idea: {idea['Name']}, Success: {success}")
151
+ print(f"Worker {gpu_id} finished.")
152
+
153
+
154
+ def do_idea(
155
+ base_dir,
156
+ results_dir,
157
+ idea,
158
+ model,
159
+ client,
160
+ client_model,
161
+ writeup,
162
+ improvement,
163
+ log_file=False,
164
+ ):
165
+ ## CREATE PROJECT FOLDER
166
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
167
+ idea_name = f"{timestamp}_{idea['Name']}"
168
+ folder_name = osp.join(results_dir, idea_name)
169
+ assert not osp.exists(folder_name), f"Folder {folder_name} already exists."
170
+ destination_dir = folder_name
171
+ shutil.copytree(base_dir, destination_dir, dirs_exist_ok=True)
172
+ with open(osp.join(base_dir, "run_0", "final_info.json"), "r") as f:
173
+ baseline_results = json.load(f)
174
+ # Check if baseline_results is a dictionary before extracting means
175
+ if isinstance(baseline_results, dict):
176
+ baseline_results = {k: v["means"] for k, v in baseline_results.items()}
177
+ exp_file = osp.join(folder_name, "experiment.py")
178
+ vis_file = osp.join(folder_name, "plot.py")
179
+ notes = osp.join(folder_name, "notes.txt")
180
+ with open(notes, "w") as f:
181
+ f.write(f"# Title: {idea['Title']}\n")
182
+ f.write(f"# Experiment description: {idea['Experiment']}\n")
183
+ f.write(f"## Run 0: Baseline\n")
184
+ f.write(f"Results: {baseline_results}\n")
185
+ f.write(f"Description: Baseline results.\n")
186
+ if log_file:
187
+ original_stdout = sys.stdout
188
+ original_stderr = sys.stderr
189
+ log_path = osp.join(folder_name, "log.txt")
190
+ log = open(log_path, "a")
191
+ sys.stdout = log
192
+ sys.stderr = log
193
+ try:
194
+ print_time()
195
+ print(f"*Starting idea: {idea_name}*")
196
+ ## PERFORM EXPERIMENTS
197
+ fnames = [exp_file, vis_file, notes]
198
+ io = InputOutput(
199
+ yes=True, chat_history_file=f"{folder_name}/{idea_name}_aider.txt"
200
+ )
201
+ if model == "deepseek-coder-v2-0724":
202
+ main_model = Model("deepseek/deepseek-coder")
203
+ elif model == "deepseek-reasoner":
204
+ main_model = Model("deepseek/deepseek-reasoner")
205
+ elif model == "llama3.1-405b":
206
+ main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
207
+ else:
208
+ main_model = Model(model)
209
+ coder = Coder.create(
210
+ main_model=main_model,
211
+ fnames=fnames,
212
+ io=io,
213
+ stream=False,
214
+ use_git=False,
215
+ edit_format="diff",
216
+ )
217
+
218
+ print_time()
219
+ print(f"*Starting Experiments*")
220
+ try:
221
+ success = perform_experiments(idea, folder_name, coder, baseline_results)
222
+ except Exception as e:
223
+ print(f"Error during experiments: {e}")
224
+ print(f"Experiments failed for idea {idea_name}")
225
+ return False
226
+
227
+ if not success:
228
+ print(f"Experiments failed for idea {idea_name}")
229
+ return False
230
+
231
+ print_time()
232
+ print(f"*Starting Writeup*")
233
+ ## PERFORM WRITEUP
234
+ if writeup == "latex":
235
+ writeup_file = osp.join(folder_name, "latex", "template.tex")
236
+ fnames = [exp_file, writeup_file, notes]
237
+ if model == "deepseek-coder-v2-0724":
238
+ main_model = Model("deepseek/deepseek-coder")
239
+ elif model == "deepseek-reasoner":
240
+ main_model = Model("deepseek/deepseek-reasoner")
241
+ elif model == "llama3.1-405b":
242
+ main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
243
+ else:
244
+ main_model = Model(model)
245
+ coder = Coder.create(
246
+ main_model=main_model,
247
+ fnames=fnames,
248
+ io=io,
249
+ stream=False,
250
+ use_git=False,
251
+ edit_format="diff",
252
+ )
253
+ try:
254
+ perform_writeup(idea, folder_name, coder, client, client_model, engine=args.engine)
255
+ except Exception as e:
256
+ print(f"Failed to perform writeup: {e}")
257
+ return False
258
+ print("Done writeup")
259
+ else:
260
+ raise ValueError(f"Writeup format {writeup} not supported.")
261
+
262
+ print_time()
263
+ print(f"*Starting Review*")
264
+ ## REVIEW PAPER
265
+ if writeup == "latex":
266
+ try:
267
+ paper_text = load_paper(f"{folder_name}/{idea['Name']}.pdf")
268
+ review = perform_review(
269
+ paper_text,
270
+ model="gpt-4o-2024-05-13",
271
+ client=openai.OpenAI(),
272
+ num_reflections=5,
273
+ num_fs_examples=1,
274
+ num_reviews_ensemble=5,
275
+ temperature=0.1,
276
+ )
277
+ # Store the review in separate review.txt file
278
+ with open(osp.join(folder_name, "review.txt"), "w") as f:
279
+ f.write(json.dumps(review, indent=4))
280
+ except Exception as e:
281
+ print(f"Failed to perform review: {e}")
282
+ return False
283
+
284
+ ## IMPROVE WRITEUP
285
+ if writeup == "latex" and improvement:
286
+ print_time()
287
+ print(f"*Starting Improvement*")
288
+ try:
289
+ perform_improvement(review, coder)
290
+ generate_latex(
291
+ coder, folder_name, f"{folder_name}/{idea['Name']}_improved.pdf"
292
+ )
293
+ paper_text = load_paper(f"{folder_name}/{idea['Name']}_improved.pdf")
294
+ review = perform_review(
295
+ paper_text,
296
+ model="gpt-4o-2024-05-13",
297
+ client=openai.OpenAI(),
298
+ num_reflections=5,
299
+ num_fs_examples=1,
300
+ num_reviews_ensemble=5,
301
+ temperature=0.1,
302
+ )
303
+ # Store the review in separate review.txt file
304
+ with open(osp.join(folder_name, "review_improved.txt"), "w") as f:
305
+ f.write(json.dumps(review))
306
+ except Exception as e:
307
+ print(f"Failed to perform improvement: {e}")
308
+ return False
309
+ return True
310
+ except Exception as e:
311
+ print(f"Failed to evaluate idea {idea_name}: {str(e)}")
312
+ return False
313
+ finally:
314
+ print("FINISHED IDEA")
315
+ if log_file:
316
+ sys.stdout = original_stdout
317
+ sys.stderr = original_stderr
318
+ log.close()
319
+
320
+
321
+ if __name__ == "__main__":
322
+ args = parse_arguments()
323
+
324
+ # Check available GPUs and adjust parallel processes if necessary
325
+ available_gpus = get_available_gpus(args.gpus)
326
+ if args.parallel > len(available_gpus):
327
+ print(
328
+ f"Warning: Requested {args.parallel} parallel processes, but only {len(available_gpus)} GPUs available. Adjusting to {len(available_gpus)}."
329
+ )
330
+ args.parallel = len(available_gpus)
331
+
332
+ print(f"Using GPUs: {available_gpus}")
333
+
334
+ # Check LaTeX dependencies before proceeding
335
+ if args.writeup == "latex" and not check_latex_dependencies():
336
+ sys.exit(1)
337
+
338
+ # Create client
339
+ client, client_model = create_client(args.model)
340
+
341
+ base_dir = osp.join("templates", args.experiment)
342
+ results_dir = osp.join("results", args.experiment)
343
+ ideas = generate_ideas(
344
+ base_dir,
345
+ client=client,
346
+ model=client_model,
347
+ skip_generation=args.skip_idea_generation,
348
+ max_num_generations=args.num_ideas,
349
+ num_reflections=NUM_REFLECTIONS,
350
+ )
351
+ if not args.skip_novelty_check:
352
+ ideas = check_idea_novelty(
353
+ ideas,
354
+ base_dir=base_dir,
355
+ client=client,
356
+ model=client_model,
357
+ engine=args.engine,
358
+ )
359
+
360
+ with open(osp.join(base_dir, "ideas.json"), "w") as f:
361
+ json.dump(ideas, f, indent=4)
362
+
363
+ novel_ideas = [idea for idea in ideas if idea["novel"]]
364
+ # novel_ideas = list(reversed(novel_ideas))
365
+
366
+ if args.parallel > 0:
367
+ print(f"Running {args.parallel} parallel processes")
368
+ queue = multiprocessing.Queue()
369
+ for idea in novel_ideas:
370
+ queue.put(idea)
371
+
372
+ processes = []
373
+ for i in range(args.parallel):
374
+ gpu_id = available_gpus[i % len(available_gpus)]
375
+ p = multiprocessing.Process(
376
+ target=worker,
377
+ args=(
378
+ queue,
379
+ base_dir,
380
+ results_dir,
381
+ args.model,
382
+ client,
383
+ client_model,
384
+ args.writeup,
385
+ args.improvement,
386
+ gpu_id,
387
+ ),
388
+ )
389
+ p.start()
390
+ time.sleep(150)
391
+ processes.append(p)
392
+
393
+ # Signal workers to exit
394
+ for _ in range(args.parallel):
395
+ queue.put(None)
396
+
397
+ for p in processes:
398
+ p.join()
399
+
400
+ print("All parallel processes completed.")
401
+ else:
402
+ for idea in novel_ideas:
403
+ print(f"Processing idea: {idea['Name']}")
404
+ try:
405
+ success = do_idea(
406
+ base_dir,
407
+ results_dir,
408
+ idea,
409
+ args.model,
410
+ client,
411
+ client_model,
412
+ args.writeup,
413
+ args.improvement,
414
+ )
415
+ print(f"Completed idea: {idea['Name']}, Success: {success}")
416
+ except Exception as e:
417
+ print(f"Failed to evaluate idea {idea['Name']}: {str(e)}")
418
+ import traceback
419
+ print(traceback.format_exc())
420
+ print("All ideas evaluated.")
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LLM APIs
2
+ anthropic
3
+ aider-chat
4
+ backoff
5
+ openai
6
+ google-generativeai
7
+ # Viz
8
+ matplotlib
9
+ pypdf
10
+ pymupdf4llm
11
+ # Common Requirements
12
+ torch
13
+ numpy
14
+ transformers
15
+ datasets
16
+ tiktoken
17
+ wandb
18
+ tqdm