Spaces:

AUXteam
/

Critical_Code_Agent

Running

App Files Files Community

AUXteam commited on 4 days ago

Commit

fc10d08

verified ·

1 Parent(s): 7a38201

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.github/workflows/check_file_size.yml +14 -0
.github/workflows/sync_to_hf.yml +18 -0
.gitignore +174 -0
.hfignore +16 -0
CriticalThinking/PROJECT_STRUCTURE.md +55 -0
CriticalThinking/app/api/router.py +80 -0
CriticalThinking/app/main.py +10 -0
CriticalThinking/app/services/agent_orchestrator.py +100 -0
CriticalThinking/app/services/hf_matcher.py +27 -0
CriticalThinking/app/services/improvement_agent.py +29 -0
CriticalThinking/app/services/indexer.py +120 -0
CriticalThinking/app/services/web_researcher.py +115 -0
CriticalThinking/requirements.txt +12 -0
CriticalThinking/tests/test_api.py +34 -0
CriticalThinking/tests/test_improvements.py +24 -0
CriticalThinking/tests/test_indexer.py +24 -0
CriticalThinking/tests/test_main.py +9 -0
CriticalThinking/tests/test_orchestrator.py +18 -0
CriticalThinking/tests/test_web_researcher.py +40 -0
Dockerfile +42 -0
LICENSE +53 -0
README.md +406 -4
experimental/Dockerfile +89 -0
experimental/launch_oe_scientist.py +394 -0
hf_app.py +37 -0
launch_scientist.py +420 -0
requirements.txt +18 -0

.github/workflows/check_file_size.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: Check file size
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+jobs:
+  check-size:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/lfs-warning@v2.0
+        with:
+          filesizelimit: 10485760 # 10MB

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://AUXteam:${HF_TOKEN}@huggingface.co/spaces/AUXteam/Critical_Code_Agent main

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+.venv_jax
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.aider*
+*.DS_Store
+# Misc folders
+data/
+*ckpt.pt
+*.zip
+ICLR2022-OpenReviewData/
+templates/*/run_0/
+templates/*/*.png
+results/*

.hfignore ADDED Viewed

	@@ -0,0 +1,16 @@

+ai_scientist/
+data/
+docs/
+example_papers/
+review_ai_scientist/
+review_iclr_bench/
+templates/
+.git/
+NPEET/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.pytest_cache/
+.coverage
+htmlcov/

CriticalThinking/PROJECT_STRUCTURE.md ADDED Viewed

	@@ -0,0 +1,55 @@

+# CriticalThinking Project Structure
+## 1. Project Description
+**CriticalThinking** is an autonomous agent system that performs deep architectural analysis of software projects. Inspired by `deep-thinking-agent` and `AI-Scientist`, it moves beyond simple linting to identify high-level design weaknesses, suggest structural improvements, and recommend state-of-the-art replacements (e.g., Hugging Face models) for custom implementations.
+### In-app Integrations
+- **Git-based Indexer**: Automatically clones and indexes codebases.
+- **Deep-Thinking Orchestrator**: Uses iterative planning and reasoning (Planner -> Retriever -> Analyzer).
+- **Hypothesis Generator**: Generates "Improvement Hypotheses" and validates them against the code context.
+- **Hugging Face Hub**: Searches for replacement components.
+- **Web Researcher**: Uses specialized MCP servers (harvesthealth/github-mcp-server) and Gradio clients to search GitHub and Hugging Face Spaces for community-driven solutions.
+### Proposed FASTAPI Setup
+- **App structure**:
+  - `main.py`: App entry point.
+  - `api/router.py`: API routes.
+  - `services/`: Business logic and agent orchestration.
+  - `tests/`: Automated tests.
+- **Dependency injection**: Services are initialized per-task with appropriate configuration (LLM model, API keys).
+## 2. Tasks and Tests
+### Backend & Infrastructure
+- **Task: Project Scaffolding**
+  - *Test*: `tests/test_main.py` -> Verifies health check.
+- **Task: Codebase Indexing Service**
+  - *Test*: `tests/test_indexer.py` -> Verifies cloning, chunking, and search.
+### Agent Logic
+- **Task: Multi-Agent Orchestration**
+  - *Test*: `tests/test_orchestrator.py` -> Verifies planning and analysis loop.
+- **Task: Improvement & Replacement Logic**
+  - *Test*: `tests/test_improvements.py` -> Verifies roadmap generation and HF matching.
+### API & End-to-End
+- **Task: API Exposure & Background Jobs**
+  - *Test*: `tests/test_api.py` -> Verifies the full /analyze -> /report flow.
+## 3. Functionality Expectations
+### User Perspective
+- Submit a repository URL.
+- Receive a "Critical Thinking Report".
+- View "Critical Weaknesses" and an "Improvement Roadmap".
+- See "Suggested Replacements" (libraries/models) for custom code.
+### Technical Perspective
+- **Iterative Reasoning**: The agent doesn't just look at code once; it plans its investigation and refines its findings.
+- **Schema-Aware RAG**: Uses structural context to find relevant code snippets.
+- **External Knowledge**: Connects to Hugging Face Hub for modernization suggestions.
+## 4. API Endpoints
+- **POST /analyze**
+  - Request: `{"repo_url": "string", "project_description": "string"}`
+  - Response: `{"task_id": "uuid"}`
+- **GET /report/{task_id}**
+  - Response: `{"status": "completed", "report": {"weaknesses": [], "improvements": []}}`

CriticalThinking/app/api/router.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from fastapi import APIRouter, BackgroundTasks, HTTPException
+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+import uuid
+import os
+from app.services.indexer import CodeIndexer
+from app.services.agent_orchestrator import AgentOrchestrator
+from app.services.improvement_agent import ImprovementAgent
+from app.services.hf_matcher import HFMatcher
+from app.services.web_researcher import WebResearcher
+router = APIRouter()
+# In-memory task store
+tasks: Dict[str, Any] = {}
+class AnalyzeRequest(BaseModel):
+    repo_url: str
+    project_description: Optional[str] = "Generic software project"
+class AnalyzeResponse(BaseModel):
+    task_id: str
+async def run_analysis_task(task_id: str, repo_url: str, project_description: str):
+    tasks[task_id]["status"] = "processing"
+    try:
+        api_key = os.getenv("OPENAI_API_KEY", "dummy")
+        # Initialize services
+        indexer = CodeIndexer(qdrant_url=":memory:", openai_api_key=api_key)
+        orchestrator = AgentOrchestrator(indexer=indexer, openai_api_key=api_key)
+        improver = ImprovementAgent(openai_api_key=api_key)
+        matcher = HFMatcher()
+        web_researcher = WebResearcher()
+        # 1. Index
+        indexer.index_repository(repo_url)
+        # 2. Analyze
+        analysis_results = orchestrator.run_analysis(project_description)
+        weaknesses = analysis_results.get("weaknesses", [])
+        # 3. Improvements
+        improvements_results = improver.generate_improvements(weaknesses)
+        improvements = improvements_results.get("improvements", [])
+        # 4. Replacement matching and Web Research
+        for imp in improvements:
+            query = imp.get("replacement_search_query")
+            if query:
+                # Direct HF search
+                replacements = matcher.find_replacements(query)
+                imp["suggested_replacements"] = replacements
+                # Web research for GitHub and HF Spaces
+                imp["github_research"] = web_researcher.research_github(query)
+                imp["hf_spaces_research"] = web_researcher.research_hf_spaces(query)
+        # 5. Store report
+        tasks[task_id]["status"] = "completed"
+        tasks[task_id]["report"] = {
+            "project": project_description,
+            "weaknesses": weaknesses,
+            "improvements": improvements
+        }
+    except Exception as e:
+        tasks[task_id]["status"] = "failed"
+        tasks[task_id]["error"] = str(e)
+@router.post("/analyze", response_model=AnalyzeResponse)
+async def analyze(request: AnalyzeRequest, background_tasks: BackgroundTasks):
+    task_id = str(uuid.uuid4())
+    tasks[task_id] = {"status": "pending", "report": None}
+    background_tasks.add_task(run_analysis_task, task_id, request.repo_url, request.project_description)
+    return AnalyzeResponse(task_id=task_id)
+@router.get("/report/{task_id}")
+async def get_report(task_id: str):
+    if task_id not in tasks:
+        raise HTTPException(status_code=404, detail="Task not found")
+    return tasks[task_id]

CriticalThinking/app/main.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from fastapi import FastAPI
+from app.api.router import router
+app = FastAPI(title="CriticalThinking API")
+app.include_router(router)
+@app.get("/health")
+async def health():
+    return {"status": "healthy"}

CriticalThinking/app/services/agent_orchestrator.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import json
+from typing import List, Dict, Any
+from openai import OpenAI
+import os
+class BaseAgent:
+    def __init__(self, model: str = "gpt-4o", openai_api_key: str = None):
+        self.client = OpenAI(api_key=openai_api_key)
+        self.model = model
+    def _get_response(self, system_prompt: str, user_prompt: str, response_format=None) -> Any:
+        # For testing purposes without API key or with dummy key
+        api_key = self.client.api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key or api_key == "dummy":
+            return self._mock_response(system_prompt)
+        args = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            "temperature": 0.2
+        }
+        if response_format:
+            args["response_format"] = response_format
+        response = self.client.chat.completions.create(**args)
+        content = response.choices[0].message.content
+        if response_format and response_format.get("type") == "json_object":
+            return json.loads(content)
+        return content
+    def _mock_response(self, system_prompt: str) -> Any:
+        if "planner" in system_prompt.lower():
+            return {
+                "steps": [
+                    {"index": 0, "sub_question": "Analyze project structure", "tool_type": "doc_search"},
+                    {"index": 1, "sub_question": "Identify core logic flaws", "tool_type": "doc_search"}
+                ],
+                "reasoning": "Standard analysis flow"
+            }
+        elif "weakness" in system_prompt.lower():
+            return {
+                "summary": "The code has several architectural issues.",
+                "weaknesses": ["Manual memory management in Python", "Lack of unit tests"],
+                "severity": "high"
+            }
+        return "Mocked response"
+class Planner(BaseAgent):
+    def plan(self, project_overview: str) -> Dict[str, Any]:
+        system_prompt = """You are an expert query planner for a deep-thinking codebase analysis system.
+Your task is to decompose complex codebase investigations into sequential execution plans.
+Guidelines:
+- Create 2-5 steps that build on each other.
+- Each step should have a clear sub-question targeting a specific architectural or logic component.
+- Specify tool_type: doc_search (for code retrieval)."""
+        user_prompt = f"Decompose the following project overview into a sequential execution plan:\n\nProject Overview: {project_overview}\n\nRespond with valid JSON in this EXACT format:\n{{\n  'steps': [\n    {{\n      'index': 0,\n      'sub_question': 'What specific architectural component needs analysis?',\n      'tool_type': 'doc_search',\n      'expected_outputs': ['finding 1', 'finding 2']\n    }}\n  ],\n  'reasoning': 'Explain why this plan will effectively find weaknesses.'\n}}"
+        return self._get_response(system_prompt, user_prompt, response_format={"type": "json_object"})
+class WeaknessAnalyzer(BaseAgent):
+    def analyze(self, code_context: str) -> Dict[str, Any]:
+        system_prompt = """You are an AI senior engineer reviewing a project for critical weaknesses.
+Be critical and cautious. Focus on:
+- Architectural flaws (circular dependencies, lack of modularity).
+- Security risks.
+- Performance bottlenecks.
+- Redundant custom logic that could be replaced by standard libraries or models."""
+        user_prompt = f"Analyze the following code snippets for weaknesses:\n\n{code_context}\n\nRespond in JSON format with fields: 'summary', 'weaknesses' (list of strings), 'severity' (high/medium/low)."
+        return self._get_response(system_prompt, user_prompt, response_format={"type": "json_object"})
+class AgentOrchestrator:
+    def __init__(self, indexer: Any, openai_api_key: str = None):
+        self.indexer = indexer
+        self.planner = Planner(openai_api_key=openai_api_key)
+        self.analyzer = WeaknessAnalyzer(openai_api_key=openai_api_key)
+    def run_analysis(self, project_overview: str) -> Dict[str, Any]:
+        # 1. Plan
+        plan = self.planner.plan(project_overview)
+        all_weaknesses = []
+        # 2. Execute steps
+        for step in plan.get("steps", []):
+            sub_q = step.get("sub_question")
+            # Search codebase
+            results = self.indexer.search(sub_q, limit=3)
+            context = "\n---\n".join([r.get("text", "") for r in results])
+            # Analyze
+            analysis = self.analyzer.analyze(context)
+            all_weaknesses.extend(analysis.get("weaknesses", []))
+        return {
+            "plan": plan,
+            "weaknesses": list(set(all_weaknesses)),
+            "status": "completed"
+        }

CriticalThinking/app/services/hf_matcher.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from huggingface_hub import HfApi
+from typing import List, Dict, Any
+class HFMatcher:
+    def __init__(self):
+        self.api = HfApi()
+    def find_replacements(self, description: str, limit: int = 3) -> List[Dict[str, Any]]:
+        try:
+            models = self.api.list_models(
+                search=description,
+                sort="downloads",
+                direction=-1,
+                limit=limit
+            )
+            results = []
+            for model in models:
+                results.append({
+                    "id": model.id,
+                    "downloads": model.downloads,
+                    "likes": model.likes,
+                    "url": f"https://huggingface.co/{model.id}"
+                })
+            return results
+        except Exception as e:
+            print(f"HF search failed: {e}")
+            return []

CriticalThinking/app/services/improvement_agent.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import List, Dict, Any
+from app.services.agent_orchestrator import BaseAgent
+class ImprovementAgent(BaseAgent):
+    def generate_improvements(self, weaknesses: List[str]) -> Dict[str, Any]:
+        system_prompt = """You are an AI research scientist and senior architect.
+Your goal is to generate impactful and creative ideas for improving a codebase.
+Consider:
+- Refactoring for better scalability.
+- Replacing custom implementations with state-of-the-art Hugging Face models or open-source projects.
+- Improving performance and maintainability."""
+        user_prompt = f"Given these weaknesses:\n{weaknesses}\n\nPropose a next-step improvement roadmap. Respond in JSON with format:\n{{\n  'improvements': [\n    {{\n      'weakness': 'the identified weakness',\n      'proposal': 'detailed improvement plan',\n      'replacement_search_query': 'query for Hugging Face or GitHub',\n      'interestingness': 1-10,\n      'feasibility': 1-10\n    }}\n  ]\n}}"
+        return self._get_response(system_prompt, user_prompt, response_format={"type": "json_object"})
+    def _mock_response(self, system_prompt: str) -> Any:
+        return {
+            "improvements": [
+                {
+                    "weakness": "Manual memory management",
+                    "proposal": "Use a managed library",
+                    "replacement_search_query": "memory management library"
+                },
+                {
+                    "weakness": "Lack of sentiment analysis accuracy",
+                    "proposal": "Use a pre-trained transformer model",
+                    "replacement_search_query": "sentiment analysis"
+                }
+            ]
+        }

CriticalThinking/app/services/indexer.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import shutil
+import tempfile
+from typing import List, Dict, Any
+import uuid
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+from openai import OpenAI
+class CodeIndexer:
+    def __init__(self, qdrant_url: str = ":memory:", openai_api_key: str = None):
+        self.qdrant = QdrantClient(qdrant_url)
+        self.openai = OpenAI(api_key=openai_api_key)
+        self.collection_name = "codebase"
+        self._ensure_collection()
+    def _ensure_collection(self):
+        collections = self.qdrant.get_collections().collections
+        exists = any(c.name == self.collection_name for c in collections)
+        if not exists:
+            self.qdrant.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
+            )
+    def index_repository(self, repo_url: str):
+        import subprocess
+        temp_dir = tempfile.mkdtemp()
+        try:
+            print(f"Cloning {repo_url} into {temp_dir}...")
+            if repo_url.startswith("local://"):
+                local_path = repo_url.replace("local://", "")
+                shutil.copytree(local_path, temp_dir, dirs_exist_ok=True)
+            else:
+                result = subprocess.run(["git", "clone", "--depth", "1", repo_url, temp_dir], capture_output=True, text=True)
+                if result.returncode != 0:
+                    raise Exception(f"Git clone failed: {result.stderr}")
+            self._index_directory(temp_dir)
+        finally:
+            shutil.rmtree(temp_dir)
+    def _index_directory(self, root_dir: str):
+        points = []
+        for root, dirs, files in os.walk(root_dir):
+            if ".git" in root:
+                continue
+            for file in files:
+                if file.endswith((".py", ".go", ".js", ".ts", ".md")):
+                    file_path = os.path.join(root, file)
+                    relative_path = os.path.relpath(file_path, root_dir)
+                    with open(file_path, "r", errors="ignore") as f:
+                        content = f.read()
+                    chunks = self._chunk_code(content)
+                    for i, chunk in enumerate(chunks):
+                        embedding = self._get_embedding(chunk)
+                        points.append(models.PointStruct(
+                            id=str(uuid.uuid4()),
+                            vector=embedding,
+                            payload={
+                                "path": relative_path,
+                                "chunk_index": i,
+                                "text": chunk
+                            }
+                        ))
+        if points:
+            self.qdrant.upsert(
+                collection_name=self.collection_name,
+                points=points
+            )
+    def _chunk_code(self, content: str, max_chars: int = 1500) -> List[str]:
+        # Simple chunking by lines for now, ensuring we don't break in the middle of a line
+        chunks = []
+        lines = content.split("\n")
+        current_chunk = []
+        current_length = 0
+        for line in lines:
+            if current_length + len(line) > max_chars and current_chunk:
+                chunks.append("\n".join(current_chunk))
+                current_chunk = []
+                current_length = 0
+            current_chunk.append(line)
+            current_length += len(line) + 1
+        if current_chunk:
+            chunks.append("\n".join(current_chunk))
+        return chunks
+    def _get_embedding(self, text: str) -> List[float]:
+        # Mock embedding if API key is missing or dummy for tests
+        api_key = self.openai.api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key or api_key == "dummy":
+            return [0.0] * 1536
+        response = self.openai.embeddings.create(
+            input=text,
+            model="text-embedding-3-small"
+        )
+        return response.data[0].embedding
+    def search(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
+        query_vector = self._get_embedding(query)
+        try:
+            # Try the modern query_points API
+            response = self.qdrant.query_points(
+                collection_name=self.collection_name,
+                query=query_vector,
+                limit=limit
+            )
+            return [hit.payload for hit in response.points]
+        except AttributeError:
+            # Fallback for older versions if search exists
+            hits = self.qdrant.search(
+                collection_name=self.collection_name,
+                query_vector=query_vector,
+                limit=limit
+            )
+            return [hit.payload for hit in hits]

CriticalThinking/app/services/web_researcher.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from gradio_client import Client
+import os
+from typing import List, Dict, Any
+import requests
+import uuid
+class WebResearcher:
+    def __init__(self):
+        self.web_search_space = "victor/websearch"
+        self.hf_search_space = "John6666/testwarm"
+        self.github_mcp_url = "https://harvesthealth-github-mcp-server.hf.space/"
+        self._web_client = None
+        self._hf_client = None
+    @property
+    def web_client(self):
+        if self._web_client is None:
+            try:
+                self._web_client = Client(self.web_search_space)
+            except Exception as e:
+                print(f"Failed to connect to Gradio Client {self.web_search_space}: {e}")
+        return self._web_client
+    @property
+    def hf_client(self):
+        if self._hf_client is None:
+            try:
+                self._hf_client = Client(self.hf_search_space)
+            except Exception as e:
+                print(f"Failed to connect to Gradio Client {self.hf_search_space}: {e}")
+        return self._hf_client
+    def search_web(self, query: str, search_type: str = "search", num_results: int = 4) -> str:
+        if self.web_client is None:
+            return "Web search unavailable."
+        try:
+            return self.web_client.predict(
+                query=query,
+                search_type=search_type,
+                num_results=num_results,
+                api_name="/search_web"
+            )
+        except Exception as e:
+            return f"Web search failed: {e}"
+    def search_hf(self, query: str, repo_types: List[str] = ["model", "space"], limit: int = 5) -> str:
+        if self.hf_client is None:
+            return "HF search unavailable."
+        try:
+            result = self.hf_client.predict(
+                repo_types=repo_types,
+                sort="trending_score",
+                sort_method="descending order",
+                filter_str="",
+                search_str=query,
+                author="",
+                tags="",
+                infer="all",
+                gated="all",
+                appr=["auto", "manual"],
+                size_categories=[],
+                limit=limit,
+                hardware=[],
+                stage=[],
+                fetch_detail=["Space Runtime"],
+                show_labels=["Type", "ID", "Likes", "DLs"],
+                api_name="/search"
+            )
+            # result[0] is a Dict with headers and data
+            if isinstance(result, tuple) and len(result) > 0:
+                data = result[0].get("data", [])
+                return f"Found HF components: {data}"
+            return str(result)
+        except Exception as e:
+            return f"HF search failed: {e}"
+    def research_github(self, topic: str) -> str:
+        # Try specialized GitHub MCP search first
+        try:
+            mcp_result = self.search_github_mcp(topic)
+            if "failed" not in mcp_result.lower() and "unavailable" not in mcp_result.lower():
+                return mcp_result
+        except Exception as e:
+            print(f"GitHub MCP search failed, falling back to web search: {e}")
+        # Fallback to web search
+        query = f"site:github.com {topic} repository"
+        return self.search_web(query)
+    def search_github_mcp(self, query: str) -> str:
+        payload = {
+            "jsonrpc": "2.0",
+            "id": str(uuid.uuid4()),
+            "method": "tools/call",
+            "params": {
+                "name": "search_repositories",
+                "arguments": {
+                    "query": query
+                }
+            }
+        }
+        try:
+            response = requests.post(self.github_mcp_url, json=payload, timeout=30)
+            response.raise_for_status()
+            result = response.json()
+            if "result" in result:
+                return str(result["result"])
+            return str(result)
+        except Exception as e:
+            return f"GitHub MCP search failed: {e}"
+    def research_hf_spaces(self, topic: str) -> str:
+        # Use deep HF search for better results
+        return self.search_hf(topic, repo_types=["space"])

CriticalThinking/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi
+uvicorn
+openai
+pydantic
+pydantic-settings
+qdrant-client
+huggingface_hub
+python-multipart
+pytest
+httpx
+gradio_client
+requests

CriticalThinking/tests/test_api.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from fastapi.testclient import TestClient
+from app.main import app
+import time
+from unittest.mock import patch, MagicMock
+client = TestClient(app)
+@patch("app.api.router.WebResearcher")
+def test_analyze_flow(mock_web_researcher_class):
+    # Mock WebResearcher
+    mock_web_researcher = MagicMock()
+    mock_web_researcher_class.return_value = mock_web_researcher
+    mock_web_researcher.research_github.return_value = "Mocked GitHub results"
+    mock_web_researcher.research_hf_spaces.return_value = "Mocked HF Spaces results"
+    # Submit analysis
+    response = client.post("/analyze", json={"repo_url": "local://.", "project_description": "Test Project"})
+    assert response.status_code == 200
+    task_id = response.json()["task_id"]
+    assert task_id
+    # Wait a bit for background task
+    time.sleep(1)
+    response = client.get(f"/report/{task_id}")
+    assert response.status_code == 200
+    data = response.json()
+    print(f"Task status: {data['status']}")
+    if data['status'] == 'failed':
+        print(f"Error: {data.get('error')}")
+    assert data["status"] == "completed"
+    assert "github_research" in data["report"]["improvements"][0]
+    assert "hf_spaces_research" in data["report"]["improvements"][0]

CriticalThinking/tests/test_improvements.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from app.services.hf_matcher import HFMatcher
+from app.services.improvement_agent import ImprovementAgent
+from unittest.mock import MagicMock
+def test_hf_matcher():
+    matcher = HFMatcher()
+    # Mocking HFApi.list_models
+    matcher.api.list_models = MagicMock()
+    mock_model = MagicMock()
+    mock_model.id = "test/model"
+    mock_model.downloads = 100
+    mock_model.likes = 10
+    matcher.api.list_models.return_value = [mock_model]
+    results = matcher.find_replacements("sentiment analysis")
+    assert len(results) == 1
+    assert results[0]["id"] == "test/model"
+def test_improvement_agent():
+    agent = ImprovementAgent(openai_api_key="dummy")
+    result = agent.generate_improvements(["Weakness 1"])
+    assert "improvements" in result
+    assert len(result["improvements"]) > 0
+    assert "replacement_search_query" in result["improvements"][0]

CriticalThinking/tests/test_indexer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pytest
+from app.services.indexer import CodeIndexer
+import os
+def test_indexer_basic():
+    # Use in-memory Qdrant and dummy API key
+    indexer = CodeIndexer(qdrant_url=":memory:", openai_api_key="dummy-key")
+    # Create a dummy repo
+    repo_content = "def add(a, b):\n    return a + b\n\n# This is a comment\ndef sub(a, b):\n    return a - b\n"
+    # Test _chunk_code
+    chunks = indexer._chunk_code(repo_content, max_chars=40)
+    assert len(chunks) > 1
+    # Test _index_directory (will use mock embedding because of dummy key and our logic)
+    # We need to make sure _get_embedding handles the dummy key
+    # Actually, our logic in _get_embedding checks for API key existence.
+    # Let's override _get_embedding for the test to be safe.
+    indexer._get_embedding = lambda x: [0.1] * 1536
+    indexer._index_directory("app") # Index some local files
+    results = indexer.search("health", limit=1)
+    assert len(results) >= 0

CriticalThinking/tests/test_main.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from fastapi.testclient import TestClient
+from app.main import app
+client = TestClient(app)
+def test_health():
+    response = client.get("/health")
+    assert response.status_code == 200
+    assert response.json() == {"status": "healthy"}

CriticalThinking/tests/test_orchestrator.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import pytest
+from app.services.agent_orchestrator import AgentOrchestrator
+from unittest.mock import MagicMock
+def test_orchestrator_flow():
+    mock_indexer = MagicMock()
+    mock_indexer.search.return_value = [{"text": "print('hello')"}]
+    orchestrator = AgentOrchestrator(indexer=mock_indexer, openai_api_key="dummy")
+    # Run analysis (will use mock responses from BaseAgent._mock_response)
+    result = orchestrator.run_analysis("A simple python script")
+    assert "plan" in result
+    assert "weaknesses" in result
+    assert result["status"] == "completed"
+    assert len(result["weaknesses"]) > 0
+    assert mock_indexer.search.called

CriticalThinking/tests/test_web_researcher.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pytest
+from app.services.web_researcher import WebResearcher
+from unittest.mock import MagicMock, patch
+@patch("requests.post")
+def test_web_researcher_github_mcp(mock_post):
+    researcher = WebResearcher()
+    mock_response = MagicMock()
+    mock_response.json.return_value = {"result": "MCP GitHub Result"}
+    mock_response.status_code = 200
+    mock_post.return_value = mock_response
+    result = researcher.research_github("sentiment analysis")
+    assert "MCP GitHub Result" in result
+    mock_post.assert_called_once()
+def test_web_researcher_github_fallback():
+    researcher = WebResearcher()
+    # Mock search_github_mcp to fail
+    researcher.search_github_mcp = MagicMock(return_value="GitHub MCP search failed")
+    with patch("app.services.web_researcher.Client") as mock_client_class:
+        mock_instance = MagicMock()
+        mock_client_class.return_value = mock_instance
+        mock_instance.predict.return_value = "Fallback Web Result"
+        result = researcher.research_github("sentiment analysis")
+        assert "Fallback Web Result" in result
+def test_web_researcher_hf():
+    researcher = WebResearcher()
+    with patch("app.services.web_researcher.Client") as mock_client_class:
+        mock_instance = MagicMock()
+        mock_client_class.return_value = mock_instance
+        mock_instance.predict.return_value = ({"data": [["space", "example/space", 10, 100]]}, [])
+        result = researcher.research_hf_spaces("sentiment analysis")
+        assert "Found HF components" in result
+        # Check that it tried to connect to John6666/testwarm
+        mock_client_class.assert_any_call("John6666/testwarm")

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+# Use an official Python runtime as a parent image
+FROM python:3.11-slim-bullseye
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    PYTHONPATH=.:CriticalThinking
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Install uv and requirements
+COPY CriticalThinking/requirements.txt .
+RUN pip install --no-cache-dir uv && \
+    uv pip install --system --no-cache-dir -r requirements.txt && \
+    uv pip install --system --no-cache-dir gradio uvicorn
+# Create a non-root user and switch to it
+# Hugging Face Spaces use a user with UID 1000
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /home/user/app
+# Copy the rest of the application
+# Use --chown=user to ensure the user has permissions
+COPY --chown=user . .
+# Expose the port
+EXPOSE 7860
+# Command to run the application
+# We use uvicorn to run the hf_app:app
+CMD ["uvicorn", "hf_app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,53 @@

+The AI Scientist Source Code License
+Version 1.0, December 2025
+This license is based on the Responsible AI Source Code License v1.1 (http://licenses.ai/).
+TERMS AND CONDITIONS
+The AI Scientist Source Code License (“License”) governs the use of the accompanying software. If you access or use the software, you accept the License. If you do not accept the License, do not access or use the software.
+1. Definitions.
+(i) "License" means the terms and conditions for use, reproduction, and distribution as defined by Sections one (1) through eight (8) of this document.
+(ii) "Licensor" means Sakana AI, the copyright owner or legal entity authorized by the copyright owner that is granting the License.
+(iii) "You" (or "Your") means an individual or legal entity exercising permissions granted by this License.
+(iv) The terms “reproduce”, “reproduction”, “derivative works”, and “distribution” have the same meaning here as under U.S. Copyright Law.
+(v) “Contribution” means the original software, additions to the original software, modifications to the original software, or derivative works of the original software.
+(vi) "Contributor" means any person or Licensor who provides a Contribution.
+2. Grant of Rights.
+Subject to this License, each Contributor grants You a non-exclusive, worldwide, royalty-free copyright license to reproduce its Contribution, prepare derivative works of its Contribution, and distribute its Contribution or any derivative works of its Contribution that You create.
+3. Restrictions.
+3.1. If You distribute any portion of the Contribution, You must include a complete copy of this License with the distribution; and
+3.2. You agree that the Contribution, or any derivative work of the Contribution, will not be used by You or any third party subject to Your control, to:
+a. Surveillance
+i. Detect or infer any legally protected class or aspect of any person, as defined by U.S. Federal Law; and
+ii. Detect or infer aspects and/or features of an identity any person, such as name, family name, address, gender, sexual orientation, race, religion, age, location (at any geographical level), skin color, society or political affiliations, employment status and/or employment history, and health and medical conditions.
+b. Computer Generated Media
+i. Synthesize and/or modify audio-realistic and/or video-realistic representations of people and events, without including a caption, watermark, and/or metadata file indicating that the representations were generated using the Contribution.
+c. Health Care
+i. Predict the likelihood that any person will request to file an insurance claim;
+ii. Diagnose a medical condition without human oversight.
+d. Criminal
+i. Predict the likelihood that a crime will be committed by any person or predict the likelihood of any person being a criminal based on facial attributes or personal data.
+e. Scientific Manuscripts and Academic Integrity (The "AI Scientist" Clause)
+i. Generate or disseminate scientific manuscripts, research papers, or technical reports without expressly and intelligibly disclaiming, in a prominent manner (e.g., in the abstract, or a dedicated 'Disclosure' or 'Methods' section), that the content was machine-generated or produced using The AI Scientist.
+3.3. Restrictions referenced in Section 3.2 MUST be included as an enforceable provision by You in any type of legal agreement governing the use and/or distribution of the Work or any Derivative Works.
+4. Termination
+Upon the occurrence of any of the restricted uses listed above in “3. Restrictions”, Licensor shall have the right to terminate this License Agreement and require You to immediately return or destroy all copies of the Contribution. Termination of this License Agreement shall be in addition to and not in lieu of any other remedies available to Licensor.
+5. Disclaimer of Warranty.
+Unless required by applicable law, Licensor provides any Contribution on an "As-Is" basis, without WARRANTIES OR CONDITIONS OF ANY KIND.
+6. Limitation of Liability.
+In no event shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages arising as a result of this License.
+END OF TERMS AND CONDITIONS

README.md CHANGED Viewed

@@ -1,10 +1,412 @@
 ---
 title: Critical Code Agent
-emoji: 🐨
-colorFrom: blue
-colorTo: green
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Critical Code Agent
+emoji: 🦀
+colorFrom: red
+colorTo: blue
 sdk: docker
 pinned: false
+app_port: 7860
 ---
+<h1 align="center">
+  <a href="https://github.com/SakanaAI/AI-Scientist/blob/main/docs/logo_2.png">
+    <img src="docs/logo_2.png" width="215" /></a><br>
+  <b>The AI Scientist: Towards Fully Automated</b><br>
+  <b>Open-Ended Scientific Discovery 🧑‍🔬</b><br>
+</h1>
+<p align="center">
+  📚 <a href="https://arxiv.org/abs/2408.06292">[Paper]</a> |
+  📝 <a href="https://sakana.ai/ai-scientist/">[Blog Post]</a> |
+  📂 <a href="https://drive.google.com/drive/folders/1G7A0wTqfXVa-cpexjk0oaXakaSJwffEt">[Drive Folder]</a>
+</p>
+One of the grand challenges of artificial intelligence is developing agents capable of conducting scientific research and discovering new knowledge. While frontier models have already been used to aid human scientists—for example, for brainstorming ideas or writing code—they still require extensive manual supervision or are heavily constrained to specific tasks.
+We're excited to introduce **The AI Scientist**, the first comprehensive system for fully automatic scientific discovery, enabling Foundation Models such as Large Language Models (LLMs) to perform research independently.
+We provide all runs and data from our paper [here](https://drive.google.com/drive/folders/1G7A0wTqfXVa-cpexjk0oaXakaSJwffEt?usp=sharing), where we run each base model on each template for approximately 50 ideas. We *highly* recommend reading through some of the [Claude papers](https://drive.google.com/drive/folders/1Mmpz6M1FK4q8e-SewgZcUzdeD0Q2zC39?usp=sharing) to get a sense of the system's strengths and weaknesses. Here are some example papers generated by **The AI Scientist** 📝:
+1. [DualScale Diffusion: Adaptive Feature Balancing for Low-Dimensional Generative Models](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/adaptive_dual_scale_denoising.pdf)
+2. [Multi-scale Grid Noise Adaptation: Enhancing Diffusion Models For Low-dimensional Data](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/grid_based_noise_adaptation.pdf)
+3. [GAN-Enhanced Diffusion: Boosting Sample Quality and Diversity](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/gan_diffusion.pdf)
+4. [DualDiff: Enhancing Mode Capture in Low-dimensional Diffusion Models via Dual-expert Denoising](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/dual_expert_denoiser.pdf)
+5. [StyleFusion: Adaptive Multi-style Generation in Character-Level Language Models](https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/multi_style_adapter.pdf)
+6. [Adaptive Learning Rates for Transformers via Q-Learning](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/rl_lr_adaptation.pdf)
+7. [Unlocking Grokking: A Comparative Study of Weight Initialization Strategies in Transformer Models](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/weight_initialization_grokking.pdf)
+8. [Grokking Accelerated: Layer-wise Learning Rates for Transformer Generalization](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/layerwise_lr_grokking.pdf)
+9. [Grokking Through Compression: Unveiling Sudden Generalization via Minimal Description Length](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/mdl_grokking_correlation.pdf)
+10. [Accelerating Mathematical Insight: Boosting Grokking Through Strategic Data Augmentation](https://github.com/SakanaAI/AI-Scientist/tree/main/example_papers/data_augmentation_grokking.pdf)
+> **Note:**
+> **Caution!** This codebase will execute LLM-written code. There are various risks and challenges associated with this autonomy, including the use of potentially dangerous packages, web access, and potential spawning of processes. Use at your own discretion. Please make sure to [containerize](#containerization) and restrict web access appropriately.
+<p align="center">
+  <a href="https://github.com/SakanaAI/AI-Scientist/blob/main/example_papers/adaptive_dual_scale_denoising/adaptive_dual_scale_denoising.pdf"><img src="https://github.com/SakanaAI/AI-Scientist/blob/main/docs/anim-ai-scientist.gif" alt="Adaptive Dual Scale Denoising" width="80%" />
+</a></p>
+## Table of Contents
+1. [Introduction](#introduction)
+2. [Requirements](#requirements)
+   - [Installation](#installation)
+   - [Supported Models and API Keys](#supported-models-and-api-keys)
+3. [Setting Up the Templates](#setting-up-the-templates)
+   - [NanoGPT Template](#nanogpt-template)
+   - [2D Diffusion Template](#2d-diffusion-template)
+   - [Grokking Template](#grokking-template)
+4. [Run AI Scientist Paper Generation Experiments](#run-ai-scientist-paper-generation-experiments)
+5. [Getting an LLM-Generated Paper Review](#getting-an-llm-generated-paper-review)
+6. [Making Your Own Template](#making-your-own-template)
+   - [Community-Contributed Templates](#community-contributed-templates)
+7. [Template Resources](#template-resources)
+8. [Citing The AI Scientist](#citing-the-ai-scientist)
+9. [Frequently Asked Questions](#frequently-asked-questions)
+10. [Containerization](#containerization)
+## Introduction
+We provide three templates, which were used in our paper, covering the following domains: **NanoGPT**, **2D Diffusion**, and **Grokking**. These templates enable The AI Scientist to generate ideas and conduct experiments in these areas. We accept contributions of new templates from the community, but please note that they are not maintained by us. All other templates beyond the three provided are community contributions.
+## Requirements
+This code is designed to run on Linux with NVIDIA GPUs using CUDA and PyTorch. Support for other GPU architectures may be possible by following the [PyTorch guidelines](https://pytorch.org/get-started/locally/). The current templates would likely take an infeasible amount of time on CPU-only machines. Running on other operating systems may require significant adjustments.
+### Installation
+```bash
+conda create -n ai_scientist python=3.11
+conda activate ai_scientist
+# Install pdflatex
+sudo apt-get install texlive-full
+# Install PyPI requirements
+pip install -r requirements.txt
+```
+**Note:** Installing `texlive-full` can take a long time. You may need to [hold Enter](https://askubuntu.com/questions/956006/pregenerating-context-markiv-format-this-may-take-some-time-takes-forever) during the installation.
+### Supported Models and API Keys
+We support a wide variety of models, including open-weight and API-only models. In general, we recommend using only frontier models above the capability of the original GPT-4. To see a full list of supported models, see [here](https://github.com/SakanaAI/AI-Scientist/blob/main/ai_scientist/llm.py).
+#### OpenAI API (GPT-4o, GPT-4o-mini, o1 models)
+By default, this uses the `OPENAI_API_KEY` environment variable.
+#### Anthropic API (Claude Sonnet 3.5)
+By default, this uses the `ANTHROPIC_API_KEY` environment variable.
+##### Claude Models via Bedrock
+For Claude models provided by [Amazon Bedrock](https://aws.amazon.com/bedrock/), please install these additional packages:
+```bash
+pip install anthropic[bedrock]
+```
+Next, specify a set of valid [AWS Credentials](https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html) and the target [AWS Region](https://docs.aws.amazon.com/bedrock/latest/userguide/bedrock-regions.html):
+Set the environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION_NAME`.
+##### Claude Models via Vertex AI
+For Claude models provided by [Vertex AI Model Garden](https://cloud.google.com/model-garden?hl=en), please install these additional packages:
+```bash
+pip install google-cloud-aiplatform
+pip install anthropic[vertex]
+```
+Next, set up valid authentication for a [Google Cloud project](https://cloud.google.com/vertex-ai/docs/authentication), for example by providing the region and project ID:
+```bash
+export CLOUD_ML_REGION="REGION"           # for Model Garden call
+export ANTHROPIC_VERTEX_PROJECT_ID="PROJECT_ID"  # for Model Garden call
+export VERTEXAI_LOCATION="REGION"         # for Aider/LiteLLM call
+export VERTEXAI_PROJECT="PROJECT_ID"      # for Aider/LiteLLM call
+```
+#### DeepSeek API (deepseek-chat, deepseek-reasoner)
+By default, this uses the `DEEPSEEK_API_KEY` environment variable.
+#### OpenRouter API (Llama3.1)
+By default, this uses the `OPENROUTER_API_KEY` environment variable.
+#### Google Gemini
+We support Google Gemini models (e.g., "gemini-1.5-flash", "gemini-1.5-pro") via the [google-generativeai](https://pypi.org/project/google-generativeai) Python library. By default, it uses the environment variable:
+```bash
+export GEMINI_API_KEY="YOUR GEMINI API KEY"
+```
+#### Semantic Scholar API (Literature Search)
+Our code can also optionally use a Semantic Scholar API Key (`S2_API_KEY`) for higher throughput [if you have one](https://www.semanticscholar.org/product/api), though it should work without it in principle. If you have problems with Semantic Scholar, you can skip the literature search and citation phases of paper generation.
+Be sure to provide the key for the model used for your runs, e.g.:
+```bash
+export OPENAI_API_KEY="YOUR KEY HERE"
+export S2_API_KEY="YOUR KEY HERE"
+```
+#### OpenAlex API (Literature Search Alternative)
+OpenAlex API can be used as an alternative if you do not have a Semantic Scholar API Key.
+OpenAlex does not require API key.
+```bash
+pip install pyalex
+export OPENALEX_MAIL_ADDRESS="YOUR EMAIL ADDRESS"
+```
+And specify `--engine openalex` when you execute the AI Scientist code.
+Note that this is experimental for those who do not have a Semantic Scholar API Key.
+## Setting Up the Templates
+This section provides instructions for setting up each of the three templates used in our paper. Before running The AI Scientist experiments, please ensure you have completed the setup steps for the templates you are interested in.
+### NanoGPT Template
+**Description:** This template investigates transformer-based autoregressive next-token prediction tasks.
+**Setup Steps:**
+1. **Prepare the data:**
+   ```bash
+   python data/enwik8/prepare.py
+   python data/shakespeare_char/prepare.py
+   python data/text8/prepare.py
+   ```
+2. **Create baseline runs (machine dependent):**
+   ```bash
+   # Set up NanoGPT baseline run
+   # NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
+   cd templates/nanoGPT
+   python experiment.py --out_dir run_0
+   python plot.py
+   ```
+### 2D Diffusion Template
+**Description:** This template studies improving the performance of diffusion generative models on low-dimensional datasets.
+**Setup Steps:**
+1. **Install dependencies:**
+   ```bash
+   # Set up 2D Diffusion
+   git clone https://github.com/gregversteeg/NPEET.git
+   cd NPEET
+   pip install .
+   pip install scikit-learn
+   ```
+2. **Create baseline runs:**
+   ```bash
+   # Set up 2D Diffusion baseline run
+   cd templates/2d_diffusion
+   python experiment.py --out_dir run_0
+   python plot.py
+   ```
+### Grokking Template
+**Description:** This template investigates questions about generalization and learning speed in deep neural networks.
+**Setup Steps:**
+1. **Install dependencies:**
+   ```bash
+   # Set up Grokking
+   pip install einops
+   ```
+2. **Create baseline runs:**
+   ```bash
+   # Set up Grokking baseline run
+   cd templates/grokking
+   python experiment.py --out_dir run_0
+   python plot.py
+   ```
+## Run AI Scientist Paper Generation Experiments
+**Note:** Please ensure the setup steps above are completed before running these experiments.
+```bash
+conda activate ai_scientist
+# Run the paper generation.
+python launch_scientist.py --model "gpt-4o-2024-05-13" --experiment nanoGPT_lite --num-ideas 2
+python launch_scientist.py --model "claude-3-5-sonnet-20241022" --experiment nanoGPT_lite --num-ideas 2
+```
+If you have more than one GPU, use the `--parallel` option to parallelize ideas across multiple GPUs.
+## Getting an LLM-Generated Paper Review
+```python
+import openai
+from ai_scientist.perform_review import load_paper, perform_review
+client = openai.OpenAI()
+model = "gpt-4o-2024-05-13"
+# Load paper from PDF file (raw text)
+paper_txt = load_paper("report.pdf")
+# Get the review dictionary
+review = perform_review(
+    paper_txt,
+    model,
+    client,
+    num_reflections=5,
+    num_fs_examples=1,
+    num_reviews_ensemble=5,
+    temperature=0.1,
+)
+# Inspect review results
+review["Overall"]    # Overall score (1-10)
+review["Decision"]   # 'Accept' or 'Reject'
+review["Weaknesses"] # List of weaknesses (strings)
+```
+To run batch analysis:
+```bash
+cd review_iclr_bench
+python iclr_analysis.py --num_reviews 500 --batch_size 100 --num_fs_examples 1 --num_reflections 5 --temperature 0.1 --num_reviews_ensemble 5
+```
+## Making Your Own Template
+If there is an area of study you would like **The AI Scientist** to explore, it is straightforward to create your own templates. In general, follow the structure of the existing templates, which consist of:
+- `experiment.py` — This is the main script where the core content is. It takes an argument `--out_dir`, which specifies where it should create the folder and save the relevant information from the run.
+- `plot.py` — This script takes the information from the `run` folders and creates plots. The code should be clear and easy to edit.
+- `prompt.json` — Put information about your template here.
+- `seed_ideas.json` — Place example ideas here. You can also try to generate ideas without any examples and then pick the best one or two to put here.
+- `latex/template.tex` — We recommend using our LaTeX folder but be sure to replace the pre-loaded citations with ones that you expect to be more relevant.
+The key to making new templates work is matching the base filenames and output JSONs to the existing format; everything else is free to change.
+You should also ensure that the `template.tex` file is updated to use the correct citation style / base plots for your template.
+### Community-Contributed Templates
+We welcome community contributions in the form of new templates. While these are not maintained by us, we are delighted to highlight your templates to others. Below, we list community-contributed templates along with links to their pull requests (PRs):
+- Infectious Disease Modeling (`seir`) - [PR #137](https://github.com/SakanaAI/AI-Scientist/pull/137)
+- Image Classification with MobileNetV3 (`mobilenetV3`) - [PR #141](https://github.com/SakanaAI/AI-Scientist/pull/141)
+- Sketch RNN (`sketch_rnn`) - [PR #143](https://github.com/SakanaAI/AI-Scientist/pull/143)
+- AI in Quantum Chemistry (`MACE`) - [PR#157](https://github.com/SakanaAI/AI-Scientist/pull/157)
+- Earthquake Prediction (`earthquake-prediction`) - [PR #167](https://github.com/SakanaAI/AI-Scientist/pull/167)
+- Tensorial Radiance Fields (`tensorf`) - [PR #175](https://github.com/SakanaAI/AI-Scientist/pull/175)
+- Large Language Model Steering / Probes (`probes`) - [PR #215](https://github.com/SakanaAI/AI-Scientist/pull/215)
+*This section is reserved for community contributions. Please submit a pull request to add your template to the list! Please describe the template in the PR description, and also show examples of the generated papers.*
+## Template Resources
+We provide three templates, which heavily use code from other repositories, credited below:
+- **NanoGPT Template** uses code from [NanoGPT](https://github.com/karpathy/nanoGPT) and this [PR](https://github.com/karpathy/nanoGPT/pull/254).
+- **2D Diffusion Template** uses code from [tiny-diffusion](https://github.com/tanelp/tiny-diffusion), [ema-pytorch](https://github.com/lucidrains/ema-pytorch), and [Datasaur](https://www.research.autodesk.com/publications/same-stats-different-graphs/).
+- **Grokking Template** uses code from [Sea-Snell/grokking](https://github.com/Sea-Snell/grokking) and [danielmamay/grokking](https://github.com/danielmamay/grokking).
+We would like to thank the developers of the open-source models and packages for their contributions and for making their work available.
+## Citing The AI Scientist
+If you use **The AI Scientist** in your research, please cite it as follows:
+```
+@article{lu2024aiscientist,
+  title={The {AI} {S}cientist: Towards Fully Automated Open-Ended Scientific Discovery},
+  author={Lu, Chris and Lu, Cong and Lange, Robert Tjarko and Foerster, Jakob and Clune, Jeff and Ha, David},
+  journal={arXiv preprint arXiv:2408.06292},
+  year={2024}
+}
+```
+## Frequently Asked Questions
+We recommend reading our paper first for any questions you have on The AI Scientist.
+**Why am I missing files when running The AI Scientist?**
+Ensure you have completed all the setup and preparation steps before the main experiment script.
+**Why has a PDF or a review not been generated?**
+The AI Scientist finishes an idea with a success rate that depends on the template, the base foundation model, and the complexity of the idea. We advise referring to our main paper. The highest success rates are observed with Claude Sonnet 3.5. Reviews are best done with GPT-4o; all other models have issues with positivity bias or failure to conform to required outputs.
+**What is the cost of each idea generated?**
+Typically less than $15 per paper with Claude Sonnet 3.5. We recommend DeepSeek Coder V2 for a much more cost-effective approach. A good place to look for new models is the [Aider leaderboard](https://aider.chat/docs/leaderboards/).
+**How do I change the base conference format associated with the write-ups?**
+Change the base `template.tex` files contained within each template.
+**How do I run The AI Scientist for different subject fields?**
+Please refer to the instructions for different templates. In this current iteration, this is restricted to ideas that can be expressed in code. However, lifting this restriction would represent exciting future work! :)
+**How do I add support for a new foundation model?**
+You may modify `ai_scientist/llm.py` to add support for a new foundation model. We do not advise using any model that is significantly weaker than GPT-4 level for **The AI Scientist**.
+**Why do I need to run the baseline runs myself?**
+These appear as `run_0` and should be run per machine you execute **The AI Scientist** on for accurate run-time comparisons due to hardware differences.
+**What if I have problems accessing the Semantic Scholar API?**
+We use the Semantic Scholar API to check ideas for novelty and collect citations for the paper write-up. You may be able to skip these phases if you don't have an API key or the API is slow to access.
+## Containerization
+We include a [community-contributed](https://github.com/SakanaAI/AI-Scientist/pull/21) Docker image that may assist with your containerization efforts in `experimental/Dockerfile`.
+You can use this image like this:
+```bash
+# Endpoint Script
+docker run -e OPENAI_API_KEY=$OPENAI_API_KEY -v `pwd`/templates:/app/AI-Scientist/templates <AI_SCIENTIST_IMAGE> \
+  --model gpt-4o-2024-05-13 \
+  --experiment 2d_diffusion \
+  --num-ideas 2
+```
+```bash
+# Interactive
+docker run -it -e OPENAI_API_KEY=$OPENAI_API_KEY \
+  --entrypoint /bin/bash \
+  <AI_SCIENTIST_IMAGE>
+```
+## ⚖️ License & Responsible Use
+This project is licensed under **The AI Scientist Source Code License** (a derivative of the Responsible AI License).
+**Mandatory Disclosure:** By using this code, you are legally bound to clearly and prominently disclose the use of AI in any resulting scientific manuscripts or papers.
+We recommend the following attribution in your paper's Abstract or Methods section:
+> "This manuscript was autonomously generated using [The AI Scientist](https://github.com/SakanaAI/AI-Scientist)."
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=SakanaAI/AI-Scientist&type=Date)](https://star-history.com/#SakanaAI/AI-Scientist&Date)

experimental/Dockerfile ADDED Viewed

	@@ -0,0 +1,89 @@

+# Use Python 3.11 as the base image
+FROM python:3.11-bullseye
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+# Set working directory
+WORKDIR /app
+# Install system dependencies including texlive-full
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget=1.21-1+deb11u1 \
+    git=1:2.30.2-1+deb11u2 \
+    build-essential=12.9 \
+    libssl-dev=1.1.1w-0+deb11u1 \
+    zlib1g-dev=1:1.2.11.dfsg-2+deb11u2 \
+    libbz2-dev=1.0.8-4 \
+    libreadline-dev=8.1-1 \
+    libsqlite3-dev=3.34.1-3 \
+    libncursesw5-dev=6.2+20201114-2+deb11u2 \
+    xz-utils=5.2.5-2.1~deb11u1 \
+    tk-dev=8.6.11+1 \
+    libxml2-dev=2.9.10+dfsg-6.7+deb11u4 \
+    libxmlsec1-dev=1.2.31-1 \
+    libffi-dev=3.3-6 \
+    liblzma-dev=5.2.5-2.1~deb11u1 \
+    texlive-full=2020.20210202-3 \
+    && rm -rf /var/lib/apt/lists/*
+# Upgrade pip
+RUN pip install --no-cache-dir --upgrade pip==24.2
+# Install Python packages
+RUN pip install --no-cache-dir \
+    anthropic==0.34.0 \
+    aider-chat==0.50.1 \
+    backoff==2.2.1 \
+    openai==1.40.6 \
+    matplotlib==3.9.2 \
+    pypdf==4.3.1 \
+    pymupdf4llm==0.0.10 \
+    torch==2.4.0 \
+    numpy==1.26.4 \
+    transformers==4.44.0 \
+    datasets==2.21.0 \
+    tiktoken==0.7.0 \
+    wandb==0.17.7 \
+    tqdm==4.66.5 \
+    scikit-learn==1.5.1 \
+    einops==0.8.0
+# Clone and install NPEET with a specific commit
+RUN git clone https://github.com/gregversteeg/NPEET.git
+WORKDIR /app/NPEET
+RUN git checkout 8b0d9485423f74e5eb199324cf362765596538d3 \
+    && pip install .
+# Clone the AI-Scientist repository
+WORKDIR /app
+RUN git clone https://github.com/SakanaAI/AI-Scientist.git
+# Set working directory to AI-Scientist
+WORKDIR /app/AI-Scientist
+# Prepare NanoGPT data
+RUN python data/enwik8/prepare.py && \
+    python data/shakespeare_char/prepare.py && \
+    python data/text8/prepare.py
+# Set up baseline runs
+RUN for dir in templates/*/; do \
+    if [ -f "${dir}experiment.py" ]; then \
+        cd "${dir}" || continue; \
+        python experiment.py --out_dir run_0 && \
+        python plot.py; \
+        cd /app/AI-Scientist || exit; \
+    fi \
+done
+# Create entrypoint script
+RUN printf '#!/bin/bash\n\
+python launch_scientist.py "$@"\n' > /app/entrypoint.sh && \
+    chmod +x /app/entrypoint.sh
+# Set the entrypoint
+ENTRYPOINT ["/app/entrypoint.sh"]
+# Set the default command to an empty array
+CMD []

experimental/launch_oe_scientist.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import argparse
+import json
+import multiprocessing
+import openai
+import os
+import os.path as osp
+import shutil
+import sys
+import time
+import torch
+from aider.coders import Coder
+from aider.io import InputOutput
+from aider.models import Model
+from datetime import datetime
+from ai_scientist.generate_ideas import generate_next_idea, check_idea_novelty
+from ai_scientist.llm import create_client, AVAILABLE_LLMS
+from ai_scientist.perform_experiments import perform_experiments
+from ai_scientist.perform_review import perform_review, load_paper, perform_improvement
+from ai_scientist.perform_writeup import perform_writeup, generate_latex
+NUM_REFLECTIONS = 3
+def print_time():
+    print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Run AI scientist experiments")
+    # add type of experiment (nanoGPT, Boston, etc.)
+    parser.add_argument(
+        "--experiment",
+        type=str,
+        default="nanoGPT",
+        help="Experiment to run AI Scientist on.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="claude-3-5-sonnet-20240620",
+        choices=AVAILABLE_LLMS,
+        help="Model to use for AI Scientist.",
+    )
+    parser.add_argument(
+        "--writeup",
+        type=str,
+        default="latex",
+        choices=["latex"],
+        help="What format to use for writeup",
+    )
+    parser.add_argument(
+        "--parallel",
+        type=int,
+        default=0,
+        help="Number of parallel processes to run. 0 for sequential execution.",
+    )
+    parser.add_argument(
+        "--improvement",
+        action="store_true",
+        help="Improve based on reviews.",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        default=None,
+        help="Comma-separated list of GPU IDs to use (e.g., '0,1,2'). If not specified, all available GPUs will be used.",
+    )
+    parser.add_argument(
+        "--num-ideas",
+        type=int,
+        default=50,
+        help="Number of ideas to generate",
+    )
+    return parser.parse_args()
+def get_available_gpus(gpu_ids=None):
+    if gpu_ids is not None:
+        return [int(gpu_id) for gpu_id in gpu_ids.split(",")]
+    return list(range(torch.cuda.device_count()))
+def worker(
+        queue,
+        base_dir,
+        results_dir,
+        model,
+        client,
+        client_model,
+        writeup,
+        improvement,
+        gpu_id,
+        idea_archive,
+        lock,
+):
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    print(f"Worker {gpu_id} started.")
+    while True:
+        _ = queue.get()
+        with lock:
+            idea_archive = generate_next_idea(
+                base_dir,
+                client=client,
+                model=client_model,
+                prev_idea_archive=idea_archive,
+                num_reflections=NUM_REFLECTIONS,
+            )
+            idea_archive = check_idea_novelty(
+                idea_archive,
+                base_dir=base_dir,
+                client=client,
+                model=client_model,
+            )
+            idea = idea_archive[-1]
+        if _ is None:
+            break
+        success, score, _ = do_idea(
+            base_dir,
+            results_dir,
+            idea,
+            model,
+            client,
+            client_model,
+            writeup,
+            improvement,
+            log_file=True,
+        )
+        print(f"Completed idea: {idea['Name']}, Success: {success}, Score: {score}")
+        with lock:
+            for x in idea_archive:
+                if x["Name"] == idea["Name"] and x["Title"] == idea["Title"]:
+                    x["Score"] = score
+                    break
+    print(f"Worker {gpu_id} finished.")
+def do_idea(
+        base_dir,
+        results_dir,
+        idea,
+        model,
+        client,
+        client_model,
+        writeup,
+        improvement,
+        log_file=False,
+):
+    ## CREATE PROJECT FOLDER
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    idea_name = f"{timestamp}_{idea['Name']}"
+    folder_name = osp.join(results_dir, idea_name)
+    assert not osp.exists(folder_name), f"Folder {folder_name} already exists."
+    destination_dir = folder_name
+    shutil.copytree(base_dir, destination_dir, dirs_exist_ok=True)
+    with open(osp.join(base_dir, "run_0", "final_info.json"), "r") as f:
+        baseline_results = json.load(f)
+    baseline_results = {k: v["means"] for k, v in baseline_results.items()}
+    exp_file = osp.join(folder_name, "experiment.py")
+    vis_file = osp.join(folder_name, "plot.py")
+    notes = osp.join(folder_name, "notes.txt")
+    with open(notes, "w") as f:
+        f.write(f"# Title: {idea['Title']}\n")
+        f.write(f"# Experiment description: {idea['Experiment']}\n")
+        f.write(f"## Run 0: Baseline\n")
+        f.write(f"Results: {baseline_results}\n")
+        f.write(f"Description: Baseline results.\n")
+    if log_file:
+        original_stdout = sys.stdout
+        original_stderr = sys.stderr
+        log_path = osp.join(folder_name, "log.txt")
+        log = open(log_path, "a")
+        sys.stdout = log
+        sys.stderr = log
+    try:
+        print_time()
+        print(f"*Starting idea: {idea_name}*")
+        ## PERFORM EXPERIMENTS
+        fnames = [exp_file, vis_file, notes]
+        io = InputOutput(
+            yes=True, chat_history_file=f"{folder_name}/{idea_name}_aider.txt"
+        )
+        if model == "deepseek-coder-v2-0724":
+            main_model = Model("deepseek/deepseek-coder")
+        elif model == "llama3.1-405b":
+            main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
+        else:
+            main_model = Model(model)
+        coder = Coder.create(
+            main_model=main_model,
+            fnames=fnames,
+            io=io,
+            stream=False,
+            use_git=False,
+            edit_format="diff",
+        )
+        print_time()
+        print(f"*Starting Experiments*")
+        try:
+            success = perform_experiments(idea, folder_name, coder, baseline_results)
+        except Exception as e:
+            print(f"Error during experiments: {e}")
+            print(f"Experiments failed for idea {idea_name}")
+            return False, 0, idea
+        if not success:
+            print(f"Experiments failed for idea {idea_name}")
+            return False, 0, idea
+        print_time()
+        print(f"*Starting Writeup*")
+        ## PERFORM WRITEUP
+        if writeup == "latex":
+            writeup_file = osp.join(folder_name, "latex", "template.tex")
+            fnames = [exp_file, writeup_file, notes]
+            if model == "deepseek-coder-v2-0724":
+                main_model = Model("deepseek/deepseek-coder")
+            elif model == "llama3.1-405b":
+                main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
+            else:
+                main_model = Model(model)
+            coder = Coder.create(
+                main_model=main_model,
+                fnames=fnames,
+                io=io,
+                stream=False,
+                use_git=False,
+                edit_format="diff",
+            )
+            try:
+                perform_writeup(idea, folder_name, coder, client, client_model)
+            except Exception as e:
+                print(f"Failed to perform writeup: {e}")
+                return False, 0, idea
+            print("Done writeup")
+        else:
+            raise ValueError(f"Writeup format {writeup} not supported.")
+        print_time()
+        print(f"*Starting Review*")
+        ## REVIEW PAPER
+        if writeup == "latex":
+            try:
+                paper_text = load_paper(f"{folder_name}/{idea['Name']}.pdf")
+                review = perform_review(
+                    paper_text,
+                    model="gpt-4o-2024-05-13",
+                    client=openai.OpenAI(),
+                    num_reflections=5,
+                    num_fs_examples=1,
+                    num_reviews_ensemble=5,
+                    temperature=0.1,
+                )
+                review_score = review["Overall"]
+                # Store the review in separate review.txt file
+                with open(osp.join(folder_name, "review.txt"), "w") as f:
+                    f.write(json.dumps(review))
+            except Exception as e:
+                print(f"Failed to perform review: {e}")
+                return False, 0, idea
+        ## IMPROVE WRITEUP
+        if writeup == "latex" and improvement:
+            print_time()
+            print(f"*Starting Improvement*")
+            try:
+                perform_improvement(review, coder)
+                generate_latex(
+                    coder, folder_name, f"{folder_name}/{idea['Name']}_improved.pdf"
+                )
+                paper_text = load_paper(f"{folder_name}/{idea['Name']}_improved.pdf")
+                review = perform_review(
+                    paper_text,
+                    model="gpt-4o-2024-05-13",
+                    client=openai.OpenAI(),
+                    num_reflections=5,
+                    num_fs_examples=1,
+                    num_reviews_ensemble=5,
+                    temperature=0.1,
+                )
+                review_score = review["Overall"]
+                # Store the review in separate review.txt file
+                with open(osp.join(folder_name, "review_improved.txt"), "w") as f:
+                    f.write(json.dumps(review))
+            except Exception as e:
+                print(f"Failed to perform improvement: {e}")
+                return False, 0, idea
+        return True, review_score, idea
+    except Exception as e:
+        print(f"Failed to evaluate idea {idea_name}: {str(e)}")
+        return False, 0, idea
+    finally:
+        print("FINISHED IDEA")
+        if log_file:
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
+            log.close()
+if __name__ == "__main__":
+    args = parse_arguments()
+    # Check available GPUs and adjust parallel processes if necessary
+    available_gpus = get_available_gpus(args.gpus)
+    if args.parallel > len(available_gpus):
+        print(
+            f"Warning: Requested {args.parallel} parallel processes, but only {len(available_gpus)} GPUs available. Adjusting to {len(available_gpus)}."
+        )
+        args.parallel = len(available_gpus)
+    print(f"Using GPUs: {available_gpus}")
+    # Create client
+    client, client_model = create_client(args.model)
+    base_dir = osp.join("templates", args.experiment)
+    results_dir = osp.join("results", args.experiment)
+    idea_archive = []
+    if args.parallel > 0:
+        print(f"Running {args.parallel} parallel processes")
+        queue = multiprocessing.Queue()
+        lock = multiprocessing.Lock()
+        for _ in range(args.num_ideas):
+            queue.put(_)
+        processes = []
+        for i in range(args.parallel):
+            gpu_id = available_gpus[i % len(available_gpus)]
+            p = multiprocessing.Process(
+                target=worker,
+                args=(
+                    queue,
+                    base_dir,
+                    results_dir,
+                    args.model,
+                    client,
+                    client_model,
+                    args.writeup,
+                    args.improvement,
+                    gpu_id,
+                    idea_archive,
+                    lock,
+                ),
+            )
+            p.start()
+            time.sleep(150)
+            processes.append(p)
+        # Signal workers to exit
+        for _ in range(args.parallel):
+            queue.put(None)
+        for p in processes:
+            p.join()
+        print("All parallel processes completed.")
+    else:
+        for _ in range(args.num_ideas):
+            idea_archive = generate_next_idea(
+                base_dir,
+                client=client,
+                model=client_model,
+                prev_idea_archive=idea_archive,
+                num_reflections=NUM_REFLECTIONS,
+            )
+            idea_archive = check_idea_novelty(
+                idea_archive,
+                base_dir=base_dir,
+                client=client,
+                model=client_model,
+            )
+            idea = idea_archive[-1]
+            print(f"Processing idea: {idea['Name']}")
+            try:
+                success, score, _ = do_idea(
+                    base_dir,
+                    results_dir,
+                    idea,
+                    args.model,
+                    client,
+                    client_model,
+                    args.writeup,
+                    args.improvement,
+                )
+                print(
+                    f"Completed idea: {idea['Name']}, Success: {success}, Score: {score}"
+                )
+                idea["Score"] = score
+            except Exception as e:
+                print(f"Failed to evaluate idea {idea['Name']}: {str(e)}")
+    print("All ideas evaluated.")

hf_app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import gradio as gr
+from fastapi import FastAPI
+from CriticalThinking.app.main import app as fastapi_app
+# The FastAPI app is already initialized in CriticalThinking.app.main
+# We can just mount it or use it as the main app.
+# Here we will mount Gradio onto the existing FastAPI app.
+def analyze_interface(repo_url, project_description):
+    # This is a placeholder for the Gradio UI to interact with the API
+    # In a real scenario, we might want to use the background task or just call the service.
+    return f"Analysis request for {repo_url} received. Please use the API endpoints to monitor progress."
+with gr.Blocks(title="Critical Code Agent") as demo:
+    gr.Markdown("# 🦀 Critical Code Agent")
+    gr.Markdown("Autonomous agent system for deep architectural analysis and software weakness identification.")
+    with gr.Row():
+        repo_url = gr.Textbox(label="Repository URL", placeholder="https://github.com/username/repo")
+        project_desc = gr.Textbox(label="Project Description", placeholder="Brief description of the project")
+    analyze_btn = gr.Button("Analyze Repository", variant="primary")
+    output = gr.Textbox(label="Status")
+    analyze_btn.click(analyze_interface, inputs=[repo_url, project_desc], outputs=output)
+    gr.Markdown("### API Endpoints")
+    gr.Markdown("- `POST /analyze`: Submit a repository for analysis")
+    gr.Markdown("- `GET /report/{task_id}`: Retrieve analysis report")
+    gr.Markdown("- `GET /health`: Check service health")
+# Mount Gradio to the FastAPI app
+app = gr.mount_gradio_app(fastapi_app, demo, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

launch_scientist.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import argparse
+import json
+import multiprocessing
+import openai
+import os
+import os.path as osp
+import shutil
+import sys
+import time
+import torch
+from aider.coders import Coder
+from aider.io import InputOutput
+from aider.models import Model
+from datetime import datetime
+from ai_scientist.generate_ideas import generate_ideas, check_idea_novelty
+from ai_scientist.llm import create_client, AVAILABLE_LLMS
+from ai_scientist.perform_experiments import perform_experiments
+from ai_scientist.perform_review import perform_review, load_paper, perform_improvement
+from ai_scientist.perform_writeup import perform_writeup, generate_latex
+NUM_REFLECTIONS = 3
+def print_time():
+    print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Run AI scientist experiments")
+    parser.add_argument(
+        "--skip-idea-generation",
+        action="store_true",
+        help="Skip idea generation and load existing ideas",
+    )
+    parser.add_argument(
+        "--skip-novelty-check",
+        action="store_true",
+        help="Skip novelty check and use existing ideas",
+    )
+    # add type of experiment (nanoGPT, Boston, etc.)
+    parser.add_argument(
+        "--experiment",
+        type=str,
+        default="nanoGPT",
+        help="Experiment to run AI Scientist on.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="claude-3-5-sonnet-20240620",
+        choices=AVAILABLE_LLMS,
+        help="Model to use for AI Scientist.",
+    )
+    parser.add_argument(
+        "--writeup",
+        type=str,
+        default="latex",
+        choices=["latex"],
+        help="What format to use for writeup",
+    )
+    parser.add_argument(
+        "--parallel",
+        type=int,
+        default=0,
+        help="Number of parallel processes to run. 0 for sequential execution.",
+    )
+    parser.add_argument(
+        "--improvement",
+        action="store_true",
+        help="Improve based on reviews.",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        default=None,
+        help="Comma-separated list of GPU IDs to use (e.g., '0,1,2'). If not specified, all available GPUs will be used.",
+    )
+    parser.add_argument(
+        "--num-ideas",
+        type=int,
+        default=50,
+        help="Number of ideas to generate",
+    )
+    parser.add_argument(
+        "--engine",
+        type=str,
+        default="semanticscholar",
+        choices=["semanticscholar", "openalex"],
+        help="Scholar engine to use.",
+    )
+    return parser.parse_args()
+def get_available_gpus(gpu_ids=None):
+    if gpu_ids is not None:
+        return [int(gpu_id) for gpu_id in gpu_ids.split(",")]
+    return list(range(torch.cuda.device_count()))
+def check_latex_dependencies():
+    """
+    Check if required LaTeX dependencies are installed on the system.
+    Returns True if all dependencies are found, False otherwise.
+    """
+    import shutil
+    import sys
+    required_dependencies = ['pdflatex', 'chktex']
+    missing_deps = []
+    for dep in required_dependencies:
+        if shutil.which(dep) is None:
+            missing_deps.append(dep)
+    if missing_deps:
+        print("Error: Required LaTeX dependencies not found:", file=sys.stderr)
+        return False
+    return True
+def worker(
+        queue,
+        base_dir,
+        results_dir,
+        model,
+        client,
+        client_model,
+        writeup,
+        improvement,
+        gpu_id,
+):
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    print(f"Worker {gpu_id} started.")
+    while True:
+        idea = queue.get()
+        if idea is None:
+            break
+        success = do_idea(
+            base_dir,
+            results_dir,
+            idea,
+            model,
+            client,
+            client_model,
+            writeup,
+            improvement,
+            log_file=True,
+        )
+        print(f"Completed idea: {idea['Name']}, Success: {success}")
+    print(f"Worker {gpu_id} finished.")
+def do_idea(
+        base_dir,
+        results_dir,
+        idea,
+        model,
+        client,
+        client_model,
+        writeup,
+        improvement,
+        log_file=False,
+):
+    ## CREATE PROJECT FOLDER
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    idea_name = f"{timestamp}_{idea['Name']}"
+    folder_name = osp.join(results_dir, idea_name)
+    assert not osp.exists(folder_name), f"Folder {folder_name} already exists."
+    destination_dir = folder_name
+    shutil.copytree(base_dir, destination_dir, dirs_exist_ok=True)
+    with open(osp.join(base_dir, "run_0", "final_info.json"), "r") as f:
+        baseline_results = json.load(f)
+    # Check if baseline_results is a dictionary before extracting means
+    if isinstance(baseline_results, dict):
+        baseline_results = {k: v["means"] for k, v in baseline_results.items()}
+    exp_file = osp.join(folder_name, "experiment.py")
+    vis_file = osp.join(folder_name, "plot.py")
+    notes = osp.join(folder_name, "notes.txt")
+    with open(notes, "w") as f:
+        f.write(f"# Title: {idea['Title']}\n")
+        f.write(f"# Experiment description: {idea['Experiment']}\n")
+        f.write(f"## Run 0: Baseline\n")
+        f.write(f"Results: {baseline_results}\n")
+        f.write(f"Description: Baseline results.\n")
+    if log_file:
+        original_stdout = sys.stdout
+        original_stderr = sys.stderr
+        log_path = osp.join(folder_name, "log.txt")
+        log = open(log_path, "a")
+        sys.stdout = log
+        sys.stderr = log
+    try:
+        print_time()
+        print(f"*Starting idea: {idea_name}*")
+        ## PERFORM EXPERIMENTS
+        fnames = [exp_file, vis_file, notes]
+        io = InputOutput(
+            yes=True, chat_history_file=f"{folder_name}/{idea_name}_aider.txt"
+        )
+        if model == "deepseek-coder-v2-0724":
+            main_model = Model("deepseek/deepseek-coder")
+        elif model == "deepseek-reasoner":
+            main_model = Model("deepseek/deepseek-reasoner")
+        elif model == "llama3.1-405b":
+            main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
+        else:
+            main_model = Model(model)
+        coder = Coder.create(
+            main_model=main_model,
+            fnames=fnames,
+            io=io,
+            stream=False,
+            use_git=False,
+            edit_format="diff",
+        )
+        print_time()
+        print(f"*Starting Experiments*")
+        try:
+            success = perform_experiments(idea, folder_name, coder, baseline_results)
+        except Exception as e:
+            print(f"Error during experiments: {e}")
+            print(f"Experiments failed for idea {idea_name}")
+            return False
+        if not success:
+            print(f"Experiments failed for idea {idea_name}")
+            return False
+        print_time()
+        print(f"*Starting Writeup*")
+        ## PERFORM WRITEUP
+        if writeup == "latex":
+            writeup_file = osp.join(folder_name, "latex", "template.tex")
+            fnames = [exp_file, writeup_file, notes]
+            if model == "deepseek-coder-v2-0724":
+                main_model = Model("deepseek/deepseek-coder")
+            elif model == "deepseek-reasoner":
+                main_model = Model("deepseek/deepseek-reasoner")
+            elif model == "llama3.1-405b":
+                main_model = Model("openrouter/meta-llama/llama-3.1-405b-instruct")
+            else:
+                main_model = Model(model)
+            coder = Coder.create(
+                main_model=main_model,
+                fnames=fnames,
+                io=io,
+                stream=False,
+                use_git=False,
+                edit_format="diff",
+            )
+            try:
+                perform_writeup(idea, folder_name, coder, client, client_model, engine=args.engine)
+            except Exception as e:
+                print(f"Failed to perform writeup: {e}")
+                return False
+            print("Done writeup")
+        else:
+            raise ValueError(f"Writeup format {writeup} not supported.")
+        print_time()
+        print(f"*Starting Review*")
+        ## REVIEW PAPER
+        if writeup == "latex":
+            try:
+                paper_text = load_paper(f"{folder_name}/{idea['Name']}.pdf")
+                review = perform_review(
+                    paper_text,
+                    model="gpt-4o-2024-05-13",
+                    client=openai.OpenAI(),
+                    num_reflections=5,
+                    num_fs_examples=1,
+                    num_reviews_ensemble=5,
+                    temperature=0.1,
+                )
+                # Store the review in separate review.txt file
+                with open(osp.join(folder_name, "review.txt"), "w") as f:
+                    f.write(json.dumps(review, indent=4))
+            except Exception as e:
+                print(f"Failed to perform review: {e}")
+                return False
+        ## IMPROVE WRITEUP
+        if writeup == "latex" and improvement:
+            print_time()
+            print(f"*Starting Improvement*")
+            try:
+                perform_improvement(review, coder)
+                generate_latex(
+                    coder, folder_name, f"{folder_name}/{idea['Name']}_improved.pdf"
+                )
+                paper_text = load_paper(f"{folder_name}/{idea['Name']}_improved.pdf")
+                review = perform_review(
+                    paper_text,
+                    model="gpt-4o-2024-05-13",
+                    client=openai.OpenAI(),
+                    num_reflections=5,
+                    num_fs_examples=1,
+                    num_reviews_ensemble=5,
+                    temperature=0.1,
+                )
+                # Store the review in separate review.txt file
+                with open(osp.join(folder_name, "review_improved.txt"), "w") as f:
+                    f.write(json.dumps(review))
+            except Exception as e:
+                print(f"Failed to perform improvement: {e}")
+                return False
+        return True
+    except Exception as e:
+        print(f"Failed to evaluate idea {idea_name}: {str(e)}")
+        return False
+    finally:
+        print("FINISHED IDEA")
+        if log_file:
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
+            log.close()
+if __name__ == "__main__":
+    args = parse_arguments()
+    # Check available GPUs and adjust parallel processes if necessary
+    available_gpus = get_available_gpus(args.gpus)
+    if args.parallel > len(available_gpus):
+        print(
+            f"Warning: Requested {args.parallel} parallel processes, but only {len(available_gpus)} GPUs available. Adjusting to {len(available_gpus)}."
+        )
+        args.parallel = len(available_gpus)
+    print(f"Using GPUs: {available_gpus}")
+    # Check LaTeX dependencies before proceeding
+    if args.writeup == "latex" and not check_latex_dependencies():
+        sys.exit(1)
+    # Create client
+    client, client_model = create_client(args.model)
+    base_dir = osp.join("templates", args.experiment)
+    results_dir = osp.join("results", args.experiment)
+    ideas = generate_ideas(
+        base_dir,
+        client=client,
+        model=client_model,
+        skip_generation=args.skip_idea_generation,
+        max_num_generations=args.num_ideas,
+        num_reflections=NUM_REFLECTIONS,
+    )
+    if not args.skip_novelty_check:
+        ideas = check_idea_novelty(
+            ideas,
+            base_dir=base_dir,
+            client=client,
+            model=client_model,
+            engine=args.engine,
+        )
+    with open(osp.join(base_dir, "ideas.json"), "w") as f:
+        json.dump(ideas, f, indent=4)
+    novel_ideas = [idea for idea in ideas if idea["novel"]]
+    # novel_ideas = list(reversed(novel_ideas))
+    if args.parallel > 0:
+        print(f"Running {args.parallel} parallel processes")
+        queue = multiprocessing.Queue()
+        for idea in novel_ideas:
+            queue.put(idea)
+        processes = []
+        for i in range(args.parallel):
+            gpu_id = available_gpus[i % len(available_gpus)]
+            p = multiprocessing.Process(
+                target=worker,
+                args=(
+                    queue,
+                    base_dir,
+                    results_dir,
+                    args.model,
+                    client,
+                    client_model,
+                    args.writeup,
+                    args.improvement,
+                    gpu_id,
+                ),
+            )
+            p.start()
+            time.sleep(150)
+            processes.append(p)
+        # Signal workers to exit
+        for _ in range(args.parallel):
+            queue.put(None)
+        for p in processes:
+            p.join()
+        print("All parallel processes completed.")
+    else:
+        for idea in novel_ideas:
+            print(f"Processing idea: {idea['Name']}")
+            try:
+                success = do_idea(
+                    base_dir,
+                    results_dir,
+                    idea,
+                    args.model,
+                    client,
+                    client_model,
+                    args.writeup,
+                    args.improvement,
+                )
+                print(f"Completed idea: {idea['Name']}, Success: {success}")
+            except Exception as e:
+                print(f"Failed to evaluate idea {idea['Name']}: {str(e)}")
+                import traceback
+                print(traceback.format_exc())
+    print("All ideas evaluated.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# LLM APIs
+anthropic
+aider-chat
+backoff
+openai
+google-generativeai
+# Viz
+matplotlib
+pypdf
+pymupdf4llm
+# Common Requirements
+torch
+numpy
+transformers
+datasets
+tiktoken
+wandb
+tqdm