Spaces:

Baction
/

Vulnerability_Scanner_Client

Running

File size: 9,981 Bytes

import gradio as gr
import re
import requests
from markdownify import markdownify
from requests.exceptions import RequestException
from smolagents import (
    CodeAgent,
    ToolCallingAgent,
    InferenceClientModel,
    WebSearchTool,
    MCPClient,
    tool
)

# MCP Server URL for GitHub tools
MCP_SERVER_URL = "https://baction-vulnerability-scanner-server.hf.space/gradio_api/mcp/"

@tool
def visit_webpage(url: str) -> str:
    """Visits a webpage at the given URL and returns its content as a markdown string.
    
    Args:
        url: The URL of the webpage to visit.
        
    Returns:
        The content of the webpage converted to Markdown, or an error message if the request fails.
    """
    try:
        # Add user agent to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Send a GET request to the URL
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Convert the HTML content to Markdown
        markdown_content = markdownify(response.text).strip()
        
        # Remove multiple line breaks
        markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
        
        # Limit content length to avoid overwhelming the AI
        if len(markdown_content) > 5000:
            markdown_content = markdown_content[:5000] + "\n\n[Content truncated due to length...]"
        
        return markdown_content
        
    except RequestException as e:
        return f"Error fetching the webpage: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

def parse_github_url(url):
    """Parse GitHub URL to extract owner, repo, and file path"""
    # Handle repository URLs
    repo_pattern = r'https://github\.com/([^/]+)/([^/]+)/?$'
    repo_match = re.match(repo_pattern, url.strip())
    if repo_match:
        return repo_match.group(1), repo_match.group(2), None
    
    # Handle file URLs
    file_pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)$'
    file_match = re.match(file_pattern, url.strip())
    if file_match:
        return file_match.group(1), file_match.group(2), file_match.group(3)
    
    return None, None, None

def analyze_vulnerabilities_multiagent(message, history, hf_token):
    """Multi-agent vulnerability analysis with web scraping capabilities"""
    
    # Validate HF token input
    if not hf_token.strip():
        return "❌ Please provide a Hugging Face API key. Get one from [Hugging Face](https://huggingface.co/settings/tokens)"
    
    try:
        # Parse the GitHub URL
        owner, repo, file_path = parse_github_url(message)
        
        if not owner or not repo:
            return "❌ Invalid GitHub URL. Please provide a valid GitHub repository or file URL."
        
        if not file_path:
            return "❌ Please provide a specific file URL for analysis. Repository-wide analysis is not supported in multi-agent mode."
        
        # Connect to MCP server for GitHub tools
        mcp_client = MCPClient({
            "url": MCP_SERVER_URL,
            "timeout": 120
        })
        github_tools = mcp_client.get_tools()
        
        # Initialize AI model
        model = InferenceClientModel(token=hf_token.strip())
        
        # Create a single agent with all tools (simpler approach)
        all_tools = github_tools + [visit_webpage]
        
        # Create single agent instead of multi-agent to avoid tool_choice issues
        agent = CodeAgent(
            tools=all_tools,
            model=model,
            additional_authorized_imports=["re", "requests"],
            max_steps=12
        )
        
        # Simplified prompt for single agent analysis
        enhanced_prompt = f"""
You are a cybersecurity expert. Analyze this GitHub file for security vulnerabilities.

GitHub URL: {message}
Repository: {owner}/{repo}
File Path: {file_path}

**ANALYSIS STEPS:**

1. **Get File Data**:
   - Use get_repository_info with owner="{owner}", repo="{repo}"
   - Use get_file_content with owner="{owner}", repo="{repo}", path="{file_path}"

2. **Find Vulnerabilities**:
   Analyze code for:
   - SQL injection patterns
   - Command injection (eval, exec, os.system)
   - XSS vulnerabilities
   - Path traversal
   - Hardcoded secrets
   - Input validation issues

3. **CVE Research**:
   - Search for CVEs: simple_cve_search("SQL injection", 3)
   - Extract CVE IDs from the string result using regex
   - Visit NVD for the first CVE: visit_webpage("https://nvd.nist.gov/vuln/detail/CVE-XXXX-XXXX")
   - Include the full NVD webpage content in your report

4. **Generate Report**:

# 🛡️ Security Analysis Report

## 🔍 File Overview
- **Path**: {file_path}
- **Repository**: {owner}/{repo}

## 🚨 Vulnerabilities Found
[List vulnerabilities with line numbers]

## 📊 CVE Research
**Top Related CVE**: [First CVE ID from regex extraction]
**CVE Details Webpage Content**: [Complete content from visit_webpage call]
**Key Details from CVE Details**: [CVSS score, attack vector, impact extracted from webpage]

## ⚠️ Other Possible CVEs
[Show other 2 CVE IDs from search]

## 🛠️ Remediation
[Specific fixes]

## ⚠️ Disclaimer
AI analysis may not be 100% accurate. Manual security review recommended.

**REMEMBER**: Always call visit_webpage for the first CVE ID to get detailed CVE information!

**CRITICAL INSTRUCTIONS**:
- simple_cve_search returns a STRING with CVE IDs and descriptions
- Extract CVE IDs using: re.findall(r'CVE-\d{4}-\d+', cve_search_string)
- TRY to visit CVE Details webpage for the first CVE ID found (more reliable than NVD)
- Use this exact pattern:
  1. Call simple_cve_search("SQL injection", 3)
  2. Extract CVE IDs with regex from the returned string
  3. Take the first CVE ID from the list
  4. Call visit_webpage("https://www.cvedetails.com/cve/CVE-YYYY-NNNNN/") with the EXACT CVE ID (keep hyphens)
  5. If webpage fails (403 error), continue with analysis using CVE search results only
- Keep variable names simple and avoid complex operations
- ALWAYS use keyword arguments for MCP tools (e.g., owner="user", repo="repo", path="file.py")
- NOTE: CVE format is standard CVE-YYYY-NNNNN (like CVE-2024-54762)
- Example: If you get "CVE-2024-54762", visit "https://www.cvedetails.com/cve/CVE-2024-54762/"
- DO NOT remove hyphens from CVE IDs when visiting CVE Details URLs
- If CVE Details access fails, use the CVE descriptions from simple_cve_search results
"""
        
        # Run the agent analysis
        result = agent.run(enhanced_prompt)
        
        # Disconnect MCP client
        mcp_client.disconnect()
        
        return str(result)
        
    except Exception as e:
        return f"❌ Error in multi-agent analysis: {str(e)}\n\nPlease ensure:\n• Valid GitHub file URL (not repository URL)\n• Hugging Face token is correct\n• File is accessible"

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.Markdown("## 🛡️ Enhanced GitHub Vulnerability Scanner")
    gr.Markdown("""
    **Advanced Security Analysis with Web Scraping**
    
    This intelligent vulnerability scanner uses AI agents with web scraping capabilities to perform comprehensive security analysis of GitHub files.
    
    **Key Features:**
    -  **🤖 AI Agent System**: Single agent with multiple tools for efficient analysis
    -  **🌐 Web Scraping**: Automatically visits NVD webpages to get detailed CVE information
    -  **📊 CVE Database Integration**: Searches CVE knowledge base and gets top 3 matches
    -  **🔍 Smart Analysis**: AI-generated vulnerability descriptions (not hardcoded)
    -  **📋 Detailed Reports**: Comprehensive reports with NVD data and remediation advice
    -  **⚠️ Accuracy Disclaimer**: Shows alternative CVEs and warns about AI limitations
    
    **Project Links:**
    - 📂 **Source Code**: [GitHub Repository](https://github.com/banno-0720/vulnerability-scanner)
    - 🔧 **MCP Server**: [Hugging Face Space](https://huggingface.co/spaces/HimanshuGoyal2004/github-mcp-server)
    
    ⚠️ **Important Notice**: This tool is designed for legitimate security research and vulnerability assessment purposes only. Do not use this scanner for malicious activities, unauthorized access, or any illegal purposes. Always ensure you have proper authorization before scanning repositories that don't belong to you.
    """)
    gr.Markdown("---")

    # API Configuration Section
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🔑 API Configuration")
            hf_token_box = gr.Textbox(
                label="🤗 Hugging Face API Key",
                placeholder="Enter your Hugging Face API key for AI model access",
                type="password",
                info="🔗 Get your free key: https://huggingface.co/settings/tokens"
            )

    gr.Markdown("---")
    gr.Markdown("### 💬 Enhanced Security Analysis")
    gr.Markdown("Paste a GitHub **FILE URL** (not repository URL) below to start the enhanced security analysis.")

    # Chatbot Interface
    chatbot = gr.ChatInterface(
        fn=lambda msg, hist, hf_token: analyze_vulnerabilities_multiagent(msg, hist, hf_token),
        additional_inputs=[hf_token_box],
        type="messages",
        examples=[
            ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/database/schema.sql", ""],
            ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/python/database.py", ""],
            ["https://github.com/banno-0720/documentation-agent/blob/main/code.py", ""]
        ],
    )

if __name__ == "__main__":
    demo.launch(server_port=7860)  # Different port to avoid conflict with server