Baction commited on
Commit
b788ba2
Β·
verified Β·
1 Parent(s): c324061

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -103
app.py CHANGED
@@ -1,13 +1,59 @@
1
  import gradio as gr
2
- from smolagents import InferenceClientModel, CodeAgent, MCPClient
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  # MCP Server URL for GitHub tools
5
  MCP_SERVER_URL = "https://baction-vulnerability-scanner-server.hf.space/gradio_api/mcp/"
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def parse_github_url(url):
8
  """Parse GitHub URL to extract owner, repo, and file path"""
9
- import re
10
-
11
  # Handle repository URLs
12
  repo_pattern = r'https://github\.com/([^/]+)/([^/]+)/?$'
13
  repo_match = re.match(repo_pattern, url.strip())
@@ -22,110 +68,119 @@ def parse_github_url(url):
22
 
23
  return None, None, None
24
 
25
-
26
- def analyze_vulnerabilities(message, history, hf_token):
27
- """Analyze GitHub repository or specific file for vulnerabilities using AI agent"""
28
 
29
  # Validate HF token input
30
  if not hf_token.strip():
31
  return "❌ Please provide a Hugging Face API key. Get one from [Hugging Face](https://huggingface.co/settings/tokens)"
32
 
33
  try:
 
 
 
 
 
 
 
 
 
 
34
  mcp_client = MCPClient({
35
  "url": MCP_SERVER_URL,
36
  "timeout": 120
37
  })
38
- tools = mcp_client.get_tools()
39
 
40
- # Initialize AI model with user's token
41
  model = InferenceClientModel(token=hf_token.strip())
42
 
43
- # Create AI agent with GitHub MCP tools and CVE database
 
 
 
44
  agent = CodeAgent(
45
- tools=tools,
46
- model=model,
47
- additional_authorized_imports=["json", "ast", "urllib", "base64", "re"],
48
  max_steps=12
49
  )
50
 
51
- # Parse the GitHub URL
52
- owner, repo, file_path = parse_github_url(message)
53
-
54
- if not owner or not repo:
55
- return "❌ Invalid GitHub URL. Please provide a valid GitHub repository or file URL."
56
-
57
- # Generate different prompts based on whether it's a file or repository
58
- if file_path:
59
- enhanced_prompt = f"""
60
- You are a cybersecurity expert with access to a comprehensive CVE knowledge base. Analyze the specific GitHub file for security vulnerabilities.
61
  GitHub URL: {message}
62
  Repository: {owner}/{repo}
63
  File Path: {file_path}
64
- Please follow this enhanced analysis workflow:
65
- 1. **Repository & File Analysis**:
66
- - Get repository information to verify it exists
67
- - Get the content of the specific file: {file_path}
68
- - Identify the programming language and framework used
69
- 2. **CVE Knowledge Base Research**:
70
- - Use the search_cve_database tool to search for relevant vulnerability patterns based on the code you find
71
- - Search for common weaknesses related to the programming language/framework
72
- - Look up specific vulnerability types you identify in the code
73
- 3. **Comprehensive Security Analysis**:
74
- - Command injection: os.system, exec, eval calls
75
- - Input validation: unvalidated user inputs, missing sanitization
76
- - Error handling: unhandled exceptions that could leak info
77
- - Hardcoded secrets: API keys, passwords, tokens, database credentials
78
- - Unsafe operations: file operations without validation
79
- - Authentication/authorization flaws
80
- - Cross-site scripting (XSS) vulnerabilities
81
- - SQL injection vulnerabilities
82
- 4. **Enhanced Security Report**:
83
- - πŸ” **File Overview** (path, language, size, framework)
84
- - πŸ“Š **Vulnerability Summary** (counts by severity with CWE mappings)
85
- - 🚨 **Detailed Findings** with:
86
- - Line numbers and code snippets
87
- - **CWE Classification** from CVE knowledge base
88
- - **CVSS Severity** based on similar CVEs
89
- - Security impact and exploitation scenarios
90
- - **Remediation advice** with best practices
91
- - **Related CVE examples** from knowledge base
92
- Use the search_cve_database tool extensively to provide context-aware analysis based on real-world vulnerability data.
93
- """
94
- else:
95
- enhanced_prompt = f"""
96
- You are a cybersecurity expert with access to a comprehensive CVE knowledge base. Analyze the GitHub repository for security vulnerabilities.
97
- Repository: {message}
98
- Please follow this enhanced analysis workflow:
99
- 1. **Repository Discovery**:
100
- - Get repository information to verify it exists and understand the tech stack
101
- - Scan for code files (.py, .js, .ts, .php, .java, .cpp, .c, .cs, .go, .rb, .rs, .swift, .kt, .scala, .sh, .bash, .ps1, .ipynb, .sql, .xml, .yaml, .yml, .json, .config, .ini, .env)
102
- - Prioritize the most critical files (main application files, configuration files, database schemas)
103
- 2. **CVE Knowledge Base Research**:
104
- - Use the search_cve_database tool to research common vulnerabilities for the identified tech stack
105
- - Search for framework-specific vulnerabilities (e.g., "Django SQL injection", "React XSS", "Node.js command injection")
106
- - Look up configuration-related vulnerabilities for the technologies used
107
- 3. **Comprehensive Security Analysis** (analyze 5-8 most important files):
108
- - **Injection Vulnerabilities**: SQL injection, command injection, code injection
109
- - **Input Validation**: Unvalidated inputs, missing sanitization, parameter tampering
110
- - **Authentication & Authorization**: Broken access controls, session management
111
- - **Data Exposure**: Hardcoded secrets, information disclosure, insecure storage
112
- - **Configuration Issues**: Debug mode, insecure defaults, missing security headers
113
- - **Framework-Specific**: Technology-specific vulnerability patterns from CVE database
114
- 4. **Enhanced Security Report**:
115
- - πŸ” **Repository Overview** (tech stack, architecture, security posture)
116
- - πŸ“ **Files Analyzed** (prioritized list with rationale)
117
- - πŸ“Š **Vulnerability Summary** with CWE classifications and CVSS scores
118
- - 🚨 **Detailed Findings** including:
119
- - File paths and line numbers
120
- - **CWE Classification** from CVE knowledge base
121
- - **Severity Assessment** based on CVSS scores from similar CVEs
122
- - Code snippets and exploitation scenarios
123
- - **Remediation Strategies** with best practices
124
- - **Related CVE References** for context
125
- Use the search_cve_database tool extensively to provide evidence-based analysis grounded in real-world vulnerability data.
 
 
 
126
  """
127
 
128
- # Run the AI agent analysis
129
  result = agent.run(enhanced_prompt)
130
 
131
  # Disconnect MCP client
@@ -134,24 +189,23 @@ Use the search_cve_database tool extensively to provide evidence-based analysis
134
  return str(result)
135
 
136
  except Exception as e:
137
- return f"❌ Error analyzing repository: {str(e)}\n\nPlease ensure:\nβ€’ Valid GitHub repository URL\nβ€’ Hugging Face token is correct\nβ€’ Repository is accessible"
138
-
139
 
140
  # Gradio UI
141
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
142
- gr.Markdown("## πŸ›‘οΈ AI-Powered GitHub Vulnerability Scanner")
143
  gr.Markdown("""
144
- **Advanced Security Analysis Tool for GitHub Repositories**
145
 
146
- This intelligent vulnerability scanner leverages cutting-edge AI agents and Model Context Protocol (MCP) tools to perform comprehensive security analysis of GitHub repositories and individual files.
147
 
148
  **Key Features:**
149
- - **πŸ€– AI-Powered Analysis**: Uses advanced language models with agentic RAG for intelligent vulnerability detection
150
- - **πŸ“Š CVE Knowledge Base**: Leverages real CVE data to provide CWE classifications and CVSS severity scores
151
- - **πŸ” Deep Code Analysis**: Scans for SQL injection, XSS, command injection, and framework-specific vulnerabilities
152
- - **πŸ“ Repository & File Support**: Analyze entire repositories or focus on specific files
153
- - **πŸ“‹ Enhanced Reports**: Comprehensive security reports with CVE references, CWE mappings, and remediation strategies
154
- - **πŸ”’ Secure Processing**: Your API keys are used securely and never stored
155
 
156
  **Project Links:**
157
  - πŸ“‚ **Source Code**: [GitHub Repository](https://github.com/banno-0720/vulnerability-scanner)
@@ -173,20 +227,20 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
173
  )
174
 
175
  gr.Markdown("---")
176
- gr.Markdown("### πŸ’¬ Security Analysis Chat")
177
- gr.Markdown("Paste any GitHub repository or file URL below to start the security analysis.")
178
 
179
  # Chatbot Interface
180
  chatbot = gr.ChatInterface(
181
- fn=lambda msg, hist, hf_token: analyze_vulnerabilities(msg, hist, hf_token),
182
  additional_inputs=[hf_token_box],
183
  type="messages",
184
  examples=[
185
- ["https://github.com/ayushmittal62/vunreability_scanner_testing", ""],
186
- ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/database/schema.sql", ""],
187
- ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/python/database.py", ""]
188
  ],
189
  )
190
 
191
  if __name__ == "__main__":
192
- demo.launch()
 
1
  import gradio as gr
2
+ import re
3
+ import requests
4
+ from markdownify import markdownify
5
+ from requests.exceptions import RequestException
6
+ from smolagents import (
7
+ CodeAgent,
8
+ ToolCallingAgent,
9
+ InferenceClientModel,
10
+ WebSearchTool,
11
+ MCPClient,
12
+ tool
13
+ )
14
 
15
  # MCP Server URL for GitHub tools
16
  MCP_SERVER_URL = "https://baction-vulnerability-scanner-server.hf.space/gradio_api/mcp/"
17
 
18
+ @tool
19
+ def visit_webpage(url: str) -> str:
20
+ """Visits a webpage at the given URL and returns its content as a markdown string.
21
+
22
+ Args:
23
+ url: The URL of the webpage to visit.
24
+
25
+ Returns:
26
+ The content of the webpage converted to Markdown, or an error message if the request fails.
27
+ """
28
+ try:
29
+ # Add user agent to avoid blocking
30
+ headers = {
31
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
32
+ }
33
+
34
+ # Send a GET request to the URL
35
+ response = requests.get(url, headers=headers, timeout=30)
36
+ response.raise_for_status() # Raise an exception for bad status codes
37
+
38
+ # Convert the HTML content to Markdown
39
+ markdown_content = markdownify(response.text).strip()
40
+
41
+ # Remove multiple line breaks
42
+ markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
43
+
44
+ # Limit content length to avoid overwhelming the AI
45
+ if len(markdown_content) > 5000:
46
+ markdown_content = markdown_content[:5000] + "\n\n[Content truncated due to length...]"
47
+
48
+ return markdown_content
49
+
50
+ except RequestException as e:
51
+ return f"Error fetching the webpage: {str(e)}"
52
+ except Exception as e:
53
+ return f"An unexpected error occurred: {str(e)}"
54
+
55
  def parse_github_url(url):
56
  """Parse GitHub URL to extract owner, repo, and file path"""
 
 
57
  # Handle repository URLs
58
  repo_pattern = r'https://github\.com/([^/]+)/([^/]+)/?$'
59
  repo_match = re.match(repo_pattern, url.strip())
 
68
 
69
  return None, None, None
70
 
71
+ def analyze_vulnerabilities_multiagent(message, history, hf_token):
72
+ """Multi-agent vulnerability analysis with web scraping capabilities"""
 
73
 
74
  # Validate HF token input
75
  if not hf_token.strip():
76
  return "❌ Please provide a Hugging Face API key. Get one from [Hugging Face](https://huggingface.co/settings/tokens)"
77
 
78
  try:
79
+ # Parse the GitHub URL
80
+ owner, repo, file_path = parse_github_url(message)
81
+
82
+ if not owner or not repo:
83
+ return "❌ Invalid GitHub URL. Please provide a valid GitHub repository or file URL."
84
+
85
+ if not file_path:
86
+ return "❌ Please provide a specific file URL for analysis. Repository-wide analysis is not supported in multi-agent mode."
87
+
88
+ # Connect to MCP server for GitHub tools
89
  mcp_client = MCPClient({
90
  "url": MCP_SERVER_URL,
91
  "timeout": 120
92
  })
93
+ github_tools = mcp_client.get_tools()
94
 
95
+ # Initialize AI model
96
  model = InferenceClientModel(token=hf_token.strip())
97
 
98
+ # Create a single agent with all tools (simpler approach)
99
+ all_tools = github_tools + [visit_webpage]
100
+
101
+ # Create single agent instead of multi-agent to avoid tool_choice issues
102
  agent = CodeAgent(
103
+ tools=all_tools,
104
+ model=model,
105
+ additional_authorized_imports=["re", "requests"],
106
  max_steps=12
107
  )
108
 
109
+ # Simplified prompt for single agent analysis
110
+ enhanced_prompt = f"""
111
+ You are a cybersecurity expert. Analyze this GitHub file for security vulnerabilities.
112
+
 
 
 
 
 
 
113
  GitHub URL: {message}
114
  Repository: {owner}/{repo}
115
  File Path: {file_path}
116
+
117
+ **ANALYSIS STEPS:**
118
+
119
+ 1. **Get File Data**:
120
+ - Use get_repository_info with owner="{owner}", repo="{repo}"
121
+ - Use get_file_content with owner="{owner}", repo="{repo}", path="{file_path}"
122
+
123
+ 2. **Find Vulnerabilities**:
124
+ Analyze code for:
125
+ - SQL injection patterns
126
+ - Command injection (eval, exec, os.system)
127
+ - XSS vulnerabilities
128
+ - Path traversal
129
+ - Hardcoded secrets
130
+ - Input validation issues
131
+
132
+ 3. **CVE Research**:
133
+ - Search for CVEs: simple_cve_search("SQL injection", 3)
134
+ - Extract CVE IDs from the string result using regex
135
+ - Visit NVD for the first CVE: visit_webpage("https://nvd.nist.gov/vuln/detail/CVE-XXXX-XXXX")
136
+ - Include the full NVD webpage content in your report
137
+
138
+ 4. **Generate Report**:
139
+
140
+ # πŸ›‘οΈ Security Analysis Report
141
+
142
+ ## πŸ” File Overview
143
+ - **Path**: {file_path}
144
+ - **Repository**: {owner}/{repo}
145
+
146
+ ## 🚨 Vulnerabilities Found
147
+ [List vulnerabilities with line numbers]
148
+
149
+ ## πŸ“Š CVE Research
150
+ **Top Related CVE**: [First CVE ID from regex extraction]
151
+ **CVE Details Webpage Content**: [Complete content from visit_webpage call]
152
+ **Key Details from CVE Details**: [CVSS score, attack vector, impact extracted from webpage]
153
+
154
+ ## ⚠️ Other Possible CVEs
155
+ [Show other 2 CVE IDs from search]
156
+
157
+ ## πŸ› οΈ Remediation
158
+ [Specific fixes]
159
+
160
+ ## ⚠️ Disclaimer
161
+ AI analysis may not be 100% accurate. Manual security review recommended.
162
+
163
+ **REMEMBER**: Always call visit_webpage for the first CVE ID to get detailed CVE information!
164
+
165
+ **CRITICAL INSTRUCTIONS**:
166
+ - simple_cve_search returns a STRING with CVE IDs and descriptions
167
+ - Extract CVE IDs using: re.findall(r'CVE-\d{4}-\d+', cve_search_string)
168
+ - TRY to visit CVE Details webpage for the first CVE ID found (more reliable than NVD)
169
+ - Use this exact pattern:
170
+ 1. Call simple_cve_search("SQL injection", 3)
171
+ 2. Extract CVE IDs with regex from the returned string
172
+ 3. Take the first CVE ID from the list
173
+ 4. Call visit_webpage("https://www.cvedetails.com/cve/CVE-YYYY-NNNNN/") with the EXACT CVE ID (keep hyphens)
174
+ 5. If webpage fails (403 error), continue with analysis using CVE search results only
175
+ - Keep variable names simple and avoid complex operations
176
+ - ALWAYS use keyword arguments for MCP tools (e.g., owner="user", repo="repo", path="file.py")
177
+ - NOTE: CVE format is standard CVE-YYYY-NNNNN (like CVE-2024-54762)
178
+ - Example: If you get "CVE-2024-54762", visit "https://www.cvedetails.com/cve/CVE-2024-54762/"
179
+ - DO NOT remove hyphens from CVE IDs when visiting CVE Details URLs
180
+ - If CVE Details access fails, use the CVE descriptions from simple_cve_search results
181
  """
182
 
183
+ # Run the agent analysis
184
  result = agent.run(enhanced_prompt)
185
 
186
  # Disconnect MCP client
 
189
  return str(result)
190
 
191
  except Exception as e:
192
+ return f"❌ Error in multi-agent analysis: {str(e)}\n\nPlease ensure:\nβ€’ Valid GitHub file URL (not repository URL)\nβ€’ Hugging Face token is correct\nβ€’ File is accessible"
 
193
 
194
  # Gradio UI
195
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
196
+ gr.Markdown("## πŸ›‘οΈ Enhanced GitHub Vulnerability Scanner")
197
  gr.Markdown("""
198
+ **Advanced Security Analysis with Web Scraping**
199
 
200
+ This intelligent vulnerability scanner uses AI agents with web scraping capabilities to perform comprehensive security analysis of GitHub files.
201
 
202
  **Key Features:**
203
+ - **πŸ€– AI Agent System**: Single agent with multiple tools for efficient analysis
204
+ - **🌐 Web Scraping**: Automatically visits NVD webpages to get detailed CVE information
205
+ - **πŸ“Š CVE Database Integration**: Searches CVE knowledge base and gets top 3 matches
206
+ - **πŸ” Smart Analysis**: AI-generated vulnerability descriptions (not hardcoded)
207
+ - **πŸ“‹ Detailed Reports**: Comprehensive reports with NVD data and remediation advice
208
+ - **⚠️ Accuracy Disclaimer**: Shows alternative CVEs and warns about AI limitations
209
 
210
  **Project Links:**
211
  - πŸ“‚ **Source Code**: [GitHub Repository](https://github.com/banno-0720/vulnerability-scanner)
 
227
  )
228
 
229
  gr.Markdown("---")
230
+ gr.Markdown("### πŸ’¬ Enhanced Security Analysis")
231
+ gr.Markdown("Paste a GitHub **FILE URL** (not repository URL) below to start the enhanced security analysis.")
232
 
233
  # Chatbot Interface
234
  chatbot = gr.ChatInterface(
235
+ fn=lambda msg, hist, hf_token: analyze_vulnerabilities_multiagent(msg, hist, hf_token),
236
  additional_inputs=[hf_token_box],
237
  type="messages",
238
  examples=[
239
+ ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/database/schema.sql", ""],
240
+ ["https://github.com/ayushmittal62/vunreability_scanner_testing/blob/master/python/database.py", ""],
241
+ ["https://github.com/banno-0720/documentation-agent/blob/main/code.py", ""]
242
  ],
243
  )
244
 
245
  if __name__ == "__main__":
246
+ demo.launch(server_port=7860) # Different port to avoid conflict with server