Spaces:

Phoenix21
/

FileStructureAnalyzer

Sleeping

App Files Files Community

FileStructureAnalyzer / app.py

Phoenix21

Create app.py

a1cb144 verified 5 months ago

raw

history blame contribute delete

15.7 kB

	import requests
	import time
	import os
	from urllib.parse import urlparse
	from treelib import Tree
	from typing import Dict, List, Optional, Tuple
	import logging
	from dataclasses import dataclass
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from groq import Groq, GroqError
	import gradio as gr
	from tqdm.auto import tqdm

	# --- Basic Configuration ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# --- Data Structures ---
	@dataclass
	class FileInfo:
	"""Data class to store file information"""
	path: str
	name: str
	content: str
	explanation: str
	size: int
	file_type: str

	# --- Core Application Logic ---
	class GitHubRepositoryAnalyzer:
	"""
	A class to analyze GitHub repositories by fetching file structures,
	downloading content, and using an LLM to explain the code.
	"""
	def __init__(self, github_token: Optional[str] = None, groq_api_key: Optional[str] = None):
	self.github_token = github_token
	self.session = requests.Session()
	self.file_contents: Dict[str, FileInfo] = {}

	# Configure GitHub API access
	if self.github_token:
	logger.info("Using provided GitHub token for higher rate limits.")
	self.session.headers.update({'Authorization': f'token {self.github_token}'})
	# Authenticated GitHub API: 5000 requests/hour
	self.rate_limiter = RateLimiter(max_calls=4500, time_window=3600)
	else:
	logger.warning("No GitHub token. Using unauthenticated access with lower rate limits.")
	# Unauthenticated: 60 requests/hour
	self.rate_limiter = RateLimiter(max_calls=50, time_window=3600)

	# Configure Groq client
	if groq_api_key:
	self.groq_client = Groq(api_key=groq_api_key)
	logger.info("Groq client initialized.")
	else:
	self.groq_client = None
	logger.warning("Groq API key not provided. Code analysis will be skipped.")

	# File types to analyze
	self.analyzable_extensions = {
	'.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.php',
	'.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bash',
	'.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml',
	'.md', '.txt', '.cfg', '.conf', '.ini', '.env', '.dockerfile', 'requirements.txt'
	}

	def extract_repo_info(self, repo_url: str) -> Tuple[str, str]:
	"""Extract owner and repository name from a GitHub URL."""
	try:
	parsed_url = urlparse(repo_url)
	path = parsed_url.path.strip('/').replace('.git', '')
	parts = path.split('/')
	if len(parts) >= 2:
	return parts[0], parts[1]
	raise ValueError("Invalid repository URL format")
	except Exception as e:
	logger.error(f"Error parsing repository URL: {e}")
	raise

	def get_repository_structure(self, owner: str, repo: str, path: str = "") -> List[Dict]:
	"""Recursively fetch the entire file structure of the repository."""
	all_files = []
	try:
	contents = self._fetch_contents(owner, repo, path)
	for item in contents:
	if item['type'] == 'dir':
	all_files.extend(self.get_repository_structure(owner, repo, item['path']))
	else:
	all_files.append(item)
	except Exception as e:
	logger.error(f"Failed to get repository structure for {path}: {e}")
	return all_files

	def _fetch_contents(self, owner: str, repo: str, path: str) -> List[Dict]:
	"""Helper to fetch contents of a specific directory with pagination."""
	url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
	items = []
	while url:
	self.rate_limiter.wait_if_needed()
	response = self.session.get(url)
	response.raise_for_status()
	items.extend(response.json())
	url = response.links.get('next', {}).get('url')
	return items

	def map_reduce_analysis(self, owner: str, repo: str, files: List[Dict], progress: gr.Progress):
	"""
	Analyzes files in parallel (Map phase) and aggregates results (Reduce phase).
	This method uses a ThreadPoolExecutor to perform I/O-bound tasks concurrently.
	"""
	# --- MAP PHASE ---
	# Each file is processed independently in a separate thread.
	# This is efficient for tasks that wait for network responses (API calls).
	logger.info(f"Starting Map phase: Analyzing {len(files)} files in parallel.")
	with ThreadPoolExecutor(max_workers=10) as executor:
	future_to_file = {executor.submit(self._process_single_file, owner, repo, file_item): file_item for file_item in files}

	# tqdm progress tracking integrated with Gradio
	pbar = tqdm(as_completed(future_to_file), total=len(files), desc="Analyzing Files")
	for future in pbar:
	try:
	file_info = future.result()
	if file_info:
	# Store the result of the map phase
	self.file_contents[file_info.path] = file_info
	pbar.set_description(f"Analyzed {file_info.name}")
	# Update Gradio progress bar
	progress(pbar.n / pbar.total, desc=pbar.desc)
	except Exception as e:
	file_item = future_to_file[future]
	logger.error(f"Error processing {file_item['path']}: {e}")

	# --- REDUCE PHASE ---
	# The reduce phase is the aggregation and structuring of the mapped results,
	# which happens after the loop when creating the tree and summary.
	logger.info("Reduce phase: Aggregating results.")
	tree = self._create_directory_tree(owner, repo)
	details = self._format_detailed_explanations()
	summary = self._format_summary(owner, repo)

	return tree, details, summary

	def _process_single_file(self, owner: str, repo: str, file_item: Dict) -> Optional[FileInfo]:
	"""Processes a single file: download, check, and analyze."""
	file_path = file_item['path']
	file_size = file_item.get('size', 0)

	if not self._should_analyze_file(file_path, file_size):
	return None

	content = self._get_raw_file(owner, repo, file_path)
	if content is None:
	return None

	explanation = self._analyze_code_with_llm(content, file_path)

	return FileInfo(
	path=file_path, name=file_item['name'], content=content,
	explanation=explanation, size=file_size, file_type=os.path.splitext(file_path)[1]
	)

	def _should_analyze_file(self, file_path: str, file_size: int) -> bool:
	"""Determine if a file should be analyzed based on extension and size."""
	if file_size > 1024 * 1024: return False # Skip files > 1MB
	file_name = os.path.basename(file_path)
	_, file_ext = os.path.splitext(file_name)
	return file_ext.lower() in self.analyzable_extensions or file_name in self.analyzable_extensions

	def _get_raw_file(self, owner: str, repo: str, file_path: str) -> Optional[str]:
	"""Fetch raw file content with fallback branches."""
	for branch in ['main', 'master']:
	url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
	try:
	response = self.session.get(url, timeout=10)
	if response.status_code == 200:
	# Simple check for binary content
	return response.text if '\x00' not in response.text else None
	except (requests.RequestException, UnicodeDecodeError) as e:
	logger.warning(f"Could not fetch or decode {file_path} from branch {branch}: {e}")
	return None

	def _analyze_code_with_llm(self, code: str, file_path: str) -> str:
	"""Analyze code using Groq LLM API."""
	if not self.groq_client:
	return "Analysis skipped: Groq API key not provided."

	max_code_length = 8000
	if len(code) > max_code_length: code = code[:max_code_length] + "\n... (truncated)"

	prompt = f"""Analyze the following code from file '{file_path}'. Provide a concise explanation of its functionality, purpose, and key components.
	```
	{code}
	```
	Structure your analysis with these points:
	1. Main Purpose: What is the primary goal of this file?
	2. Key Functions/Classes: What are the main components and what do they do?
	3. Overall Role: How does this file fit into the larger project?
	"""
	try:
	chat_completion = self.groq_client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama3-8b-8192",
	temperature=0.2, max_tokens=1024
	)
	return chat_completion.choices[0].message.content.strip()
	except GroqError as e:
	logger.error(f"Groq API error for {file_path}: {e}")
	return f"Error: Groq API request failed - {e.message}"
	except Exception as e:
	logger.error(f"Error calling Groq API for {file_path}: {e}")
	return f"Error: {e}"

	def _create_directory_tree(self, owner: str, repo: str) -> str:
	"""Creates a string representation of the directory tree."""
	tree = Tree()
	root_id = f"{owner}/{repo}"
	tree.create_node(f"🌳 {root_id}", root_id)
	created_nodes = {root_id}

	for file_path in sorted(self.file_contents.keys()):
	path_parts = file_path.split('/')
	parent_id = root_id
	for i, part in enumerate(path_parts[:-1]):
	node_id = f"{root_id}/{'/'.join(path_parts[:i+1])}"
	if node_id not in created_nodes:
	tree.create_node(f"📁 {part}", node_id, parent=parent_id)
	created_nodes.add(node_id)
	parent_id = node_id

	file_name = path_parts[-1]
	file_type = self.file_contents[file_path].file_type
	emoji = self.get_file_emoji(file_type)
	tree.create_node(f"{emoji} {file_name}", f"{root_id}/{file_path}", parent=parent_id)

	return f"```\n{tree.show(line_type='ascii-ex')}\n```"

	def _format_detailed_explanations(self) -> str:
	"""Formats all file explanations into a single Markdown string."""
	if not self.file_contents: return "No files were analyzed."

	output = []
	for path, info in sorted(self.file_contents.items()):
	output.append(f"### 📄 `{path}`")
	output.append(f"Size: {info.size:,} bytes")
	output.append("---")
	output.append(info.explanation)
	output.append("\n---\n")
	return "\n".join(output)

	def _format_summary(self, owner: str, repo: str) -> str:
	"""Creates a summary of the analysis."""
	total_files = len(self.file_contents)
	total_size = sum(info.size for info in self.file_contents.values())
	return (
	f"## Analysis Summary for `{owner}/{repo}`\n"
	f"- Total Files Analyzed: {total_files}\n"
	f"- Total Code Size Analyzed: {total_size:,} bytes"
	)

	@staticmethod
	def get_file_emoji(file_type: str) -> str:
	"""Returns an emoji for a given file type."""
	emoji_map = {
	'.py': '🐍', '.js': '🟨', '.ts': '🔷', '.java': '☕', '.html': '🌐',
	'.css': '🎨', '.json': '📋', '.md': '📝', '.sh': '🐚', '.yml': '⚙️',
	'.yaml': '⚙️', '.dockerfile': '🐳', '.sql': '🗄️', 'requirements.txt': '📦'
	}
	return emoji_map.get(file_type.lower(), '📄')

	class RateLimiter:
	"""Simple rate limiter to avoid exceeding API limits."""
	def __init__(self, max_calls: int, time_window: int):
	self.max_calls = max_calls
	self.time_window = time_window
	self.calls = []

	def wait_if_needed(self):
	now = time.time()
	self.calls = [t for t in self.calls if now - t < self.time_window]
	if len(self.calls) >= self.max_calls:
	sleep_time = self.time_window - (now - self.calls[0])
	if sleep_time > 0:
	logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
	time.sleep(sleep_time)
	self.calls.append(time.time())

	# --- Gradio Interface ---
	def analyze_repo_gradio(repo_url: str, github_token: str, groq_key: str, progress=gr.Progress(track_tqdm=True)):
	"""The main function executed by the Gradio interface."""
	if not repo_url:
	return "Please enter a GitHub repository URL.", "", ""
	if not groq_key:
	return "Please enter your Groq API Key.", "", ""

	try:
	analyzer = GitHubRepositoryAnalyzer(github_token=github_token, groq_api_key=groq_key)

	progress(0, desc="Extracting repo info...")
	owner, repo = analyzer.extract_repo_info(repo_url)

	progress(0.1, desc="Fetching repository file structure...")
	all_files = analyzer.get_repository_structure(owner, repo)
	if not all_files:
	return "Could not retrieve repository structure. Check URL or token.", "", ""

	tree, details, summary = analyzer.map_reduce_analysis(owner, repo, all_files, progress)

	return tree, details, summary
	except Exception as e:
	logger.error(f"A critical error occurred: {e}", exc_info=True)
	return f"An error occurred: {e}", "", ""

	def create_gradio_interface():
	"""Builds and returns the Gradio web interface."""
	with gr.Blocks(theme=gr.themes.Soft(), title="GitHub Repo Analyzer") as demo:
	gr.Markdown("# 🤖 AI-Powered GitHub Repository Analyzer")
	gr.Markdown("Enter a GitHub repository URL to generate a file tree, get AI-powered explanations for each file, and see a summary.")

	with gr.Row():
	with gr.Column(scale=2):
	repo_url = gr.Textbox(label="GitHub Repository URL", placeholder="e.g., https://github.com/google/generative-ai-python")
	with gr.Accordion("API Keys (Optional but Recommended)", open=False):
	github_token = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token for a higher rate limit", type="password")
	groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key for code analysis", type="password")

	analyze_btn = gr.Button("Analyze Repository", variant="primary")

	with gr.Tabs():
	with gr.TabItem("📊 Summary"):
	summary_output = gr.Markdown()
	with gr.TabItem("🌳 File Tree"):
	tree_output = gr.Markdown()
	with gr.TabItem("📄 Detailed Analysis"):
	details_output = gr.Markdown()

	analyze_btn.click(
	fn=analyze_repo_gradio,
	inputs=[repo_url, github_token, groq_key],
	outputs=[tree_output, details_output, summary_output]
	)
	return demo

	if __name__ == "__main__":
	app = create_gradio_interface()
	app.launch(debug=True)