|
|
""" |
|
|
Backend parser utilities for AnyCoder. |
|
|
Handles parsing of various code formats including transformers.js, Python multi-file outputs, and more. |
|
|
""" |
|
|
import re |
|
|
import json |
|
|
import ast |
|
|
from typing import Dict, Optional |
|
|
from backend_models import get_inference_client, get_real_model_id |
|
|
|
|
|
|
|
|
def parse_transformers_js_output(code: str) -> Dict[str, str]: |
|
|
"""Parse transformers.js output into separate files (index.html, index.js, style.css) |
|
|
|
|
|
Uses comprehensive parsing patterns to handle various LLM output formats. |
|
|
Updated to use transformers.js v3.8.0 CDN. |
|
|
""" |
|
|
print(f"[Parser] Received code length: {len(code)} characters") |
|
|
print(f"[Parser] First 200 chars: {code[:200]}") |
|
|
|
|
|
files = { |
|
|
'index.html': '', |
|
|
'index.js': '', |
|
|
'style.css': '' |
|
|
} |
|
|
|
|
|
|
|
|
html_patterns = [ |
|
|
r'```html\s*\n([\s\S]*?)(?:```|\Z)', |
|
|
r'```htm\s*\n([\s\S]*?)(?:```|\Z)', |
|
|
r'```\s*(?:index\.html|html)\s*\n([\s\S]*?)(?:```|\Z)' |
|
|
] |
|
|
|
|
|
js_patterns = [ |
|
|
r'```javascript\s*\n([\s\S]*?)(?:```|\Z)', |
|
|
r'```js\s*\n([\s\S]*?)(?:```|\Z)', |
|
|
r'```\s*(?:index\.js|javascript|js)\s*\n([\s\S]*?)(?:```|\Z)' |
|
|
] |
|
|
|
|
|
css_patterns = [ |
|
|
r'```css\s*\n([\s\S]*?)(?:```|\Z)', |
|
|
r'```\s*(?:style\.css|css)\s*\n([\s\S]*?)(?:```|\Z)' |
|
|
] |
|
|
|
|
|
|
|
|
for pattern in html_patterns: |
|
|
html_match = re.search(pattern, code, re.IGNORECASE) |
|
|
if html_match: |
|
|
files['index.html'] = html_match.group(1).strip() |
|
|
break |
|
|
|
|
|
|
|
|
for pattern in js_patterns: |
|
|
js_match = re.search(pattern, code, re.IGNORECASE) |
|
|
if js_match: |
|
|
files['index.js'] = js_match.group(1).strip() |
|
|
break |
|
|
|
|
|
|
|
|
for pattern in css_patterns: |
|
|
css_match = re.search(pattern, code, re.IGNORECASE) |
|
|
if css_match: |
|
|
files['style.css'] = css_match.group(1).strip() |
|
|
break |
|
|
|
|
|
|
|
|
if not (files['index.html'] and files['index.js'] and files['style.css']): |
|
|
|
|
|
|
|
|
|
|
|
html_fallback = re.search(r'===\s*index\.html\s*===\s*\n([\s\S]+?)(?=\n===|\n\s*---|\n\n(?:This |✨|🎨|🚀|\*\*Key Features|\*\*Design)|$)', code, re.IGNORECASE) |
|
|
|
|
|
|
|
|
js_fallback = re.search(r'===\s*(?:index\.js|app\.js)\s*===\s*\n([\s\S]+?)(?=\n===|\n\s*---|\n\n(?:This |✨|🎨|🚀|\*\*Key Features|\*\*Design)|$)', code, re.IGNORECASE) |
|
|
|
|
|
|
|
|
css_fallback = re.search(r'===\s*(?:style\.css|styles\.css)\s*===\s*\n([\s\S]+?)(?=\n===|\n\s*---|\n\n(?:This |✨|🎨|🚀|\*\*Key Features|\*\*Design)|$)', code, re.IGNORECASE) |
|
|
|
|
|
print(f"[Parser] Fallback extraction - HTML found: {bool(html_fallback)}, JS found: {bool(js_fallback)}, CSS found: {bool(css_fallback)}") |
|
|
|
|
|
if html_fallback: |
|
|
files['index.html'] = html_fallback.group(1).strip() |
|
|
if js_fallback: |
|
|
js_content = js_fallback.group(1).strip() |
|
|
|
|
|
|
|
|
js_content = re.sub(r'"\s*\n\s*([^"])', r'" + "\1', js_content) |
|
|
files['index.js'] = js_content |
|
|
if css_fallback: |
|
|
css_content = css_fallback.group(1).strip() |
|
|
files['style.css'] = css_content |
|
|
|
|
|
|
|
|
if files['index.html'] and 'styles.css' in files['index.html']: |
|
|
print("[Parser] Normalizing styles.css reference to style.css in HTML") |
|
|
files['index.html'] = files['index.html'].replace('href="styles.css"', 'href="style.css"') |
|
|
files['index.html'] = files['index.html'].replace("href='styles.css'", "href='style.css'") |
|
|
|
|
|
|
|
|
if not (files['index.html'] and files['index.js'] and files['style.css']): |
|
|
|
|
|
patterns = [ |
|
|
(r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)index\.html(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.html'), |
|
|
(r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)(?:index\.js|app\.js)(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.js'), |
|
|
(r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)(?:style\.css|styles\.css)(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'style.css') |
|
|
] |
|
|
|
|
|
for pattern, file_key in patterns: |
|
|
if not files[file_key]: |
|
|
match = re.search(pattern, code, re.IGNORECASE | re.MULTILINE) |
|
|
if match: |
|
|
|
|
|
content = match.group(1).strip() |
|
|
content = re.sub(r'^```\w*\s*\n', '', content) |
|
|
content = re.sub(r'\n```\s*$', '', content) |
|
|
files[file_key] = content.strip() |
|
|
|
|
|
|
|
|
if files['index.html'] and files['style.css']: |
|
|
if 'styles.css' in files['index.html']: |
|
|
print("[Parser] Normalizing styles.css reference to style.css in HTML") |
|
|
files['index.html'] = files['index.html'].replace('href="styles.css"', 'href="style.css"') |
|
|
files['index.html'] = files['index.html'].replace("href='styles.css'", "href='style.css'") |
|
|
|
|
|
if files['index.html'] and files['index.js']: |
|
|
if 'app.js' in files['index.html']: |
|
|
print("[Parser] Normalizing app.js reference to index.js in HTML") |
|
|
files['index.html'] = files['index.html'].replace('src="app.js"', 'src="index.js"') |
|
|
files['index.html'] = files['index.html'].replace("src='app.js'", "src='index.js'") |
|
|
|
|
|
|
|
|
cdn_url = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.0" |
|
|
|
|
|
for file_key in ['index.html', 'index.js']: |
|
|
if files[file_key]: |
|
|
content = files[file_key] |
|
|
|
|
|
content = re.sub( |
|
|
r"from\s+['\"]https://cdn.jsdelivr.net/npm/@huggingface/transformers@[^'\"]+['\"]", |
|
|
f"from '{cdn_url}'", |
|
|
content |
|
|
) |
|
|
content = re.sub( |
|
|
r"from\s+['\"]https://cdn.jsdelivr.net/npm/@xenova/transformers@[^'\"]+['\"]", |
|
|
f"from '{cdn_url}'", |
|
|
content |
|
|
) |
|
|
files[file_key] = content |
|
|
|
|
|
return files |
|
|
|
|
|
|
|
|
def parse_html_code(code: str) -> str: |
|
|
"""Extract HTML code from various formats""" |
|
|
code = code.strip() |
|
|
|
|
|
|
|
|
if code.startswith('<!DOCTYPE') or code.startswith('<html'): |
|
|
return code |
|
|
|
|
|
|
|
|
if '```html' in code: |
|
|
match = re.search(r'```html\s*(.*?)\s*```', code, re.DOTALL) |
|
|
if match: |
|
|
return match.group(1).strip() |
|
|
|
|
|
if '```' in code: |
|
|
match = re.search(r'```\s*(.*?)\s*```', code, re.DOTALL) |
|
|
if match: |
|
|
return match.group(1).strip() |
|
|
|
|
|
return code |
|
|
|
|
|
|
|
|
def parse_python_requirements(code: str) -> Optional[str]: |
|
|
"""Extract requirements.txt content from code if present""" |
|
|
|
|
|
req_pattern = r'===\s*requirements\.txt\s*===\s*(.*?)(?====|$)' |
|
|
match = re.search(req_pattern, code, re.DOTALL | re.IGNORECASE) |
|
|
|
|
|
if match: |
|
|
requirements = match.group(1).strip() |
|
|
|
|
|
requirements = re.sub(r'^```\w*\s*', '', requirements, flags=re.MULTILINE) |
|
|
requirements = re.sub(r'```\s*$', '', requirements, flags=re.MULTILINE) |
|
|
return requirements |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def parse_multi_file_python_output(code: str) -> Dict[str, str]: |
|
|
"""Parse multi-file Python output (e.g., Gradio, Streamlit)""" |
|
|
files = {} |
|
|
|
|
|
|
|
|
pattern = r'===\s*(\S+\.(?:py|txt))\s*===\s*(.*?)(?====|$)' |
|
|
matches = re.finditer(pattern, code, re.DOTALL | re.IGNORECASE) |
|
|
|
|
|
for match in matches: |
|
|
filename = match.group(1).strip() |
|
|
content = match.group(2).strip() |
|
|
|
|
|
|
|
|
content = re.sub(r'^```\w*\s*', '', content, flags=re.MULTILINE) |
|
|
content = re.sub(r'```\s*$', '', content, flags=re.MULTILINE) |
|
|
|
|
|
files[filename] = content |
|
|
|
|
|
return files |
|
|
|
|
|
|
|
|
def strip_tool_call_markers(text): |
|
|
"""Remove TOOL_CALL markers and thinking tags that some LLMs add to their output.""" |
|
|
if not text: |
|
|
return text |
|
|
|
|
|
text = re.sub(r'\[/?TOOL_CALL\]', '', text, flags=re.IGNORECASE) |
|
|
|
|
|
text = re.sub(r'<think>[\s\S]*?</think>', '', text, flags=re.IGNORECASE) |
|
|
|
|
|
text = re.sub(r'^<think>[\s\S]*?(?=\n|$)', '', text, flags=re.IGNORECASE | re.MULTILINE) |
|
|
|
|
|
text = re.sub(r'</think>', '', text, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
text = re.sub(r'^\s*\}\}\s*$', '', text, flags=re.MULTILINE) |
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def remove_code_block(text): |
|
|
"""Remove code block markers from text.""" |
|
|
|
|
|
text = strip_tool_call_markers(text) |
|
|
|
|
|
|
|
|
patterns = [ |
|
|
r'```(?:html|HTML)\n([\s\S]+?)\n```', |
|
|
r'```\n([\s\S]+?)\n```', |
|
|
r'```([\s\S]+?)```' |
|
|
] |
|
|
for pattern in patterns: |
|
|
match = re.search(pattern, text, re.DOTALL) |
|
|
if match: |
|
|
extracted = match.group(1).strip() |
|
|
|
|
|
if extracted.split('\n', 1)[0].strip().lower() in ['python', 'html', 'css', 'javascript', 'json', 'c', 'cpp', 'markdown', 'latex', 'jinja2', 'typescript', 'yaml', 'dockerfile', 'shell', 'r', 'sql']: |
|
|
return extracted.split('\n', 1)[1] if '\n' in extracted else '' |
|
|
return extracted |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def extract_import_statements(code): |
|
|
"""Extract import statements from generated code.""" |
|
|
import_statements = [] |
|
|
|
|
|
|
|
|
builtin_modules = { |
|
|
'os', 'sys', 'json', 'time', 'datetime', 'random', 'math', 're', 'collections', |
|
|
'itertools', 'functools', 'pathlib', 'urllib', 'http', 'email', 'html', 'xml', |
|
|
'csv', 'tempfile', 'shutil', 'subprocess', 'threading', 'multiprocessing', |
|
|
'asyncio', 'logging', 'typing', 'base64', 'hashlib', 'secrets', 'uuid', |
|
|
'copy', 'pickle', 'io', 'contextlib', 'warnings', 'sqlite3', 'gzip', 'zipfile', |
|
|
'tarfile', 'socket', 'ssl', 'platform', 'getpass', 'pwd', 'grp', 'stat', |
|
|
'glob', 'fnmatch', 'linecache', 'traceback', 'inspect', 'keyword', 'token', |
|
|
'tokenize', 'ast', 'code', 'codeop', 'dis', 'py_compile', 'compileall', |
|
|
'importlib', 'pkgutil', 'modulefinder', 'runpy', 'site', 'sysconfig' |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
tree = ast.parse(code) |
|
|
|
|
|
for node in ast.walk(tree): |
|
|
if isinstance(node, ast.Import): |
|
|
for alias in node.names: |
|
|
module_name = alias.name.split('.')[0] |
|
|
if module_name not in builtin_modules and not module_name.startswith('_'): |
|
|
import_statements.append(f"import {alias.name}") |
|
|
|
|
|
elif isinstance(node, ast.ImportFrom): |
|
|
if node.module: |
|
|
module_name = node.module.split('.')[0] |
|
|
if module_name not in builtin_modules and not module_name.startswith('_'): |
|
|
names = [alias.name for alias in node.names] |
|
|
import_statements.append(f"from {node.module} import {', '.join(names)}") |
|
|
|
|
|
except SyntaxError: |
|
|
|
|
|
for line in code.split('\n'): |
|
|
line = line.strip() |
|
|
if line.startswith('import ') or line.startswith('from '): |
|
|
|
|
|
if line.startswith('import '): |
|
|
module_name = line.split()[1].split('.')[0] |
|
|
elif line.startswith('from '): |
|
|
module_name = line.split()[1].split('.')[0] |
|
|
|
|
|
if module_name not in builtin_modules and not module_name.startswith('_'): |
|
|
import_statements.append(line) |
|
|
|
|
|
return list(set(import_statements)) |
|
|
|
|
|
|
|
|
def generate_requirements_txt_with_llm(import_statements): |
|
|
"""Generate requirements.txt content using LLM based on import statements.""" |
|
|
if not import_statements: |
|
|
return "# No additional dependencies required\n" |
|
|
|
|
|
|
|
|
try: |
|
|
client = get_inference_client("zai-org/GLM-4.6", "auto") |
|
|
actual_model_id = get_real_model_id("zai-org/GLM-4.6") |
|
|
|
|
|
imports_text = '\n'.join(import_statements) |
|
|
|
|
|
prompt = f"""Based on the following Python import statements, generate a comprehensive requirements.txt file with all necessary and commonly used related packages: |
|
|
|
|
|
{imports_text} |
|
|
|
|
|
Instructions: |
|
|
- Include the direct packages needed for the imports |
|
|
- Include commonly used companion packages and dependencies for better functionality |
|
|
- Use correct PyPI package names (e.g., PIL -> Pillow, sklearn -> scikit-learn) |
|
|
- IMPORTANT: For diffusers, ALWAYS use: git+https://github.com/huggingface/diffusers |
|
|
- IMPORTANT: For transformers, ALWAYS use: git+https://github.com/huggingface/transformers |
|
|
- IMPORTANT: If diffusers is installed, also include transformers and sentencepiece as they usually go together |
|
|
- Examples of comprehensive dependencies: |
|
|
* diffusers often needs: git+https://github.com/huggingface/transformers, sentencepiece, accelerate, torch, tokenizers |
|
|
* transformers often needs: accelerate, torch, tokenizers, datasets |
|
|
* gradio often needs: gradio>=6.0, requests, Pillow for image handling (ALWAYS use gradio>=6.0) |
|
|
* pandas often needs: numpy, openpyxl for Excel files |
|
|
* matplotlib often needs: numpy, pillow for image saving |
|
|
* sklearn often needs: numpy, scipy, joblib |
|
|
* streamlit often needs: pandas, numpy, requests |
|
|
* opencv-python often needs: numpy, pillow |
|
|
* fastapi often needs: uvicorn, pydantic |
|
|
* torch often needs: torchvision, torchaudio (if doing computer vision/audio) |
|
|
- Include packages for common file formats if relevant (openpyxl, python-docx, PyPDF2) |
|
|
- Do not include Python built-in modules |
|
|
- Do not specify versions unless there are known compatibility issues |
|
|
- One package per line |
|
|
- If no external packages are needed, return "# No additional dependencies required" |
|
|
|
|
|
🚨 CRITICAL OUTPUT FORMAT: |
|
|
- Output ONLY the package names, one per line (plain text format) |
|
|
- Do NOT use markdown formatting (no ```, no bold, no headings, no lists) |
|
|
- Do NOT add any explanatory text before or after the package list |
|
|
- Do NOT wrap the output in code blocks |
|
|
- Just output raw package names as they would appear in requirements.txt |
|
|
|
|
|
Generate a comprehensive requirements.txt that ensures the application will work smoothly:""" |
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": "You are a Python packaging expert specializing in creating comprehensive, production-ready requirements.txt files. Output ONLY plain text package names without any markdown formatting, code blocks, or explanatory text. Your goal is to ensure applications work smoothly by including not just direct dependencies but also commonly needed companion packages, popular extensions, and supporting libraries that developers typically need together."}, |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model=actual_model_id, |
|
|
messages=messages, |
|
|
max_tokens=1024, |
|
|
temperature=0.1 |
|
|
) |
|
|
|
|
|
requirements_content = response.choices[0].message.content.strip() |
|
|
|
|
|
|
|
|
if '```' in requirements_content: |
|
|
requirements_content = remove_code_block(requirements_content) |
|
|
|
|
|
|
|
|
lines = requirements_content.split('\n') |
|
|
clean_lines = [] |
|
|
for line in lines: |
|
|
stripped_line = line.strip() |
|
|
|
|
|
|
|
|
if (stripped_line == '```' or |
|
|
stripped_line.startswith('```') or |
|
|
stripped_line.startswith('#') and not stripped_line.startswith('# ') or |
|
|
stripped_line.startswith('**') or |
|
|
stripped_line.startswith('*') and not stripped_line[1:2].isalnum() or |
|
|
stripped_line.startswith('-') and not stripped_line[1:2].isalnum() or |
|
|
stripped_line.startswith('===') or |
|
|
stripped_line.startswith('---') or |
|
|
stripped_line.lower().startswith('here') or |
|
|
stripped_line.lower().startswith('this') or |
|
|
stripped_line.lower().startswith('the') or |
|
|
stripped_line.lower().startswith('based on') or |
|
|
stripped_line == ''): |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if (stripped_line.startswith('# ') or |
|
|
stripped_line.startswith('git+') or |
|
|
stripped_line[0].isalnum() or |
|
|
'==' in stripped_line or |
|
|
'>=' in stripped_line or |
|
|
'<=' in stripped_line): |
|
|
clean_lines.append(line) |
|
|
|
|
|
requirements_content = '\n'.join(clean_lines).strip() |
|
|
|
|
|
|
|
|
if requirements_content and not requirements_content.endswith('\n'): |
|
|
requirements_content += '\n' |
|
|
|
|
|
return requirements_content if requirements_content else "# No additional dependencies required\n" |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"[Parser] Warning: LLM requirements generation failed: {e}, using fallback") |
|
|
dependencies = set() |
|
|
special_cases = { |
|
|
'PIL': 'Pillow', |
|
|
'sklearn': 'scikit-learn', |
|
|
'skimage': 'scikit-image', |
|
|
'bs4': 'beautifulsoup4' |
|
|
} |
|
|
|
|
|
for stmt in import_statements: |
|
|
if stmt.startswith('import '): |
|
|
module_name = stmt.split()[1].split('.')[0] |
|
|
package_name = special_cases.get(module_name, module_name) |
|
|
dependencies.add(package_name) |
|
|
elif stmt.startswith('from '): |
|
|
module_name = stmt.split()[1].split('.')[0] |
|
|
package_name = special_cases.get(module_name, module_name) |
|
|
dependencies.add(package_name) |
|
|
|
|
|
if dependencies: |
|
|
return '\n'.join(sorted(dependencies)) + '\n' |
|
|
else: |
|
|
return "# No additional dependencies required\n" |
|
|
|
|
|
|