""" Code parsing and formatting utilities for different frameworks. Handles parsing of transformers.js, React, multi-file HTML, Streamlit, and Gradio code. """ import re import os import json import base64 from typing import Dict, List, Optional, Tuple from bs4 import BeautifulSoup import html from .config import SEARCH_START, DIVIDER, REPLACE_END # Type definitions History = List[Dict[str, str]] def strip_tool_call_markers(text): """Remove TOOL_CALL markers that some LLMs (like Qwen) add to their output.""" if not text: return text # Remove [TOOL_CALL] and [/TOOL_CALL] markers text = re.sub(r'\[/?TOOL_CALL\]', '', text, flags=re.IGNORECASE) # Remove standalone }} that appears with tool calls # Only remove if it's on its own line or at the end text = re.sub(r'^\s*\}\}\s*$', '', text, flags=re.MULTILINE) return text.strip() def remove_code_block(text): # First strip any tool call markers text = strip_tool_call_markers(text) # Try to match code blocks with language markers patterns = [ r'```(?:html|HTML)\n([\s\S]+?)\n```', # Match ```html or ```HTML r'```\n([\s\S]+?)\n```', # Match code blocks without language markers r'```([\s\S]+?)```' # Match code blocks without line breaks ] for pattern in patterns: match = re.search(pattern, text, re.DOTALL) if match: extracted = match.group(1).strip() # Remove a leading language marker line (e.g., 'python') if present if extracted.split('\n', 1)[0].strip().lower() in ['python', 'html', 'css', 'javascript', 'json', 'c', 'cpp', 'markdown', 'latex', 'jinja2', 'typescript', 'yaml', 'dockerfile', 'shell', 'r', 'sql', 'sql-mssql', 'sql-mysql', 'sql-mariadb', 'sql-sqlite', 'sql-cassandra', 'sql-plSQL', 'sql-hive', 'sql-pgsql', 'sql-gql', 'sql-gpsql', 'sql-sparksql', 'sql-esper']: return extracted.split('\n', 1)[1] if '\n' in extracted else '' # If HTML markup starts later in the block (e.g., Poe injected preface), trim to first HTML root html_root_idx = None for tag in [' 0: return extracted[html_root_idx:].strip() return extracted # If no code block is found, check if the entire text is HTML stripped = text.strip() if stripped.startswith('') or stripped.startswith(' 0: return stripped[idx:].strip() return stripped # Special handling for python: remove python marker if text.strip().startswith('```python'): return text.strip()[9:-3].strip() # Remove a leading language marker line if present (fallback) lines = text.strip().split('\n', 1) if lines[0].strip().lower() in ['python', 'html', 'css', 'javascript', 'json', 'c', 'cpp', 'markdown', 'latex', 'jinja2', 'typescript', 'yaml', 'dockerfile', 'shell', 'r', 'sql', 'sql-mssql', 'sql-mysql', 'sql-mariadb', 'sql-sqlite', 'sql-cassandra', 'sql-plSQL', 'sql-hive', 'sql-pgsql', 'sql-gql', 'sql-gpsql', 'sql-sparksql', 'sql-esper']: return lines[1] if len(lines) > 1 else '' return text.strip() ## React CDN compatibility fixer removed per user preference def strip_placeholder_thinking(text: str) -> str: """Remove placeholder 'Thinking...' status lines from streamed text.""" if not text: return text # Matches lines like: "Thinking..." or "Thinking... (12s elapsed)" return re.sub(r"(?mi)^[\t ]*Thinking\.\.\.(?:\s*\(\d+s elapsed\))?[\t ]*$\n?", "", text) def is_placeholder_thinking_only(text: str) -> bool: """Return True if text contains only 'Thinking...' placeholder lines (with optional elapsed).""" if not text: return False stripped = text.strip() if not stripped: return False return re.fullmatch(r"(?s)(?:\s*Thinking\.\.\.(?:\s*\(\d+s elapsed\))?\s*)+", stripped) is not None def extract_last_thinking_line(text: str) -> str: """Extract the last 'Thinking...' line to display as status.""" matches = list(re.finditer(r"Thinking\.\.\.(?:\s*\(\d+s elapsed\))?", text)) return matches[-1].group(0) if matches else "Thinking..." def parse_transformers_js_output(text): """Parse transformers.js output and extract the three files (index.html, index.js, style.css)""" files = { 'index.html': '', 'index.js': '', 'style.css': '' } # Multiple patterns to match the three code blocks with different variations html_patterns = [ r'```html\s*\n([\s\S]*?)(?:```|\Z)', r'```htm\s*\n([\s\S]*?)(?:```|\Z)', r'```\s*(?:index\.html|html)\s*\n([\s\S]*?)(?:```|\Z)' ] js_patterns = [ r'```javascript\s*\n([\s\S]*?)(?:```|\Z)', r'```js\s*\n([\s\S]*?)(?:```|\Z)', r'```\s*(?:index\.js|javascript|js)\s*\n([\s\S]*?)(?:```|\Z)' ] css_patterns = [ r'```css\s*\n([\s\S]*?)(?:```|\Z)', r'```\s*(?:style\.css|css)\s*\n([\s\S]*?)(?:```|\Z)' ] # Extract HTML content for pattern in html_patterns: html_match = re.search(pattern, text, re.IGNORECASE) if html_match: files['index.html'] = html_match.group(1).strip() break # Extract JavaScript content for pattern in js_patterns: js_match = re.search(pattern, text, re.IGNORECASE) if js_match: files['index.js'] = js_match.group(1).strip() break # Extract CSS content for pattern in css_patterns: css_match = re.search(pattern, text, re.IGNORECASE) if css_match: files['style.css'] = css_match.group(1).strip() break # Fallback: support === index.html === format if any file is missing if not (files['index.html'] and files['index.js'] and files['style.css']): # Use regex to extract sections html_fallback = re.search(r'===\s*index\.html\s*===\s*\n([\s\S]+?)(?=\n===|$)', text, re.IGNORECASE) js_fallback = re.search(r'===\s*index\.js\s*===\s*\n([\s\S]+?)(?=\n===|$)', text, re.IGNORECASE) css_fallback = re.search(r'===\s*style\.css\s*===\s*\n([\s\S]+?)(?=\n===|$)', text, re.IGNORECASE) if html_fallback: files['index.html'] = html_fallback.group(1).strip() if js_fallback: files['index.js'] = js_fallback.group(1).strip() if css_fallback: files['style.css'] = css_fallback.group(1).strip() # Additional fallback: extract from numbered sections or file headers if not (files['index.html'] and files['index.js'] and files['style.css']): # Try patterns like "1. index.html:" or "**index.html**" patterns = [ (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)index\.html(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.html'), (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)index\.js(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.js'), (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)style\.css(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'style.css') ] for pattern, file_key in patterns: if not files[file_key]: match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) if match: # Clean up the content by removing any code block markers content = match.group(1).strip() content = re.sub(r'^```\w*\s*\n', '', content) content = re.sub(r'\n```\s*$', '', content) files[file_key] = content.strip() return files def format_transformers_js_output(files): """Format the three files into a single display string""" output = [] output.append("=== index.html ===") output.append(files['index.html']) output.append("\n=== index.js ===") output.append(files['index.js']) output.append("\n=== style.css ===") output.append(files['style.css']) return '\n'.join(output) def build_transformers_inline_html(files: dict) -> str: """Merge transformers.js three-file output into a single self-contained HTML document. - Inlines style.css into a " if css else "" if style_tag: if '' in doc.lower(): # Preserve original casing by finding closing head case-insensitively match = _re.search(r"", doc, flags=_re.IGNORECASE) if match: idx = match.start() doc = doc[:idx] + style_tag + doc[idx:] else: # No head; insert at top of body match = _re.search(r"
]*>", doc, flags=_re.IGNORECASE) if match: idx = match.end() doc = doc[:idx] + "\n" + style_tag + doc[idx:] else: # Append at beginning doc = style_tag + doc # Inline JS: insert before script_tag = f"" if js else "" # Lightweight debug console overlay to surface runtime errors inside the iframe debug_overlay = ( "\n" "\n" "" ) # Cleanup script to clear Cache Storage and IndexedDB on unload to free model weights cleanup_tag = ( "" ) if script_tag: match = _re.search(r"\n", doc, flags=_re.IGNORECASE) if match: idx = match.start() doc = doc[:idx] + debug_overlay + script_tag + cleanup_tag + doc[idx:] else: # Append at end doc = doc + debug_overlay + script_tag + cleanup_tag return doc def send_transformers_to_sandbox(files: dict) -> str: """Build a self-contained HTML document from transformers.js files and return an iframe preview.""" merged_html = build_transformers_inline_html(files) return send_to_sandbox(merged_html) def parse_multipage_html_output(text: str) -> Dict[str, str]: """Parse multi-page HTML output formatted as repeated "=== filename ===" sections. Returns a mapping of filename → file content. Supports nested paths like assets/css/styles.css. """ if not text: return {} # First, strip any markdown fences cleaned = remove_code_block(text) files: Dict[str, str] = {} import re as _re pattern = _re.compile(r"^===\s*([^=\n]+?)\s*===\s*\n([\s\S]*?)(?=\n===\s*[^=\n]+?\s*===|\Z)", _re.MULTILINE) for m in pattern.finditer(cleaned): name = m.group(1).strip() content = m.group(2).strip() # Remove accidental trailing fences if present content = _re.sub(r"^```\w*\s*\n|\n```\s*$", "", content) files[name] = content return files def format_multipage_output(files: Dict[str, str]) -> str: """Format a dict of files back into === filename === sections. Ensures `index.html` appears first if present; others follow sorted by path. """ if not isinstance(files, dict) or not files: return "" ordered_paths = [] if 'index.html' in files: ordered_paths.append('index.html') for path in sorted(files.keys()): if path == 'index.html': continue ordered_paths.append(path) parts: list[str] = [] for path in ordered_paths: parts.append(f"=== {path} ===") # Avoid trailing extra newlines to keep blocks compact parts.append((files.get(path) or '').rstrip()) return "\n".join(parts) def validate_and_autofix_files(files: Dict[str, str]) -> Dict[str, str]: """Ensure minimal contract for multi-file sites; auto-fix missing pieces. Rules: - Ensure at least one HTML entrypoint (index.html). If none, synthesize a simple index.html linking discovered pages. - For each HTML file, ensure referenced local assets exist in files; if missing, add minimal stubs. - Normalize relative paths (strip leading '/'). """ if not isinstance(files, dict) or not files: return files or {} import re as _re normalized: Dict[str, str] = {} for k, v in files.items(): safe_key = k.strip().lstrip('/') normalized[safe_key] = v html_files = [p for p in normalized.keys() if p.lower().endswith('.html')] has_index = 'index.html' in normalized # If no index.html but some HTML pages exist, create a simple hub index linking to them if not has_index and html_files: links = '\n'.join([f"
" for p in html_files]) normalized['index.html'] = ( "\n\n
\n\n" "\n" "
\n\n
\n