Spaces:
Sleeping
Sleeping
| """Diff visualization utilities.""" | |
| import difflib | |
| from typing import Iterator | |
| def generate_html_diff(original: str, cleaned: str) -> str: | |
| """Generate HTML diff between original and cleaned text.""" | |
| differ = difflib.HtmlDiff(tabsize=2, wrapcolumn=80) | |
| original_lines = original.splitlines(keepends=True) | |
| cleaned_lines = cleaned.splitlines(keepends=True) | |
| html_diff = differ.make_file( | |
| original_lines, | |
| cleaned_lines, | |
| "Original Text", | |
| "Cleaned Text", | |
| context=True, | |
| numlines=3 | |
| ) | |
| return _style_diff_html(html_diff) | |
| def _style_diff_html(html_diff: str) -> str: | |
| """Add custom styling to the diff HTML.""" | |
| custom_style = """ | |
| <style> | |
| .diff { | |
| font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
| font-size: 12px; | |
| line-height: 1.4; | |
| border: 1px solid #ddd; | |
| border-radius: 8px; | |
| overflow: auto; | |
| max-height: 600px; | |
| } | |
| .diff table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 0; | |
| } | |
| .diff td { | |
| padding: 2px 8px; | |
| vertical-align: top; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| } | |
| .diff_header { | |
| background: #f8f9fa; | |
| font-weight: bold; | |
| text-align: center; | |
| border-bottom: 1px solid #dee2e6; | |
| } | |
| .diff_next { | |
| background: #e9ecef; | |
| text-align: center; | |
| font-size: 10px; | |
| } | |
| .diff_add { | |
| background: #d4edda; | |
| color: #155724; | |
| } | |
| .diff_chg { | |
| background: #fff3cd; | |
| color: #856404; | |
| } | |
| .diff_sub { | |
| background: #f8d7da; | |
| color: #721c24; | |
| } | |
| </style> | |
| """ | |
| # Insert custom style before closing </head> tag | |
| styled_html = html_diff.replace('</head>', f'{custom_style}</head>') | |
| # Add diff class to the table | |
| styled_html = styled_html.replace('<table class="diff"', '<div class="diff"><table class="diff"') | |
| styled_html = styled_html.replace('</table>', '</table></div>') | |
| return styled_html | |
| def get_diff_stats(original: str, cleaned: str) -> dict[str, int]: | |
| """Get statistics about the diff between original and cleaned text.""" | |
| differ = difflib.SequenceMatcher(None, original, cleaned) | |
| return { | |
| 'original_length': len(original), | |
| 'cleaned_length': len(cleaned), | |
| 'similarity_ratio': round(differ.ratio() * 100, 2), | |
| 'characters_removed': len(original) - len(cleaned) | |
| } |