Spaces:
Running
Running
| <html> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="viewport" content="width=device-width" /> | |
| <title>Parquet Visualization Studio</title> | |
| <link rel="stylesheet" href="style.css" /> | |
| <script src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-mvp.wasm.js"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-browser-mvp.worker.js"></script> | |
| <script type="module" src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-browser-mvp.worker.js"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega@5"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>π Parquet Visualization Studio</h1> | |
| <p class="subtitle">Query parquet files using SQL with DuckDB WASM</p> | |
| <form id="queryForm"> | |
| <div class="form-group"> | |
| <label for="urlSelect">Select Example Dataset</label> | |
| <select id="urlSelect"> | |
| <option value="">-- Choose a dataset or enter custom URL below --</option> | |
| <option value="https://huggingface.co/datasets/PleIAs/SYNTH/resolve/refs%2Fconvert%2Fparquet/default/partial-train/0000.parquet">PleIAs/SYNTH</option> | |
| <option value="https://huggingface.co/datasets/facebook/omnilingual-asr-corpus/resolve/refs%2Fconvert%2Fparquet/gby_Latn/train/0000.parquet">facebook/omnilingual-asr-corpus</option> | |
| <option value="https://example.com/dataset3.parquet">Dataset 3</option> | |
| <option value="https://example.com/dataset4.parquet">Dataset 4</option> | |
| <option value="https://example.com/dataset5.parquet">Dataset 5</option> | |
| <option value="https://example.com/dataset6.parquet">Dataset 6</option> | |
| <option value="https://example.com/dataset7.parquet">Dataset 7</option> | |
| <option value="https://example.com/dataset8.parquet">Dataset 8</option> | |
| <option value="https://example.com/dataset9.parquet">Dataset 9</option> | |
| <option value="https://example.com/dataset10.parquet">Dataset 10</option> | |
| </select> | |
| </div> | |
| <div class="form-group"> | |
| <label for="parquetUrl">Parquet File URL</label> | |
| <input | |
| type="text" | |
| id="parquetUrl" | |
| placeholder="https://example.com/data.parquet" | |
| required | |
| /> | |
| </div> | |
| <div class="form-group"> | |
| <label for="sqlQuery">SQL Query</label> | |
| <textarea | |
| id="sqlQuery" | |
| rows="4" | |
| required | |
| >SELECT * FROM data LIMIT 10</textarea> | |
| </div> | |
| <button type="submit" id="submitBtn">Run Query</button> | |
| </form> | |
| <div id="status" class="status"></div> | |
| <div id="visualizationSection" class="visualization-section" style="display: none;"> | |
| <h2>Visualization</h2> | |
| <div class="form-group"> | |
| <label for="vizSelect">Select a 2D plot</label> | |
| <select id="vizSelect"> | |
| <option value="">-- Choose columns to visualize --</option> | |
| </select> | |
| </div> | |
| <button type="button" id="renderVizBtn" disabled>Render Visualization</button> | |
| <div id="vizContainer" class="viz-container"></div> | |
| </div> | |
| <div id="resultsContainer" class="results-container" style="display: none;"> | |
| <h2>Results</h2> | |
| <div id="results" class="results"></div> | |
| </div> | |
| </div> | |
| <script type="module"> | |
| import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/+esm'; | |
| let db = null; | |
| let conn = null; | |
| let currentDatasetUrl = null; | |
| let columnInfo = []; | |
| // Initialize DuckDB | |
| async function initDuckDB() { | |
| const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles(); | |
| const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES); | |
| const worker_url = URL.createObjectURL( | |
| new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' }) | |
| ); | |
| const worker = new Worker(worker_url); | |
| const logger = new duckdb.ConsoleLogger(); | |
| db = new duckdb.AsyncDuckDB(logger, worker); | |
| await db.instantiate(bundle.mainModule, bundle.pthreadWorker); | |
| URL.revokeObjectURL(worker_url); | |
| conn = await db.connect(); | |
| } | |
| // Update status message | |
| function setStatus(message, type = 'info') { | |
| const statusEl = document.getElementById('status'); | |
| statusEl.textContent = message; | |
| statusEl.className = `status status-${type}`; | |
| statusEl.style.display = 'block'; | |
| } | |
| // Render results as HTML table | |
| function renderResults(result) { | |
| const resultsContainer = document.getElementById('resultsContainer'); | |
| const resultsEl = document.getElementById('results'); | |
| resultsContainer.style.display = 'block'; | |
| if (!result || result.numRows === 0) { | |
| resultsEl.innerHTML = '<p class="no-results">No results found.</p>'; | |
| return; | |
| } | |
| // Get column names and rows | |
| const columns = result.schema.fields.map(field => field.name); | |
| const rows = result.toArray(); | |
| // Build HTML table | |
| let tableHtml = '<div class="table-wrapper"><table><thead><tr>'; | |
| // Add headers | |
| columns.forEach(col => { | |
| tableHtml += `<th>${escapeHtml(col)}</th>`; | |
| }); | |
| tableHtml += '</tr></thead><tbody>'; | |
| // Add rows | |
| rows.forEach(row => { | |
| tableHtml += '<tr>'; | |
| columns.forEach(col => { | |
| const value = row[col]; | |
| tableHtml += `<td>${escapeHtml(String(value ?? ''))}</td>`; | |
| }); | |
| tableHtml += '</tr>'; | |
| }); | |
| tableHtml += '</tbody></table></div>'; | |
| tableHtml += `<p class="row-count">Showing ${result.numRows} row(s)</p>`; | |
| resultsEl.innerHTML = tableHtml; | |
| } | |
| // Escape HTML to prevent XSS | |
| function escapeHtml(text) { | |
| const div = document.createElement('div'); | |
| div.textContent = text; | |
| return div.innerHTML; | |
| } | |
| // Determine if a DuckDB type is numeric | |
| function isNumericType(type) { | |
| const numericTypes = ['TINYINT', 'SMALLINT', 'INTEGER', 'BIGINT', 'HUGEINT', | |
| 'FLOAT', 'DOUBLE', 'DECIMAL', 'NUMERIC', 'REAL']; | |
| return numericTypes.some(t => type.toUpperCase().includes(t)); | |
| } | |
| // Determine if a DuckDB type is text | |
| function isTextType(type) { | |
| const textTypes = ['VARCHAR', 'CHAR', 'TEXT', 'STRING']; | |
| return textTypes.some(t => type.toUpperCase().includes(t)); | |
| } | |
| // Detect columns and their types from the dataset | |
| async function detectColumns(url) { | |
| try { | |
| setStatus('Detecting column types...', 'info'); | |
| // Initialize DuckDB if not already done | |
| if (!db) { | |
| await initDuckDB(); | |
| } | |
| // Register the parquet file | |
| try { | |
| await db.registerFileURL( | |
| 'data.parquet', | |
| url, | |
| duckdb.DuckDBDataProtocol.HTTP, | |
| false | |
| ); | |
| } catch {} | |
| // Query to get column information | |
| const result = await conn.query("DESCRIBE 'data.parquet'"); | |
| const rows = result.toArray(); | |
| columnInfo = rows.map(row => ({ | |
| name: row.column_name, | |
| type: row.column_type | |
| })); | |
| setStatus(`Detected ${columnInfo.length} columns`, 'success'); | |
| buildVisualizationDropdown(); | |
| } catch (error) { | |
| console.error('Error detecting columns:', error); | |
| setStatus(`Error detecting columns: ${error.message}`, 'error'); | |
| columnInfo = []; | |
| } | |
| } | |
| // Build the visualization dropdown with valid combinations | |
| function buildVisualizationDropdown() { | |
| const vizSelect = document.getElementById('vizSelect'); | |
| const vizSection = document.getElementById('visualizationSection'); | |
| // Clear existing options except first | |
| vizSelect.innerHTML = '<option value="">-- Choose columns to visualize --</option>'; | |
| const validCombinations = []; | |
| // Generate all valid column combinations | |
| for (let i = 0; i < columnInfo.length; i++) { | |
| const xCol = columnInfo[i]; | |
| const xIsNumeric = isNumericType(xCol.type); | |
| const xIsText = isTextType(xCol.type); | |
| for (let j = 0; j < columnInfo.length; j++) { | |
| if (i === j) continue; // Skip same column | |
| const yCol = columnInfo[j]; | |
| const yIsNumeric = isNumericType(yCol.type); | |
| const yIsText = isTextType(yCol.type); | |
| // Valid combinations: | |
| // 1. Numeric x Numeric (scatter plot) | |
| // 2. Numeric x Text (bar chart with count) | |
| // 3. Text x Numeric (bar chart with count) | |
| // Skip: Text x Text | |
| if ((xIsNumeric && yIsNumeric) || | |
| (xIsNumeric && yIsText) || | |
| (xIsText && yIsNumeric)) { | |
| let chartType; | |
| if (xIsNumeric && yIsNumeric) { | |
| chartType = 'scatter'; | |
| } else if (xIsNumeric && yIsText) { | |
| chartType = 'bar'; | |
| } else if (xIsText && yIsNumeric) { | |
| chartType = 'bar'; | |
| } | |
| validCombinations.push({ | |
| xCol: xCol.name, | |
| xType: xCol.type, | |
| yCol: yCol.name, | |
| yType: yCol.type, | |
| chartType: chartType | |
| }); | |
| const option = document.createElement('option'); | |
| option.value = JSON.stringify({ xCol: xCol.name, yCol: yCol.name, chartType }); | |
| option.textContent = `${xCol.name} (${xCol.type}) Γ ${yCol.name} (${yCol.type}) - ${chartType}`; | |
| vizSelect.appendChild(option); | |
| } else if (!xIsNumeric && !xIsText && !yIsNumeric && !yIsText) { | |
| // Other unsupported type combinations - we'll handle error when selected | |
| validCombinations.push({ | |
| xCol: xCol.name, | |
| xType: xCol.type, | |
| yCol: yCol.name, | |
| yType: yCol.type, | |
| chartType: 'unsupported' | |
| }); | |
| const option = document.createElement('option'); | |
| option.value = JSON.stringify({ xCol: xCol.name, yCol: yCol.name, chartType: 'unsupported' }); | |
| option.textContent = `${xCol.name} (${xCol.type}) Γ ${yCol.name} (${yCol.type}) - unsupported`; | |
| vizSelect.appendChild(option); | |
| } | |
| } | |
| } | |
| if (validCombinations.length > 0) { | |
| vizSection.style.display = 'block'; | |
| } else { | |
| vizSection.style.display = 'none'; | |
| setStatus('No valid column combinations found for visualization', 'error'); | |
| } | |
| } | |
| // Render visualization using Vega-Lite | |
| async function renderVisualization(xCol, yCol, chartType) { | |
| const vizContainer = document.getElementById('vizContainer'); | |
| vizContainer.innerHTML = ''; // Clear previous | |
| if (chartType === 'unsupported') { | |
| setStatus(`Error: Unsupported column type combination. X column type: ${columnInfo.find(c => c.name === xCol)?.type}, Y column type: ${columnInfo.find(c => c.name === yCol)?.type}`, 'error'); | |
| return; | |
| } | |
| try { | |
| setStatus('Fetching data for visualization...', 'info'); | |
| // Fetch data (limit to reasonable amount for visualization) | |
| const query = `SELECT "${xCol}", "${yCol}" FROM 'data.parquet' LIMIT 1000`; | |
| const result = await conn.query(query); | |
| const data = result.toArray(); | |
| setStatus('Rendering visualization...', 'info'); | |
| let spec; | |
| if (chartType === 'scatter') { | |
| // Numeric x Numeric: Scatter plot | |
| spec = { | |
| $schema: 'https://vega.github.io/schema/vega-lite/v5.json', | |
| description: `Scatter plot of ${xCol} vs ${yCol}`, | |
| data: { values: data }, | |
| mark: 'point', | |
| encoding: { | |
| x: { field: xCol, type: 'quantitative' }, | |
| y: { field: yCol, type: 'quantitative' } | |
| }, | |
| width: 600, | |
| height: 400 | |
| }; | |
| } else if (chartType === 'bar') { | |
| // One numeric, one text: Bar chart with count aggregation | |
| const xColInfo = columnInfo.find(c => c.name === xCol); | |
| const yColInfo = columnInfo.find(c => c.name === yCol); | |
| const xIsNumeric = isNumericType(xColInfo.type); | |
| const yIsNumeric = isNumericType(yColInfo.type); | |
| if (xIsNumeric && !yIsNumeric) { | |
| // X is numeric, Y is text: count text on Y, aggregate on X | |
| spec = { | |
| $schema: 'https://vega.github.io/schema/vega-lite/v5.json', | |
| description: `Bar chart of ${yCol} counts`, | |
| data: { values: data }, | |
| mark: 'bar', | |
| encoding: { | |
| y: { field: yCol, type: 'nominal' }, | |
| x: { aggregate: 'count', type: 'quantitative' } | |
| }, | |
| width: 600, | |
| height: 400 | |
| }; | |
| } else { | |
| // X is text, Y is numeric: count text on X, aggregate on Y | |
| spec = { | |
| $schema: 'https://vega.github.io/schema/vega-lite/v5.json', | |
| description: `Bar chart of ${xCol} counts`, | |
| data: { values: data }, | |
| mark: 'bar', | |
| encoding: { | |
| x: { field: xCol, type: 'nominal' }, | |
| y: { aggregate: 'count', type: 'quantitative' } | |
| }, | |
| width: 600, | |
| height: 400 | |
| }; | |
| } | |
| } | |
| // Render using vega-embed | |
| await vegaEmbed('#vizContainer', spec); | |
| setStatus('Visualization rendered successfully!', 'success'); | |
| } catch (error) { | |
| console.error('Error rendering visualization:', error); | |
| setStatus(`Error rendering visualization: ${error.message}`, 'error'); | |
| } | |
| } | |
| // Handle form submission | |
| async function handleSubmit(e) { | |
| e.preventDefault(); | |
| const parquetUrl = document.getElementById('parquetUrl').value.trim(); | |
| const sqlQuery = document.getElementById('sqlQuery').value.trim(); | |
| const submitBtn = document.getElementById('submitBtn'); | |
| if (!parquetUrl || !sqlQuery) { | |
| setStatus('Please provide both a parquet URL and SQL query.', 'error'); | |
| return; | |
| } | |
| try { | |
| submitBtn.disabled = true; | |
| submitBtn.textContent = 'Running...'; | |
| setStatus('Initializing DuckDB...', 'info'); | |
| // Initialize DuckDB if not already done | |
| if (!db) { | |
| await initDuckDB(); | |
| } | |
| setStatus('Loading parquet file...', 'info'); | |
| // Register the parquet file from URL | |
| try { | |
| await db.registerFileURL( | |
| 'data.parquet', | |
| parquetUrl, | |
| duckdb.DuckDBDataProtocol.HTTP, | |
| false | |
| ); | |
| } | |
| catch {} | |
| setStatus('Executing query...', 'info'); | |
| // Execute the query | |
| const result = await conn.query(sqlQuery.replace(/\bdata\b/gi, "'data.parquet'")); | |
| setStatus('Query completed successfully!', 'success'); | |
| // Render results | |
| renderResults(result); | |
| } catch (error) { | |
| console.error('Error:', error); | |
| setStatus(`Error: ${error.message}`, 'error'); | |
| document.getElementById('resultsContainer').style.display = 'none'; | |
| } finally { | |
| submitBtn.disabled = false; | |
| submitBtn.textContent = 'Run Query'; | |
| } | |
| } | |
| // Handle dropdown selection | |
| document.getElementById('urlSelect').addEventListener('change', async function(e) { | |
| const selectedUrl = e.target.value; | |
| if (selectedUrl) { | |
| document.getElementById('parquetUrl').value = selectedUrl; | |
| currentDatasetUrl = selectedUrl; | |
| await detectColumns(selectedUrl); | |
| } | |
| }); | |
| // Handle manual URL input (detect when user blurs or presses enter) | |
| document.getElementById('parquetUrl').addEventListener('blur', async function(e) { | |
| const url = e.target.value.trim(); | |
| if (url && url !== currentDatasetUrl) { | |
| currentDatasetUrl = url; | |
| await detectColumns(url); | |
| } | |
| }); | |
| // Handle visualization selection | |
| document.getElementById('vizSelect').addEventListener('change', function(e) { | |
| const renderBtn = document.getElementById('renderVizBtn'); | |
| renderBtn.disabled = !e.target.value; | |
| }); | |
| // Handle render visualization button | |
| document.getElementById('renderVizBtn').addEventListener('click', async function() { | |
| const vizSelect = document.getElementById('vizSelect'); | |
| const selectedValue = vizSelect.value; | |
| if (selectedValue) { | |
| const { xCol, yCol, chartType } = JSON.parse(selectedValue); | |
| await renderVisualization(xCol, yCol, chartType); | |
| } | |
| }); | |
| // Set up event listeners | |
| document.getElementById('queryForm').addEventListener('submit', handleSubmit); | |
| // Initialize on load | |
| setStatus('Ready to query parquet files!', 'success'); | |
| </script> | |
| </body> | |
| </html> | |