visualization-studio / index.html
znation's picture
znation HF Staff
claude: add vega-lite vis
8e5c2f5
raw
history blame
15.8 kB
<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>Parquet Visualization Studio</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-mvp.wasm.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-browser-mvp.worker.js"></script>
<script type="module" src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-browser-mvp.worker.js"></script>
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
</head>
<body>
<div class="container">
<h1>πŸ“Š Parquet Visualization Studio</h1>
<p class="subtitle">Query parquet files using SQL with DuckDB WASM</p>
<form id="queryForm">
<div class="form-group">
<label for="urlSelect">Select Example Dataset</label>
<select id="urlSelect">
<option value="">-- Choose a dataset or enter custom URL below --</option>
<option value="https://huggingface.co/datasets/PleIAs/SYNTH/resolve/refs%2Fconvert%2Fparquet/default/partial-train/0000.parquet">PleIAs/SYNTH</option>
<option value="https://huggingface.co/datasets/facebook/omnilingual-asr-corpus/resolve/refs%2Fconvert%2Fparquet/gby_Latn/train/0000.parquet">facebook/omnilingual-asr-corpus</option>
<option value="https://example.com/dataset3.parquet">Dataset 3</option>
<option value="https://example.com/dataset4.parquet">Dataset 4</option>
<option value="https://example.com/dataset5.parquet">Dataset 5</option>
<option value="https://example.com/dataset6.parquet">Dataset 6</option>
<option value="https://example.com/dataset7.parquet">Dataset 7</option>
<option value="https://example.com/dataset8.parquet">Dataset 8</option>
<option value="https://example.com/dataset9.parquet">Dataset 9</option>
<option value="https://example.com/dataset10.parquet">Dataset 10</option>
</select>
</div>
<div class="form-group">
<label for="parquetUrl">Parquet File URL</label>
<input
type="text"
id="parquetUrl"
placeholder="https://example.com/data.parquet"
required
/>
</div>
<div class="form-group">
<label for="sqlQuery">SQL Query</label>
<textarea
id="sqlQuery"
rows="4"
required
>SELECT * FROM data LIMIT 10</textarea>
</div>
<button type="submit" id="submitBtn">Run Query</button>
</form>
<div id="status" class="status"></div>
<div id="visualizationSection" class="visualization-section" style="display: none;">
<h2>Visualization</h2>
<div class="form-group">
<label for="vizSelect">Select a 2D plot</label>
<select id="vizSelect">
<option value="">-- Choose columns to visualize --</option>
</select>
</div>
<button type="button" id="renderVizBtn" disabled>Render Visualization</button>
<div id="vizContainer" class="viz-container"></div>
</div>
<div id="resultsContainer" class="results-container" style="display: none;">
<h2>Results</h2>
<div id="results" class="results"></div>
</div>
</div>
<script type="module">
import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/+esm';
let db = null;
let conn = null;
let currentDatasetUrl = null;
let columnInfo = [];
// Initialize DuckDB
async function initDuckDB() {
const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
const worker_url = URL.createObjectURL(
new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' })
);
const worker = new Worker(worker_url);
const logger = new duckdb.ConsoleLogger();
db = new duckdb.AsyncDuckDB(logger, worker);
await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
URL.revokeObjectURL(worker_url);
conn = await db.connect();
}
// Update status message
function setStatus(message, type = 'info') {
const statusEl = document.getElementById('status');
statusEl.textContent = message;
statusEl.className = `status status-${type}`;
statusEl.style.display = 'block';
}
// Render results as HTML table
function renderResults(result) {
const resultsContainer = document.getElementById('resultsContainer');
const resultsEl = document.getElementById('results');
resultsContainer.style.display = 'block';
if (!result || result.numRows === 0) {
resultsEl.innerHTML = '<p class="no-results">No results found.</p>';
return;
}
// Get column names and rows
const columns = result.schema.fields.map(field => field.name);
const rows = result.toArray();
// Build HTML table
let tableHtml = '<div class="table-wrapper"><table><thead><tr>';
// Add headers
columns.forEach(col => {
tableHtml += `<th>${escapeHtml(col)}</th>`;
});
tableHtml += '</tr></thead><tbody>';
// Add rows
rows.forEach(row => {
tableHtml += '<tr>';
columns.forEach(col => {
const value = row[col];
tableHtml += `<td>${escapeHtml(String(value ?? ''))}</td>`;
});
tableHtml += '</tr>';
});
tableHtml += '</tbody></table></div>';
tableHtml += `<p class="row-count">Showing ${result.numRows} row(s)</p>`;
resultsEl.innerHTML = tableHtml;
}
// Escape HTML to prevent XSS
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// Determine if a DuckDB type is numeric
function isNumericType(type) {
const numericTypes = ['TINYINT', 'SMALLINT', 'INTEGER', 'BIGINT', 'HUGEINT',
'FLOAT', 'DOUBLE', 'DECIMAL', 'NUMERIC', 'REAL'];
return numericTypes.some(t => type.toUpperCase().includes(t));
}
// Determine if a DuckDB type is text
function isTextType(type) {
const textTypes = ['VARCHAR', 'CHAR', 'TEXT', 'STRING'];
return textTypes.some(t => type.toUpperCase().includes(t));
}
// Detect columns and their types from the dataset
async function detectColumns(url) {
try {
setStatus('Detecting column types...', 'info');
// Initialize DuckDB if not already done
if (!db) {
await initDuckDB();
}
// Register the parquet file
try {
await db.registerFileURL(
'data.parquet',
url,
duckdb.DuckDBDataProtocol.HTTP,
false
);
} catch {}
// Query to get column information
const result = await conn.query("DESCRIBE 'data.parquet'");
const rows = result.toArray();
columnInfo = rows.map(row => ({
name: row.column_name,
type: row.column_type
}));
setStatus(`Detected ${columnInfo.length} columns`, 'success');
buildVisualizationDropdown();
} catch (error) {
console.error('Error detecting columns:', error);
setStatus(`Error detecting columns: ${error.message}`, 'error');
columnInfo = [];
}
}
// Build the visualization dropdown with valid combinations
function buildVisualizationDropdown() {
const vizSelect = document.getElementById('vizSelect');
const vizSection = document.getElementById('visualizationSection');
// Clear existing options except first
vizSelect.innerHTML = '<option value="">-- Choose columns to visualize --</option>';
const validCombinations = [];
// Generate all valid column combinations
for (let i = 0; i < columnInfo.length; i++) {
const xCol = columnInfo[i];
const xIsNumeric = isNumericType(xCol.type);
const xIsText = isTextType(xCol.type);
for (let j = 0; j < columnInfo.length; j++) {
if (i === j) continue; // Skip same column
const yCol = columnInfo[j];
const yIsNumeric = isNumericType(yCol.type);
const yIsText = isTextType(yCol.type);
// Valid combinations:
// 1. Numeric x Numeric (scatter plot)
// 2. Numeric x Text (bar chart with count)
// 3. Text x Numeric (bar chart with count)
// Skip: Text x Text
if ((xIsNumeric && yIsNumeric) ||
(xIsNumeric && yIsText) ||
(xIsText && yIsNumeric)) {
let chartType;
if (xIsNumeric && yIsNumeric) {
chartType = 'scatter';
} else if (xIsNumeric && yIsText) {
chartType = 'bar';
} else if (xIsText && yIsNumeric) {
chartType = 'bar';
}
validCombinations.push({
xCol: xCol.name,
xType: xCol.type,
yCol: yCol.name,
yType: yCol.type,
chartType: chartType
});
const option = document.createElement('option');
option.value = JSON.stringify({ xCol: xCol.name, yCol: yCol.name, chartType });
option.textContent = `${xCol.name} (${xCol.type}) Γ— ${yCol.name} (${yCol.type}) - ${chartType}`;
vizSelect.appendChild(option);
} else if (!xIsNumeric && !xIsText && !yIsNumeric && !yIsText) {
// Other unsupported type combinations - we'll handle error when selected
validCombinations.push({
xCol: xCol.name,
xType: xCol.type,
yCol: yCol.name,
yType: yCol.type,
chartType: 'unsupported'
});
const option = document.createElement('option');
option.value = JSON.stringify({ xCol: xCol.name, yCol: yCol.name, chartType: 'unsupported' });
option.textContent = `${xCol.name} (${xCol.type}) Γ— ${yCol.name} (${yCol.type}) - unsupported`;
vizSelect.appendChild(option);
}
}
}
if (validCombinations.length > 0) {
vizSection.style.display = 'block';
} else {
vizSection.style.display = 'none';
setStatus('No valid column combinations found for visualization', 'error');
}
}
// Render visualization using Vega-Lite
async function renderVisualization(xCol, yCol, chartType) {
const vizContainer = document.getElementById('vizContainer');
vizContainer.innerHTML = ''; // Clear previous
if (chartType === 'unsupported') {
setStatus(`Error: Unsupported column type combination. X column type: ${columnInfo.find(c => c.name === xCol)?.type}, Y column type: ${columnInfo.find(c => c.name === yCol)?.type}`, 'error');
return;
}
try {
setStatus('Fetching data for visualization...', 'info');
// Fetch data (limit to reasonable amount for visualization)
const query = `SELECT "${xCol}", "${yCol}" FROM 'data.parquet' LIMIT 1000`;
const result = await conn.query(query);
const data = result.toArray();
setStatus('Rendering visualization...', 'info');
let spec;
if (chartType === 'scatter') {
// Numeric x Numeric: Scatter plot
spec = {
$schema: 'https://vega.github.io/schema/vega-lite/v5.json',
description: `Scatter plot of ${xCol} vs ${yCol}`,
data: { values: data },
mark: 'point',
encoding: {
x: { field: xCol, type: 'quantitative' },
y: { field: yCol, type: 'quantitative' }
},
width: 600,
height: 400
};
} else if (chartType === 'bar') {
// One numeric, one text: Bar chart with count aggregation
const xColInfo = columnInfo.find(c => c.name === xCol);
const yColInfo = columnInfo.find(c => c.name === yCol);
const xIsNumeric = isNumericType(xColInfo.type);
const yIsNumeric = isNumericType(yColInfo.type);
if (xIsNumeric && !yIsNumeric) {
// X is numeric, Y is text: count text on Y, aggregate on X
spec = {
$schema: 'https://vega.github.io/schema/vega-lite/v5.json',
description: `Bar chart of ${yCol} counts`,
data: { values: data },
mark: 'bar',
encoding: {
y: { field: yCol, type: 'nominal' },
x: { aggregate: 'count', type: 'quantitative' }
},
width: 600,
height: 400
};
} else {
// X is text, Y is numeric: count text on X, aggregate on Y
spec = {
$schema: 'https://vega.github.io/schema/vega-lite/v5.json',
description: `Bar chart of ${xCol} counts`,
data: { values: data },
mark: 'bar',
encoding: {
x: { field: xCol, type: 'nominal' },
y: { aggregate: 'count', type: 'quantitative' }
},
width: 600,
height: 400
};
}
}
// Render using vega-embed
await vegaEmbed('#vizContainer', spec);
setStatus('Visualization rendered successfully!', 'success');
} catch (error) {
console.error('Error rendering visualization:', error);
setStatus(`Error rendering visualization: ${error.message}`, 'error');
}
}
// Handle form submission
async function handleSubmit(e) {
e.preventDefault();
const parquetUrl = document.getElementById('parquetUrl').value.trim();
const sqlQuery = document.getElementById('sqlQuery').value.trim();
const submitBtn = document.getElementById('submitBtn');
if (!parquetUrl || !sqlQuery) {
setStatus('Please provide both a parquet URL and SQL query.', 'error');
return;
}
try {
submitBtn.disabled = true;
submitBtn.textContent = 'Running...';
setStatus('Initializing DuckDB...', 'info');
// Initialize DuckDB if not already done
if (!db) {
await initDuckDB();
}
setStatus('Loading parquet file...', 'info');
// Register the parquet file from URL
try {
await db.registerFileURL(
'data.parquet',
parquetUrl,
duckdb.DuckDBDataProtocol.HTTP,
false
);
}
catch {}
setStatus('Executing query...', 'info');
// Execute the query
const result = await conn.query(sqlQuery.replace(/\bdata\b/gi, "'data.parquet'"));
setStatus('Query completed successfully!', 'success');
// Render results
renderResults(result);
} catch (error) {
console.error('Error:', error);
setStatus(`Error: ${error.message}`, 'error');
document.getElementById('resultsContainer').style.display = 'none';
} finally {
submitBtn.disabled = false;
submitBtn.textContent = 'Run Query';
}
}
// Handle dropdown selection
document.getElementById('urlSelect').addEventListener('change', async function(e) {
const selectedUrl = e.target.value;
if (selectedUrl) {
document.getElementById('parquetUrl').value = selectedUrl;
currentDatasetUrl = selectedUrl;
await detectColumns(selectedUrl);
}
});
// Handle manual URL input (detect when user blurs or presses enter)
document.getElementById('parquetUrl').addEventListener('blur', async function(e) {
const url = e.target.value.trim();
if (url && url !== currentDatasetUrl) {
currentDatasetUrl = url;
await detectColumns(url);
}
});
// Handle visualization selection
document.getElementById('vizSelect').addEventListener('change', function(e) {
const renderBtn = document.getElementById('renderVizBtn');
renderBtn.disabled = !e.target.value;
});
// Handle render visualization button
document.getElementById('renderVizBtn').addEventListener('click', async function() {
const vizSelect = document.getElementById('vizSelect');
const selectedValue = vizSelect.value;
if (selectedValue) {
const { xCol, yCol, chartType } = JSON.parse(selectedValue);
await renderVisualization(xCol, yCol, chartType);
}
});
// Set up event listeners
document.getElementById('queryForm').addEventListener('submit', handleSubmit);
// Initialize on load
setStatus('Ready to query parquet files!', 'success');
</script>
</body>
</html>