Spaces:
Running
Running
| <html lang="ja"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Moonshine Tiny JA - リアルタイム日本語文字起こし</title> | |
| <meta name="description" content="ブラウザ上で動作する日本語音声認識デモ。Moonshine Tiny JAモデルをTransformers.jsで実行。"> | |
| <style> | |
| * { | |
| box-sizing: border-box; | |
| } | |
| body { | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; | |
| max-width: 800px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| background: #1a1a2e; | |
| color: #eee; | |
| min-height: 100vh; | |
| } | |
| h1 { | |
| text-align: center; | |
| color: #00d4ff; | |
| margin-bottom: 5px; | |
| } | |
| .subtitle { | |
| text-align: center; | |
| color: #888; | |
| font-size: 14px; | |
| margin-bottom: 20px; | |
| } | |
| .status { | |
| text-align: center; | |
| padding: 10px; | |
| border-radius: 8px; | |
| margin: 20px 0; | |
| background: #16213e; | |
| } | |
| .status.loading { | |
| color: #ffa500; | |
| } | |
| .status.ready { | |
| color: #00ff88; | |
| } | |
| .status.recording { | |
| color: #ff4757; | |
| } | |
| .status.error { | |
| color: #ff4757; | |
| background: #2d1f1f; | |
| } | |
| button { | |
| display: block; | |
| width: 200px; | |
| margin: 20px auto; | |
| padding: 15px 30px; | |
| font-size: 18px; | |
| border: none; | |
| border-radius: 50px; | |
| cursor: pointer; | |
| transition: all 0.3s; | |
| } | |
| button:disabled { | |
| background: #555; | |
| cursor: not-allowed; | |
| } | |
| #startBtn { | |
| background: linear-gradient(135deg, #00d4ff, #00ff88); | |
| color: #1a1a2e; | |
| font-weight: bold; | |
| } | |
| #startBtn:hover:not(:disabled) { | |
| transform: scale(1.05); | |
| box-shadow: 0 0 20px rgba(0, 212, 255, 0.5); | |
| } | |
| #startBtn.recording { | |
| background: linear-gradient(135deg, #ff4757, #ff6b81); | |
| animation: pulse 1s infinite; | |
| } | |
| @keyframes pulse { | |
| 0%, 100% { | |
| box-shadow: 0 0 0 0 rgba(255, 71, 87, 0.4); | |
| } | |
| 50% { | |
| box-shadow: 0 0 0 15px rgba(255, 71, 87, 0); | |
| } | |
| } | |
| .control-container { | |
| margin: 20px 0; | |
| padding: 15px; | |
| background: #16213e; | |
| border-radius: 8px; | |
| } | |
| .control-container label { | |
| display: block; | |
| margin-bottom: 10px; | |
| } | |
| .control-container input[type="range"] { | |
| width: 100%; | |
| cursor: pointer; | |
| } | |
| .slider-labels { | |
| display: flex; | |
| justify-content: space-between; | |
| font-size: 12px; | |
| color: #888; | |
| margin-top: 5px; | |
| } | |
| .mode-switch { | |
| display: flex; | |
| gap: 10px; | |
| margin-bottom: 15px; | |
| } | |
| .mode-switch button { | |
| flex: 1; | |
| width: auto; | |
| margin: 0; | |
| padding: 10px 15px; | |
| font-size: 14px; | |
| border-radius: 8px; | |
| background: #0f0f23; | |
| color: #888; | |
| } | |
| .mode-switch button.active { | |
| background: linear-gradient(135deg, #00d4ff, #00ff88); | |
| color: #1a1a2e; | |
| font-weight: bold; | |
| } | |
| .mode-switch button:disabled { | |
| opacity: 0.5; | |
| } | |
| .mode-description { | |
| font-size: 12px; | |
| color: #888; | |
| margin-top: 10px; | |
| padding: 10px; | |
| background: #0f0f23; | |
| border-radius: 6px; | |
| } | |
| #transcript { | |
| background: #16213e; | |
| border-radius: 12px; | |
| padding: 20px; | |
| min-height: 200px; | |
| margin-top: 20px; | |
| font-size: 18px; | |
| line-height: 1.8; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| } | |
| #transcript:empty::before { | |
| content: "文字起こし結果がここに表示されます..."; | |
| color: #666; | |
| } | |
| #currentText { | |
| color: #00d4ff; | |
| font-style: italic; | |
| min-height: 30px; | |
| margin-top: 10px; | |
| text-align: center; | |
| } | |
| .info { | |
| background: #16213e; | |
| border-radius: 8px; | |
| padding: 15px; | |
| margin: 20px 0; | |
| font-size: 14px; | |
| color: #aaa; | |
| } | |
| .info a { | |
| color: #00d4ff; | |
| text-decoration: none; | |
| } | |
| .info a:hover { | |
| text-decoration: underline; | |
| } | |
| .progress-container { | |
| background: #0f0f23; | |
| border-radius: 10px; | |
| height: 20px; | |
| margin: 10px 0; | |
| overflow: hidden; | |
| } | |
| .progress-bar { | |
| height: 100%; | |
| background: linear-gradient(90deg, #00d4ff, #00ff88); | |
| width: 0%; | |
| transition: width 0.3s; | |
| } | |
| .footer { | |
| text-align: center; | |
| margin-top: 30px; | |
| padding-top: 20px; | |
| border-top: 1px solid #333; | |
| font-size: 12px; | |
| color: #666; | |
| } | |
| .footer a { | |
| color: #00d4ff; | |
| text-decoration: none; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Moonshine Tiny JA</h1> | |
| <p class="subtitle">ブラウザで動作する日本語リアルタイム文字起こし</p> | |
| <div id="status" class="status loading"> | |
| モデルを読み込み中... | |
| <div class="progress-container"> | |
| <div id="progressBar" class="progress-bar"></div> | |
| </div> | |
| </div> | |
| <button id="startBtn" disabled>読み込み中...</button> | |
| <div class="control-container"> | |
| <div class="mode-switch"> | |
| <button id="modeChunk" class="active">区切りモード</button> | |
| <button id="modeOverlap">オーバーラップモード</button> | |
| </div> | |
| <div id="modeDescription" class="mode-description"> | |
| 指定間隔ごとに録音を区切って処理。シンプルだが境界で言葉が途切れる可能性あり。 | |
| </div> | |
| </div> | |
| <div class="control-container"> | |
| <label for="intervalSlider"> | |
| <span id="intervalLabel">録音間隔</span>: <span id="intervalValue">3</span>秒 | |
| </label> | |
| <input type="range" id="intervalSlider" min="1" max="6" step="0.5" value="3"> | |
| <div class="slider-labels"> | |
| <span>1秒 (高速)</span> | |
| <span>6秒 (高精度)</span> | |
| </div> | |
| </div> | |
| <div id="currentText"></div> | |
| <div id="transcript"></div> | |
| <div class="info"> | |
| <strong>使い方:</strong><br> | |
| 1. モデルの読み込みを待つ(初回は数分かかります)<br> | |
| 2. モードを選択(オーバーラップ推奨)<br> | |
| 3. 「録音開始」ボタンをクリック<br> | |
| 4. マイクに向かって話す<br><br> | |
| <strong>モデル:</strong> <a href="https://huggingface.co/wmoto-ai/moonshine-tiny-ja-ONNX" target="_blank">wmoto-ai/moonshine-tiny-ja-ONNX</a><br> | |
| <strong>ベース:</strong> <a href="https://huggingface.co/UsefulSensors/moonshine-tiny-ja" target="_blank">UsefulSensors/moonshine-tiny-ja</a> | |
| </div> | |
| <div class="footer"> | |
| Powered by <a href="https://www.moonshine.ai/" target="_blank">Moonshine AI</a> | | |
| <a href="https://huggingface.co/docs/transformers.js" target="_blank">Transformers.js</a><br> | |
| Licensed under <a href="https://github.com/usefulsensors/moonshine/blob/main/LICENSE" target="_blank">Moonshine AI Community License</a> | |
| </div> | |
| <script type="module"> | |
| import { | |
| MoonshineForConditionalGeneration, | |
| AutoProcessor, | |
| AutoTokenizer, | |
| } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3'; | |
| const statusEl = document.getElementById('status'); | |
| const startBtn = document.getElementById('startBtn'); | |
| const transcriptEl = document.getElementById('transcript'); | |
| const currentTextEl = document.getElementById('currentText'); | |
| const progressBar = document.getElementById('progressBar'); | |
| const intervalSlider = document.getElementById('intervalSlider'); | |
| const intervalValue = document.getElementById('intervalValue'); | |
| const intervalLabel = document.getElementById('intervalLabel'); | |
| const modeChunkBtn = document.getElementById('modeChunk'); | |
| const modeOverlapBtn = document.getElementById('modeOverlap'); | |
| const modeDescription = document.getElementById('modeDescription'); | |
| let model = null; | |
| let processor = null; | |
| let tokenizer = null; | |
| let isRecording = false; | |
| let isProcessing = false; | |
| let currentMode = 'chunk'; | |
| // Chunk mode variables | |
| let mediaRecorder = null; | |
| let audioChunks = []; | |
| // Overlap mode variables | |
| let audioContext = null; | |
| let mediaStream = null; | |
| let scriptProcessor = null; | |
| let audioBuffer = []; | |
| let processTimer = null; | |
| let lastTranscript = ''; | |
| const SAMPLE_RATE = 16000; | |
| const WINDOW_SEC = 4; | |
| const MODEL_ID = 'wmoto-ai/moonshine-tiny-ja-ONNX'; | |
| // Mode switching | |
| function setMode(mode) { | |
| if (isRecording) return; | |
| currentMode = mode; | |
| if (mode === 'chunk') { | |
| modeChunkBtn.classList.add('active'); | |
| modeOverlapBtn.classList.remove('active'); | |
| modeDescription.textContent = '指定間隔ごとに録音を区切って処理。シンプルだが境界で言葉が途切れる可能性あり。'; | |
| intervalLabel.textContent = '録音間隔'; | |
| intervalSlider.min = '1'; | |
| intervalSlider.max = '6'; | |
| intervalSlider.value = '3'; | |
| intervalValue.textContent = '3'; | |
| } else { | |
| modeChunkBtn.classList.remove('active'); | |
| modeOverlapBtn.classList.add('active'); | |
| modeDescription.textContent = '連続バッファ + オーバーラップ処理。境界での途切れを防ぎ、滑らかな文字起こしを実現。'; | |
| intervalLabel.textContent = '処理間隔'; | |
| intervalSlider.min = '1'; | |
| intervalSlider.max = '4'; | |
| intervalSlider.value = '2'; | |
| intervalValue.textContent = '2'; | |
| } | |
| } | |
| modeChunkBtn.addEventListener('click', () => setMode('chunk')); | |
| modeOverlapBtn.addEventListener('click', () => setMode('overlap')); | |
| intervalSlider.addEventListener('input', () => { | |
| intervalValue.textContent = intervalSlider.value; | |
| }); | |
| async function loadModel() { | |
| try { | |
| statusEl.textContent = 'モデルを読み込み中... (初回は数分かかることがあります)'; | |
| const progressCallback = (progress) => { | |
| if (progress.status === 'progress') { | |
| const percent = Math.round((progress.loaded / progress.total) * 100); | |
| progressBar.style.width = percent + '%'; | |
| statusEl.textContent = `モデルを読み込み中... ${percent}%`; | |
| } | |
| }; | |
| [model, processor, tokenizer] = await Promise.all([ | |
| MoonshineForConditionalGeneration.from_pretrained(MODEL_ID, { | |
| dtype: 'fp32', | |
| progress_callback: progressCallback | |
| }), | |
| AutoProcessor.from_pretrained(MODEL_ID), | |
| AutoTokenizer.from_pretrained(MODEL_ID) | |
| ]); | |
| statusEl.textContent = '準備完了!録音を開始できます'; | |
| statusEl.className = 'status ready'; | |
| startBtn.textContent = '録音開始'; | |
| startBtn.disabled = false; | |
| } catch (error) { | |
| statusEl.textContent = `エラー: ${error.message}`; | |
| statusEl.className = 'status error'; | |
| } | |
| } | |
| // ============ Common transcription function ============ | |
| async function transcribe(audioData) { | |
| if (!model || !processor || !tokenizer) return null; | |
| if (audioData.length < 1600) return null; | |
| let maxLevel = 0; | |
| let sumSquares = 0; | |
| for (let i = 0; i < audioData.length; i++) { | |
| const abs = Math.abs(audioData[i]); | |
| if (abs > maxLevel) maxLevel = abs; | |
| sumSquares += audioData[i] * audioData[i]; | |
| } | |
| const rms = Math.sqrt(sumSquares / audioData.length); | |
| if (rms < 0.01 || maxLevel < 0.05) return null; | |
| const inputs = await processor(audioData); | |
| const audioDuration = audioData.length / SAMPLE_RATE; | |
| const maxTokens = Math.min(Math.round(audioDuration * 25), 150); | |
| const outputs = await model.generate({ | |
| ...inputs, | |
| max_new_tokens: maxTokens, | |
| }); | |
| let text = tokenizer.decode(outputs[0], { skip_special_tokens: true }).trim(); | |
| const repeatPattern = /(.{2,}?)\1{4,}/; | |
| if (repeatPattern.test(text)) { | |
| text = text.replace(/(.{2,}?)\1{3,}/g, '$1'); | |
| } | |
| const hallucinations = ['彼は私', '彼女は私', 'そう、そう']; | |
| const isHallucination = hallucinations.some(h => text.includes(h) && text.length > 30); | |
| if (isHallucination) return null; | |
| return text; | |
| } | |
| // ============ Chunk Mode ============ | |
| async function transcribeAudioBlob(audioBlob) { | |
| try { | |
| currentTextEl.textContent = '処理中...'; | |
| const arrayBuffer = await audioBlob.arrayBuffer(); | |
| if (!audioContext) { | |
| audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE }); | |
| } | |
| const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); | |
| const offlineCtx = new OfflineAudioContext( | |
| 1, | |
| Math.ceil(audioBuffer.duration * SAMPLE_RATE), | |
| SAMPLE_RATE | |
| ); | |
| const source = offlineCtx.createBufferSource(); | |
| source.buffer = audioBuffer; | |
| source.connect(offlineCtx.destination); | |
| source.start(); | |
| const resampled = await offlineCtx.startRendering(); | |
| const audioData = resampled.getChannelData(0); | |
| const text = await transcribe(audioData); | |
| if (text) { | |
| currentTextEl.textContent = text; | |
| transcriptEl.textContent += text + '\n'; | |
| } else { | |
| currentTextEl.textContent = '(音声が検出されませんでした)'; | |
| } | |
| } catch (error) { | |
| currentTextEl.textContent = `エラー: ${error.message}`; | |
| } | |
| } | |
| async function startChunkRecording() { | |
| try { | |
| const stream = await navigator.mediaDevices.getUserMedia({ | |
| audio: { channelCount: 1, sampleRate: SAMPLE_RATE } | |
| }); | |
| audioChunks = []; | |
| mediaRecorder = new MediaRecorder(stream, { | |
| mimeType: 'audio/webm;codecs=opus' | |
| }); | |
| mediaRecorder.ondataavailable = (event) => { | |
| if (event.data.size > 0) { | |
| audioChunks.push(event.data); | |
| } | |
| }; | |
| const processAndRestart = async () => { | |
| if (!isRecording) return; | |
| mediaRecorder.stop(); | |
| }; | |
| mediaRecorder.onstop = async () => { | |
| if (audioChunks.length > 0 && isRecording) { | |
| const audioBlob = new Blob(audioChunks, { type: 'audio/webm;codecs=opus' }); | |
| audioChunks = []; | |
| await transcribeAudioBlob(audioBlob); | |
| if (isRecording && mediaRecorder.stream.active) { | |
| const intervalMs = parseFloat(intervalSlider.value) * 1000; | |
| mediaRecorder.start(500); | |
| setTimeout(processAndRestart, intervalMs); | |
| } | |
| } | |
| }; | |
| mediaRecorder.start(500); | |
| isRecording = true; | |
| const intervalMs = parseFloat(intervalSlider.value) * 1000; | |
| setTimeout(processAndRestart, intervalMs); | |
| updateRecordingUI(); | |
| } catch (error) { | |
| statusEl.textContent = `マイクエラー: ${error.message}`; | |
| statusEl.className = 'status error'; | |
| } | |
| } | |
| function stopChunkRecording() { | |
| if (mediaRecorder && mediaRecorder.state !== 'inactive') { | |
| mediaRecorder.stop(); | |
| mediaRecorder.stream.getTracks().forEach(track => track.stop()); | |
| } | |
| isRecording = false; | |
| audioChunks = []; | |
| } | |
| // ============ Overlap Mode ============ | |
| function removeDuplicateText(prevText, newText) { | |
| if (!prevText || !newText) return newText; | |
| const minOverlap = 2; | |
| const maxOverlap = Math.min(prevText.length, newText.length, 20); | |
| for (let len = maxOverlap; len >= minOverlap; len--) { | |
| const prevEnd = prevText.slice(-len); | |
| if (newText.startsWith(prevEnd)) { | |
| return newText.slice(len); | |
| } | |
| } | |
| return newText; | |
| } | |
| async function processAudioWindow() { | |
| if (!model || !processor || !tokenizer || !isRecording) return; | |
| if (isProcessing) return; | |
| isProcessing = true; | |
| try { | |
| const windowSamples = WINDOW_SEC * SAMPLE_RATE; | |
| if (audioBuffer.length < windowSamples * 0.5) { | |
| currentTextEl.textContent = '(音声を収集中...)'; | |
| return; | |
| } | |
| const startIdx = Math.max(0, audioBuffer.length - windowSamples); | |
| const audioData = new Float32Array(audioBuffer.slice(startIdx)); | |
| currentTextEl.textContent = '処理中...'; | |
| const text = await transcribe(audioData); | |
| if (text) { | |
| const uniqueText = removeDuplicateText(lastTranscript, text); | |
| if (uniqueText && uniqueText.length > 0) { | |
| currentTextEl.textContent = text; | |
| transcriptEl.textContent += uniqueText; | |
| lastTranscript = text; | |
| } | |
| } else { | |
| currentTextEl.textContent = '(音声が検出されませんでした)'; | |
| } | |
| } catch (error) { | |
| currentTextEl.textContent = `エラー: ${error.message}`; | |
| } finally { | |
| isProcessing = false; | |
| } | |
| } | |
| async function startOverlapRecording() { | |
| try { | |
| mediaStream = await navigator.mediaDevices.getUserMedia({ | |
| audio: { channelCount: 1, sampleRate: SAMPLE_RATE } | |
| }); | |
| audioContext = new (window.AudioContext || window.webkitAudioContext)({ | |
| sampleRate: SAMPLE_RATE | |
| }); | |
| const source = audioContext.createMediaStreamSource(mediaStream); | |
| const bufferSize = 4096; | |
| scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1); | |
| const maxBufferSize = SAMPLE_RATE * 10; | |
| scriptProcessor.onaudioprocess = (e) => { | |
| if (!isRecording) return; | |
| const inputData = e.inputBuffer.getChannelData(0); | |
| for (let i = 0; i < inputData.length; i++) { | |
| audioBuffer.push(inputData[i]); | |
| } | |
| while (audioBuffer.length > maxBufferSize) { | |
| audioBuffer.shift(); | |
| } | |
| }; | |
| source.connect(scriptProcessor); | |
| scriptProcessor.connect(audioContext.destination); | |
| audioBuffer = []; | |
| lastTranscript = ''; | |
| isRecording = true; | |
| isProcessing = false; | |
| const intervalMs = parseFloat(intervalSlider.value) * 1000; | |
| processTimer = setInterval(processAudioWindow, intervalMs); | |
| updateRecordingUI(); | |
| } catch (error) { | |
| statusEl.textContent = `マイクエラー: ${error.message}`; | |
| statusEl.className = 'status error'; | |
| } | |
| } | |
| function stopOverlapRecording() { | |
| isRecording = false; | |
| if (processTimer) { | |
| clearInterval(processTimer); | |
| processTimer = null; | |
| } | |
| if (scriptProcessor) { | |
| scriptProcessor.disconnect(); | |
| scriptProcessor = null; | |
| } | |
| if (audioContext) { | |
| audioContext.close(); | |
| audioContext = null; | |
| } | |
| if (mediaStream) { | |
| mediaStream.getTracks().forEach(track => track.stop()); | |
| mediaStream = null; | |
| } | |
| audioBuffer = []; | |
| } | |
| // ============ UI Helpers ============ | |
| function updateRecordingUI() { | |
| statusEl.textContent = '録音中... マイクに向かって話してください'; | |
| statusEl.className = 'status recording'; | |
| startBtn.textContent = '録音停止'; | |
| startBtn.classList.add('recording'); | |
| modeChunkBtn.disabled = true; | |
| modeOverlapBtn.disabled = true; | |
| } | |
| function updateStoppedUI() { | |
| statusEl.textContent = '録音停止。再開するにはボタンをクリック'; | |
| statusEl.className = 'status ready'; | |
| startBtn.textContent = '録音開始'; | |
| startBtn.classList.remove('recording'); | |
| currentTextEl.textContent = ''; | |
| modeChunkBtn.disabled = false; | |
| modeOverlapBtn.disabled = false; | |
| } | |
| // ============ Main Controls ============ | |
| function startRecording() { | |
| if (currentMode === 'chunk') { | |
| startChunkRecording(); | |
| } else { | |
| startOverlapRecording(); | |
| } | |
| } | |
| function stopRecording() { | |
| if (currentMode === 'chunk') { | |
| stopChunkRecording(); | |
| } else { | |
| stopOverlapRecording(); | |
| } | |
| updateStoppedUI(); | |
| } | |
| startBtn.addEventListener('click', () => { | |
| if (isRecording) { | |
| stopRecording(); | |
| } else { | |
| startRecording(); | |
| } | |
| }); | |
| loadModel(); | |
| </script> | |
| </body> | |
| </html> | |