Spaces:

tscr-369
/

vocalcore

Runtime error

App Files Files Community

tscr-369 commited on Jul 20, 2025

Commit

83a2db2

verified ·

1 Parent(s): 0229d5e

Update main.py

Browse files

Files changed (1) hide show

main.py +298 -25

main.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import os
-from fastapi import FastAPI, File, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
-from huggingface_hub import login
-from NatureLM.models import NatureLM
-from NatureLM.infer import Pipeline
-import tempfile
-# Authenticate with HuggingFace to access gated models
-login(token=os.environ.get("HF_TOKEN"))
-app = FastAPI()
-# Allow CORS for all origins (for frontend integration)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -20,20 +22,291 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Load the model once at startup
-model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio").eval()
-pipeline = Pipeline(model=model)
-@app.post("/analyze")
 async def analyze_audio(file: UploadFile = File(...)):
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
-        tmp.write(await file.read())
-        tmp_path = tmp.name
-    results = pipeline([tmp_path], ["What is the common name for the focal species in the audio? Answer:"])
-    return {
-        "species": results[0],  # Adjust parsing as needed
-        "interpretation": "TODO: parse from model output",
-        "confidence": 90,  # TODO: parse or estimate
-        "clusterGroup": "TODO: parse from model output",
-        "additionalInfo": "TODO: parse from model output"
-    }

 import os
+import torch
+import librosa
+import numpy as np
+from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, Dict, Any
+import json
+import re
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from naturelm_audio import NatureLMAudio
+app = FastAPI(title="NatureLM Audio Analysis API")
+# CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Initialize NatureLM model
+model = None
+tokenizer = None
+def load_model():
+    global model, tokenizer
+    try:
+        # Load NatureLM-audio model
+        model = NatureLMAudio.from_pretrained("NatureLM/NatureLM-audio")
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
+        # Set model to evaluation mode
+        model.eval()
+        if torch.cuda.is_available():
+            model = model.cuda()
+        print("✅ NatureLM model loaded successfully")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        raise e
+# Load model on startup
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+class AnalysisResponse(BaseModel):
+    species: str
+    interpretation: str
+    confidence: float
+    signal_type: str
+    common_name: str
+    scientific_name: str
+    habitat: str
+    behavior: str
+    audio_characteristics: Dict[str, Any]
+    model_confidence: float
+    llama_confidence: float
+    additional_insights: str
+    cluster_group: str
+def extract_confidence_from_response(response_text: str) -> Dict[str, float]:
+    """Extract confidence scores from NatureLM response"""
+    confidence_scores = {
+        "model_confidence": 0.0,
+        "llama_confidence": 0.0
+    }
+    # Look for confidence patterns in the response
+    confidence_patterns = [
+        r"confidence[:\s]*(\d+(?:\.\d+)?)",
+        r"certainty[:\s]*(\d+(?:\.\d+)?)",
+        r"(\d+(?:\.\d+)?)%?\s*confidence",
+        r"confidence\s*level[:\s]*(\d+(?:\.\d+)?)"
+    ]
+    for pattern in confidence_patterns:
+        matches = re.findall(pattern, response_text.lower())
+        if matches:
+            try:
+                confidence_scores["model_confidence"] = float(matches[0])
+                confidence_scores["llama_confidence"] = float(matches[0])
+                break
+            except ValueError:
+                continue
+    return confidence_scores
+def extract_species_info(response_text: str) -> Dict[str, str]:
+    """Extract detailed species information from NatureLM response"""
+    info = {
+        "common_name": "",
+        "scientific_name": "",
+        "habitat": "",
+        "behavior": "",
+        "signal_type": ""
+    }
+    # Extract common name
+    common_patterns = [
+        r"common name[:\s]*([A-Za-z\s]+)",
+        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+\(common\)",
+        r"species[:\s]*([A-Za-z\s]+)"
+    ]
+    for pattern in common_patterns:
+        match = re.search(pattern, response_text, re.IGNORECASE)
+        if match:
+            info["common_name"] = match.group(1).strip()
+            break
+    # Extract scientific name
+    sci_patterns = [
+        r"scientific name[:\s]*([A-Z][a-z]+\s+[a-z]+)",
+        r"([A-Z][a-z]+\s+[a-z]+)\s+\(scientific\)",
+        r"genus[:\s]*([A-Z][a-z]+)\s+species[:\s]*([a-z]+)"
+    ]
+    for pattern in sci_patterns:
+        match = re.search(pattern, response_text, re.IGNORECASE)
+        if match:
+            if len(match.groups()) == 2:
+                info["scientific_name"] = f"{match.group(1)} {match.group(2)}"
+            else:
+                info["scientific_name"] = match.group(1).strip()
+            break
+    # Extract signal type
+    signal_patterns = [
+        r"signal type[:\s]*([A-Za-z\s]+)",
+        r"call type[:\s]*([A-Za-z\s]+)",
+        r"vocalization[:\s]*([A-Za-z\s]+)",
+        r"sound type[:\s]*([A-Za-z\s]+)"
+    ]
+    for pattern in signal_patterns:
+        match = re.search(pattern, response_text, re.IGNORECASE)
+        if match:
+            info["signal_type"] = match.group(1).strip()
+            break
+    # Extract habitat
+    habitat_patterns = [
+        r"habitat[:\s]*([A-Za-z\s,]+)",
+        r"environment[:\s]*([A-Za-z\s,]+)",
+        r"found in[:\s]*([A-Za-z\s,]+)"
+    ]
+    for pattern in habitat_patterns:
+        match = re.search(pattern, response_text, re.IGNORECASE)
+        if match:
+            info["habitat"] = match.group(1).strip()
+            break
+    # Extract behavior
+    behavior_patterns = [
+        r"behavior[:\s]*([A-Za-z\s,]+)",
+        r"purpose[:\s]*([A-Za-z\s,]+)",
+        r"function[:\s]*([A-Za-z\s,]+)"
+    ]
+    for pattern in behavior_patterns:
+        match = re.search(pattern, response_text, re.IGNORECASE)
+        if match:
+            info["behavior"] = match.group(1).strip()
+            break
+    return info
+def analyze_audio_characteristics(audio_path: str) -> Dict[str, Any]:
+    """Analyze audio characteristics using librosa"""
+    try:
+        # Load audio file
+        y, sr = librosa.load(audio_path, sr=None)
+        # Calculate audio features
+        duration = librosa.get_duration(y=y, sr=sr)
+        # Spectral features
+        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
+        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
+        # MFCC features
+        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+        # Pitch features
+        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
+        # Rhythm features
+        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
+        # Energy features
+        rms = librosa.feature.rms(y=y)[0]
+        characteristics = {
+            "duration_seconds": float(duration),
+            "sample_rate": int(sr),
+            "tempo_bpm": float(tempo),
+            "mean_spectral_centroid": float(np.mean(spectral_centroids)),
+            "mean_spectral_rolloff": float(np.mean(spectral_rolloff)),
+            "mean_rms_energy": float(np.mean(rms)),
+            "mfcc_mean": [float(x) for x in np.mean(mfccs, axis=1)],
+            "pitch_range": {
+                "min": float(np.min(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0),
+                "max": float(np.max(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0),
+                "mean": float(np.mean(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0)
+            }
+        }
+        return characteristics
+    except Exception as e:
+        print(f"Error analyzing audio characteristics: {e}")
+        return {}
+@app.post("/analyze", response_model=AnalysisResponse)
 async def analyze_audio(file: UploadFile = File(...)):
+    try:
+        # Save uploaded file temporarily
+        temp_path = f"/tmp/{file.filename}"
+        with open(temp_path, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        # Analyze audio characteristics
+        audio_chars = analyze_audio_characteristics(temp_path)
+        # Create enhanced prompt for NatureLM
+        enhanced_prompt = f"""
+        Analyze this animal audio recording and provide detailed information including:
+        1. Species identification (common name and scientific name)
+        2. Signal type and purpose
+        3. Habitat and behavior context
+        4. Audio characteristics analysis
+        5. Confidence level in your assessment
+        Please provide a comprehensive analysis with specific details about:
+        - Common name of the species
+        - Scientific name (genus and species)
+        - Type of vocalization (call, song, alarm, etc.)
+        - Habitat where this species is typically found
+        - Behavioral context of this sound
+        - Confidence level (0-100%)
+        Audio file: {file.filename}
+        Duration: {audio_chars.get('duration_seconds', 'Unknown')} seconds
+        Sample rate: {audio_chars.get('sample_rate', 'Unknown')} Hz
+        """
+        # Get NatureLM prediction
+        with torch.no_grad():
+            inputs = tokenizer(enhanced_prompt, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            outputs = model.generate(
+                **inputs,
+                max_length=512,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id
+            )
+            response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract information from response
+        confidence_scores = extract_confidence_from_response(response_text)
+        species_info = extract_species_info(response_text)
+        # Calculate overall confidence
+        overall_confidence = max(
+            confidence_scores["model_confidence"],
+            confidence_scores["llama_confidence"],
+            50.0  # Default fallback
+        )
+        # Clean up temp file
+        os.remove(temp_path)
+        return AnalysisResponse(
+            species=species_info["common_name"] or "Unknown species",
+            interpretation=response_text,
+            confidence=overall_confidence,
+            signal_type=species_info["signal_type"] or "Vocalization",
+            common_name=species_info["common_name"] or "Unknown",
+            scientific_name=species_info["scientific_name"] or "Unknown",
+            habitat=species_info["habitat"] or "Unknown habitat",
+            behavior=species_info["behavior"] or "Unknown behavior",
+            audio_characteristics=audio_chars,
+            model_confidence=confidence_scores["model_confidence"],
+            llama_confidence=confidence_scores["llama_confidence"],
+            additional_insights=response_text,
+            cluster_group="NatureLM Analysis"
+        )
+    except Exception as e:
+        # Clean up temp file if it exists
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "model_loaded": model is not None}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)