Spaces:

tscr-369
/

vocalcore

Runtime error

App Files Files Community

tscr-369 commited on Jul 20

Commit

f2ac05a

verified ·

1 Parent(s): 550fba7

Update main.py

Browse files

Files changed (1) hide show

main.py +301 -76

main.py CHANGED Viewed

@@ -4,13 +4,14 @@ import librosa
 import numpy as np
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from typing import Optional, Dict, Any
 import json
 import re
 from contextlib import asynccontextmanager
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import InferenceClient
 # Set up cache directories BEFORE importing any HuggingFace modules
 cache_base = "/app/.cache"
@@ -32,13 +33,13 @@ for cache_dir in cache_dirs:
 # Global variables for model and pipeline
 model = None
-pipeline = None
 client = None
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # Startup
-    global model, pipeline, client
     try:
         print("🔄 Starting NatureLM Audio Decoder API...")
         print(f"📁 Using cache directory: {os.environ.get('HF_HOME', '/app/.cache')}")
@@ -47,9 +48,37 @@ async def lifespan(app: FastAPI):
         client = InferenceClient()
         print("✅ HuggingFace client initialized successfully")
-        # Note: We're not loading the model locally anymore due to dependency issues
-        # Instead, we'll use the HuggingFace Inference API
-        print("✅ API ready to use HuggingFace Inference API")
     except Exception as e:
         print(f"❌ Error during startup: {e}")
@@ -79,6 +108,8 @@ app.add_middleware(
 )
 class AnalysisResponse(BaseModel):
     species: str
     interpretation: str
     confidence: float
@@ -92,52 +123,83 @@ class AnalysisResponse(BaseModel):
     llama_confidence: float
     additional_insights: str
     cluster_group: str
 def extract_confidence_from_response(response_text: str) -> Dict[str, float]:
-    """Extract confidence scores from NatureLM response"""
     confidence_scores = {
         "model_confidence": 0.0,
-        "llama_confidence": 0.0
     }
-    # Look for confidence patterns in the response
     confidence_patterns = [
         r"confidence[:\s]*(\d+(?:\.\d+)?)",
         r"certainty[:\s]*(\d+(?:\.\d+)?)",
         r"(\d+(?:\.\d+)?)%?\s*confidence",
-        r"confidence\s*level[:\s]*(\d+(?:\.\d+)?)"
     ]
     for pattern in confidence_patterns:
         matches = re.findall(pattern, response_text.lower())
         if matches:
             try:
-                confidence_scores["model_confidence"] = float(matches[0])
-                confidence_scores["llama_confidence"] = float(matches[0])
                 break
             except ValueError:
                 continue
     return confidence_scores
 def extract_species_info(response_text: str) -> Dict[str, str]:
-    """Extract detailed species information from NatureLM response"""
     info = {
         "common_name": "",
         "scientific_name": "",
         "habitat": "",
         "behavior": "",
-        "signal_type": ""
     }
-    # Extract common name
     common_patterns = [
-        r"common name[:\s]*([A-Za-z\s]+)",
         r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+\(common\)",
-        r"species[:\s]*([A-Za-z\s]+)",
-        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+treefrog",
-        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+bird",
-        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+mammal"
     ]
     for pattern in common_patterns:
@@ -146,11 +208,13 @@ def extract_species_info(response_text: str) -> Dict[str, str]:
             info["common_name"] = match.group(1).strip()
             break
-    # Extract scientific name
     sci_patterns = [
         r"scientific name[:\s]*([A-Z][a-z]+\s+[a-z]+)",
         r"([A-Z][a-z]+\s+[a-z]+)\s+\(scientific\)",
-        r"genus[:\s]*([A-Z][a-z]+)\s+species[:\s]*([a-z]+)"
     ]
     for pattern in sci_patterns:
@@ -162,14 +226,15 @@ def extract_species_info(response_text: str) -> Dict[str, str]:
                 info["scientific_name"] = match.group(1).strip()
             break
-    # Extract signal type
     signal_patterns = [
-        r"signal type[:\s]*([A-Za-z\s]+)",
-        r"call type[:\s]*([A-Za-z\s]+)",
-        r"vocalization[:\s]*([A-Za-z\s]+)",
-        r"sound type[:\s]*([A-Za-z\s]+)",
-        r"([A-Za-z\s]+)\s+call",
-        r"([A-Za-z\s]+)\s+song"
     ]
     for pattern in signal_patterns:
@@ -178,12 +243,15 @@ def extract_species_info(response_text: str) -> Dict[str, str]:
             info["signal_type"] = match.group(1).strip()
             break
-    # Extract habitat
     habitat_patterns = [
-        r"habitat[:\s]*([A-Za-z\s,]+)",
-        r"environment[:\s]*([A-Za-z\s,]+)",
-        r"found in[:\s]*([A-Za-z\s,]+)",
-        r"lives in[:\s]*([A-Za-z\s,]+)"
     ]
     for pattern in habitat_patterns:
@@ -192,12 +260,15 @@ def extract_species_info(response_text: str) -> Dict[str, str]:
             info["habitat"] = match.group(1).strip()
             break
-    # Extract behavior
     behavior_patterns = [
-        r"behavior[:\s]*([A-Za-z\s,]+)",
-        r"purpose[:\s]*([A-Za-z\s,]+)",
-        r"function[:\s]*([A-Za-z\s,]+)",
-        r"used for[:\s]*([A-Za-z\s,]+)"
     ]
     for pattern in behavior_patterns:
@@ -206,10 +277,56 @@ def extract_species_info(response_text: str) -> Dict[str, str]:
             info["behavior"] = match.group(1).strip()
             break
     return info
 def analyze_audio_characteristics(audio_path: str) -> Dict[str, Any]:
-    """Analyze audio characteristics using librosa"""
     try:
         # Load audio file
         y, sr = librosa.load(audio_path, sr=None)
@@ -220,6 +337,7 @@ def analyze_audio_characteristics(audio_path: str) -> Dict[str, Any]:
         # Spectral features
         spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
         spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
         # MFCC features
         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
@@ -233,18 +351,33 @@ def analyze_audio_characteristics(audio_path: str) -> Dict[str, Any]:
         # Energy features
         rms = librosa.feature.rms(y=y)[0]
         characteristics = {
             "duration_seconds": float(duration),
             "sample_rate": int(sr),
             "tempo_bpm": float(tempo),
             "mean_spectral_centroid": float(np.mean(spectral_centroids)),
             "mean_spectral_rolloff": float(np.mean(spectral_rolloff)),
             "mean_rms_energy": float(np.mean(rms)),
             "mfcc_mean": [float(x) for x in np.mean(mfccs, axis=1)],
             "pitch_range": {
                 "min": float(np.min(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0),
                 "max": float(np.max(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0),
                 "mean": float(np.mean(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0)
             }
         }
@@ -253,18 +386,44 @@ def analyze_audio_characteristics(audio_path: str) -> Dict[str, Any]:
         print(f"Error analyzing audio characteristics: {e}")
         return {}
 @app.get("/")
 async def root():
-    return {"message": "NatureLM Audio Decoder API", "version": "1.0.0"}
 @app.get("/health")
 async def health_check():
-    return {"status": "healthy", "service": "NatureLM Audio Decoder API", "client_ready": client is not None}
 @app.post("/analyze", response_model=AnalysisResponse)
 async def analyze_audio(file: UploadFile = File(...)):
     """
-    Analyze audio file using NatureLM model via HuggingFace Inference API
     """
     try:
         # Save uploaded file temporarily
@@ -275,78 +434,139 @@ async def analyze_audio(file: UploadFile = File(...)):
         # Analyze audio characteristics
         audio_chars = analyze_audio_characteristics(temp_path)
-        # Create comprehensive prompt
         prompt = """
         Analyze this animal audio recording and provide detailed information including:
         1. Species identification (common name and scientific name)
-        2. Signal type and purpose
         3. Habitat and behavior context
         4. Audio characteristics analysis
-        5. Confidence level in your assessment
         Please provide a comprehensive analysis with specific details about:
         - Common name of the species
         - Scientific name (genus and species)
-        - Type of vocalization (call, song, alarm, etc.)
         - Habitat where this species is typically found
         - Behavioral context of this sound
         - Confidence level (0-100%)
         Audio file: {filename}
         Duration: {duration} seconds
         Sample rate: {sample_rate} Hz
         """.format(
             filename=file.filename,
             duration=audio_chars.get('duration_seconds', 'Unknown'),
-            sample_rate=audio_chars.get('sample_rate', 'Unknown')
         )
-        # Use HuggingFace inference API for NatureLM-audio
         try:
-            # Read audio file as bytes
-            with open(temp_path, "rb") as audio_file:
-                audio_bytes = audio_file.read()
-            # Call NatureLM-audio model via HuggingFace API
-            # Note: This requires proper API token configuration
-            response = client.post(
-                "EarthSpeciesProject/NatureLM-audio",
-                inputs={
-                    "audio": audio_bytes,
-                    "text": prompt
-                }
-            )
-            # Parse response
-            if isinstance(response, list) and len(response) > 0:
-                combined_response = response[0]
             else:
-                combined_response = str(response)
         except Exception as api_error:
             print(f"API call failed: {api_error}")
-            # Fallback to a mock response for testing
             combined_response = """
             This appears to be a Green Treefrog (Hyla cinerea) mating call.
             The vocalization is a distinctive "quonk" sound used for territorial defense and mate attraction.
             These frogs are commonly found in wetland habitats throughout the southeastern United States.
-            The call is typically produced during breeding season and serves to establish territory and attract females.
             Confidence level: 85%
             """
         # Extract information from response
         confidence_scores = extract_confidence_from_response(combined_response)
         species_info = extract_species_info(combined_response)
-        # Calculate overall confidence based on response quality
         overall_confidence = max(
             confidence_scores["model_confidence"],
             confidence_scores["llama_confidence"],
-            75.0 if species_info["common_name"] else 50.0  # Higher confidence if species identified
         )
         # Clean up temp file
         os.remove(temp_path)
@@ -363,16 +583,21 @@ async def analyze_audio(file: UploadFile = File(...)):
             model_confidence=confidence_scores["model_confidence"],
             llama_confidence=confidence_scores["llama_confidence"],
             additional_insights=combined_response,
-            cluster_group="NatureLM Analysis"
         )
     except Exception as e:
         # Clean up temp file if it exists
-        if os.path.exists(temp_path):
             os.remove(temp_path)
         raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import numpy as np
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, ConfigDict
+from typing import Optional, Dict, Any, List
 import json
 import re
 from contextlib import asynccontextmanager
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from huggingface_hub import InferenceClient
+import base64
 # Set up cache directories BEFORE importing any HuggingFace modules
 cache_base = "/app/.cache"
 # Global variables for model and pipeline
 model = None
+audio_pipeline = None
 client = None
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # Startup
+    global model, audio_pipeline, client
     try:
         print("🔄 Starting NatureLM Audio Decoder API...")
         print(f"📁 Using cache directory: {os.environ.get('HF_HOME', '/app/.cache')}")
         client = InferenceClient()
         print("✅ HuggingFace client initialized successfully")
+        # Load NatureLM-audio model locally for better performance
+        try:
+            print("🔄 Loading NatureLM-audio model...")
+            model_name = "EarthSpeciesProject/NatureLM-audio"
+            # Load tokenizer and model
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
+            )
+            # Create audio pipeline
+            audio_pipeline = pipeline(
+                "automatic-speech-recognition",
+                model=model,
+                tokenizer=tokenizer,
+                device_map="auto"
+            )
+            print("✅ NatureLM-audio model loaded successfully")
+        except Exception as model_error:
+            print(f"⚠️ Could not load model locally: {model_error}")
+            print("🔄 Falling back to HuggingFace Inference API")
+            model = None
+            audio_pipeline = None
+        print("✅ API ready for NatureLM-audio analysis")
     except Exception as e:
         print(f"❌ Error during startup: {e}")
 )
 class AnalysisResponse(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
     species: str
     interpretation: str
     confidence: float
     llama_confidence: float
     additional_insights: str
     cluster_group: str
+    detailed_caption: str
+    confidence_breakdown: Dict[str, float]
+    species_alternatives: List[Dict[str, Any]]
+    audio_quality_score: float
+    detection_method: str
 def extract_confidence_from_response(response_text: str) -> Dict[str, float]:
+    """Extract confidence scores from NatureLM response with enhanced parsing"""
     confidence_scores = {
         "model_confidence": 0.0,
+        "llama_confidence": 0.0,
+        "species_confidence": 0.0,
+        "signal_confidence": 0.0,
+        "overall_confidence": 0.0
     }
+    # Enhanced confidence patterns
     confidence_patterns = [
         r"confidence[:\s]*(\d+(?:\.\d+)?)",
         r"certainty[:\s]*(\d+(?:\.\d+)?)",
         r"(\d+(?:\.\d+)?)%?\s*confidence",
+        r"confidence\s*level[:\s]*(\d+(?:\.\d+)?)",
+        r"(\d+(?:\.\d+)?)\s*out\s*of\s*100",
+        r"probability[:\s]*(\d+(?:\.\d+)?)",
+        r"likelihood[:\s]*(\d+(?:\.\d+)?)"
     ]
     for pattern in confidence_patterns:
         matches = re.findall(pattern, response_text.lower())
         if matches:
             try:
+                confidence_value = float(matches[0])
+                confidence_scores["model_confidence"] = confidence_value
+                confidence_scores["llama_confidence"] = confidence_value
+                confidence_scores["overall_confidence"] = confidence_value
                 break
             except ValueError:
                 continue
+    # Extract species-specific confidence
+    species_confidence_patterns = [
+        r"species\s+confidence[:\s]*(\d+(?:\.\d+)?)",
+        r"identification\s+confidence[:\s]*(\d+(?:\.\d+)?)",
+        r"species\s+probability[:\s]*(\d+(?:\.\d+)?)"
+    ]
+    for pattern in species_confidence_patterns:
+        match = re.search(pattern, response_text.lower())
+        if match:
+            try:
+                confidence_scores["species_confidence"] = float(match.group(1))
+            except ValueError:
+                continue
     return confidence_scores
 def extract_species_info(response_text: str) -> Dict[str, str]:
+    """Extract detailed species information from NatureLM response with enhanced parsing"""
     info = {
         "common_name": "",
         "scientific_name": "",
         "habitat": "",
         "behavior": "",
+        "signal_type": "",
+        "detailed_caption": ""
     }
+    # Enhanced common name extraction
     common_patterns = [
+        r"common name[:\s]*([A-Za-z\s\-]+)",
         r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+\(common\)",
+        r"species[:\s]*([A-Za-z\s\-]+)",
+        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:treefrog|frog|toad)",
+        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:bird|sparrow|warbler|thrush|owl|hawk|eagle)",
+        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:mammal|bat|whale|dolphin|seal|bear|wolf|fox)",
+        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:insect|bee|cricket|cicada|grasshopper)",
+        r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:fish|shark|tuna|salmon)"
     ]
     for pattern in common_patterns:
             info["common_name"] = match.group(1).strip()
             break
+    # Enhanced scientific name extraction
     sci_patterns = [
         r"scientific name[:\s]*([A-Z][a-z]+\s+[a-z]+)",
         r"([A-Z][a-z]+\s+[a-z]+)\s+\(scientific\)",
+        r"genus[:\s]*([A-Z][a-z]+)\s+species[:\s]*([a-z]+)",
+        r"([A-Z][a-z]+)\s+([a-z]+)\s+\(scientific\)",
+        r"([A-Z][a-z]+)\s+([a-z]+)\s+species"
     ]
     for pattern in sci_patterns:
                 info["scientific_name"] = match.group(1).strip()
             break
+    # Enhanced signal type extraction
     signal_patterns = [
+        r"signal type[:\s]*([A-Za-z\s\-]+)",
+        r"call type[:\s]*([A-Za-z\s\-]+)",
+        r"vocalization[:\s]*([A-Za-z\s\-]+)",
+        r"sound type[:\s]*([A-Za-z\s\-]+)",
+        r"([A-Za-z\s\-]+)\s+(?:call|song|chirp|trill|whistle|hoot|bark|growl|roar|squeak|click|buzz)",
+        r"vocalization\s+type[:\s]*([A-Za-z\s\-]+)",
+        r"communication\s+type[:\s]*([A-Za-z\s\-]+)"
     ]
     for pattern in signal_patterns:
             info["signal_type"] = match.group(1).strip()
             break
+    # Enhanced habitat extraction
     habitat_patterns = [
+        r"habitat[:\s]*([A-Za-z\s,\-]+)",
+        r"environment[:\s]*([A-Za-z\s,\-]+)",
+        r"found in[:\s]*([A-Za-z\s,\-]+)",
+        r"lives in[:\s]*([A-Za-z\s,\-]+)",
+        r"native to[:\s]*([A-Za-z\s,\-]+)",
+        r"distribution[:\s]*([A-Za-z\s,\-]+)",
+        r"range[:\s]*([A-Za-z\s,\-]+)"
     ]
     for pattern in habitat_patterns:
             info["habitat"] = match.group(1).strip()
             break
+    # Enhanced behavior extraction
     behavior_patterns = [
+        r"behavior[:\s]*([A-Za-z\s,\-]+)",
+        r"purpose[:\s]*([A-Za-z\s,\-]+)",
+        r"function[:\s]*([A-Za-z\s,\-]+)",
+        r"used for[:\s]*([A-Za-z\s,\-]+)",
+        r"behavioral\s+context[:\s]*([A-Za-z\s,\-]+)",
+        r"communication\s+purpose[:\s]*([A-Za-z\s,\-]+)",
+        r"significance[:\s]*([A-Za-z\s,\-]+)"
     ]
     for pattern in behavior_patterns:
             info["behavior"] = match.group(1).strip()
             break
+    # Extract detailed caption from the full response
+    info["detailed_caption"] = response_text.strip()
     return info
+def generate_detailed_caption(species_info: Dict[str, str], audio_chars: Dict[str, Any], confidence_scores: Dict[str, float]) -> str:
+    """Generate a comprehensive, detailed caption for the audio"""
+    caption_parts = []
+    # Species identification
+    if species_info["common_name"]:
+        caption_parts.append(f"Species: {species_info['common_name']}")
+        if species_info["scientific_name"]:
+            caption_parts.append(f"({species_info['scientific_name']})")
+    # Signal type and characteristics
+    if species_info["signal_type"]:
+        caption_parts.append(f"Signal Type: {species_info['signal_type']}")
+    # Audio characteristics
+    if audio_chars:
+        duration = audio_chars.get('duration_seconds', 0)
+        if duration > 0:
+            caption_parts.append(f"Duration: {duration:.2f}s")
+        tempo = audio_chars.get('tempo_bpm', 0)
+        if tempo > 0:
+            caption_parts.append(f"Tempo: {tempo:.1f} BPM")
+        pitch_range = audio_chars.get('pitch_range', {})
+        if pitch_range.get('min', 0) > 0 and pitch_range.get('max', 0) > 0:
+            caption_parts.append(f"Pitch Range: {pitch_range['min']:.1f}-{pitch_range['max']:.1f} Hz")
+    # Habitat and behavior context
+    if species_info["habitat"]:
+        caption_parts.append(f"Habitat: {species_info['habitat']}")
+    if species_info["behavior"]:
+        caption_parts.append(f"Behavior: {species_info['behavior']}")
+    # Confidence information
+    overall_conf = confidence_scores.get('overall_confidence', 0)
+    if overall_conf > 0:
+        caption_parts.append(f"Confidence: {overall_conf:.1f}%")
+    return " | ".join(caption_parts) if caption_parts else "Audio analysis completed"
 def analyze_audio_characteristics(audio_path: str) -> Dict[str, Any]:
+    """Analyze audio characteristics using librosa with enhanced features"""
     try:
         # Load audio file
         y, sr = librosa.load(audio_path, sr=None)
         # Spectral features
         spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
         spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
+        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
         # MFCC features
         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
         # Energy features
         rms = librosa.feature.rms(y=y)[0]
+        # Zero crossing rate
+        zcr = librosa.feature.zero_crossing_rate(y)[0]
+        # Harmonic features
+        harmonic, percussive = librosa.effects.hpss(y)
+        harmonic_ratio = np.sum(harmonic**2) / (np.sum(harmonic**2) + np.sum(percussive**2))
         characteristics = {
             "duration_seconds": float(duration),
             "sample_rate": int(sr),
             "tempo_bpm": float(tempo),
             "mean_spectral_centroid": float(np.mean(spectral_centroids)),
             "mean_spectral_rolloff": float(np.mean(spectral_rolloff)),
+            "mean_spectral_bandwidth": float(np.mean(spectral_bandwidth)),
             "mean_rms_energy": float(np.mean(rms)),
+            "mean_zero_crossing_rate": float(np.mean(zcr)),
+            "harmonic_ratio": float(harmonic_ratio),
             "mfcc_mean": [float(x) for x in np.mean(mfccs, axis=1)],
             "pitch_range": {
                 "min": float(np.min(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0),
                 "max": float(np.max(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0),
                 "mean": float(np.mean(pitches[magnitudes > 0.1]) if np.any(magnitudes > 0.1) else 0)
+            },
+            "audio_quality_indicators": {
+                "signal_to_noise_ratio": float(np.mean(rms) / (np.std(rms) + 1e-8)),
+                "clarity_score": float(harmonic_ratio * np.mean(spectral_centroids) / 1000),
+                "complexity_score": float(np.std(mfccs))
             }
         }
         print(f"Error analyzing audio characteristics: {e}")
         return {}
+def calculate_audio_quality_score(audio_chars: Dict[str, Any]) -> float:
+    """Calculate overall audio quality score"""
+    if not audio_chars:
+        return 0.0
+    quality_indicators = audio_chars.get('audio_quality_indicators', {})
+    # Base quality factors
+    snr = quality_indicators.get('signal_to_noise_ratio', 0)
+    clarity = quality_indicators.get('clarity_score', 0)
+    complexity = quality_indicators.get('complexity_score', 0)
+    # Normalize and combine scores
+    snr_score = min(snr / 10, 1.0) * 30  # Max 30 points
+    clarity_score = min(clarity, 1.0) * 40  # Max 40 points
+    complexity_score = min(complexity / 10, 1.0) * 30  # Max 30 points
+    total_score = snr_score + clarity_score + complexity_score
+    return min(total_score, 100.0)
 @app.get("/")
 async def root():
+    return {"message": "NatureLM Audio Decoder API", "version": "1.0.0", "model": "NatureLM-audio"}
 @app.get("/health")
 async def health_check():
+    return {
+        "status": "healthy",
+        "service": "NatureLM Audio Decoder API",
+        "model_loaded": model is not None,
+        "pipeline_ready": audio_pipeline is not None,
+        "client_ready": client is not None
+    }
 @app.post("/analyze", response_model=AnalysisResponse)
 async def analyze_audio(file: UploadFile = File(...)):
     """
+    Analyze audio file using NatureLM-audio model with enhanced confidence scoring and detailed captioning
     """
     try:
         # Save uploaded file temporarily
         # Analyze audio characteristics
         audio_chars = analyze_audio_characteristics(temp_path)
+        audio_quality_score = calculate_audio_quality_score(audio_chars)
+        # Create comprehensive prompt for NatureLM-audio
         prompt = """
         Analyze this animal audio recording and provide detailed information including:
         1. Species identification (common name and scientific name)
+        2. Signal type and purpose with specific details
         3. Habitat and behavior context
         4. Audio characteristics analysis
+        5. Confidence level in your assessment (0-100%)
+        6. Alternative species possibilities if uncertain
         Please provide a comprehensive analysis with specific details about:
         - Common name of the species
         - Scientific name (genus and species)
+        - Type of vocalization (call, song, alarm, territorial, mating, etc.)
         - Habitat where this species is typically found
         - Behavioral context of this sound
         - Confidence level (0-100%)
+        - Any alternative species that could produce similar sounds
         Audio file: {filename}
         Duration: {duration} seconds
         Sample rate: {sample_rate} Hz
+        Audio quality indicators: SNR={snr:.2f}, Clarity={clarity:.2f}, Complexity={complexity:.2f}
         """.format(
             filename=file.filename,
             duration=audio_chars.get('duration_seconds', 'Unknown'),
+            sample_rate=audio_chars.get('sample_rate', 'Unknown'),
+            snr=audio_chars.get('audio_quality_indicators', {}).get('signal_to_noise_ratio', 0),
+            clarity=audio_chars.get('audio_quality_indicators', {}).get('clarity_score', 0),
+            complexity=audio_chars.get('audio_quality_indicators', {}).get('complexity_score', 0)
         )
+        # Use NatureLM-audio model for analysis
         try:
+            if audio_pipeline is not None:
+                # Use local model if available
+                print("🔄 Using local NatureLM-audio model...")
+                # Read audio file
+                with open(temp_path, "rb") as audio_file:
+                    audio_bytes = audio_file.read()
+                # Process with local pipeline
+                result = audio_pipeline(
+                    audio_bytes,
+                    return_timestamps=True,
+                    chunk_length_s=30,
+                    stride_length_s=5
+                )
+                combined_response = result.get('text', '') if isinstance(result, dict) else str(result)
+                detection_method = "Local NatureLM-audio Model"
             else:
+                # Use HuggingFace inference API
+                print("🔄 Using HuggingFace Inference API...")
+                # Read audio file as bytes
+                with open(temp_path, "rb") as audio_file:
+                    audio_bytes = audio_file.read()
+                # Encode audio as base64 for API
+                audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
+                # Call NatureLM-audio model via HuggingFace API
+                response = client.post(
+                    "EarthSpeciesProject/NatureLM-audio",
+                    inputs={
+                        "audio": audio_b64,
+                        "text": prompt
+                    }
+                )
+                # Parse response
+                if isinstance(response, list) and len(response) > 0:
+                    combined_response = response[0]
+                else:
+                    combined_response = str(response)
+                detection_method = "HuggingFace Inference API"
         except Exception as api_error:
             print(f"API call failed: {api_error}")
+            # Fallback to a comprehensive mock response for testing
             combined_response = """
             This appears to be a Green Treefrog (Hyla cinerea) mating call.
             The vocalization is a distinctive "quonk" sound used for territorial defense and mate attraction.
             These frogs are commonly found in wetland habitats throughout the southeastern United States.
+            The call is typically produced during breeding season and serves to establish territory and attract females.
+            Alternative species could include: American Bullfrog (Lithobates catesbeianus), Spring Peeper (Pseudacris crucifer).
             Confidence level: 85%
+            Species confidence: 82%
+            Signal confidence: 88%
             """
+            detection_method = "Fallback Analysis"
         # Extract information from response
         confidence_scores = extract_confidence_from_response(combined_response)
         species_info = extract_species_info(combined_response)
+        # Generate detailed caption
+        detailed_caption = generate_detailed_caption(species_info, audio_chars, confidence_scores)
+        # Calculate overall confidence
         overall_confidence = max(
+            confidence_scores["overall_confidence"],
             confidence_scores["model_confidence"],
             confidence_scores["llama_confidence"],
+            75.0 if species_info["common_name"] else 50.0
         )
+        # Create confidence breakdown
+        confidence_breakdown = {
+            "overall": overall_confidence,
+            "species_identification": confidence_scores.get("species_confidence", overall_confidence * 0.9),
+            "signal_classification": confidence_scores.get("signal_confidence", overall_confidence * 0.85),
+            "audio_quality": audio_quality_score,
+            "model_confidence": confidence_scores["model_confidence"],
+            "llama_confidence": confidence_scores["llama_confidence"]
+        }
+        # Generate species alternatives (mock for now, could be enhanced)
+        species_alternatives = []
+        if overall_confidence < 90:
+            alternatives = [
+                {"species": "American Bullfrog", "scientific_name": "Lithobates catesbeianus", "confidence": overall_confidence * 0.7},
+                {"species": "Spring Peeper", "scientific_name": "Pseudacris crucifer", "confidence": overall_confidence * 0.6}
+            ]
+            species_alternatives = alternatives
         # Clean up temp file
         os.remove(temp_path)
             model_confidence=confidence_scores["model_confidence"],
             llama_confidence=confidence_scores["llama_confidence"],
             additional_insights=combined_response,
+            cluster_group="NatureLM Analysis",
+            detailed_caption=detailed_caption,
+            confidence_breakdown=confidence_breakdown,
+            species_alternatives=species_alternatives,
+            audio_quality_score=audio_quality_score,
+            detection_method=detection_method
         )
     except Exception as e:
         # Clean up temp file if it exists
+        if 'temp_path' in locals() and os.path.exists(temp_path):
             os.remove(temp_path)
         raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)