Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

16e585d

verified ·

1 Parent(s): 20dc7a6

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +119 -68

src/streamlit_app.py CHANGED Viewed

@@ -7,14 +7,13 @@ import plotly.figure_factory as ff
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
 from io import StringIO
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 # ======================================================
 # ⚙️ APP CONFIGURATION
 # ======================================================
 st.set_page_config(page_title="📊 Smart Data Analyst Pro", layout="wide")
 st.title("📊 Smart Data Analyst Pro")
-st.caption("AI that cleans, analyzes, and visualizes your data — powered by Hugging Face Inference API and local open-source models.")
 # ======================================================
 # 🔐 Load Environment Variables
@@ -27,7 +26,7 @@ else:
     login(token=HF_TOKEN)
 # ======================================================
-# 🧠 MODEL SETTINGS
 # ======================================================
 with st.sidebar:
     st.header("⚙️ Model Settings")
@@ -37,18 +36,17 @@ with st.sidebar:
         [
             "Qwen/Qwen2.5-Coder-7B-Instruct",
             "meta-llama/Meta-Llama-3-8B-Instruct",
-            "microsoft/Phi-3-mini-4k-instruct"
         ],
         index=0
     )
     ANALYST_MODEL = st.selectbox(
-        "Select Analysis Model (Local Open-Source Recommended):",
-        [
-            "meta-llama/Meta-Llama-3-8B-Instruct",
-            "Qwen/Qwen2.5-Coder-7B-Instruct",
-            "HuggingFaceH4/zephyr-7b-beta",
-            "mistralai/Mistral-7B-Instruct-v0.3"
         ],
         index=0
     )
@@ -56,91 +54,122 @@ with st.sidebar:
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
     max_tokens = st.slider("Max Tokens", 128, 2048, 512)
-# Initialize cleaner client (HF API)
 cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
-# Initialize local analyst if open-source
-local_analyst = None
-if ANALYST_MODEL in ["meta-llama/Meta-Llama-3-8B-Instruct"]:
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(ANALYST_MODEL)
-        model = AutoModelForCausalLM.from_pretrained(ANALYST_MODEL)
-        local_analyst = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    except Exception as e:
-        st.warning(f"⚠️ Failed to load local analyst: {e}")
 # ======================================================
-# 🧩 DATA CLEANING FUNCTIONS
 # ======================================================
 def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     df = df.copy()
     df.dropna(axis=1, how="all", inplace=True)
     df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
     for col in df.columns:
         if df[col].dtype == "O":
-            df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown", inplace=True)
         else:
             df[col].fillna(df[col].median(), inplace=True)
     df.drop_duplicates(inplace=True)
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
     raw_preview = df.head(5).to_csv(index=False)
     prompt = f"""
-You are a Python data cleaning expert.
-Clean and standardize the dataset dynamically:
-- Handle missing values logically
-- Correct and normalize column names
-- Detect and fix datatype inconsistencies
-- Remove duplicates or invalid rows
-Return ONLY valid CSV text (no Markdown).
 --- RAW SAMPLE ---
 {raw_preview}
 """
     try:
-        response = cleaner_client.text_generation(prompt, max_new_tokens=1024, temperature=0.1, return_full_text=False)
         cleaned_str = response.strip()
     except Exception as e:
-        st.warning(f"⚠️ AI cleaning failed: {e}")
-        return fallback_clean(df)
-    cleaned_str = cleaned_str.replace("```csv","").replace("```","").replace("###","").replace(";",",").strip()
-    lines = [l for l in cleaned_str.splitlines() if "," in l]
     cleaned_str = "\n".join(lines)
     try:
         cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
-        cleaned_df.dropna(axis=1, how="all", inplace=True)
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
         return cleaned_df
     except Exception as e:
-        st.warning(f"⚠️ CSV parse failed: {e}")
         return fallback_clean(df)
 def summarize_dataframe(df: pd.DataFrame) -> str:
     lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
     for col in df.columns[:10]:
         non_null = int(df[col].notnull().sum())
         if pd.api.types.is_numeric_dtype(df[col]):
-            mean = df[col].mean()
-            median = df[col].median() if non_null > 0 else None
             lines.append(f"- {col}: mean={mean:.3f}, median={median}, non_null={non_null}")
         else:
             top = df[col].value_counts().head(3).to_dict()
             lines.append(f"- {col}: top_values={top}, non_null={non_null}")
     return "\n".join(lines)
-# ======================================================
-# 🧠 ANALYSIS FUNCTION
-# ======================================================
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
     df_summary = summarize_dataframe(df)
     sample = df.head(6).to_csv(index=False)
     prompt = f"""
-You are a data analyst.
-Analyze '{dataset_name}' and answer the question below.
-Base your insights only on the provided data.
 --- SUMMARY ---
 {df_summary}
@@ -148,32 +177,41 @@ Base your insights only on the provided data.
 --- SAMPLE DATA ---
 {sample}
---- QUESTION ---
 {user_query}
-Respond concisely with key insights, numbers, patterns, and recommended steps.
 """
-    if local_analyst:
-        try:
-            response = local_analyst(prompt, max_new_tokens=max_tokens, temperature=temperature)
-            return response[0]['generated_text']
-        except Exception as e:
-            return f"⚠️ Local analysis failed: {e}"
-    else:
-        st.warning("⚠️ Analyst model is not local. Using HF API may require payment.")
-        return "Analysis not available for free model."
 # ======================================================
-# 🚀 MAIN APP
 # ======================================================
 uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
 if uploaded:
-    try:
-        df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
-    except Exception as e:
-        st.error(f"❌ File load failed: {e}")
-        st.stop()
     with st.spinner("🧼 AI Cleaning your dataset..."):
         cleaned_df = ai_clean_dataset(df)
@@ -181,33 +219,46 @@ if uploaded:
     st.subheader("✅ Cleaned Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
-    with st.expander("📋 Cleaning Summary"):
         st.text(summarize_dataframe(cleaned_df))
     with st.expander("📈 Quick Visualizations", expanded=True):
         numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
         categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
-        viz_type = st.selectbox("Visualization Type", ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"])
         if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
             x = st.selectbox("X-axis", numeric_cols)
-            y = st.selectbox("Y-axis", numeric_cols, index=min(1,len(numeric_cols)-1))
             color = st.selectbox("Color", ["None"] + categorical_cols)
             fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Histogram" and numeric_cols:
             col = st.selectbox("Column", numeric_cols)
             fig = px.histogram(cleaned_df, x=col, nbins=30)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Box Plot" and numeric_cols:
             col = st.selectbox("Column", numeric_cols)
             fig = px.box(cleaned_df, y=col)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
             corr = cleaned_df[numeric_cols].corr()
-            fig = ff.create_annotated_heatmap(z=corr.values, x=list(corr.columns), y=list(corr.index),
-                                              annotation_text=corr.round(2).values, showscale=True)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Categorical Count" and categorical_cols:
             cat = st.selectbox("Category", categorical_cols)
             fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
@@ -216,7 +267,7 @@ if uploaded:
             st.warning("⚠️ Not enough columns for this visualization type.")
     st.subheader("💬 Ask AI About Your Data")
-    user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales?")
     if st.button("Analyze with AI", use_container_width=True) and user_query:
         with st.spinner("🤖 Interpreting data..."):
             result = query_analysis_model(cleaned_df, user_query, uploaded.name)

 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
 from io import StringIO
 # ======================================================
 # ⚙️ APP CONFIGURATION
 # ======================================================
 st.set_page_config(page_title="📊 Smart Data Analyst Pro", layout="wide")
 st.title("📊 Smart Data Analyst Pro")
+st.caption("AI that cleans, analyzes, and visualizes your data — powered by Hugging Face Inference API.")
 # ======================================================
 # 🔐 Load Environment Variables
     login(token=HF_TOKEN)
 # ======================================================
+# 🧠 MODEL SETUP
 # ======================================================
 with st.sidebar:
     st.header("⚙️ Model Settings")
         [
             "Qwen/Qwen2.5-Coder-7B-Instruct",
             "meta-llama/Meta-Llama-3-8B-Instruct",
+            "microsoft/Phi-3-mini-4k-instruct",
+            "mistralai/Mistral-7B-Instruct-v0.3"
         ],
         index=0
     )
     ANALYST_MODEL = st.selectbox(
+        "Select Analysis Model:",
+        [   "Qwen/Qwen2.5-14B-Instruct",
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "HuggingFaceH4/zephyr-7b-beta"
         ],
         index=0
     )
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
     max_tokens = st.slider("Max Tokens", 128, 2048, 512)
+# Initialize inference clients
 cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
+analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
 # ======================================================
+# 🧩 SMART DATA CLEANING
 # ======================================================
 def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
+    """Backup rule-based cleaner."""
     df = df.copy()
     df.dropna(axis=1, how="all", inplace=True)
     df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
     for col in df.columns:
         if df[col].dtype == "O":
+            if not df[col].mode().empty:
+                df[col].fillna(df[col].mode()[0], inplace=True)
+            else:
+                df[col].fillna("Unknown", inplace=True)
         else:
             df[col].fillna(df[col].median(), inplace=True)
     df.drop_duplicates(inplace=True)
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Cleans the dataset using the selected AI model. Falls back gracefully if the model fails.
+    """
     raw_preview = df.head(5).to_csv(index=False)
     prompt = f"""
+You are a professional data cleaning assistant.
+Clean and standardize the dataset below dynamically:
+1. Handle missing values
+2. Fix column name inconsistencies
+3. Convert data types (dates, numbers, categories)
+4. Remove irrelevant or duplicate rows
+Return ONLY a valid CSV text (no markdown, no explanations).
 --- RAW SAMPLE ---
 {raw_preview}
 """
     try:
+        # Try text-generation task first
+        response = cleaner_client.text_generation(
+            prompt,
+            max_new_tokens=1024,
+            temperature=0.1,
+            return_full_text=False,
+        )
         cleaned_str = response.strip()
     except Exception as e:
+        # Retry with chat completion if needed
+        if "Supported task: conversational" in str(e) or "not supported" in str(e):
+            try:
+                chat_resp = cleaner_client.chat_completion(
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=1024,
+                    temperature=0.1,
+                )
+                cleaned_str = chat_resp["choices"][0]["message"]["content"].strip()
+            except Exception as e2:
+                st.warning(f"⚠️ AI cleaning failed (chat mode): {e2}")
+                return fallback_clean(df)
+        else:
+            st.warning(f"⚠️ AI cleaning failed ({e})")
+            return fallback_clean(df)
+    # Remove possible markdown/code fences
+    cleaned_str = (
+        cleaned_str.replace("```csv", "")
+        .replace("```", "")
+        .replace("###", "")
+        .replace(";", ",")
+        .strip()
+    )
+    # Keep only valid CSV-like lines
+    lines = cleaned_str.splitlines()
+    lines = [line for line in lines if "," in line and not line.lower().startswith(("note", "summary"))]
     cleaned_str = "\n".join(lines)
+    # Try parsing robustly
     try:
         cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
+        cleaned_df = cleaned_df.dropna(axis=1, how="all")
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
         return cleaned_df
     except Exception as e:
+        st.warning(f"⚠️ AI CSV parse failed: {e}")
         return fallback_clean(df)
 def summarize_dataframe(df: pd.DataFrame) -> str:
+    """Generate a concise summary of the dataframe."""
     lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
     for col in df.columns[:10]:
         non_null = int(df[col].notnull().sum())
         if pd.api.types.is_numeric_dtype(df[col]):
+            desc = df[col].describe().to_dict()
+            mean = float(desc.get("mean", np.nan))
+            median = float(df[col].median()) if non_null > 0 else None
             lines.append(f"- {col}: mean={mean:.3f}, median={median}, non_null={non_null}")
         else:
             top = df[col].value_counts().head(3).to_dict()
             lines.append(f"- {col}: top_values={top}, non_null={non_null}")
     return "\n".join(lines)
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
+    """Send the dataframe and user query to the analysis model for interpretation."""
     df_summary = summarize_dataframe(df)
     sample = df.head(6).to_csv(index=False)
     prompt = f"""
+You are a professional data analyst.
+Analyze the dataset '{dataset_name}' and answer the user's question.
 --- SUMMARY ---
 {df_summary}
 --- SAMPLE DATA ---
 {sample}
+--- USER QUESTION ---
 {user_query}
+Respond with:
+1. Key insights and patterns
+2. Quantitative findings
+3. Notable relationships or anomalies
+4. Data-driven recommendations
 """
+    try:
+        response = analyst_client.text_generation(
+            prompt, temperature=temperature, max_new_tokens=max_tokens, return_full_text=False
+        )
+        return response.strip()
+    except Exception as e:
+        if "Supported task: conversational" in str(e) or "not supported" in str(e):
+            try:
+                chat_resp = analyst_client.chat_completion(
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                )
+                return chat_resp["choices"][0]["message"]["content"].strip()
+            except Exception as e2:
+                return f"⚠️ Analysis failed (chat mode): {e2}"
+        return f"⚠️ Analysis failed: {e}"
 # ======================================================
+# 🚀 MAIN APP LOGIC
 # ======================================================
 uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
 if uploaded:
+    df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
     with st.spinner("🧼 AI Cleaning your dataset..."):
         cleaned_df = ai_clean_dataset(df)
     st.subheader("✅ Cleaned Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
+    with st.expander("📋 Cleaning Summary", expanded=False):
         st.text(summarize_dataframe(cleaned_df))
     with st.expander("📈 Quick Visualizations", expanded=True):
         numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
         categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
+        viz_type = st.selectbox(
+            "Visualization Type",
+            ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
+        )
         if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
             x = st.selectbox("X-axis", numeric_cols)
+            y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
             color = st.selectbox("Color", ["None"] + categorical_cols)
             fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Histogram" and numeric_cols:
             col = st.selectbox("Column", numeric_cols)
             fig = px.histogram(cleaned_df, x=col, nbins=30)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Box Plot" and numeric_cols:
             col = st.selectbox("Column", numeric_cols)
             fig = px.box(cleaned_df, y=col)
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
             corr = cleaned_df[numeric_cols].corr()
+            fig = ff.create_annotated_heatmap(
+                z=corr.values,
+                x=list(corr.columns),
+                y=list(corr.index),
+                annotation_text=corr.round(2).values,
+                showscale=True
+            )
             st.plotly_chart(fig, use_container_width=True)
         elif viz_type == "Categorical Count" and categorical_cols:
             cat = st.selectbox("Category", categorical_cols)
             fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
             st.warning("⚠️ Not enough columns for this visualization type.")
     st.subheader("💬 Ask AI About Your Data")
+    user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query:
         with st.spinner("🤖 Interpreting data..."):
             result = query_analysis_model(cleaned_df, user_query, uploaded.name)