Spaces:
Sleeping
Sleeping
File size: 4,250 Bytes
d4d1ca8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import pandas as pd
from pathlib import Path
def suggest_labels(df: pd.DataFrame, top_k: int = 1):
"""
Given an event dataframe, suggest CauseType for rows with KnowUnknowCause == 'ไม่พบสาเหตุ' or missing.
Returns a DataFrame with suggestions and scores.
"""
df = df.copy()
# ensure columns exist
needed = ['KnowUnknowCause','CauseType','SubCauseType','OpDeviceType','FaultDeviceType','FaultDeviceCondition','Weather','Capacity(kVA)','Load(MW)','OpDeviceSysType','OpDevicePhase']
for c in needed:
if c not in df.columns:
df[c] = None
known = df[df['KnowUnknowCause'].notna() & (df['KnowUnknowCause'].str.strip() != 'ไม่พบสาเหตุ')]
unknown = df[df['KnowUnknowCause'].notna() & (df['KnowUnknowCause'].str.strip() == 'ไม่พบสาเหตุ')]
unknown = pd.concat([unknown, df[df['KnowUnknowCause'].isna()]], ignore_index=False)
if known.empty or unknown.empty:
return pd.DataFrame([])
for num in ['Capacity(kVA)','Load(MW)']:
known[num] = pd.to_numeric(known.get(num), errors='coerce')
unknown[num] = pd.to_numeric(unknown.get(num), errors='coerce')
cat_features = ['OpDeviceType','FaultDeviceType','FaultDeviceCondition','Weather','OpDeviceSysType','OpDevicePhase']
num_features = ['Capacity(kVA)','Load(MW)']
cause_groups = {cause: g for cause, g in known.groupby('CauseType')}
# compute most common SubCauseType per cause group
cause_submode = {}
for cause, g in cause_groups.items():
try:
# pick first mode that's non-null
modes = g['SubCauseType'].dropna()
if not modes.empty:
cause_submode[cause] = modes.mode().iloc[0]
else:
cause_submode[cause] = None
except Exception:
cause_submode[cause] = None
suggestions = []
for idx, row in unknown.iterrows():
scores = {}
for cause, g in cause_groups.items():
cat_score = 0.0
matches = 0
for f in cat_features:
val = row.get(f)
if pd.isna(val) or val is None:
continue
same = (g[f] == val).sum()
frac = same / max(1, len(g))
cat_score += frac
matches += 1
cat_score = cat_score / matches if matches>0 else 0.0
num_score = 0.0
ncount = 0
for nf in num_features:
rv = row.get(nf)
if pd.isna(rv) or rv is None:
continue
median = g[nf].median()
if pd.isna(median):
continue
scale = abs(median) if abs(median) > 0 else 1.0
num_score += 1.0 / (1.0 + abs(rv - median) / scale)
ncount += 1
num_score = num_score / ncount if ncount>0 else 0.0
combined = 0.7 * cat_score + 0.3 * num_score
scores[cause] = combined
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
top = sorted_scores[:int(top_k)]
suggestion_text = '; '.join([f"{c}: {s:.3f}" for c,s in top])
suggested_cause = top[0][0] if top else None
suggested_subcause = cause_submode.get(suggested_cause) if suggested_cause is not None else None
suggestions.append({
'Number': row.get('Number'),
'EventNumber': row.get('EventNumber'),
'OutageDateTime': row.get('OutageDateTime'),
'KnowUnknowCause': row.get('KnowUnknowCause'),
'Orig_CauseType': row.get('CauseType'),
'Orig_SubCauseType': row.get('SubCauseType'),
'SuggestedCause': suggested_cause,
'SuggestedSubCauseType': suggested_subcause,
'Scores': suggestion_text
})
return pd.DataFrame(suggestions)
def suggest_labels_to_file(df: pd.DataFrame, out_path: str = None, top_k: int = 1):
out_df = suggest_labels(df, top_k=top_k)
if out_path:
p = Path(out_path)
p.parent.mkdir(parents=True, exist_ok=True)
out_df.to_csv(p, index=False, encoding='utf-8-sig')
return out_df
|