File size: 10,142 Bytes
cc2e1db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""Compute reliability indices (SAIFI, SAIDI, CAIDI, MAIFI) from outage event CSV.

Usage (programmatic):
from scripts.compute_reliability import compute_reliability
summary = compute_reliability('data/data_1.csv', total_customers=500000)

Usage (CLI):
python scripts/compute_reliability.py --input data/data_1.csv --total-customers 500000

Assumptions & mapping (from inspected CSV):
- Outage start: `OutageDateTime`
- Outage end: prefer `CloseEventDateTime`, else `LastRestoDateTime`, else `FirstRestoDateTime`
- Customers affected: prefer `AffectedCustomer` column; else sum `AffectedCustomer1..5`; else `AllStepCusXTime` or `AllStepCusXTime1..5` fallback.
- Planned outages: rows with `EventType` containing 'แผน' (e.g., 'แผนดับไฟ') are considered planned and can be excluded by default.
- Date format is day-first like '10-01-2025 10:28:00'.

Outputs saved to `outputs/reliability_summary.csv` and breakdown CSVs.
"""

from __future__ import annotations
import argparse
from typing import Optional, Dict
import pandas as pd
import numpy as np
from pathlib import Path

DATE_COLS = ['OutageDateTime', 'FirstRestoDateTime', 'LastRestoDateTime', 'CreateEventDateTime', 'CloseEventDateTime']


def parse_dates(df: pd.DataFrame) -> pd.DataFrame:
    for c in DATE_COLS:
        if c in df.columns:
            # many dates are in format dd-mm-YYYY HH:MM:SS
            df[c] = pd.to_datetime(df[c], dayfirst=True, errors='coerce')
    return df


def coalesce_end_time(row: pd.Series) -> pd.Timestamp | None:
    for c in ('CloseEventDateTime', 'LastRestoDateTime', 'FirstRestoDateTime', 'CreateEventDateTime'):
        if c in row and pd.notna(row[c]):
            return row[c]
    return pd.NaT


def estimate_customers(row: pd.Series) -> float:
    # Prefer AffectedCustomer if present and numeric
    def to_num(x):
        try:
            if pd.isna(x) or x == '':
                return np.nan
            return float(x)
        except Exception:
            return np.nan

    cols = row.index
    # Try AffectedCustomer
    if 'AffectedCustomer' in cols:
        v = to_num(row['AffectedCustomer'])
        if not np.isnan(v):
            return v
    # Sum AffectedCustomer1..5
    acs = []
    for i in range(1, 6):
        k = f'AffectedCustomer{i}'
        if k in cols:
            acs.append(to_num(row[k]))
    acs = [x for x in acs if not np.isnan(x)]
    if acs:
        return float(sum(acs))
    # Try AllStepCusXTime or AllStepCusXTime1..5
    if 'AllStepCusXTime' in cols:
        v = to_num(row['AllStepCusXTime'])
        if not np.isnan(v):
            return v
    asts = []
    for i in range(1, 6):
        k = f'AllStepCusXTime{i}'
        if k in cols:
            asts.append(to_num(row[k]))
    asts = [x for x in asts if not np.isnan(x)]
    if asts:
        return float(sum(asts))
    # As last resort, try numeric columns near end: Capacity(kVA) or Load(MW) are not customer counts
    return np.nan


def flag_planned(event_type: Optional[str]) -> bool:
    if pd.isna(event_type):
        return False
    s = str(event_type)
    # In this dataset planned outages use Thai word 'แผน'
    if 'แผน' in s:
        return True
    # else treat as unplanned
    return False


def compute_reliability(
    input_csv: str | Path,
    total_customers: Optional[float] = None,
    customers_map: Optional[Dict[str, float]] = None,
    exclude_planned: bool = True,
    momentary_threshold_min: float = 1.0,
    groupby_cols: list[str] | None = None,
    out_dir: str | Path | None = 'outputs',
) -> Dict[str, pd.DataFrame]:
    """Reads CSV and computes reliability indices.

    Returns a dict of DataFrames: overall, by_group.
    """
    input_csv = Path(input_csv)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(input_csv, dtype=str)
    # parse dates
    df = parse_dates(df)

    # coalesce end time
    df['OutageStart'] = df.get('OutageDateTime')
    df['OutageEnd'] = df.apply(coalesce_end_time, axis=1)
    # compute duration in minutes
    df['DurationMin'] = (pd.to_datetime(df['OutageEnd']) - pd.to_datetime(df['OutageStart'])).dt.total_seconds() / 60.0

    # customers affected
    df['CustomersAffected'] = df.apply(estimate_customers, axis=1)

    # flag planned
    df['IsPlanned'] = df['EventType'].apply(flag_planned) if 'EventType' in df.columns else False

    if exclude_planned:
        df_work = df[~df['IsPlanned']].copy()
    else:
        df_work = df.copy()

    # Fill missing durations (negative or NaN) with 0
    df_work['DurationMin'] = df_work['DurationMin'].fillna(0)
    df_work.loc[df_work['DurationMin'] < 0, 'DurationMin'] = 0

    # ensure numeric customers
    df_work['CustomersAffected'] = pd.to_numeric(df_work['CustomersAffected'], errors='coerce').fillna(0)

    # Choose grouping
    if groupby_cols is None:
        groupby_cols = []

    # helper to compute indices given total customers
    def compute_from_df(dfall: pd.DataFrame, cust_total: float) -> Dict[str, float]:
        total_interruptions = dfall['CustomersAffected'].sum()
        total_customer_minutes = (dfall['CustomersAffected'] * dfall['DurationMin']).sum()
        # momentary interruptions: durations less than threshold
        momentary_interruptions = dfall.loc[dfall['DurationMin'] < momentary_threshold_min, 'CustomersAffected'].sum()
        saifi = total_interruptions / cust_total if cust_total and cust_total > 0 else np.nan
        saidi = total_customer_minutes / cust_total if cust_total and cust_total > 0 else np.nan
        caidi = (saidi / saifi) if (saifi and saifi > 0) else np.nan
        maifi = momentary_interruptions / cust_total if cust_total and cust_total > 0 else np.nan
        return {
            'TotalInterruptions': total_interruptions,
            'TotalCustomerMinutes': total_customer_minutes,
            'MomentaryInterruptions': momentary_interruptions,
            'SAIFI': saifi,
            'SAIDI': saidi,
            'CAIDI': caidi,
            'MAIFI': maifi,
        }

    results = {}

    if customers_map is not None:
        # customers_map expects keys matching grouping (e.g., Feeder or AffectedAreaID). We'll compute per key
        # Overall must supply a 'TOTAL' or we sum map values
        total_customers_map_sum = sum(customers_map.values())
        overall = compute_from_df(df_work, total_customers_map_sum if total_customers is None else total_customers)
        results['overall'] = pd.DataFrame([overall])

        # per-group
        if groupby_cols:
            group = df_work.groupby(groupby_cols).agg({'CustomersAffected': 'sum', 'DurationMin': 'mean'})
        else:
            # if no group col provided, try Feeder then AffectedAreaID
            if 'Feeder' in df_work.columns:
                groupby_cols = ['Feeder']
            elif 'AffectedAreaID' in df_work.columns:
                groupby_cols = ['AffectedAreaID']
            else:
                groupby_cols = []

        if groupby_cols:
            rows = []
            for key, sub in df_work.groupby(groupby_cols):
                # key can be tuple
                keyname = key if isinstance(key, str) else '_'.join(map(str, key))
                cust = customers_map.get(keyname, np.nan)
                metrics = compute_from_df(sub, cust if not np.isnan(cust) else np.nan)
                metrics.update({'Group': keyname})
                rows.append(metrics)
            results['by_group'] = pd.DataFrame(rows)
        else:
            results['by_group'] = pd.DataFrame()
    else:
        # customers_map not provided: require total_customers
        if total_customers is None:
            raise ValueError('Either total_customers or customers_map must be provided to compute per-customer indices')
        overall = compute_from_df(df_work, float(total_customers))
        results['overall'] = pd.DataFrame([overall])
        # per-group breakdowns (SAIFI-like per 1000 customers will use proportion of total customers by share)
        if groupby_cols:
            rows = []
            # If we don't have customers per group, we will compute interruption counts and durations but can't compute per-customer normalized indices without providing customers_map.
            for key, sub in df_work.groupby(groupby_cols):
                keyname = key if isinstance(key, str) else '_'.join(map(str, key))
                rows.append({
                    'Group': keyname,
                    'TotalInterruptions': sub['CustomersAffected'].sum(),
                    'TotalCustomerMinutes': (sub['CustomersAffected'] * sub['DurationMin']).sum(),
                    'Events': len(sub),
                })
            results['by_group'] = pd.DataFrame(rows)
        else:
            results['by_group'] = pd.DataFrame()

    # Save CSVs
    results['raw'] = df_work
    results['raw'].to_csv(out_dir / 'events_cleaned.csv', index=False)
    results['overall'].to_csv(out_dir / 'reliability_overall.csv', index=False)
    if 'by_group' in results:
        results['by_group'].to_csv(out_dir / 'reliability_by_group.csv', index=False)

    return results


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i', required=True, help='Input CSV file')
    parser.add_argument('--total-customers', type=float, help='Total customers served in the system (required if no customers map)')
    parser.add_argument('--exclude-planned', action='store_true', help='Exclude planned outages (default True)')
    parser.add_argument('--momentary-threshold-min', type=float, default=1.0, help='Threshold in minutes for momentary interruption')
    parser.add_argument('--groupby', nargs='*', default=['Feeder'], help='Columns to group by for breakdown (default: Feeder)')
    args = parser.parse_args()

    res = compute_reliability(args.input, total_customers=args.total_customers, exclude_planned=args.exclude_planned, momentary_threshold_min=args.momentary_threshold_min, groupby_cols=args.groupby)
    print('Wrote outputs to outputs/ (events_cleaned.csv, reliability_overall.csv, reliability_by_group.csv)')