|
|
from datetime import datetime |
|
|
from io import StringIO |
|
|
|
|
|
import boto3 |
|
|
import pandas as pd |
|
|
|
|
|
from tools.config import ( |
|
|
AWS_ACCESS_KEY, |
|
|
AWS_REGION, |
|
|
AWS_SECRET_KEY, |
|
|
DOCUMENT_REDACTION_BUCKET, |
|
|
OUTPUT_FOLDER, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION: |
|
|
s3 = boto3.client( |
|
|
"s3", |
|
|
aws_access_key_id=AWS_ACCESS_KEY, |
|
|
aws_secret_access_key=AWS_SECRET_KEY, |
|
|
region_name=AWS_REGION, |
|
|
) |
|
|
else: |
|
|
s3 = boto3.client("s3") |
|
|
|
|
|
bucket_name = DOCUMENT_REDACTION_BUCKET |
|
|
prefix = "usage/" |
|
|
earliest_date = "20250409" |
|
|
latest_date = "20250423" |
|
|
|
|
|
|
|
|
|
|
|
def list_files_in_s3(bucket, prefix): |
|
|
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) |
|
|
if "Contents" in response: |
|
|
return [content["Key"] for content in response["Contents"]] |
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
def is_within_date_range(date_str, start_date, end_date): |
|
|
date_obj = datetime.strptime(date_str, "%Y%m%d") |
|
|
return start_date <= date_obj <= end_date |
|
|
|
|
|
|
|
|
|
|
|
start_date = datetime.strptime(earliest_date, "%Y%m%d") |
|
|
end_date = datetime.strptime(latest_date, "%Y%m%d") |
|
|
|
|
|
|
|
|
all_files = list_files_in_s3(bucket_name, prefix) |
|
|
|
|
|
|
|
|
log_files = [] |
|
|
for file in all_files: |
|
|
parts = file.split("/") |
|
|
if len(parts) >= 3: |
|
|
date_str = parts[1] |
|
|
if ( |
|
|
is_within_date_range(date_str, start_date, end_date) |
|
|
and parts[-1] == "log.csv" |
|
|
): |
|
|
log_files.append(file) |
|
|
|
|
|
|
|
|
df_list = [] |
|
|
for log_file in log_files: |
|
|
|
|
|
obj = s3.get_object(Bucket=bucket_name, Key=log_file) |
|
|
try: |
|
|
csv_content = obj["Body"].read().decode("utf-8") |
|
|
except Exception as e: |
|
|
print("Could not load in log file:", log_file, "due to:", e) |
|
|
csv_content = obj["Body"].read().decode("latin-1") |
|
|
|
|
|
|
|
|
try: |
|
|
df = pd.read_csv(StringIO(csv_content)) |
|
|
except Exception as e: |
|
|
print("Could not load in log file:", log_file, "due to:", e) |
|
|
continue |
|
|
|
|
|
df_list.append(df) |
|
|
|
|
|
|
|
|
if df_list: |
|
|
concatenated_df = pd.concat(df_list, ignore_index=True) |
|
|
|
|
|
|
|
|
concatenated_df.to_csv(OUTPUT_FOLDER + "consolidated_s3_logs.csv", index=False) |
|
|
print("Consolidated CSV saved as 'consolidated_s3_logs.csv'") |
|
|
else: |
|
|
print("No log files found in the given date range.") |
|
|
|