Spaces:

seanpedrickcase
/

document_redaction_vlm

Running on Zero

File size: 14,124 Bytes

d864d45

import os
from typing import List, Type

import boto3
import pandas as pd

from tools.config import (
    AWS_REGION,
    DOCUMENT_REDACTION_BUCKET,
    RUN_AWS_FUNCTIONS,
    S3_OUTPUTS_BUCKET,
    SAVE_LOGS_TO_CSV,
)
from tools.secure_path_utils import secure_join

PandasDataFrame = Type[pd.DataFrame]


def get_assumed_role_info():
    sts_endpoint = "https://sts." + AWS_REGION + ".amazonaws.com"
    sts = boto3.client("sts", region_name=AWS_REGION, endpoint_url=sts_endpoint)
    response = sts.get_caller_identity()

    # Extract ARN of the assumed role
    assumed_role_arn = response["Arn"]

    # Extract the name of the assumed role from the ARN
    assumed_role_name = assumed_role_arn.split("/")[-1]

    return assumed_role_arn, assumed_role_name


if RUN_AWS_FUNCTIONS:
    try:
        session = boto3.Session(region_name=AWS_REGION)

    except Exception as e:
        print("Could not start boto3 session:", e)

    try:
        assumed_role_arn, assumed_role_name = get_assumed_role_info()

        print("Successfully assumed ARN role")
        # print("Assumed Role ARN:", assumed_role_arn)
        # print("Assumed Role Name:", assumed_role_name)

    except Exception as e:
        print("Could not get assumed role from STS:", e)


# Download direct from S3 - requires login credentials
def download_file_from_s3(
    bucket_name: str,
    key: str,
    local_file_path_and_name: str,
    RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
):

    if RUN_AWS_FUNCTIONS:

        try:
            # Ensure the local directory exists
            os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)

            s3 = boto3.client("s3", region_name=AWS_REGION)
            s3.download_file(bucket_name, key, local_file_path_and_name)
            print(
                f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}"
            )
        except Exception as e:
            print("Could not download file:", key, "from s3 due to", e)


def download_folder_from_s3(
    bucket_name: str,
    s3_folder: str,
    local_folder: str,
    RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
):
    """
    Download all files from an S3 folder to a local folder.
    """
    if RUN_AWS_FUNCTIONS:
        if bucket_name and s3_folder and local_folder:

            s3 = boto3.client("s3", region_name=AWS_REGION)

            # List objects in the specified S3 folder
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)

            # Download each object
            for obj in response.get("Contents", []):
                # Extract object key and construct local file path
                object_key = obj["Key"]
                local_file_path = secure_join(
                    local_folder, os.path.relpath(object_key, s3_folder)
                )

                # Create directories if necessary
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

                # Download the object
                try:
                    s3.download_file(bucket_name, object_key, local_file_path)
                    print(
                        f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
                    )
                except Exception as e:
                    print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
        else:
            print(
                "One or more required variables are empty, could not download from S3"
            )


def download_files_from_s3(
    bucket_name: str,
    s3_folder: str,
    local_folder: str,
    filenames: List[str],
    RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
):
    """
    Download specific files from an S3 folder to a local folder.
    """

    if RUN_AWS_FUNCTIONS:
        if bucket_name and s3_folder and local_folder and filenames:

            s3 = boto3.client("s3", region_name=AWS_REGION)

            print("Trying to download file: ", filenames)

            if filenames == "*":
                # List all objects in the S3 folder
                print("Trying to download all files in AWS folder: ", s3_folder)
                response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)

                print("Found files in AWS folder: ", response.get("Contents", []))

                filenames = [
                    obj["Key"].split("/")[-1] for obj in response.get("Contents", [])
                ]

                print("Found filenames in AWS folder: ", filenames)

            for filename in filenames:
                object_key = secure_join(s3_folder, filename)
                local_file_path = secure_join(local_folder, filename)

                # Create directories if necessary
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

                # Download the object
                try:
                    s3.download_file(bucket_name, object_key, local_file_path)
                    print(
                        f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'"
                    )
                except Exception as e:
                    print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)

        else:
            print(
                "One or more required variables are empty, could not download from S3"
            )


def upload_file_to_s3(
    local_file_paths: List[str],
    s3_key: str,
    s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
    RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
):
    """
    Uploads a file from local machine to Amazon S3.

    Args:
    - local_file_path: Local file path(s) of the file(s) to upload.
    - s3_key: Key (path) to the file in the S3 bucket.
    - s3_bucket: Name of the S3 bucket.

    Returns:
    - Message as variable/printed to console
    """
    final_out_message = list()
    final_out_message_str = ""

    if RUN_AWS_FUNCTIONS:
        try:
            if s3_bucket and s3_key and local_file_paths:

                s3_client = boto3.client("s3", region_name=AWS_REGION)

                if isinstance(local_file_paths, str):
                    local_file_paths = [local_file_paths]

                for file in local_file_paths:
                    if s3_client:
                        # print(s3_client)
                        try:
                            # Get file name off file path
                            file_name = os.path.basename(file)

                            s3_key_full = s3_key + file_name
                            # print("S3 key: ", s3_bucket, "/", s3_key_full, sep="")

                            s3_client.upload_file(file, s3_bucket, s3_key_full)
                            out_message = (
                                "File " + file_name + " uploaded successfully!"
                            )

                        except Exception as e:
                            out_message = f"Error uploading file(s): {e}"
                            print(out_message)

                        final_out_message.append(out_message)
                        final_out_message_str = "\n".join(final_out_message)

                    else:
                        final_out_message_str = "Could not connect to AWS."
            else:
                final_out_message_str = (
                    "At least one essential variable is empty, could not upload to S3"
                )
        except Exception as e:
            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
            print(final_out_message_str)
    else:
        final_out_message_str = "App not set to run AWS functions"

    return final_out_message_str


def upload_log_file_to_s3(
    local_file_paths: List[str],
    s3_key: str,
    s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
    RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
    SAVE_LOGS_TO_CSV: bool = SAVE_LOGS_TO_CSV,
):
    """
    Uploads a log file from local machine to Amazon S3.

    Args:
    - local_file_path: Local file path(s) of the file(s) to upload.
    - s3_key: Key (path) to the file in the S3 bucket.
    - s3_bucket: Name of the S3 bucket.

    Returns:
    - Message as variable/printed to console
    """
    final_out_message = list()
    final_out_message_str = ""

    if RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV:
        try:
            if s3_bucket and s3_key and local_file_paths:

                s3_client = boto3.client("s3", region_name=AWS_REGION)

                if isinstance(local_file_paths, str):
                    local_file_paths = [local_file_paths]

                for file in local_file_paths:
                    if s3_client:
                        # print(s3_client)
                        try:
                            # Get file name off file path
                            file_name = os.path.basename(file)

                            s3_key_full = s3_key + file_name

                            s3_client.upload_file(file, s3_bucket, s3_key_full)
                            out_message = (
                                "File " + file_name + " uploaded successfully!"
                            )
                            # print(out_message)

                        except Exception as e:
                            out_message = f"Error uploading file(s): {e}"
                            print(out_message)

                        final_out_message.append(out_message)
                        final_out_message_str = "\n".join(final_out_message)

                    else:
                        final_out_message_str = "Could not connect to AWS."
            else:
                final_out_message_str = (
                    "At least one essential variable is empty, could not upload to S3"
                )
        except Exception as e:
            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
            print(final_out_message_str)
    else:
        final_out_message_str = "App not set to run AWS functions"

    return final_out_message_str


# Helper to upload outputs to S3 when enabled in config.
def export_outputs_to_s3(
    file_list_state,
    s3_output_folder_state_value: str,
    save_outputs_to_s3_flag: bool,
    base_file_state=None,
    s3_bucket: str = S3_OUTPUTS_BUCKET,
):
    """
    Upload a list of local output files to the configured S3 outputs folder.

    - file_list_state: Gradio dropdown state that holds a list of file paths or a
        single path/string. If blank/empty, no action is taken.
    - s3_output_folder_state_value: Final S3 key prefix (including any session hash)
        to use as the destination folder for uploads.
    - s3_bucket: Name of the S3 bucket.
    """
    try:

        # Respect the runtime toggle as well as environment configuration
        if not save_outputs_to_s3_flag:
            return

        if not s3_output_folder_state_value:
            # No configured S3 outputs folder – nothing to do
            return

        # Normalise input to a Python list of strings
        file_paths = file_list_state
        if not file_paths:
            return

        # Gradio dropdown may return a single string or a list
        if isinstance(file_paths, str):
            file_paths = [file_paths]

        # Filter out any non-truthy values
        file_paths = [p for p in file_paths if p]
        if not file_paths:
            return

        # Derive a base file stem (name without extension) from the original
        # file(s) being analysed, if provided. This is used to create an
        # additional subfolder layer so that outputs are grouped under the
        # analysed file name rather than under each output file name.
        base_stem = None
        if base_file_state:
            base_path = None

            # Gradio File components typically provide a list of objects with a `.name` attribute
            if isinstance(base_file_state, str):
                base_path = base_file_state
            elif isinstance(base_file_state, list) and base_file_state:
                first_item = base_file_state[0]
                base_path = getattr(first_item, "name", None) or str(first_item)
            else:
                base_path = getattr(base_file_state, "name", None) or str(
                    base_file_state
                )

            if base_path:
                base_name = os.path.basename(base_path)
                base_stem, _ = os.path.splitext(base_name)

        # Ensure base S3 prefix (session/date) ends with a trailing slash
        base_prefix = s3_output_folder_state_value
        if not base_prefix.endswith("/"):
            base_prefix = base_prefix + "/"

        # For each file, append a subfolder. If we have a derived base_stem
        # from the input being analysed, use that; otherwise, fall back to
        # the individual output file name stem. Final pattern:
        #   <session_output_folder>/<date>/<base_file_stem>/<file_name>
        # or, if base_file_stem is not available:
        #   <session_output_folder>/<date>/<output_file_stem>/<file_name>
        for file in file_paths:
            file_name = os.path.basename(file)

            if base_stem:
                folder_stem = base_stem
            else:
                folder_stem, _ = os.path.splitext(file_name)

            per_file_prefix = base_prefix + folder_stem + "/"

            out_message = upload_file_to_s3(
                local_file_paths=[file],
                s3_key=per_file_prefix,
                s3_bucket=s3_bucket,
            )

            # Log any issues to console so failures are visible in logs/stdout
            if (
                "Error uploading file" in out_message
                or "could not upload" in out_message.lower()
            ):
                print("export_outputs_to_s3 encountered issues:", out_message)

        print("Successfully uploaded outputs to S3")

    except Exception as e:
        # Do not break the app flow if S3 upload fails – just report to console
        print(f"export_outputs_to_s3 failed with error: {e}")

    # No GUI outputs to update
    return