Spaces:

openfree
/

NASA-Galileo

Sleeping

App Files Files Community

NASA-Galileo / src /utils.py

openfree

Deploy from GitHub repository

b20c769 verified 4 months ago

raw

history blame contribute delete

11.1 kB

	import json
	import os
	import random
	from datetime import datetime
	from pathlib import Path
	from typing import Dict, Optional

	import dateutil.tz
	import numpy as np
	import torch

	from .config import DEFAULT_SEED
	from .data.dataset import (
	SPACE_BAND_GROUPS_IDX,
	SPACE_TIME_BANDS_GROUPS_IDX,
	STATIC_BAND_GROUPS_IDX,
	TIME_BAND_GROUPS_IDX,
	)
	from .masking import MASKING_MODES, MaskedOutput

	data_dir = Path(__file__).parent.parent / "data"
	logging_dir = Path(__file__).parent.parent / "logs"
	config_dir = Path(__file__).parent.parent / "config"

	if not torch.cuda.is_available():
	device = torch.device("cpu")
	else:
	device = torch.device("cuda:0")
	torch.cuda.set_device(device)


	def will_cause_nans(x: torch.Tensor):
	return torch.isnan(x).any() or torch.isinf(x).any()


	# From https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
	def seed_everything(seed: int = DEFAULT_SEED):
	random.seed(seed)
	os.environ["PYTHONHASHSEED"] = str(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = True


	def masked_output_np_to_tensor(
	s_t_x, sp_x, t_x, st_x, s_t_m, sp_m, t_m, st_m, month
	) -> MaskedOutput:
	"""converts eval task"""
	return MaskedOutput(
	torch.as_tensor(s_t_x, dtype=torch.float32),
	torch.as_tensor(sp_x, dtype=torch.float32),
	torch.as_tensor(t_x, dtype=torch.float32),
	torch.as_tensor(st_x, dtype=torch.float32),
	torch.as_tensor(s_t_m, dtype=torch.float32),
	torch.as_tensor(sp_m, dtype=torch.float32),
	torch.as_tensor(t_m, dtype=torch.float32),
	torch.as_tensor(st_m, dtype=torch.float32),
	torch.as_tensor(month, dtype=torch.long),
	)


	class AverageMeter:
	"""computes and stores the average and current value"""

	average: float
	sum: float
	count: int

	def __init__(self):
	self.average = 0.0
	self.sum = 0.0
	self.count = 0

	def update(self, val, n=1):
	self.sum += val * n
	self.count += n
	self.average = self.sum / self.count


	def check_config(config):
	"""
	this function modifies the config
	primarily so that legacy configs
	work with updates to the code
	"""
	expected_training_keys_type = {
	"num_epochs": int,
	"batch_size": int,
	"effective_batch_size": int,
	"patch_sizes": list,
	"max_lr": float,
	"final_lr": float,
	"warmup_epochs": (int, float),
	"shape_time_combinations": list,
	"augmentation": dict,
	"masking_probabilities": list,
	"grad_clip": bool,
	"normalization": str,
	"random_masking": str,
	}
	optional_training_keys_type_default = {
	"target_masking": (str, "decoder_only"),
	"double_loss": (bool, False),
	"double_predictors": (bool, False),
	"ignore_band_groups": (list, None),
	}
	training_dict = config["training"]

	for key, val in expected_training_keys_type.items():
	assert key in training_dict, f"Expected {key} in training dict"
	assert isinstance(
	training_dict[key],
	val, # type: ignore
	), f"Expected {key} to be {val}, got {type(training_dict[key])}"
	for key, val in optional_training_keys_type_default.items():
	if key in training_dict:
	assert isinstance(
	training_dict[key], val[0]
	), f"Expected {key} to be {val}, got {type(training_dict[key])}"
	else:
	print(f"{key} missing from training dict. Filling with default value {val[1]}")
	config["training"][key] = val[1]

	assert ("target_exit_after" in training_dict.keys()) or (
	"token_exit_cfg" in training_dict.keys()
	)

	# legacy: encode_ratio, decode_ratio
	# new - specific for st and random
	if "encode_ratio" in training_dict:
	# then we assign it for both st and random
	training_dict["st_encode_ratio"] = training_dict["encode_ratio"]
	training_dict["random_encode_ratio"] = training_dict["encode_ratio"]
	del training_dict["encode_ratio"]
	else:
	assert "st_encode_ratio" in training_dict
	assert "random_encode_ratio" in training_dict
	if "decode_ratio" in training_dict:
	# then we assign it for both st and random
	training_dict["st_decode_ratio"] = training_dict["decode_ratio"]
	training_dict["random_decode_ratio"] = training_dict["decode_ratio"]
	del training_dict["decode_ratio"]
	else:
	assert "st_decode_ratio" in training_dict
	assert "random_decode_ratio" in training_dict

	# we have three options for the losses:
	# 1. the loss keys are in the training config at a top level (legacy)
	# 2. the loss dict has a single entry "loss_dict" (if double loss is False)
	# 3. the loss dict has two entries, "loss_dict_random", "loss_dict_st"
	# if double loss is True
	if "loss_type" in training_dict:
	loss_dict = {"loss_type": training_dict["loss_type"]}
	for additional_key in ["loss_mask_other_samples", "pred2unit", "tau"]:
	if additional_key in training_dict:
	loss_dict[additional_key] = training_dict[additional_key]
	training_dict["loss_dict"] = loss_dict

	if training_dict["double_loss"]:
	assert "target_exit_after" in training_dict
	assert isinstance(training_dict["target_exit_after"], int)
	assert "token_exit_cfg" in training_dict
	assert isinstance(training_dict["token_exit_cfg"], dict)
	assert (
	training_dict["random_masking"] == "half"
	), "double_loss only possible with random_masking == half"
	if "loss_dict" not in training_dict:
	assert "loss_dict_st" in training_dict
	assert "loss_dict_random" in training_dict
	else:
	if "target_exit_after" in training_dict.keys():
	assert isinstance(training_dict["target_exit_after"], int)
	assert "token_exit_cfg" not in training_dict.keys()
	training_dict["token_exit_cfg"] = None
	elif "token_exit_cfg" in training_dict.keys():
	assert isinstance(training_dict["token_exit_cfg"], dict)
	assert "target_exit_after" not in training_dict.keys()
	training_dict["target_exit_after"] = None

	if training_dict["double_predictors"]:
	if not training_dict["double_loss"]:
	raise ValueError("double_predictors only possible with double loss")

	if isinstance(training_dict["warmup_epochs"], float):
	training_dict["warmup_epochs"] = int(
	training_dict["warmup_epochs"] * training_dict["num_epochs"]
	)
	assert isinstance(training_dict["warmup_epochs"], int)
	assert training_dict["num_epochs"] > training_dict["warmup_epochs"]
	assert training_dict["normalization"] in ["std", "scaling"]
	assert training_dict["random_masking"] in ["half", "full", "none"]

	assert len(training_dict["masking_probabilities"]) == len(
	MASKING_MODES
	), f"Expected {len(MASKING_MODES)}, got {len(training_dict['masking_probabilities'])}"

	for combination in training_dict["shape_time_combinations"]:
	assert "timesteps" in combination.keys()
	assert "size" in combination.keys()
	assert combination["timesteps"] >= 3

	expected_encoder_decoder_keys_type = {
	"embedding_size": int,
	"depth": int,
	"mlp_ratio": int,
	"num_heads": int,
	"max_sequence_length": int,
	}

	expected_encoder_only_keys_type = {"freeze_projections": bool, "drop_path": float}
	expected_decoder_only_keys_type = {"learnable_channel_embeddings": bool}

	model_dict = config["model"]
	for model in ["encoder", "decoder"]:
	assert model in model_dict
	for key, val in expected_encoder_decoder_keys_type.items():
	assert key in model_dict[model], f"Expected {key} in {model} dict"
	assert isinstance(model_dict[model][key], val)
	if model == "encoder":
	for key, val in expected_encoder_only_keys_type.items():
	assert key in model_dict[model], f"Expected {key} in {model} dict"
	assert isinstance(model_dict[model][key], val)
	elif model == "decoder":
	for key, val in expected_decoder_only_keys_type.items():
	assert key in model_dict[model], f"Expected {key} in {model} dict"
	assert isinstance(model_dict[model][key], val)

	config["model"]["encoder"]["max_patch_size"] = max(config["training"]["patch_sizes"])
	config["model"]["decoder"]["max_patch_size"] = max(config["training"]["patch_sizes"])
	config["model"]["decoder"]["encoder_embedding_size"] = config["model"]["encoder"][
	"embedding_size"
	]
	config["model"]["decoder"]["decoder_embedding_size"] = config["model"]["decoder"].pop(
	"embedding_size"
	)

	if ("loss_dict" in config["training"]) and (config["training"]["loss_dict"] == "MAE"):
	max_patch_size = max(config["training"]["patch_sizes"])
	max_group_length = max(
	[
	max([len(v) for _, v in SPACE_TIME_BANDS_GROUPS_IDX.items()]),
	max([len(v) for _, v in TIME_BAND_GROUPS_IDX.items()]),
	max([len(v) for _, v in SPACE_BAND_GROUPS_IDX.items()]),
	max([len(v) for _, v in STATIC_BAND_GROUPS_IDX.items()]),
	]
	)
	config["model"]["decoder"]["output_embedding_size"] = (
	max_patch_size**2
	) * max_group_length

	return config


	def load_check_config(name: str) -> Dict:
	with (config_dir / "models" / name).open("r") as f:
	config = json.load(f)
	config = check_config(config)

	return config


	def timestamp_dirname(suffix: Optional[str] = None) -> str:
	ts = datetime.now(dateutil.tz.tzlocal()).strftime("%Y_%m_%d_%H_%M_%S_%f")
	return f"{ts}_{suffix}" if suffix is not None else ts


	def is_bf16_available():
	# https://github.com/huggingface/transformers/blob/d91841315aab55cf1347f4eb59332858525fad0f/src/transformers/utils/import_utils.py#L275
	# https://github.com/pytorch/pytorch/blob/2289a12f21c54da93bf5d696e3f9aea83dd9c10d/torch/testing/_internal/common_cuda.py#L51
	# to succeed:
	# 1. the hardware needs to support bf16 (arch >= Ampere)
	# 2. torch >= 1.10 (1.9 should be enough for AMP API has changed in 1.10, so using 1.10 as minimal)
	# 3. CUDA >= 11
	# 4. torch.autocast exists
	# XXX: one problem here is that it may give invalid results on mixed gpus setup, so it's
	# really only correct for the 0th gpu (or currently set default device if different from 0)

	if not torch.cuda.is_available() or torch.version.cuda is None:
	return False
	if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
	return False
	if int(torch.version.cuda.split(".")[0]) < 11:
	return False
	if not hasattr(torch, "autocast"):
	return False

	return True