DarthReca
/

actu-magnitude-regression

@@ -1,203 +1,89 @@
-from dataclasses import dataclass
 import numpy as np
 import timm
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, repeat
 from segmentation_models_pytorch.base import SegmentationHead
 from segmentation_models_pytorch.decoders.unet.decoder import UnetDecoder
 from timm.layers.create_act import create_act_layer
-from transformers import PretrainedConfig, PreTrainedModel
-from transformers.modeling_outputs import SemanticSegmenterOutput
 from .convlstm import ConvLSTM
-class ACTUConfig(PretrainedConfig):
-    model_type = "actu"
     def __init__(
         self,
-        # Base ACTU parameters
-        in_channels: int = 3,
-        kernel_size: tuple[int, int] = (3, 3),
-        padding="same",
-        stride=(1, 1),
-        backbone="resnet34",
         bias=True,
         batch_first=True,
         bidirectional=False,
         original_resolution=(256, 256),
-        act_layer="sigmoid",
-        n_classes=1,
-        # Variant control parameters
-        use_dem_input: bool = False,
-        use_climate_branch: bool = False,
-        # Climate branch parameters
-        climate_seq_len=5,
-        climate_input_dim=6,
-        lstm_hidden_dim=128,
-        num_lstm_layers=1,
         **kwargs,
     ):
-        super().__init__(**kwargs)
-        self.in_channels = in_channels
-        self.kernel_size = kernel_size
-        self.padding = padding
-        self.stride = stride
-        self.backbone = backbone
-        self.bias = bias
-        self.batch_first = batch_first
-        self.bidirectional = bidirectional
-        self.original_resolution = original_resolution
-        self.act_layer = act_layer
         self.n_classes = n_classes
-        # Parameters to control variants
-        self.use_dem_input = use_dem_input
-        self.use_climate_branch = use_climate_branch
-        self.climate_seq_len = climate_seq_len
-        self.climate_input_dim = climate_input_dim
-        self.lstm_hidden_dim = lstm_hidden_dim
-        self.num_lstm_layers = num_lstm_layers
-        # Adjust in_channels if DEM is used
-        if self.use_dem_input:
-            self.in_channels += 1
-class ACTUForImageSegmentation(PreTrainedModel):
-    config_class = ACTUConfig
-    def __init__(self, config: ACTUConfig):
-        super().__init__(config)
-        self.config = config
         self.encoder: nn.Module = timm.create_model(
-            config.backbone, features_only=True, in_chans=config.in_channels
         )
         with torch.no_grad():
-            dummy_input_channels = config.in_channels
-            dummy_input = torch.randn(
-                1, dummy_input_channels, *config.original_resolution
             )
-            embs = self.encoder(dummy_input)
-            self.embs_shape = [e.shape for e in embs]
-            self.encoder_channels = [e[1] for e in self.embs_shape]
         self.convlstm = nn.ModuleList(
-            [
-                ConvLSTM(
-                    in_channels=shape[1],
-                    hidden_channels=shape[1],
-                    kernel_size=config.kernel_size,
-                    padding=config.padding,
-                    stride=config.stride,
-                    bias=config.bias,
-                    batch_first=config.batch_first,
-                    bidirectional=config.bidirectional,
-                )
-                for shape in self.embs_shape
-            ]
-        )
-        if self.config.use_climate_branch:
-            self.climate_branch = ClimateBranchLSTM(
-                output_shapes=[e[1:] for e in self.embs_shape],
-                lstm_hidden_dim=config.lstm_hidden_dim,
-                climate_seq_len=config.climate_seq_len,
-                climate_input_dim=config.climate_input_dim,
-                num_lstm_layers=config.num_lstm_layers,
-            )
-            self.fusers = nn.ModuleList(
-                GatedFusion(enc, enc) for enc in self.encoder_channels
             )
         self.decoder = UnetDecoder(
-            encoder_channels=[1] + self.encoder_channels,
-            decoder_channels=self.encoder_channels[::-1],
-            n_blocks=len(self.encoder_channels),
         )
         self.seg_head = nn.Sequential(
             SegmentationHead(
-                in_channels=self.encoder_channels[0],
-                out_channels=config.n_classes,
             ),
-            create_act_layer(config.act_layer, inplace=True),
         )
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        climate: torch.Tensor = None,
-        dem: torch.Tensor = None,
-        labels: torch.Tensor = None,
-        **kwargs,
-    ) -> SemanticSegmenterOutput:
-        b, t = pixel_values.shape[:2]
-        original_size = pixel_values.shape[-2:]
-        # Handle DEM input
-        if self.config.use_dem_input:
-            if dem is None:
-                raise ValueError(
-                    "DEM tensor must be provided when use_dem_input is True."
-                )
-            dem_repeated = repeat(dem, "b c h w -> b t c h w", t=t)
-            pixel_values = torch.cat([pixel_values, dem_repeated], dim=2)
-        # 1. Encode images per time step
-        encoded_sequence = self._encode_images(pixel_values)
-        # 2. Handle Climate Branch Fusion
-        if self.config.use_climate_branch:
-            if climate is None:
-                raise ValueError(
-                    "Climate tensor must be provided when use_climate_branch is True."
-                )
-            climate_features = self.climate_branch(climate)
-            # Reshape for fusion
-            encoded_sequence_reshaped = [
-                rearrange(f, "b t c h w -> (b t) c h w") for f in encoded_sequence
-            ]
-            climate_features_reshaped = [
-                rearrange(f, "b t c h w -> (b t) c h w") for f in climate_features
-            ]
-            # Fuse features
-            fused_features = [
-                fuser(img, clim)
-                for fuser, img, clim in zip(
-                    self.fusers, encoded_sequence_reshaped, climate_features_reshaped
-                )
-            ]
-            # Reshape back to sequence
-            encoded_sequence = [
-                rearrange(f, "(b t) c h w -> b t c h w", b=b) for f in fused_features
-            ]
-        # 3. Process sequence with ConvLSTM
-        temporal_features = self._encode_timeseries(encoded_sequence)
-        # 4. Decode to get the segmentation map
-        logits = self._decode(temporal_features, size=original_size)
-        loss = None
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits, labels.float().unsqueeze(1))
-        return SemanticSegmenterOutput(
-            loss=loss,
-            logits=logits,
-        )
     def _encode_images(self, x: torch.Tensor) -> list[torch.Tensor]:
         B = x.size(0)
@@ -221,112 +107,3 @@ class ACTUForImageSegmentation(PreTrainedModel):
             trend_map, size=size, mode="bilinear", align_corners=False
         )
         return trend_map
-class ClimateBranchLSTM(nn.Module):
-    """
-    Processes climate time series data using an LSTM.
-    Input shape: (B, T, T_1, C_clim) -> e.g., (B, 5, 6, 5)
-    Output shape: (B, T, output_dim) -> e.g., (B, 5, 128)
-    """
-    def __init__(
-        self,
-        output_shapes: list[tuple[int, int, int]],
-        climate_input_dim=5,
-        climate_seq_len=6,
-        lstm_hidden_dim=64,
-        num_lstm_layers=1,
-    ):
-        super().__init__()
-        self.climate_seq_len = climate_seq_len
-        self.climate_input_dim = climate_input_dim
-        self.lstm_hidden_dim = lstm_hidden_dim
-        self.num_lstm_layers = num_lstm_layers
-        self.proj_dim = 128
-        self.output_shapes = output_shapes
-        self.lstm = nn.LSTM(
-            input_size=climate_input_dim,
-            hidden_size=lstm_hidden_dim,
-            num_layers=num_lstm_layers,
-            batch_first=True,  # Crucial: expects input shape (batch, seq_len, features)
-            dropout=0.3 if num_lstm_layers > 1 else 0,
-            bidirectional=False,
-        )
-        # Linear layer to project LSTM output to the desired final dimension
-        self.fc = nn.Linear(lstm_hidden_dim, self.proj_dim)
-        self.upsamples = nn.ModuleList(
-            _build_upsampler(self.proj_dim, *shape[:2]) for shape in output_shapes
-        )
-    def forward(self, climate_data: torch.Tensor) -> list[torch.Tensor]:
-        # climate_data shape: (B, T, T_1, C_clim), e.g., (B, 5, 6, 5)
-        B_img, B_cli, T, C = climate_data.shape
-        # Reshape for LSTM: Treat each sequence independently
-        lstm_input = rearrange(climate_data, "Bi Bc T C -> (Bi Bc) T C")
-        # Pass through LSTM
-        _, (hidden, _) = self.lstm.forward(lstm_input)
-        # Get the last layer's hidden state
-        last_hidden = (
-            hidden[[hidden.size(0) // 2, -1]] if self.lstm.bidirectional else hidden[-1]
-        )
-        if last_hidden.ndim == 3:
-            last_hidden = hidden.mean(dim=0)
-        # Pass the final hidden state through the fully connected layer(s) and upsample
-        climate_features = self.fc(last_hidden)
-        climate_features = rearrange(climate_features, "b c -> b c 1 1")
-        climate_features = [
-            rearrange(
-                u(climate_features), "(Bi Bc) C H W -> Bi Bc C H W", Bi=B_img, Bc=B_cli
-            )
-            for u in self.upsamples
-        ]
-        return climate_features
-class GatedFusion(nn.Module):
-    def __init__(self, img_channels, clim_channels):
-        super().__init__()
-        self.gate = nn.Sequential(
-            nn.Sequential(
-                nn.Conv2d(
-                    img_channels + clim_channels, img_channels, kernel_size=3, padding=1
-                ),
-                nn.ReLU(inplace=True),
-                nn.Conv2d(img_channels, img_channels, kernel_size=1),
-                nn.Sigmoid(),  # Gate values between 0 and 1
-            )
-        )
-    def forward(self, img_feat, clim_feat):
-        gate = self.gate(torch.cat([img_feat, clim_feat], dim=1))
-        return gate * img_feat + (1 - gate) * clim_feat
-def _build_upsampler(
-    in_channels: int, target_channels: int, target_h: int
-) -> nn.Sequential:
-    layers = []
-    current_h = 1
-    # Expand to target channels early (e.g., 1x1 → 1x1 with target_channels)
-    layers += [nn.Conv2d(in_channels, target_channels, kernel_size=1), nn.GELU()]
-    # Upsample spatially to target_h
-    while current_h < target_h:
-        next_h = min(current_h * 2, target_h)
-        layers += [
-            nn.Upsample(scale_factor=2, mode="nearest"),
-            nn.Conv2d(target_channels, target_channels, kernel_size=3, padding=1),
-            nn.GELU(),
-        ]
-        current_h = next_h
-    return nn.Sequential(*layers)

 import numpy as np
 import timm
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from einops import rearrange
 from segmentation_models_pytorch.base import SegmentationHead
 from segmentation_models_pytorch.decoders.unet.decoder import UnetDecoder
 from timm.layers.create_act import create_act_layer
 from .convlstm import ConvLSTM
+class ACTU(nn.Module):
     def __init__(
         self,
+        in_channels,
+        kernel_size,
+        padding,
+        stride,
+        backbone: str,
         bias=True,
         batch_first=True,
         bidirectional=False,
         original_resolution=(256, 256),
+        act_layer: str = "sigmoid",
+        n_classes: int = 1,
         **kwargs,
     ):
+        super(ACTU, self).__init__()
         self.n_classes = n_classes
+        self.backbone = backbone
         self.encoder: nn.Module = timm.create_model(
+            backbone, features_only=True, in_chans=in_channels
         )
         with torch.no_grad():
+            embs = self.encoder.forward(
+                torch.randn(1, in_channels, *original_resolution)
             )
+            embs_shape = [e.shape for e in embs]
+        # The ConvLSTM expects inputs of shape (B, T, feature_dim, H_enc, W_enc)
+        # We assume the provided ConvLSTM code is available.
         self.convlstm = nn.ModuleList(
+            ConvLSTM(
+                in_channels=shape[1],
+                hidden_channels=shape[1],
+                kernel_size=kernel_size,
+                padding=padding,
+                stride=stride,
+                bias=bias,
+                batch_first=batch_first,
+                bidirectional=bidirectional,
             )
+            for shape in embs_shape
+        )
+        # If bidirectional, the hidden representation is concatenated from both directions.
+        n_upsamples = int(np.log2(original_resolution[0] / embs_shape[-1][-2]))
+        skip_channels_list = [shape[1] for shape in embs_shape[-(n_upsamples + 1) : -1]]
+        skip_channels_list = skip_channels_list[::-1]  # Reverse the list.
+        encoder_channels = [e[1] for e in embs_shape]
         self.decoder = UnetDecoder(
+            encoder_channels=[1, *encoder_channels],
+            decoder_channels=encoder_channels[::-1],
+            n_blocks=len(encoder_channels),
         )
         self.seg_head = nn.Sequential(
             SegmentationHead(
+                in_channels=encoder_channels[0],
+                out_channels=n_classes,
             ),
+            create_act_layer(act_layer, inplace=True),
         )
+        self.encoder_channels = encoder_channels
+        self.embs_shape = embs_shape
+    def forward(self, x: torch.Tensor, **kwargs):
+        size = x.size()[-2:]
+        # Process each time step through the encoder.
+        x = self._encode_images(x)
+        # Pass the encoded sequence through the ConvLSTM.
+        x = self._encode_timeseries(x)
+        return self._decode(x, size=size)
     def _encode_images(self, x: torch.Tensor) -> list[torch.Tensor]:
         B = x.size(0)
             trend_map, size=size, mode="bilinear", align_corners=False
         )
         return trend_map