diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..72d118862af448bf5262c64a961be6d19504877e --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +dav2_models +ckpts +mast3r +Metric3D +Depth-Anything-V2/metric_depth \ No newline at end of file diff --git a/Depth-Anything-V2/DA-2K.md b/Depth-Anything-V2/DA-2K.md new file mode 100644 index 0000000000000000000000000000000000000000..70290bc96c0201aa68e4e69f1c78c928dd61e497 --- /dev/null +++ b/Depth-Anything-V2/DA-2K.md @@ -0,0 +1,51 @@ +# DA-2K Evaluation Benchmark + +## Introduction + +![DA-2K](assets/DA-2K.png) + +DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations. + +Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark. + + +## Usage + +Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main). + +All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below: + +``` +{ + "image_path": [ + { + "point1": [h1, w1], # (vertical position, horizontal position) + "point2": [h2, w2], # (vertical position, horizontal position) + "closer_point": "point1" # we always set "point1" as the closer one + }, + ... + ], + ... +} +``` + +To visualize the annotations: +```bash +python visualize.py [--scene-type ] +``` + +**Options** +- `--scene-type ` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set as `""` to include all scene types. + +## Citation + +If you find this benchmark useful, please consider citing: + +```bibtex +@article{depth_anything_v2, + title={Depth Anything V2}, + author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, + journal={arXiv:2406.09414}, + year={2024} +} +``` \ No newline at end of file diff --git a/Depth-Anything-V2/LICENSE b/Depth-Anything-V2/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/Depth-Anything-V2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Depth-Anything-V2/README.md b/Depth-Anything-V2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c3bb68dd1b8798ac17b02b9181c431953be1cfae --- /dev/null +++ b/Depth-Anything-V2/README.md @@ -0,0 +1,201 @@ +
+

Depth Anything V2

+ +[**Lihe Yang**](https://liheyoung.github.io/)1 · [**Bingyi Kang**](https://bingykang.github.io/)2† · [**Zilong Huang**](http://speedinghzl.github.io/)2 +
+[**Zhen Zhao**](http://zhaozhen.me/) · [**Xiaogang Xu**](https://xiaogang00.github.io/) · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)2 · [**Hengshuang Zhao**](https://hszhao.github.io/)1* + +1HKU   2TikTok +
+†project lead *corresponding author + +Paper PDF +Project Page + +Benchmark +
+ +This work presents Depth Anything V2. It significantly outperforms [V1](https://github.com/LiheYoung/Depth-Anything) in fine-grained details and robustness. Compared with SD-based models, it enjoys faster inference speed, fewer parameters, and higher depth accuracy. + +![teaser](assets/teaser.png) + + +## News +- **2025-01-22:** [Video Depth Anything](https://videodepthanything.github.io) has been released. It generates consistent depth maps for super-long videos (e.g., over 5 minutes). +- **2024-12-22:** [Prompt Depth Anything](https://promptda.github.io/) has been released. It supports 4K resolution metric depth estimation when low-res LiDAR is used to prompt the DA models. +- **2024-07-06:** Depth Anything V2 is supported in [Transformers](https://github.com/huggingface/transformers/). See the [instructions](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2) for convenient usage. +- **2024-06-25:** Depth Anything is integrated into [Apple Core ML Models](https://developer.apple.com/machine-learning/models/). See the instructions ([V1](https://huggingface.co/apple/coreml-depth-anything-small), [V2](https://huggingface.co/apple/coreml-depth-anything-v2-small)) for usage. +- **2024-06-22:** We release [smaller metric depth models](https://github.com/DepthAnything/Depth-Anything-V2/tree/main/metric_depth#pre-trained-models) based on Depth-Anything-V2-Small and Base. +- **2024-06-20:** Our repository and project page are flagged by GitHub and removed from the public for 6 days. Sorry for the inconvenience. +- **2024-06-14:** Paper, project page, code, models, demo, and benchmark are all released. + + +## Pre-trained Models + +We provide **four models** of varying scales for robust relative depth estimation: + +| Model | Params | Checkpoint | +|:-|-:|:-:| +| Depth-Anything-V2-Small | 24.8M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true) | +| Depth-Anything-V2-Base | 97.5M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true) | +| Depth-Anything-V2-Large | 335.3M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true) | +| Depth-Anything-V2-Giant | 1.3B | Coming soon | + + +## Usage + +### Prepraration + +```bash +git clone https://github.com/DepthAnything/Depth-Anything-V2 +cd Depth-Anything-V2 +pip install -r requirements.txt +``` + +Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory. + +### Use our models +```python +import cv2 +import torch + +from depth_anything_v2.dpt import DepthAnythingV2 + +DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' + +model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} +} + +encoder = 'vitl' # or 'vits', 'vitb', 'vitg' + +model = DepthAnythingV2(**model_configs[encoder]) +model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location='cpu')) +model = model.to(DEVICE).eval() + +raw_img = cv2.imread('your/image/path') +depth = model.infer_image(raw_img) # HxW raw depth map in numpy +``` + +If you do not want to clone this repository, you can also load our models through [Transformers](https://github.com/huggingface/transformers/). Below is a simple code snippet. Please refer to the [official page](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2) for more details. + +- Note 1: Make sure you can connect to Hugging Face and have installed the latest Transformers. +- Note 2: Due to the [upsampling difference](https://github.com/huggingface/transformers/pull/31522#issuecomment-2184123463) between OpenCV (we used) and Pillow (HF used), predictions may differ slightly. So you are more recommended to use our models through the way introduced above. +```python +from transformers import pipeline +from PIL import Image + +pipe = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf") +image = Image.open('your/image/path') +depth = pipe(image)["depth"] +``` + +### Running script on *images* + +```bash +python run.py \ + --encoder \ + --img-path --outdir \ + [--input-size ] [--pred-only] [--grayscale] +``` +Options: +- `--img-path`: You can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths. +- `--input-size` (optional): By default, we use input size `518` for model inference. ***You can increase the size for even more fine-grained results.*** +- `--pred-only` (optional): Only save the predicted depth map, without raw image. +- `--grayscale` (optional): Save the grayscale depth map, without applying color palette. + +For example: +```bash +python run.py --encoder vitl --img-path assets/examples --outdir depth_vis +``` + +### Running script on *videos* + +```bash +python run_video.py \ + --encoder \ + --video-path assets/examples_video --outdir video_depth_vis \ + [--input-size ] [--pred-only] [--grayscale] +``` + +***Our larger model has better temporal consistency on videos.*** + +### Gradio demo + +To use our gradio demo locally: + +```bash +python app.py +``` + +You can also try our [online demo](https://huggingface.co/spaces/Depth-Anything/Depth-Anything-V2). + +***Note: Compared to V1, we have made a minor modification to the DINOv2-DPT architecture (originating from this [issue](https://github.com/LiheYoung/Depth-Anything/issues/81)).*** In V1, we *unintentionally* used features from the last four layers of DINOv2 for decoding. In V2, we use [intermediate features](https://github.com/DepthAnything/Depth-Anything-V2/blob/2cbc36a8ce2cec41d38ee51153f112e87c8e42d8/depth_anything_v2/dpt.py#L164-L169) instead. Although this modification did not improve details or accuracy, we decided to follow this common practice. + + +## Fine-tuned to Metric Depth Estimation + +Please refer to [metric depth estimation](./metric_depth). + + +## DA-2K Evaluation Benchmark + +Please refer to [DA-2K benchmark](./DA-2K.md). + + +## Community Support + +**We sincerely appreciate all the community support for our Depth Anything series. Thank you a lot!** + +- Apple Core ML: + - https://developer.apple.com/machine-learning/models + - https://huggingface.co/apple/coreml-depth-anything-v2-small + - https://huggingface.co/apple/coreml-depth-anything-small +- Transformers: + - https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2 + - https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything +- TensorRT: + - https://github.com/spacewalk01/depth-anything-tensorrt + - https://github.com/zhujiajian98/Depth-Anythingv2-TensorRT-python +- ONNX: https://github.com/fabio-sim/Depth-Anything-ONNX +- ComfyUI: https://github.com/kijai/ComfyUI-DepthAnythingV2 +- Transformers.js (real-time depth in web): https://huggingface.co/spaces/Xenova/webgpu-realtime-depth-estimation +- Android: + - https://github.com/shubham0204/Depth-Anything-Android + - https://github.com/FeiGeChuanShu/ncnn-android-depth_anything + + +## Acknowledgement + +We are sincerely grateful to the awesome Hugging Face team ([@Pedro Cuenca](https://huggingface.co/pcuenq), [@Niels Rogge](https://huggingface.co/nielsr), [@Merve Noyan](https://huggingface.co/merve), [@Amy Roberts](https://huggingface.co/amyeroberts), et al.) for their huge efforts in supporting our models in Transformers and Apple Core ML. + +We also thank the [DINOv2](https://github.com/facebookresearch/dinov2) team for contributing such impressive models to our community. + + +## LICENSE + +Depth-Anything-V2-Small model is under the Apache-2.0 license. Depth-Anything-V2-Base/Large/Giant models are under the CC-BY-NC-4.0 license. + + +## Citation + +If you find this project useful, please consider citing: + +```bibtex +@article{depth_anything_v2, + title={Depth Anything V2}, + author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, + journal={arXiv:2406.09414}, + year={2024} +} + +@inproceedings{depth_anything_v1, + title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data}, + author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, + booktitle={CVPR}, + year={2024} +} +``` diff --git a/Depth-Anything-V2/app.py b/Depth-Anything-V2/app.py new file mode 100644 index 0000000000000000000000000000000000000000..41399be3b316f38af653692fc9895d32b06e09f1 --- /dev/null +++ b/Depth-Anything-V2/app.py @@ -0,0 +1,88 @@ +import glob +import gradio as gr +import matplotlib +import numpy as np +from PIL import Image +import torch +import tempfile +from gradio_imageslider import ImageSlider + +from depth_anything_v2.dpt import DepthAnythingV2 + +css = """ +#img-display-container { + max-height: 100vh; +} +#img-display-input { + max-height: 80vh; +} +#img-display-output { + max-height: 80vh; +} +#download { + height: 62px; +} +""" +DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' +model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} +} +encoder = 'vitl' +model = DepthAnythingV2(**model_configs[encoder]) +state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu") +model.load_state_dict(state_dict) +model = model.to(DEVICE).eval() + +title = "# Depth Anything V2" +description = """Official demo for **Depth Anything V2**. +Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details.""" + +def predict_depth(image): + return model.infer_image(image) + +with gr.Blocks(css=css) as demo: + gr.Markdown(title) + gr.Markdown(description) + gr.Markdown("### Depth Prediction demo") + + with gr.Row(): + input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input') + depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5) + submit = gr.Button(value="Compute Depth") + gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",) + raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",) + + cmap = matplotlib.colormaps.get_cmap('Spectral_r') + + def on_submit(image): + original_image = image.copy() + + h, w = image.shape[:2] + + depth = predict_depth(image[:, :, ::-1]) + + raw_depth = Image.fromarray(depth.astype('uint16')) + tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False) + raw_depth.save(tmp_raw_depth.name) + + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + depth = depth.astype(np.uint8) + colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8) + + gray_depth = Image.fromarray(depth) + tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False) + gray_depth.save(tmp_gray_depth.name) + + return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name] + + submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file]) + + example_files = glob.glob('assets/examples/*') + examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit) + + +if __name__ == '__main__': + demo.queue().launch() \ No newline at end of file diff --git a/Depth-Anything-V2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e79d1218e1268c3168490b8439dd6d0d22540d43 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2dec39a45011d169ecbbcce2e2b903014af78ca Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2.py b/Depth-Anything-V2/depth_anything_v2/dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..83d250818c721c6df3b30d3f4352945527701615 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2.py @@ -0,0 +1,415 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +from functools import partial +import math +import logging +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn.init import trunc_normal_ + +from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block + + +logger = logging.getLogger("dinov2") + + +def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +class DinoVisionTransformer(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=Block, + ffn_layer="mlp", + block_chunks=1, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + num_register_tokens: (int) number of extra cls tokens (so-called "registers") + interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings + interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings + """ + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + self.num_register_tokens = num_register_tokens + self.interpolate_antialias = interpolate_antialias + self.interpolate_offset = interpolate_offset + + self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + assert num_register_tokens >= 0 + self.register_tokens = ( + nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + if ffn_layer == "mlp": + logger.info("using MLP layer as FFN") + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + logger.info("using SwiGLU layer as FFN") + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + logger.info("using Identity layer as FFN") + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize]) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + nn.init.normal_(self.cls_token, std=1e-6) + if self.register_tokens is not None: + nn.init.normal_(self.register_tokens, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0 + w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset + # w0, h0 = w0 + 0.1, h0 + 0.1 + + sqrt_N = math.sqrt(N) + sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2), + scale_factor=(sx, sy), + # (int(w0), int(h0)), # to solve the upsampling shape issue + mode="bicubic", + antialias=self.interpolate_antialias + ) + + assert int(w0) == patch_pos_embed.shape[-2] + assert int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + if self.register_tokens is not None: + x = torch.cat( + ( + x[:, :1], + self.register_tokens.expand(x.shape[0], -1, -1), + x[:, 1:], + ), + dim=1, + ) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] + for blk in self.blocks: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + + for blk in self.blocks: + x = blk(x) + + x_norm = self.norm(x) + return { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, is_training=False, **kwargs): + ret = self.forward_features(*args, **kwargs) + if is_training: + return ret + else: + return self.head(ret["x_norm_clstoken"]) + + +def init_weights_vit_timm(module: nn.Module, name: str = ""): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def vit_small(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_base(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_large(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs): + """ + Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 + """ + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def DINOv2(model_name): + model_zoo = { + "vits": vit_small, + "vitb": vit_base, + "vitl": vit_large, + "vitg": vit_giant2 + } + + return model_zoo[model_name]( + img_size=518, + patch_size=14, + init_values=1.0, + ffn_layer="mlp" if model_name != "vitg" else "swiglufused", + block_chunks=0, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1 + ) diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__init__.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8120f4bc83066cb3f825ce32daa3b437f88486f1 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock +from .attention import MemEffAttention diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1533f5a4fdf332a2fc3013d4227d8169900c17e6 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6b19a9f74f7e048ae90772e30dd0225fe05712f Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..317a93229610f3b4a6fc49427afcf42115029e64 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..585f592ff728e095394abfe75b47c424c36b3d21 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f55f0cf25f7ca65ed73908abb46a38f5653bf215 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0475a42ad40f86d91a81e04f023639689561646 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..063b7a7f57b0b38d5377380fa54fc7f092b4a339 Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79c26c56f51052b4d06d0f0a5f2a1e8b9c897ded Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/attention.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..815a2bf53dbec496f6a184ed7d03bcecb7124262 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/attention.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging + +from torch import Tensor +from torch import nn + + +logger = logging.getLogger("dinov2") + + +try: + from xformers.ops import memory_efficient_attention, unbind, fmha + + XFORMERS_AVAILABLE = True +except ImportError: + logger.warning("xFormers not available") + XFORMERS_AVAILABLE = False + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + assert attn_bias is None, "xFormers is required for nested tensors usage" + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + \ No newline at end of file diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/block.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/block.py new file mode 100644 index 0000000000000000000000000000000000000000..25488f57cc0ad3c692f86b62555f6668e2a66db1 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/block.py @@ -0,0 +1,252 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +from typing import Callable, List, Any, Tuple, Dict + +import torch +from torch import nn, Tensor + +from .attention import Attention, MemEffAttention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +logger = logging.getLogger("dinov2") + + +try: + from xformers.ops import fmha + from xformers.ops import scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True +except ImportError: + logger.warning("xFormers not available") + XFORMERS_AVAILABLE = False + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage" + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/drop_path.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/drop_path.py new file mode 100644 index 0000000000000000000000000000000000000000..af05625984dd14682cc96a63bf0c97bab1f123b1 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/drop_path.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/layer_scale.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/layer_scale.py new file mode 100644 index 0000000000000000000000000000000000000000..ca5daa52bd81d3581adeb2198ea5b7dba2a3aea1 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/layer_scale.py @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 + +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/mlp.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..5e4b315f972f9a9f54aef1e4ef4e81b52976f018 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/mlp.py @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..574abe41175568d700a389b8b96d1ba554914779 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/Depth-Anything-V2/depth_anything_v2/dinov2_layers/swiglu_ffn.py b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/swiglu_ffn.py new file mode 100644 index 0000000000000000000000000000000000000000..b3324b266fb0a50ccf8c3a0ede2ae10ac4dfa03e --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dinov2_layers/swiglu_ffn.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Optional + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +try: + from xformers.ops import SwiGLU + + XFORMERS_AVAILABLE = True +except ImportError: + SwiGLU = SwiGLUFFN + XFORMERS_AVAILABLE = False + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/Depth-Anything-V2/depth_anything_v2/dpt.py b/Depth-Anything-V2/depth_anything_v2/dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..df9f9db510d376d92ac09f5f0883c815ca7eb447 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/dpt.py @@ -0,0 +1,233 @@ +import cv2 +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.transforms import Compose + +from .dinov2 import DINOv2 +from .util.blocks import FeatureFusionBlock, _make_scratch +from .util.transform import Resize, NormalizeImage, PrepareForNet + + +def _make_fusion_block(features, use_bn, size=None): + return FeatureFusionBlock( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + size=size, + ) + + +class ConvBlock(nn.Module): + def __init__(self, in_feature, out_feature): + super().__init__() + + self.conv_block = nn.Sequential( + nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(out_feature), + nn.ReLU(True) + ) + + def forward(self, x): + return self.conv_block(x) + + +class DPTHead(nn.Module): + def __init__( + self, + in_channels, + features=256, + use_bn=False, + out_channels=[256, 512, 1024, 1024], + use_clstoken=False + ): + super(DPTHead, self).__init__() + + self.use_clstoken = use_clstoken + + self.projects = nn.ModuleList([ + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channel, + kernel_size=1, + stride=1, + padding=0, + ) for out_channel in out_channels + ]) + + self.resize_layers = nn.ModuleList([ + nn.ConvTranspose2d( + in_channels=out_channels[0], + out_channels=out_channels[0], + kernel_size=4, + stride=4, + padding=0), + nn.ConvTranspose2d( + in_channels=out_channels[1], + out_channels=out_channels[1], + kernel_size=2, + stride=2, + padding=0), + nn.Identity(), + nn.Conv2d( + in_channels=out_channels[3], + out_channels=out_channels[3], + kernel_size=3, + stride=2, + padding=1) + ]) + + if use_clstoken: + self.readout_projects = nn.ModuleList() + for _ in range(len(self.projects)): + self.readout_projects.append( + nn.Sequential( + nn.Linear(2 * in_channels, in_channels), + nn.GELU())) + + self.scratch = _make_scratch( + out_channels, + features, + groups=1, + expand=False, + ) + + self.scratch.stem_transpose = None + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + head_features_1 = features + head_features_2 = 32 + + self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1) + self.scratch.output_conv2 = nn.Sequential( + nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True), + nn.Identity(), + ) + + def forward(self, out_features, patch_h, patch_w): + out = [] + for i, x in enumerate(out_features): + if self.use_clstoken: + x, cls_token = x[0], x[1] + readout = cls_token.unsqueeze(1).expand_as(x) + x = self.readout_projects[i](torch.cat((x, readout), -1)) + else: + x = x[0] + + x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w)) + + x = self.projects[i](x) + x = self.resize_layers[i](x) + + # project&resize 0: torch.Size([1, 256, 148, 216]) + # project&resize 1: torch.Size([1, 512, 74, 108]) + # project&resize 2: torch.Size([1, 1024, 37, 54]) + # project&resize 3: torch.Size([1, 1024, 19, 27]) + + out.append(x) + + layer_1, layer_2, layer_3, layer_4 = out + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:]) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:]) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out_fea = self.scratch.output_conv1(path_1) + # scratch.output_conv1: torch.Size([1, 128, 296, 432]) + out = F.interpolate(out_fea, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True) + # interpolate: torch.Size([1, 128, 518, 756]) + out = self.scratch.output_conv2(out) + # scratch.output_conv2: torch.Size([1, 1, 518, 756]) + + return out, out_fea + + +class DepthAnythingV2(nn.Module): + def __init__( + self, + encoder='vitl', + features=256, + out_channels=[256, 512, 1024, 1024], + use_bn=False, + use_clstoken=False + ): + super(DepthAnythingV2, self).__init__() + + self.intermediate_layer_idx = { + 'vits': [2, 5, 8, 11], + 'vitb': [2, 5, 8, 11], + 'vitl': [4, 11, 17, 23], + 'vitg': [9, 19, 29, 39] + } + + self.encoder = encoder + self.pretrained = DINOv2(model_name=encoder) + + self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken) + + def forward(self, x): + patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14 + + # features 0: torch.Size([1, 1998, 1024]) torch.Size([1, 1024]) + # features 1: torch.Size([1, 1998, 1024]) torch.Size([1, 1024]) + # features 2: torch.Size([1, 1998, 1024]) torch.Size([1, 1024]) + # features 3: torch.Size([1, 1998, 1024]) torch.Size([1, 1024]) + features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True) + + depth, out_fea = self.depth_head(features, patch_h, patch_w) + depth = F.relu(depth) + + return depth, out_fea + + @torch.no_grad() + def infer_image(self, raw_image, input_size=518): + image, (h, w) = self.image2tensor(raw_image, input_size) + + depth = self.forward(image) + + depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0] + + return depth.cpu().numpy() + + def image2tensor(self, raw_image, input_size=518): + transform = Compose([ + Resize( + width=input_size, + height=input_size, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + ]) + + h, w = raw_image.shape[:2] + + image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0 + + image = transform({'image': image})['image'] + image = torch.from_numpy(image).unsqueeze(0) + + DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' + image = image.to(DEVICE) + + return image, (h, w) diff --git a/Depth-Anything-V2/depth_anything_v2/util/__pycache__/blocks.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/util/__pycache__/blocks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80ba251d3f20c17e68c23af5819b8708a092cdcd Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/util/__pycache__/blocks.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/util/__pycache__/transform.cpython-310.pyc b/Depth-Anything-V2/depth_anything_v2/util/__pycache__/transform.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e14c14973ba18a7a7975fa7093bd038f0e33faa Binary files /dev/null and b/Depth-Anything-V2/depth_anything_v2/util/__pycache__/transform.cpython-310.pyc differ diff --git a/Depth-Anything-V2/depth_anything_v2/util/blocks.py b/Depth-Anything-V2/depth_anything_v2/util/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..382ea183a40264056142afffc201c992a2b01d37 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/util/blocks.py @@ -0,0 +1,148 @@ +import torch.nn as nn + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + if len(in_shape) >= 4: + out_shape4 = out_shape + + if expand: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + if len(in_shape) >= 4: + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + if len(in_shape) >= 4: + scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + + return scratch + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups=1 + + self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) + + self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) + + if self.bn == True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn == True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn == True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + size=None + ): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups=1 + + self.expand = expand + out_features = features + if self.expand == True: + out_features = features // 2 + + self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) + + self.resConfUnit1 = ResidualConvUnit(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + self.size=size + + def forward(self, *xs, size=None): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + + output = self.resConfUnit2(output) + + if (size is None) and (self.size is None): + modifier = {"scale_factor": 2} + elif size is None: + modifier = {"size": self.size} + else: + modifier = {"size": size} + + output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners) + + output = self.out_conv(output) + + return output diff --git a/Depth-Anything-V2/depth_anything_v2/util/transform.py b/Depth-Anything-V2/depth_anything_v2/util/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..b14aacd44ea086b01725a9ca68bb49eadcf37d73 --- /dev/null +++ b/Depth-Anything-V2/depth_anything_v2/util/transform.py @@ -0,0 +1,158 @@ +import numpy as np +import cv2 + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height) + new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height) + new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0]) + + # resize sample + sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method) + + if self.__resize_target: + if "depth" in sample: + sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST) + + if "mask" in sample: + sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST) + + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + return sample \ No newline at end of file diff --git a/Depth-Anything-V2/requirements.txt b/Depth-Anything-V2/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8698f1bc9fd4f6e98147c042d6c978e343b0f494 --- /dev/null +++ b/Depth-Anything-V2/requirements.txt @@ -0,0 +1,6 @@ +gradio_imageslider +gradio==4.29.0 +matplotlib +opencv-python +torch +torchvision diff --git a/Depth-Anything-V2/run.py b/Depth-Anything-V2/run.py new file mode 100644 index 0000000000000000000000000000000000000000..14810ff39a17e60354e2199d62708b197e97d750 --- /dev/null +++ b/Depth-Anything-V2/run.py @@ -0,0 +1,73 @@ +import argparse +import cv2 +import glob +import matplotlib +import numpy as np +import os +import torch + +from depth_anything_v2.dpt import DepthAnythingV2 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Depth Anything V2') + + parser.add_argument('--img-path', type=str) + parser.add_argument('--input-size', type=int, default=518) + parser.add_argument('--outdir', type=str, default='./vis_depth') + + parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) + + parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') + parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') + + args = parser.parse_args() + + DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' + + model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} + } + + depth_anything = DepthAnythingV2(**model_configs[args.encoder]) + depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu')) + depth_anything = depth_anything.to(DEVICE).eval() + + if os.path.isfile(args.img_path): + if args.img_path.endswith('txt'): + with open(args.img_path, 'r') as f: + filenames = f.read().splitlines() + else: + filenames = [args.img_path] + else: + filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True) + + os.makedirs(args.outdir, exist_ok=True) + + cmap = matplotlib.colormaps.get_cmap('Spectral_r') + + for k, filename in enumerate(filenames): + print(f'Progress {k+1}/{len(filenames)}: {filename}') + + raw_image = cv2.imread(filename) + + depth = depth_anything.infer_image(raw_image, args.input_size) + + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + depth = depth.astype(np.uint8) + + if args.grayscale: + depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) + else: + depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) + + if args.pred_only: + cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), depth) + else: + split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255 + combined_result = cv2.hconcat([raw_image, split_region, depth]) + + cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), combined_result) \ No newline at end of file diff --git a/Depth-Anything-V2/run_video.py b/Depth-Anything-V2/run_video.py new file mode 100644 index 0000000000000000000000000000000000000000..cc3c5b6a33585396d91aaaa4301110a6179f452d --- /dev/null +++ b/Depth-Anything-V2/run_video.py @@ -0,0 +1,92 @@ +import argparse +import cv2 +import glob +import matplotlib +import numpy as np +import os +import torch + +from depth_anything_v2.dpt import DepthAnythingV2 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Depth Anything V2') + + parser.add_argument('--video-path', type=str) + parser.add_argument('--input-size', type=int, default=518) + parser.add_argument('--outdir', type=str, default='./vis_video_depth') + + parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) + + parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') + parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') + + args = parser.parse_args() + + DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' + + model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} + } + + depth_anything = DepthAnythingV2(**model_configs[args.encoder]) + depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu')) + depth_anything = depth_anything.to(DEVICE).eval() + + if os.path.isfile(args.video_path): + if args.video_path.endswith('txt'): + with open(args.video_path, 'r') as f: + lines = f.read().splitlines() + else: + filenames = [args.video_path] + else: + filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True) + + os.makedirs(args.outdir, exist_ok=True) + + margin_width = 50 + cmap = matplotlib.colormaps.get_cmap('Spectral_r') + + for k, filename in enumerate(filenames): + print(f'Progress {k+1}/{len(filenames)}: {filename}') + + raw_video = cv2.VideoCapture(filename) + frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS)) + + if args.pred_only: + output_width = frame_width + else: + output_width = frame_width * 2 + margin_width + + output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4') + out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height)) + + while raw_video.isOpened(): + ret, raw_frame = raw_video.read() + if not ret: + break + + depth = depth_anything.infer_image(raw_frame, args.input_size) + + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + depth = depth.astype(np.uint8) + + if args.grayscale: + depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) + else: + depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) + + if args.pred_only: + out.write(depth) + else: + split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255 + combined_frame = cv2.hconcat([raw_frame, split_region, depth]) + + out.write(combined_frame) + + raw_video.release() + out.release() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..14d0651e7d7c341e1ce2423c5b6e536fbc99d949 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Princeton Vision & Learning Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index ee0588231a839b13f221752c0a8bfba3e03d4e26..9bff6cb2c8227f11bf4e9becb1384feb7dc9df5d 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ --- -title: >- - Diving Into The Fusion Of Monocular Priors For Generalized Stereo Matching - Demo -emoji: 🏆 -colorFrom: indigo -colorTo: blue +title: Diving Into The Fusion Of Monocular Priors For Generalized Stereo Matching +emoji: 😻 +colorFrom: red +colorTo: indigo sdk: gradio sdk_version: 5.38.0 app_file: app.py @@ -12,3 +10,368 @@ pinned: false --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# [ICCV25] Diving into the Fusion of Monocular Priors for Generalized Stereo Matching + +Detailed images can be found at [Google Driver](https://drive.google.com/file/d/1u2u_-AgxkdtnkQENEf1d2JjtutwrtCPb/view?usp=sharing) + + + + + +## Requirements +```Shell +conda env create -f envs/environment_GStereo.yaml +conda activate raftstereo +``` + + +## Required Data +```Shell +├── datasets + ├── sceneflow + ├── driving + │   ├── disparity + │   ├── frames_cleanpass + │   └── frames_finalpass + ├── flying3d + │   ├── disparity + │   ├── frames_cleanpass + │   └── frames_finalpass + └── monkaa + ├── disparity + ├── frames_cleanpass + └── frames_finalpass + ├── Kitti15 + ├── testing + │   ├── image_2 + │   └── image_3 + └── training + ├── disp_noc_0 + ├── disp_noc_1 + ├── disp_occ_0 + ├── disp_occ_1 + ├── flow_noc + ├── flow_occ + ├── image_2 + ├── image_3 + └── obj_map + ├── Kitti12 + ├── testing + │   ├── calib + │   ├── colored_0 + │   ├── colored_1 + │   ├── disp_noc + │   ├── disp_occ + │   ├── flow_noc + │   ├── flow_occ + │   ├── image_0 + │   └── image_1 + └── training + ├── calib + ├── colored_0 + └── colored_1 + ├── Middlebury + └── MiddEval3 + ├── testF + ├── testH + ├── testQ + ├── trainingF + ├── trainingH + └── trainingQ + ├── ETH3D + ├── two_view_testing + └── two_view_training +    ├── delivery_area_1l +    ├── delivery_area_1s +    ├── delivery_area_2l + ├── Booster + ├── test + │   ├── balanced + │   └── unbalanced + └── train + ├── balanced + └── unbalanced +``` + + + +## Code +All codes are provided here, including DepthAnything v2. +Since we modified `dpt.py` to get intermediate features and depth output, please use the modified code. + + +- ### Training + All training script is presented in [script/train_stereo_raftstereo.sh](script/train_stereo_raftstereo.sh) and [script/train_stereo_raftstereo_depthany.sh](script/train_stereo_raftstereo_depthany.sh). + Please specify the following variable in scripts before training. + | variable | meaning | + |---------------|----------------------| + | `NCCL_P2P_DISABLE` | We set `NCCL_P2P_DISABLE=1` as the distributed training went wrong at our `A40` GPU. | + | `CUDA_VISIBLE_DEVICES` | avaliable GPU id, e.g., `CUDA_VISIBLE_DEVICES=0,1,2,3` | + | `DATASET_ROOT` | the training dataset path, e.g., `./datasets/sceneflow` | + | `LOG_ROOT` | path to save log file | + | `TB_ROOT` | path to save tensorboard data | + | `CKPOINT_ROOT` | path to save checkpoint | + + + In order to reproduce our results, please download `depth_anything_v2_vitl.pth` from DepthAnything v2 before training and specify `--depthany_model_dir` in script shell to path of directory where `depth_anything_v2_vitl.pth` is saved. Here, we do not provide the link as it maybe conflicts to the CVPR guideline. + We also explain the code for ablation study, in which each experiment is mostly controlled by the `--model_name` used in the training shell. + | `--model_name` | meaning | + |-----------------|-------------------------| + | `RaftStereo` | Original RaftStereo model | + | `RaftStereoDisp` | The output of GRU is a single channel for disparity instead of two channels for optical flow, `Baseline` in Table 3 of the main text. | + | `RAFTStereoMast3r` | The pre-trained MASt3R is used as the backbone, and its features are used for cost volume construction, `RaftStereo + backbone Mast3r` in supplemental text. | + | `RaftStereoNoCTX` | RaftStereo model without context network, `Baseline w/o mono feature` in Table 3 of the main text. | + | `RAFTStereoDepthAny` | RaftStereo model with our monocular encoder, `Baseline + ME` in Table 3 of the main text. | + | `RAFTStereoDepthFusion` | RaftStereo model with our monocular encoder, `Baseline + ME + IDF` in Table 3 of the main text. | + | `RAFTStereoDepthBeta` | RaftStereo model with our monocular encoder and iterative local fusion, `Baseline + ME + ILF` in Table 3 of the main text. | + | `RAFTStereoDepthBetaNoLBP` | RaftStereo model with our monocular encoder and iterative local fusion without LBPEncoder, `L(6)` and `L(7)` in Table 4 of the main text. | + | `RAFTStereoDepthMatch` | RaftStereo model with DepthAnything v2 as feature extractor for cost volume construction, `RaftStereo + backbone DepthAnything` in the supplemental text. | + | `RAFTStereoDepthPostFusion` | RaftStereo model with our monocular encoder, iterative local fusion and post fusion, `Baseline + ME + PF` in Table 3 of the main text. | + | `RAFTStereoDepthBetaRefine` | RaftStereo model with our monocular encoder, iterative local fusion, and global fusion, `Baseline + ME + ILF + GF` in Table 3 of the main text. | + + + | variable | meaning | + |--------------------------|-------------------------| + | `--lbp_neighbor_offsets` | control `LBP Kernel` used in Table 4 of the main text. | + | `--modulation_ratio` | control `r` amplitude parameter used in Table 4 of the main text. | + | `--conf_from_fea` | `Cost` or `Hybrid` for `Confidence` used in Table 4 of the main text. | + | `--refine_pool` | learning registration parameters via pooling in the supplemental text. | + + + The training is launched by following + ```Shell + bash ./script/train_stereo_raftstereo_depthany.sh EXP_NAME + ``` + `EXP_NAME` specifies the experiment name. We use this name to save each log file, tensorboard data, and checkpoint for different experiments. The corresponding file structure is as follows + ```Shell + ├── runs +    ├── ckpoint + │ ├── RaftStereoDepthAny + │ ├── RaftStereoMast3r + │ └── RaftStereoNoCTX +    ├── log + │ ├── RaftStereoDepthAny + │ ├── RaftStereoMast3r + │ └── RaftStereoNoCTX +    └── tboard + ├── RaftStereoDepthAny + ├── RaftStereoMast3r + └── RaftStereoNoCTX + ``` + > ⚠️ **Warning**: **Please follow the training process mentioned in our main text.** We first train the model without the global fusion module. Then, we train the monocular registration of the global fusion module while keeping the other modules frozen with a well-trained model from the first stage. Finally, we train the entire global fusion module while keeping the other modules frozen with a well-trained model from the second stage. + +- ### Evaluation + The evaluation script is presented in [script/evaluate_stereo_raftstereo.sh](script/evaluate_stereo_raftstereo.sh). + We use `--test_exp_name` to specify the evaluation experiment name. + The results of each experiment are restored in `LOG_ROOT/eval.xlsx`. We also merge all experiments' results in `LOG_ROOT/merged_eval.xlsx` through `python3 merge_sheet.py`. + The evaluation metrics remain the same for different methods. + The `mean ± std` is computed via [tools/get_statistics.py](tools/get_statistics.py). + +- ### Visualization + We visualize the error map via [script/gen_sample_stereo_raftstereo.sh](script/gen_sample_stereo_raftstereo.sh) and intermediate results via [script/vis_inter_stereo_raftstereo.sh](script/vis_inter_stereo_raftstereo.sh). + We provide an easy-to-use visualization toolbox to fully understand each module. + +- ### Demo + The model weights, pre-trained on SceneFlow, can be downloaded from [Google Drive](https://drive.google.com/file/d/1T1o7soh3p4C_tHzmUd0ZCtnQbVczPmXz/view?usp=sharing). + The demo used to infer disparity maps from custom image pairs is presented in `infer_stereo_raftstereo.py`. For specific usage, please refer to `script/infer_stereo_raftstereo.sh`. + + +## More Results +The results after using our custom synthetic data [Trans Dataset](https://github.com/BFZD233/TranScene), which is built for multi-label transparent scenes. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MethodBooster
ALLTransNo_Trans
EPERMSE2px3px5px6px8pxEPERMSE2px3px5px6px8pxEPERMSE2px3px5px6px8px
Ours2.265.6011.028.596.606.005.357.9311.0359.8350.3638.4433.8727.561.523.936.984.973.643.272.89
Ours+Trans1.244.197.915.974.524.083.445.678.4246.7838.5528.6525.4121.300.753.074.773.232.292.011.59
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MethodBooster
Class 0Class 1Class 2Class 3
EPERMSE2px3px5px6px8pxEPERMSE2px3px5px6px8pxEPERMSE2px3px5px6px8pxEPERMSE2px3px5px6px8px
Ours0.793.025.904.573.172.581.451.534.7012.677.804.883.963.145.326.3923.3417.6213.5012.8012.157.9311.0359.8350.3638.4433.8727.56
Ours+Trans0.752.995.154.083.002.591.731.404.749.175.633.803.372.861.622.2613.5110.237.406.504.935.678.4246.7838.5528.6525.4121.30
+ diff --git a/abs_cost/abs_cost_kernel.cu b/abs_cost/abs_cost_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..58c29ee1b45be474864eda4b053721b8fad1973e --- /dev/null +++ b/abs_cost/abs_cost_kernel.cu @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define BLOCK 16 + +// (B,H,W1,C) (B,H,W2,C) -> (B,H,W1,W2) + +__forceinline__ __device__ bool within_bounds(int h, int w1, int w2, int H, int W1, int W2) { + return h >= 0 && h < H && w1 >= 0 && w1 < W1 && w2 >= 0 && w2 < W2; +} + +template +__global__ void absolute_difference_forward_kernel( + const torch::PackedTensorAccessor32 fmap1, + const torch::PackedTensorAccessor32 fmap2, + torch::PackedTensorAccessor32 result) +{ + const int C = fmap1.size(3); + const int H = fmap1.size(1); + const int W1 = fmap1.size(2); + const int W2 = fmap2.size(2); + + // 获取当前线程的索引 + const int w1 = blockIdx.x * blockDim.x + threadIdx.x; + const int w2 = blockIdx.y * blockDim.y + threadIdx.y; + const int h = blockIdx.z % H; + const int b = blockIdx.z / H; + + if (!within_bounds(h, w1, w2, H, W1, W2)) { + return; + } + + scalar_t sum = 0.0; + for (int c = 0; i < C; ++c) { + scalar_t diff = fabs(fmap1[b][h][w1][c] - fmap2[b][h][w2][c]); + sum += diff; + } + + result[b][h][w1][w2] = sum; +} + +template +__global__ void absolute_difference_backward_kernel_fmap1( + const torch::PackedTensorAccessor32 fmap1, + const torch::PackedTensorAccessor32 fmap2, + const torch::PackedTensorAccessor32 grad_output, + torch::PackedTensorAccessor32 grad_fmap1) +{ + const int k = blockIdx.x * blockDim.x + threadIdx.x; + const int h = blockIdx.y * blockDim.y + threadIdx.y; + const int n = blockIdx.z; + + const int i_size = fmap1.size(1); + const int j_size = fmap1.size(2); + const int k_size = fmap1.size(3); + const int h_size = fmap2.size(3); + + if (!within_bounds(h, k, j_size, k_size)) { + return; + } + + for (int i = 0; i < i_size; ++i) { + for (int j = 0; j < j_size; ++j) { + scalar_t grad = 0.0; + + scalar_t diff = fmap1[n][i][j][k] - fmap2[n][i][j][h]; + if (diff >= 0) { + grad = grad_output[n][h][k][h]; + } else { + grad = -grad_output[n][h][k][h]; + } + + grad_fmap1[n][i][j][k] += grad; + } + } +} + +template +__global__ void absolute_difference_backward_kernel_fmap2( + const torch::PackedTensorAccessor32 fmap1, + const torch::PackedTensorAccessor32 fmap2, + const torch::PackedTensorAccessor32 grad_output, + torch::PackedTensorAccessor32 grad_fmap2) +{ + const int k = blockIdx.x * blockDim.x + threadIdx.x; + const int h = blockIdx.y * blockDim.y + threadIdx.y; + const int n = blockIdx.z; + + const int i_size = fmap1.size(1); + const int j_size = fmap1.size(2); + const int k_size = fmap1.size(3); + const int h_size = fmap2.size(3); + + if (!within_bounds(h, k, j_size, k_size)) { + return; + } + + for (int i = 0; i < i_size; ++i) { + for (int j = 0; j < j_size; ++j) { + scalar_t grad = 0.0; + + scalar_t diff = fmap2[n][i][j][h] - fmap1[n][i][j][k]; + if (diff >= 0) { + grad = grad_output[n][h][k][h]; + } else { + grad = -grad_output[n][h][k][h]; + } + + grad_fmap2[n][i][j][h] += grad; + } + } +} + +/** + * compute correlation between each element (h,w1)~(h,w2). + * (B,H,W1,C) (B,H,W2,C) -> (B,H,W1,W2) + */ +std::vector absolute_difference_cuda_forward( + torch::Tensor fmap1, + torch::Tensor fmap2) +{ + const auto B = fmap1.size(0); + const auto H = fmap1.size(1); + const auto W1 = fmap1.size(2); + const auto W2 = fmap2.size(2); + + const dim3 blocks((W1 + BLOCK - 1) / BLOCK, + (W2 + BLOCK - 1) / BLOCK, + B*H); + + const dim3 threads(BLOCK, BLOCK); + + auto opts = fmap1.options(); + torch::Tensor result = torch::zeros({B, H, W1, W2}, opts); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.scalar_type(), "absolute_difference_forward_kernel", ([&] { + absolute_difference_forward_kernel<<>>( + fmap1.packed_accessor32(), + fmap2.packed_accessor32(), + result.packed_accessor32()); + })); + + return {result}; +} + +std::vector absolute_difference_cuda_backward( + torch::Tensor fmap1, + torch::Tensor fmap2, + torch::Tensor grad_output) +{ + const auto B = fmap1.size(0); + const auto H = fmap1.size(1); + const auto W1 = fmap1.size(2); + const auto W2 = fmap2.size(2); + + auto grad_fmap1 = torch::zeros_like(fmap1); + auto grad_fmap2 = torch::zeros_like(fmap2); + + const dim3 blocks((k_size + BLOCK - 1) / BLOCK, + (h_size + BLOCK - 1) / BLOCK, + batch_size); + + const dim3 threads(BLOCK, BLOCK); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.scalar_type(), "absolute_difference_backward_kernel_fmap1", ([&] { + absolute_difference_backward_kernel_fmap1<<>>( + fmap1.packed_accessor32(), + fmap2.packed_accessor32(), + grad_output.packed_accessor32(), + grad_fmap1.packed_accessor32()); + })); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap2.scalar_type(), "absolute_difference_backward_kernel_fmap2", ([&] { + absolute_difference_backward_kernel_fmap2<<>>( + fmap1.packed_accessor32(), + fmap2.packed_accessor32(), + grad_output.packed_accessor32(), + grad_fmap2.packed_accessor32()); + })); + + return {grad_fmap1, grad_fmap2}; +} diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..f0b86c794af65676d827b014387c32524068bb4a --- /dev/null +++ b/app.py @@ -0,0 +1,103 @@ +from __future__ import print_function, division +import sys +sys.path.insert(0,'core') +sys.path.append('core/utils') + +import os +import argparse +import gradio as gr +import cv2 +from core.raft_stereo_depthbeta_refine import RAFTStereoDepthBetaRefine +import torch +import torch.nn as nn +from core.utils.utils import InputPadder +import matplotlib.pyplot as plt +from huggingface_hub import hf_hub_download + + +parser = argparse.ArgumentParser() +parser.add_argument('--root', help="dataset root", default=None) +parser.add_argument('--sv_root', help="visualization root", default=None) +parser.add_argument('--test_exp_name', default='', help="name your experiment in testing") +parser.add_argument('--mast3r_model_path', default='MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth', help="pretrained model path for MaSt3R") +parser.add_argument('--depthany_model_dir', default='./dav2_models', help="directory of pretrained model path for DepthAnything") +parser.add_argument('--restore_ckpt', help="restore checkpoint", default="./ckpts/diving_stereo.pth") +parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') +parser.add_argument('--valid_iters', type=int, default=32, help='number of flow-field updates during forward pass') +parser.add_argument('--eval', action='store_true', help='evaluation mode') +parser.add_argument('--is_test', action='store_true', help='on testing') + +# Architecure choices +parser.add_argument('--hidden_dims', nargs='+', type=int, default=[128]*3, help="hidden state and context dimensions") +parser.add_argument('--corr_implementation', choices=["reg", "alt", "reg_cuda", "alt_cuda"], default="reg", help="correlation volume implementation") +parser.add_argument('--shared_backbone', action='store_true', help="use a single backbone for the context and feature encoders") +parser.add_argument('--corr_levels', type=int, default=4, help="number of levels in the correlation pyramid") +parser.add_argument('--corr_radius', type=int, default=4, help="width of the correlation pyramid") +parser.add_argument('--n_downsample', type=int, default=2, help="resolution of the disparity field (1/2^K)") +parser.add_argument('--context_norm', type=str, default="batch", choices=['group', 'batch', 'instance', 'none'], help="normalization of context encoder") +parser.add_argument('--slow_fast_gru', action='store_true', help="iterate the low-res GRUs more frequently") +parser.add_argument('--n_gru_layers', type=int, default=3, help="number of hidden GRU levels") + +parser.add_argument('--lbp_neighbor_offsets', default='(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)', help="determine the neighbors used in LBP encoder") +parser.add_argument('--modulation_ratio', type=float, default=1., help="hyperparameters for modulation") +parser.add_argument('--modulation_alg', choices=["linear", "sigmoid"], default="linear", help="rescale modulation") +parser.add_argument('--conf_from_fea', action='store_true', help="confidence in refinement not only from cost volume but also from other features") +parser.add_argument('--refine_pool', action='store_true', help="use pooling in refinement") +parser.add_argument('--refine_unet', action='store_true', help="use EfficientUnet in refinement") + +parser.add_argument('--improvement', action='store_true', help="visualize improvement map (error_map[i] - error_map[i-1])") +parser.add_argument('--movement', action='store_true', help="visualize movement map (flow_pr[i] - flow_pr[i-1])") +parser.add_argument('--acceleration', action='store_true', help="visualize acceleration map (movement_map[i] - movement_map[i-1])") +parser.add_argument('--mask', action='store_true', help="visualize mask") +parser.add_argument('--binary_thold', type=float, default=0.5, help="visualize binary mask") + +args = parser.parse_args() +args.conf_from_fea = True +args.eval = True + +model = RAFTStereoDepthBetaRefine(args) +model = torch.nn.DataParallel(model, device_ids=[0]) + + +checkpoint_path = hf_hub_download( + repo_id="BFZD/Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching", + filename="ckpts/diving_stereo.pth", +) + +checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu')) +# model.load_state_dict(checkpoint, strict=True) +new_state_dict = {} +for key, value in checkpoint.items(): + if key.find("lbp_encoder.lbp_conv") != -1: + continue + new_state_dict[key] = value +# model.load_state_dict(new_state_dict, strict=True) +model.load_state_dict(new_state_dict, strict=False) + +# model.cuda() +model.eval() + + + +def predict(image1, image2): + with torch.no_grad(): + image1 = torch.from_numpy(image1).permute(2, 0, 1).float() + image2 = torch.from_numpy(image2).permute(2, 0, 1).float() + image1 = image1[None][:,:3,:,:] + image2 = image2[None][:,:3,:,:] + padder = InputPadder(image1.shape, divis_by=32) + image1, image2 = padder.pad(image1, image2) + atom_dict = model(image1, image2, iters=args.valid_iters, test_mode=False, vis_mode=True) + output = atom_dict['disp_predictions'][-1].abs().cpu().numpy() + disp = padder.unpad(output) + disp = disp.squeeze() + normalized_disp = (disp - disp.min()) / (disp.max() - disp.min()) + cmap = plt.get_cmap('jet') + colored_disp = cmap(normalized_disp)[:, :, :3] # Get RGB channels + + return colored_disp +interface = gr.Interface(fn=predict, + inputs=[gr.Image(label="Left Image"), + gr.Image(label="Right Image")], + outputs="image") +interface.launch() diff --git a/core/ManStereo.py b/core/ManStereo.py new file mode 100644 index 0000000000000000000000000000000000000000..7e34a42bdc155cdcfb04faf90a016a69b1d7e45e --- /dev/null +++ b/core/ManStereo.py @@ -0,0 +1,302 @@ +import os +import sys +import logging +import numpy as np +from datetime import datetime + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update import ManifoldBasicMultiUpdateBlock +from core.extractor import BasicEncoder, MultiBasicEncoder, ResidualBlock +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import coords_grid, upflow8, LoggerCommon +from core.confidence import OffsetConfidence +from core.refinement import Refinement, UpdateHistory +from core import geometry as GEO +from core.utils.plane import get_pos, convert2patch, predict_disp + +logger = LoggerCommon("ARCHI") + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereo(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = MultiBasicEncoder(output_dim=[args.hidden_dims, context_dims], norm_fn=args.context_norm, downsample=args.n_downsample) + self.update_block = ManifoldBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + if args.confidence: + self.confidence_computer = OffsetConfidence(args) + + if args.geo_estimator=="geometry_mlp": + self.geometry_builder = GEO.Geometry_MLP(args) + elif args.geo_estimator=="geometry_conv": + self.geometry_builder = GEO.Geometry_Conv(args) + elif args.geo_estimator=="geometry_conv_split": + self.geometry_builder = GEO.Geometry_Conv_Split(args) + + if args.refinement is not None and len(args.refinement)>0: + if self.args.slant is None or len(self.args.slant)==0 : + dim_disp = 1 + elif self.args.slant in ["slant", "slant_local"] : + dim_disp = 6 + + if args.refinement.lower()=="refinement": + self.refine = Refinement(args, in_chans=256, dim_fea=96, dim_disp=dim_disp) + else: + raise Exception("No such refinement: {}".format(args.refinement)) + + if self.args.update_his: + self.update_hist = UpdateHistory(args, 128, dim_disp) + + logger.info(f"RAFTStereo ~ " +\ + f"Confidence: {args.confidence}, offset_memory_size: {args.offset_memory_size}, " +\ + f"offset_memory_last_iter: {args.offset_memory_last_iter}, " +\ + f"slant: {args.slant}, slant_norm: {args.slant_norm}, " +\ + f"geo estimator: {args.geo_estimator}, geo_fusion: {args.geo_fusion}, " +\ + f"refine: {args.refinement}, refine_win_size: {args.refine_win_size}, num_heads:{args.num_heads}, " +\ + f"split_win: {args.split_win}, refine_start_itr: {args.refine_start_itr}, " +\ + f"update_his: {args.update_his}, U_thold: {args.U_thold}, " +\ + f"stop_freeze_bn: {args.stop_freeze_bn}" ) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_flow(self, img): + """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0""" + N, _, H, W = img.shape + + coords0 = coords_grid(N, H, W).to(img.device) + coords1 = coords_grid(N, H, W).to(img.device) + + return coords0, coords1 + + def upsample_flow(self, flow, mask): + """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ + N, D, H, W = flow.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_flow = F.unfold(factor * flow, [3,3], padding=1) + up_flow = up_flow.view(N, D, 9, 1, 1, H, W) + up_flow = torch.sum(mask * up_flow, dim=2) + + img_coord = None + if self.args.geo_estimator is not None and len(self.args.geo_estimator)>0: + img_coord = get_pos(H*factor, W*factor, disp=None, + slant=self.args.slant, + slant_norm=self.args.slant_norm, + patch_size=factor, + device=flow.device) # (1,2,H*factor,W*factor) + img_coord = img_coord.repeat(N,1,1,1) + + up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) + return up_flow.reshape(N, D, factor*H, factor*W), img_coord + + def upsample_geo(self, mask=None, mask_disp=None, params=None): + """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ + N, D, H, W = params.shape + factor = 2 ** self.args.n_downsample + if mask is not None: + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) # (B,1,9,factor,factor,H,W) + if mask_disp is not None: + mask_disp = mask_disp.view(N, 1, 9, factor, factor, H, W) + mask_disp = torch.softmax(mask_disp, dim=2) # (B,1,9,factor,factor,H,W) + + # d_p = a_q\cdot\Delta u_{q\to p} + b_q\cdot\Delta v_{q\to p} + d_q + delta_pq = get_pos(H*factor, W*factor, disp=None, + slant=self.args.slant, + slant_norm=self.args.slant_norm, + patch_size=factor, + device=params.device) # (1,2,H*factor,W*factor) + patch_delta_pq = convert2patch(delta_pq, patch_size=factor, div_last=False).detach() # (1,2,factor*factor,H,W) + + disp = predict_disp(params, patch_delta_pq, patch_size=factor, mul_last=True) # (B,factor*factor,H,W) + + if mask_disp is not None: + disp = F.unfold(disp, [3,3], padding=1) # (B,factor*factor*9,H,W) + disp = disp.view(N, 1, factor, factor, 9, H, W) # (B,1,factor,factor,9,H,W) + disp = disp.permute((0,1,4,2,3,5,6)) # (B,1,9,factor,factor,H,W) + disp = torch.sum(mask_disp * disp, dim=2) # (B,1,factor,factor,H,W) + disp = disp.permute(0, 1, 4, 2, 5, 3) # (B,1,H,factor,W,factor) + return disp.reshape(N, 1, factor*H, factor*W) + + elif mask is not None: + disp = F.unfold(disp, [3,3], padding=1) # (B,factor*factor*9,H,W) + disp = disp.view(N, 1, factor, factor, 9, H, W) # (B,1,factor,factor,9,H,W) + disp = disp.permute((0,1,4,2,3,5,6)) # (B,1,9,factor,factor,H,W) + disp = torch.sum(mask * disp, dim=2) # (B,1,factor,factor,H,W) + disp = disp.permute(0, 1, 4, 2, 5, 3) # (B,1,H,factor,W,factor) + return disp.reshape(N, 1, factor*H, factor*W) + + disp = F.fold(disp.flatten(-2,-1), (H*factor,W*factor), kernel_size=factor, stride=factor).view(N,1,H*factor,W*factor) + return disp + + + def forward(self, image1, image2, iters=12, flow_init=None, + test_mode=False, vis_mode=False, enable_refinement=True): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + cnet_list = self.cnet(image1, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.fnet([image1, image2]) + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + coords0, coords1 = self.initialize_flow(net_list[0]) + + if flow_init is not None: + coords1 = coords1 + flow_init + + flow_predictions = [] + disp_predictions = [] + disp_predictions_refine = [] + params_list = [] + params_list_refine = [] + confidence_list = [] + offset_memory = [] + for itr in range(iters): + coords1 = coords1.detach() + corr = corr_fn(coords1) # index correlation volume + flow = coords1 - coords0 + + with autocast(enabled=self.args.mixed_precision): + ## first-stage in geometry estimation + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_flow, up_mask_disp = self.update_block(net_list, inp_list, corr, flow, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + ## region detection: acquire confidence + if self.args.confidence: + offset_memory.append(delta_flow[:,0:2]) + if itr0: + geo_params = self.geometry_builder(img_coord, -flow_up, disparity) + + # disp_up = self.upsample_geo(up_mask, params=geo_params) + disp_up = self.upsample_geo(mask=None, mask_disp=up_mask_disp, params=geo_params) + params_list.append(geo_params) + disp_predictions.append(disp_up) + + ## curvature-aware propagation + disparity_refine = None + geo_params_refine = None + if self.args.refinement is not None and len(self.args.refinement)>0 and enable_refinement: + if itr>=self.args.refine_start_itr: + geo_params_refine = self.refine(geo_params, inp_list[0], confidence, + if_shift=(itr-self.args.refine_start_itr)%2>0) + coords1 = coords0 - geo_params_refine[:,:1] + disparity_refine = geo_params_refine[:,:1] + ### update hidden state + if self.args.update_his: + net_list[0] = self.update_hist(net_list[0], -disparity_refine) + params_list_refine.append(geo_params_refine) + + # upsample refinement + disp_up_refine = None + if geo_params_refine is not None: + # disp_up_refine = self.upsample_geo(up_mask, params=geo_params_refine) + disp_up_refine = self.upsample_geo(mask=None, mask_disp=up_mask_disp, params=geo_params_refine) + # disp_up_refine = disp_up_refine[:,:1] + disp_predictions_refine.append(disp_up_refine) + + if test_mode: + if self.args.refinement is not None and len(self.args.refinement)>0 and enable_refinement: + return coords1 - coords0, flow_up_refine + return coords1 - coords0, flow_up + # return coords1 - coords0, -disp_up + + if vis_mode: + return flow_predictions, disp_predictions, disp_predictions_refine, confidence_list + + return flow_predictions, disp_predictions, disp_predictions_refine, confidence_list, params_list, params_list_refine diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/core/__pycache__/__init__.cpython-310.pyc b/core/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76034fde4e9fd0e5e201ff61e4a38a069edb68ff Binary files /dev/null and b/core/__pycache__/__init__.cpython-310.pyc differ diff --git a/core/__pycache__/confidence.cpython-310.pyc b/core/__pycache__/confidence.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2c4869244d19b77ae5cfd83cc6f8acd98b96dee Binary files /dev/null and b/core/__pycache__/confidence.cpython-310.pyc differ diff --git a/core/__pycache__/corr.cpython-310.pyc b/core/__pycache__/corr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bb748f8a063d4b9b03d1415b8d1f43c78496182 Binary files /dev/null and b/core/__pycache__/corr.cpython-310.pyc differ diff --git a/core/__pycache__/extractor.cpython-310.pyc b/core/__pycache__/extractor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ce2aaa229a984008e4d3c4d991ec52561575f10 Binary files /dev/null and b/core/__pycache__/extractor.cpython-310.pyc differ diff --git a/core/__pycache__/extractor_depthany.cpython-310.pyc b/core/__pycache__/extractor_depthany.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6434ee5e2fa2923ab04cff8225d7212555bc1aa Binary files /dev/null and b/core/__pycache__/extractor_depthany.cpython-310.pyc differ diff --git a/core/__pycache__/fusion.cpython-310.pyc b/core/__pycache__/fusion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc5a39577f51a8a2af1492e822030e3943c54aa8 Binary files /dev/null and b/core/__pycache__/fusion.cpython-310.pyc differ diff --git a/core/__pycache__/geometry.cpython-310.pyc b/core/__pycache__/geometry.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9a92005c44e27faa67ff8034910b74c8b5a2ef6 Binary files /dev/null and b/core/__pycache__/geometry.cpython-310.pyc differ diff --git a/core/__pycache__/raft_stereo_depthbeta_refine.cpython-310.pyc b/core/__pycache__/raft_stereo_depthbeta_refine.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5da61742119fb18b07b4dfc7e5c6c8aca382e9b4 Binary files /dev/null and b/core/__pycache__/raft_stereo_depthbeta_refine.cpython-310.pyc differ diff --git a/core/__pycache__/update_disp.cpython-310.pyc b/core/__pycache__/update_disp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3328af66ffeb7fac5eb2ea7e8954fc6fe7bfdf0 Binary files /dev/null and b/core/__pycache__/update_disp.cpython-310.pyc differ diff --git a/core/confidence.py b/core/confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..c61d53352ae432761ce223d3395edb82b96fbc6f --- /dev/null +++ b/core/confidence.py @@ -0,0 +1,169 @@ +import os +import sys +import logging +import numpy as np +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + + + +class OffsetConfidence(nn.Module): + def __init__(self, args): + super(OffsetConfidence, self).__init__() + self.detach = args.detach_in_confidence + self.offset_memory_size = args.offset_memory_size + self.conv_fea = nn.Conv2d(256, 16, 3, padding=1) + self.conv_offset = nn.Conv2d(2*args.offset_memory_size, 16, 3, padding=1) + self.fusion = nn.Sequential(OrderedDict([ + ('conv1', nn.Conv2d(32, 8, 3, padding=1)), + ('relu1', nn.LeakyReLU(inplace=True)), + ('conv2', nn.Conv2d(8, 2, 3, padding=1)), + ('relu2', nn.LeakyReLU(inplace=True)), + ('conv3', nn.Conv2d(2, 1, 1, padding=0)), + ])) + + if "local_rank" not in args or args.local_rank==0 : + logging.info(f"OffsetConfidence: " + \ + f"detach: {args.detach_in_confidence}") + + def forward(self, fea, offset_memory): + if type(fea) is list: + fea = torch.cat(fea, dim=1) + context = self.conv_fea(fea.detach() if self.detach else fea) + offset_memory = torch.cat([offset.detach() if self.detach else offset for offset in offset_memory], dim=1) + confidence = self.conv_offset( -offset_memory ) + confidence = self.fusion( torch.cat([confidence,context], dim=1) ) + return confidence + + + +class MBConvBlockSimple(nn.Module): + def __init__(self, in_channels, out_channels, expand_ratio=1, kernel_size=3, stride=1, se_ratio=0.25): + super(MBConvBlockSimple, self).__init__() + + self.has_se = se_ratio is not None and 0 < se_ratio <= 1 + self.expand_ratio = expand_ratio + mid_channels = in_channels * expand_ratio + if expand_ratio != 1: + self.expand_conv = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False) + self.bn0 = nn.BatchNorm2d(mid_channels) + + self.depthwise_conv = nn.Conv2d(mid_channels, mid_channels, kernel_size=kernel_size, stride=stride, + padding=kernel_size // 2, groups=mid_channels, bias=False) + self.bn1 = nn.BatchNorm2d(mid_channels) + + if self.has_se: + se_channels = max(1, int(in_channels * se_ratio)) + self.se_reduce = nn.Conv2d(mid_channels, se_channels, kernel_size=1) + self.se_expand = nn.Conv2d(se_channels, mid_channels, kernel_size=1) + + self.project_conv = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False) + self.bn2 = nn.BatchNorm2d(out_channels) + + self.swish = nn.SiLU(inplace=True) + self.use_residual = (stride == 1 and in_channels == out_channels) + + def forward(self, x): + identity = x + if self.expand_ratio != 1: + x = self.swish(self.bn0(self.expand_conv(x))) + + x = self.swish(self.bn1(self.depthwise_conv(x))) + + if self.has_se: + se = F.adaptive_avg_pool2d(x, 1) + se = self.swish(self.se_reduce(se)) + se = torch.sigmoid(self.se_expand(se)) + x = x * se + + x = self.bn2(self.project_conv(x)) + + if self.use_residual: + x = x + identity + + return x + + +class EfficientNetB1SimpleEncoder(nn.Module): + def __init__(self, in_C=2): + super(EfficientNetB1SimpleEncoder, self).__init__() + + self.pre_pro = nn.Sequential( + nn.Conv2d(in_C, 8, 3, padding=1), + nn.BatchNorm2d(8), + nn.SiLU(inplace=True), + nn.Conv2d(8, 8, 3, padding=1), + nn.BatchNorm2d(8), + nn.SiLU(inplace=True), + ) + + # Stem, first downsampling + self.stem = nn.Sequential( + nn.Conv2d(8, 32, kernel_size=3, stride=2, padding=1, bias=False), + nn.BatchNorm2d(32), + nn.SiLU(inplace=True) + ) + + # EfficientNet-B1 Layers Configuration + layers_config = [ + (32, 16, 1, 3, 1, 1), # Stage 1 (no downsampling) + (16, 24, 6, 3, 2, 2), # Stage 2 (second downsampling) + (24, 40, 6, 5, 2, 2), # Stage 3 (third downsampling) + ] + + # Building EfficientNet-B1 stages + self.blocks = nn.ModuleList() + for in_channels, out_channels, expand_ratio, kernel_size, stride, repeats in layers_config: + block_layers = [] + block_layers.append(MBConvBlockSimple(in_channels, out_channels, expand_ratio, kernel_size, stride)) + for _ in range(repeats - 1): + block_layers.append(MBConvBlockSimple(out_channels, out_channels, expand_ratio, kernel_size, stride=1)) + self.blocks.append(nn.Sequential(*block_layers)) + + def forward(self, x): + features = [] + x = self.pre_pro(x) + features.append(x) # Store features for skip connections + x = self.stem(x) + for block in self.blocks: + x = block(x) + features.append(x) # Store features for skip connections + return features + + +class EfficientUNetSimple(nn.Module): + def __init__(self, num_classes=1): + super(EfficientUNetSimple, self).__init__() + + # Encoder using EfficientNet-B1 with only three stages + self.encoder = EfficientNetB1SimpleEncoder() + + # Decoder layers (Upsampling) + self.upconv3 = nn.Conv2d(40, 24, kernel_size=1) + self.up3 = nn.ConvTranspose2d(24, 24, kernel_size=2, stride=2) + + self.upconv2 = nn.Conv2d(24, 16, kernel_size=1) + self.up2 = nn.ConvTranspose2d(16, 16, kernel_size=2, stride=2) + + self.upconv1 = nn.Conv2d(16, 8, kernel_size=1) + self.up1 = nn.ConvTranspose2d(8, 8, kernel_size=2, stride=2) + + # Final conv layer + self.final_conv = nn.Conv2d(8, num_classes, kernel_size=1) + + def forward(self, x): + # Encoder + features = self.encoder(x) + # print("-"*30, features[-1].shape, features[-2].shape, features[-3].shape, features[-4].shape) + + # Decoder with skip connections + x = self.up3(self.upconv3(features[-1])) + features[-2] # 1/8 ~ 1/4 + x = self.up2(self.upconv2(x)) + features[-3] # 1/4 ~ 1/2 + x = self.up1(self.upconv1(x)) + features[-4] # 1/2 ~ 1 + + # Final output layer + x = self.final_conv(x) + return x diff --git a/core/corr.py b/core/corr.py new file mode 100644 index 0000000000000000000000000000000000000000..1bac31a8e31b6fa34c1c43982975aa2f940d2f11 --- /dev/null +++ b/core/corr.py @@ -0,0 +1,309 @@ +import torch +import torch.nn.functional as F +from core.utils.utils import bilinear_sampler + +try: + import corr_sampler +except: + pass + +try: + import alt_cuda_corr +except: + # alt_cuda_corr is not compiled + pass + + +class CorrSampler(torch.autograd.Function): + @staticmethod + def forward(ctx, volume, coords, radius): + ctx.save_for_backward(volume,coords) + ctx.radius = radius + corr, = corr_sampler.forward(volume, coords, radius) + return corr + @staticmethod + def backward(ctx, grad_output): + volume, coords = ctx.saved_tensors + grad_output = grad_output.contiguous() + grad_volume, = corr_sampler.backward(volume, coords, grad_output, ctx.radius) + return grad_volume, None, None + +class CorrBlockFast1D: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.corr_pyramid = [] + # all pairs correlation + corr = CorrBlockFast1D.corr(fmap1, fmap2) + batch, h1, w1, dim, w2 = corr.shape + corr = corr.reshape(batch*h1*w1, dim, 1, w2) + for i in range(self.num_levels): + self.corr_pyramid.append(corr.view(batch, h1, w1, -1, w2//2**i)) + corr = F.avg_pool2d(corr, [1,2], stride=[1,2]) + + def __call__(self, coords): + out_pyramid = [] + bz, _, ht, wd = coords.shape + coords = coords[:, [0]] + for i in range(self.num_levels): + corr = CorrSampler.apply(self.corr_pyramid[i].squeeze(3), coords/2**i, self.radius) + out_pyramid.append(corr.view(bz, -1, ht, wd)) + return torch.cat(out_pyramid, dim=1) + + @staticmethod + def corr(fmap1, fmap2): + B, D, H, W1 = fmap1.shape + _, _, _, W2 = fmap2.shape + fmap1 = fmap1.view(B, D, H, W1) + fmap2 = fmap2.view(B, D, H, W2) + corr = torch.einsum('aijk,aijh->ajkh', fmap1, fmap2) + corr = corr.reshape(B, H, W1, 1, W2).contiguous() + return corr / torch.sqrt(torch.tensor(D).float()) + + +class PytorchAlternateCorrBlock1D: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.corr_pyramid = [] + self.fmap1 = fmap1 + self.fmap2 = fmap2 + + def corr(self, fmap1, fmap2, coords): + B, D, H, W = fmap2.shape + # map grid coordinates to [-1,1] + xgrid, ygrid = coords.split([1,1], dim=-1) + xgrid = 2*xgrid/(W-1) - 1 + ygrid = 2*ygrid/(H-1) - 1 + + grid = torch.cat([xgrid, ygrid], dim=-1) + output_corr = [] + for grid_slice in grid.unbind(3): + fmapw_mini = F.grid_sample(fmap2, grid_slice, align_corners=True) + corr = torch.sum(fmapw_mini * fmap1, dim=1) + output_corr.append(corr) + corr = torch.stack(output_corr, dim=1).permute(0,2,3,1) + + return corr / torch.sqrt(torch.tensor(D).float()) + + def __call__(self, coords): + r = self.radius + coords = coords.permute(0, 2, 3, 1) + batch, h1, w1, _ = coords.shape + fmap1 = self.fmap1 + fmap2 = self.fmap2 + out_pyramid = [] + for i in range(self.num_levels): + dx = torch.zeros(1) + dy = torch.linspace(-r, r, 2*r+1) + delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device) + centroid_lvl = coords.reshape(batch, h1, w1, 1, 2).clone() + centroid_lvl[...,0] = centroid_lvl[...,0] / 2**i + coords_lvl = centroid_lvl + delta.view(-1, 2) + corr = self.corr(fmap1, fmap2, coords_lvl) + fmap2 = F.avg_pool2d(fmap2, [1, 2], stride=[1, 2]) + out_pyramid.append(corr) + out = torch.cat(out_pyramid, dim=-1) + return out.permute(0, 3, 1, 2).contiguous().float() + + +class PytorchAlternateAbsCorrBlock1D: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.corr_pyramid = [] + self.fmap1 = fmap1 + + self.fmap2_pyramid = [fmap2] + for i in range(num_levels): + fmap2 = F.avg_pool2d(fmap2, [1, 2], stride=[1, 2]) + self.fmap2_pyramid.append(fmap2) + + def corr(self, fmap1, fmap2, coords): + B, C, H, W = fmap1.shape + # map grid coordinates to [-1,1] + xgrid, ygrid = coords.split([1,1], dim=-1) + xgrid = 2*xgrid/(W-1) - 1 + ygrid = 2*ygrid/(H-1) - 1 + + grid = torch.cat([xgrid, ygrid], dim=-1) + + disp_num = 2 * self.radius + 1 + fmapw_mini = F.grid_sample(fmap2, grid.view(B, H, W*disp_num, 2), mode='bilinear', + padding_mode='zeros').view(B, C, H, W, disp_num) # (B, C, H, W, S) + corr = torch.sum(fmap1.unsqueeze(-1) * fmapw_mini, dim=1) + + return corr / torch.sqrt(torch.tensor(C).float()) + + def __call__(self, coords): + print(f"当前显存消耗量: {torch.distributed.get_rank()} {torch.cuda.memory_allocated() / 1024 / 1024:.2f} MB") + + # in case of only disparity used in coordinates + B, D, H, W = coords.shape + if D==1: + y_coord = torch.arange(H).unsqueeze(1).float().repeat(B, 1, 1, W).to(coords.device) + coords = torch.cat([coords,y_coord], dim=1) + + r = self.radius + coords = coords.permute(0, 2, 3, 1) + batch, h1, w1, _ = coords.shape + + fmap1 = self.fmap1 + out_pyramid = [] + for i in range(self.num_levels): + fmap2 = self.fmap2_pyramid[i] + + dx = torch.zeros(1) + dy = torch.linspace(-r, r, 2*r+1) + delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device) + centroid_lvl = coords.reshape(batch, h1, w1, 1, 2).clone() + centroid_lvl[...,0] = centroid_lvl[...,0] / 2**i + coords_lvl = centroid_lvl + delta.view(-1, 2) + + corr = self.corr(fmap1, fmap2, coords_lvl) + out_pyramid.append(corr) + out = torch.cat(out_pyramid, dim=-1) + return out.permute(0, 3, 1, 2).contiguous().float() + + +class CorrBlock1D: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.corr_pyramid = [] + + # all pairs correlation + corr = CorrBlock1D.corr(fmap1, fmap2) + + batch, h1, w1, _, w2 = corr.shape + corr = corr.reshape(batch*h1*w1, 1, 1, w2) + + self.corr_pyramid.append(corr) + for i in range(self.num_levels): + corr = F.avg_pool2d(corr, [1,2], stride=[1,2]) + self.corr_pyramid.append(corr) + + def __call__(self, coords): + r = self.radius + coords = coords[:, :1].permute(0, 2, 3, 1) + batch, h1, w1, _ = coords.shape + + # print(f"当前显存消耗量: {torch.distributed.get_rank()} {torch.cuda.memory_allocated() / 1024 / 1024:.2f} MB") + + out_pyramid = [] + for i in range(self.num_levels): + corr = self.corr_pyramid[i] + dx = torch.linspace(-r, r, 2*r+1) + dx = dx.view(2*r+1, 1).to(coords.device) + x0 = dx + coords.reshape(batch*h1*w1, 1, 1, 1) / 2**i + y0 = torch.zeros_like(x0) + + coords_lvl = torch.cat([x0,y0], dim=-1) + corr = bilinear_sampler(corr, coords_lvl) + corr = corr.view(batch, h1, w1, -1) + out_pyramid.append(corr) + + out = torch.cat(out_pyramid, dim=-1) + return out.permute(0, 3, 1, 2).contiguous().float() + + @staticmethod + def corr(fmap1, fmap2): + B, D, H, W1 = fmap1.shape + _, _, _, W2 = fmap2.shape + fmap1 = fmap1.view(B, D, H, W1) + fmap2 = fmap2.view(B, D, H, W2) + corr = torch.einsum('aijk,aijh->ajkh', fmap1, fmap2) + corr = corr.reshape(B, H, W1, 1, W2).contiguous() + return corr / torch.sqrt(torch.tensor(D).float()) + +class AbsCorrBlock1D: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.abs_corr_matrix_pyramid = [] + + # all pairs correlation + abs_corr_matrix = AbsCorrBlock1D.abs_corr(fmap1, fmap2) + + batch, h1, w1, _, w2 = abs_corr_matrix.shape + abs_corr_matrix = abs_corr_matrix.reshape(batch*h1*w1, 1, 1, w2) + + self.abs_corr_matrix_pyramid.append(abs_corr_matrix) + for i in range(self.num_levels): + abs_corr_matrix = F.avg_pool2d(abs_corr_matrix, [1,2], stride=[1,2]) + self.abs_corr_matrix_pyramid.append(abs_corr_matrix) + + def __call__(self, coords): + r = self.radius + coords = coords[:, :1].permute(0, 2, 3, 1) + batch, h1, w1, _ = coords.shape + + out_pyramid = [] + for i in range(self.num_levels): + abs_corr_matrix = self.abs_corr_matrix_pyramid[i] + dx = torch.linspace(-r, r, 2*r+1) + dx = dx.view(2*r+1, 1).to(coords.device) + x0 = dx + coords.reshape(batch*h1*w1, 1, 1, 1) / 2**i + y0 = torch.zeros_like(x0) + + coords_lvl = torch.cat([x0,y0], dim=-1) + abs_corr_matrix = bilinear_sampler(abs_corr_matrix, coords_lvl) + abs_corr_matrix = abs_corr_matrix.view(batch, h1, w1, -1) + out_pyramid.append(abs_corr_matrix) + + out = torch.cat(out_pyramid, dim=-1) + return out.permute(0, 3, 1, 2).contiguous().float() + + @staticmethod + def abs_corr(fmap1, fmap2): + """fucntion: build the correlation matrix (not traditional cost volume) for each pixel in the same line. + args: + fmap1: feature maps from left view, B*C*H*W1; + fmap2: feature maps from right view, B*C*H*W2; + return: + the correlation matrix, B*H*W1*W2; + """ + B, D, H, W1 = fmap1.shape + _, _, _, W2 = fmap2.shape + + # 计算 L1 匹配代价 + # corr_matrix = torch.einsum('aijk,aijh->ajkh', fmap1, fmap2) + # corr_matrix = torch.sum(torch.abs(fmap1.unsqueeze(-1) - fmap2.unsqueeze(-2)), dim=1) # shape (B, H, W1, W2) + corr_matrix = (fmap1.unsqueeze(-1) - fmap2.unsqueeze(-2)).abs_().sum(dim=1) # shape (B, H, W1, W2) + # corr_matrix = fmap1.sum(dim=1).unsqueeze(-1) - fmap2.sum(dim=1).unsqueeze(-2) # shape (B, H, W1, W2) + print("-"*10, " AbsCorrBlock1D: {} ".format(corr_matrix.shape), "-"*10) + print(f"当前显存消耗量: {torch.distributed.get_rank()} {torch.cuda.memory_allocated() / 1024 / 1024:.2f} MB") + + corr_matrix = corr_matrix.reshape(B, H, W1, 1, W2).contiguous() + return corr_matrix / torch.sqrt(torch.tensor(D).float()) + +class AlternateCorrBlock: + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + raise NotImplementedError + self.num_levels = num_levels + self.radius = radius + + self.pyramid = [(fmap1, fmap2)] + for i in range(self.num_levels): + fmap1 = F.avg_pool2d(fmap1, 2, stride=2) + fmap2 = F.avg_pool2d(fmap2, 2, stride=2) + self.pyramid.append((fmap1, fmap2)) + + def __call__(self, coords): + coords = coords.permute(0, 2, 3, 1) + B, H, W, _ = coords.shape + dim = self.pyramid[0][0].shape[1] + + corr_list = [] + for i in range(self.num_levels): + r = self.radius + fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous() + fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous() + + coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous() + corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r) + corr_list.append(corr.squeeze(1)) + + corr = torch.stack(corr_list, dim=1) + corr = corr.reshape(B, -1, H, W) + return corr / torch.sqrt(torch.tensor(dim).float()) diff --git a/core/extractor.py b/core/extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..edd71e29bff969f1d0e5f2e6e673627811c2bc4d --- /dev/null +++ b/core/extractor.py @@ -0,0 +1,300 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ResidualBlock(nn.Module): + def __init__(self, in_planes, planes, norm_fn='group', stride=1): + super(ResidualBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + if not (stride == 1 and in_planes == planes): + self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + + elif norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(planes) + self.norm2 = nn.BatchNorm2d(planes) + if not (stride == 1 and in_planes == planes): + self.norm3 = nn.BatchNorm2d(planes) + + elif norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(planes) + self.norm2 = nn.InstanceNorm2d(planes) + if not (stride == 1 and in_planes == planes): + self.norm3 = nn.InstanceNorm2d(planes) + + elif norm_fn == 'none': + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + if not (stride == 1 and in_planes == planes): + self.norm3 = nn.Sequential() + + if stride == 1 and in_planes == planes: + self.downsample = None + + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) + + + def forward(self, x): + y = x + y = self.conv1(y) + y = self.norm1(y) + y = self.relu(y) + y = self.conv2(y) + y = self.norm2(y) + y = self.relu(y) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x+y) + + + +class BottleneckBlock(nn.Module): + def __init__(self, in_planes, planes, norm_fn='group', stride=1): + super(BottleneckBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0) + self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride) + self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) + self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) + self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + if not stride == 1: + self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + + elif norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(planes//4) + self.norm2 = nn.BatchNorm2d(planes//4) + self.norm3 = nn.BatchNorm2d(planes) + if not stride == 1: + self.norm4 = nn.BatchNorm2d(planes) + + elif norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(planes//4) + self.norm2 = nn.InstanceNorm2d(planes//4) + self.norm3 = nn.InstanceNorm2d(planes) + if not stride == 1: + self.norm4 = nn.InstanceNorm2d(planes) + + elif norm_fn == 'none': + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + self.norm3 = nn.Sequential() + if not stride == 1: + self.norm4 = nn.Sequential() + + if stride == 1: + self.downsample = None + + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) + + + def forward(self, x): + y = x + y = self.relu(self.norm1(self.conv1(y))) + y = self.relu(self.norm2(self.conv2(y))) + y = self.relu(self.norm3(self.conv3(y))) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x+y) + +class BasicEncoder(nn.Module): + def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0, downsample=3): + super(BasicEncoder, self).__init__() + self.norm_fn = norm_fn + self.downsample = downsample + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) + + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(64) + + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(64) + + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1 + (downsample > 2), padding=3) + self.relu1 = nn.ReLU(inplace=True) + + self.in_planes = 64 + self.layer1 = self._make_layer(64, stride=1) + self.layer2 = self._make_layer(96, stride=1 + (downsample > 1)) + self.layer3 = self._make_layer(128, stride=1 + (downsample > 0)) + + # output convolution + self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) + + self.dropout = None + if dropout > 0: + self.dropout = nn.Dropout2d(p=dropout) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + + def forward(self, x, dual_inp=False): + + # if input is list, combine batch dimension + is_list = isinstance(x, tuple) or isinstance(x, list) + if is_list: + batch_dim = x[0].shape[0] + x = torch.cat(x, dim=0) + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + + x = self.conv2(x) + + if self.training and self.dropout is not None: + x = self.dropout(x) + + if is_list: + x = x.split(split_size=batch_dim, dim=0) + + return x + +class MultiBasicEncoder(nn.Module): + def __init__(self, output_dim=[128], norm_fn='batch', dropout=0.0, downsample=3): + super(MultiBasicEncoder, self).__init__() + self.norm_fn = norm_fn + self.downsample = downsample + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) + + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(64) + + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(64) + + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1 + (downsample > 2), padding=3) + self.relu1 = nn.ReLU(inplace=True) + + self.in_planes = 64 + self.layer1 = self._make_layer(64, stride=1) + self.layer2 = self._make_layer(96, stride=1 + (downsample > 1)) + self.layer3 = self._make_layer(128, stride=1 + (downsample > 0)) + self.layer4 = self._make_layer(128, stride=2) + self.layer5 = self._make_layer(128, stride=2) + + output_list = [] + for dim in output_dim: + conv_out = nn.Sequential( + ResidualBlock(128, 128, self.norm_fn, stride=1), + nn.Conv2d(128, dim[2], 3, padding=1)) + output_list.append(conv_out) + + self.outputs08 = nn.ModuleList(output_list) + + output_list = [] + for dim in output_dim: + conv_out = nn.Sequential( + ResidualBlock(128, 128, self.norm_fn, stride=1), + nn.Conv2d(128, dim[1], 3, padding=1)) + output_list.append(conv_out) + + self.outputs16 = nn.ModuleList(output_list) + + output_list = [] + for dim in output_dim: + conv_out = nn.Conv2d(128, dim[0], 3, padding=1) + output_list.append(conv_out) + + self.outputs32 = nn.ModuleList(output_list) + + if dropout > 0: + self.dropout = nn.Dropout2d(p=dropout) + else: + self.dropout = None + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, x, dual_inp=False, num_layers=3): + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + if dual_inp: + v = x + x = x[:(x.shape[0]//2)] + + outputs08 = [f(x) for f in self.outputs08] + if num_layers == 1: + return (outputs08, v) if dual_inp else (outputs08,) + + y = self.layer4(x) + outputs16 = [f(y) for f in self.outputs16] + + if num_layers == 2: + return (outputs08, outputs16, v) if dual_inp else (outputs08, outputs16) + + z = self.layer5(y) + outputs32 = [f(z) for f in self.outputs32] + + return (outputs08, outputs16, outputs32, v) if dual_inp else (outputs08, outputs16, outputs32) diff --git a/core/extractor_depthany.py b/core/extractor_depthany.py new file mode 100644 index 0000000000000000000000000000000000000000..70edb61bf44d46a822178a6c9e003019028dcbfa --- /dev/null +++ b/core/extractor_depthany.py @@ -0,0 +1,278 @@ +import os +import sys +import numpy as np +sys.path.insert(0,'Depth-Anything-V2') + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.transforms as T + +from core.extractor import ResidualBlock +from depth_anything_v2.dpt import DepthAnythingV2 +from core.utils.utils import sv_intermediate_results +from huggingface_hub import hf_hub_download + + +def resize_tensor(tensor, target_size=512, ratio=16): + # 获取输入 tensor 的尺寸 (B, C, H, W) + _, _, H, W = tensor.shape + + # 计算 H 和 W 中较长的一边 + if H > W: + new_H = target_size + new_W = int(W * (target_size / H)) + else: + new_W = target_size + new_H = int(H * (target_size / W)) + + new_W = (np.ceil(new_W / ratio) * ratio).astype(int) + new_H = (np.ceil(new_H / ratio) * ratio).astype(int) + + # 使用 interpolate 进行缩放 + resized_tensor = F.interpolate(tensor, size=(new_H, new_W), mode='bicubic', align_corners=False) + + return resized_tensor + + +def resize_to_quarter(tensor, original_size, ratio): + # 将尺寸缩小为原始尺寸的 1/4 + quarter_H = original_size[0] // ratio + quarter_W = original_size[1] // ratio + + # 使用 interpolate 进行缩小 + resized_tensor = F.interpolate(tensor, size=(quarter_H, quarter_W), mode='bilinear', align_corners=False) + + return resized_tensor + + +class DepthAnyExtractor(nn.Module): + def __init__(self, model_dir, output_dim=[128], norm_fn='batch', downsample=2, args=None): + super(DepthAnyExtractor, self).__init__() + self.args = args + self.norm_fn = norm_fn + self.downsample = downsample + + output_list = [] + for dim in output_dim: + conv_out = nn.Sequential( + ResidualBlock(128, 128, self.norm_fn, stride=1), + nn.Conv2d(128, dim[2], 3, padding=1)) + output_list.append(conv_out) + + self.outputs08 = nn.ModuleList(output_list) + + output_list = [] + for dim in output_dim: + conv_out = nn.Sequential( + ResidualBlock(128, 128, self.norm_fn, stride=1), + nn.Conv2d(128, dim[1], 3, padding=1)) + output_list.append(conv_out) + + self.outputs16 = nn.ModuleList(output_list) + + output_list = [] + for dim in output_dim: + conv_out = nn.Conv2d(128, dim[0], 3, padding=1) + output_list.append(conv_out) + + self.outputs32 = nn.ModuleList(output_list) + + self.layer1 = nn.Sequential( + nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + ) + + self.in_planes = 128 + self.layer2 = self._make_layer(128, stride=2) + self.layer3 = self._make_layer(128, stride=2) + + # self._init_weights() + + model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} + } + + encoder = "vitl" + depth_anything = DepthAnythingV2(**model_configs[encoder]) + + + checkpoint_path = hf_hub_download( + repo_id="BFZD/Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching", + filename="dav2_models/depth_anything_v2_vitl.pth", + ) + # depth_anything.load_state_dict(torch.load(os.path.join(model_dir, f'depth_anything_v2_{encoder}.pth'), + # map_location='cpu')) + depth_anything.load_state_dict(torch.load(checkpoint_path,map_location='cpu')) + # self.depth_anything = depth_anything.to('cuda') + self.depth_anything = depth_anything + + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + self.mean = torch.tensor(mean).view(1, 3, 1, 1) + self.std = torch.tensor(std).view(1, 3, 1, 1) + + # 冻结 depth_anything 模型的所有参数 + for param in self.depth_anything.parameters(): + param.requires_grad = False + + # def _init_weights(self): + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + # if m.weight is not None: + # nn.init.constant_(m.weight, 1) + # if m.bias is not None: + # nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, image, dual_inp=False, num_layers=3): + # resize image + B, _, H, W = image.shape + img = resize_tensor(image, target_size=518, ratio=14) + + # normalization + img = ((img+1)/2 - self.mean) / self.std + + # DepthAnything + with torch.no_grad(): + # out_depth: [1, 1, 518, 756] + # out_fea: [1, 128, 296, 432] + depth, depth_fea = self.depth_anything(img) + + # resize image + # [1, 128, H//4, W//4] + depth = resize_to_quarter(depth, (H,W), 2**self.downsample) + x = resize_to_quarter(depth_fea, (H,W), 2**self.downsample) + + if self.args is not None and hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(x, "depthAnything_features", self.args.sv_root) + + x = self.layer1(x) + outputs08 = [f(x) for f in self.outputs08] + if num_layers == 1: + return (outputs08, v) if dual_inp else (outputs08,) + + # [1, 128, H//8, W//8] + y = self.layer2(x) + outputs16 = [f(y) for f in self.outputs16] + if num_layers == 2: + return (outputs08, outputs16, v) if dual_inp else (outputs08, outputs16) + + # [1, 128, H//16, W//16] + z = self.layer3(y) + outputs32 = [f(z) for f in self.outputs32] + + return (outputs08, outputs16, outputs32), depth + + + + +class DepthMatchExtractor(nn.Module): + def __init__(self, model_dir, output_dim=256, norm_fn='batch', downsample=2): + super(DepthMatchExtractor, self).__init__() + self.norm_fn = norm_fn + self.downsample = downsample + + self.layer1 = nn.Sequential( + nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + ) + + self.in_planes = 128 + self.layer2 = self._make_layer(128, stride=1) + self.conv = nn.Conv2d(128, output_dim, kernel_size=1) + + # self._init_weights() + + model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} + } + + encoder = "vitl" + depth_anything = DepthAnythingV2(**model_configs[encoder]) + checkpoint_path = hf_hub_download( + repo_id="BFZD/Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching", + filename="dav2_models/depth_anything_v2_vitl.pth", + ) + # depth_anything.load_state_dict(torch.load(os.path.join(model_dir, f'depth_anything_v2_{encoder}.pth'), + # map_location='cpu')) + depth_anything.load_state_dict(torch.load(checkpoint_path,map_location='cpu')) + self.depth_anything = depth_anything.to('cuda') + + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + self.mean = torch.tensor(mean).view(1, 3, 1, 1).cuda() + self.std = torch.tensor(std).view(1, 3, 1, 1).cuda() + + # 冻结 depth_anything 模型的所有参数 + for param in self.depth_anything.parameters(): + param.requires_grad = False + + # def _init_weights(self): + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + # if m.weight is not None: + # nn.init.constant_(m.weight, 1) + # if m.bias is not None: + # nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, x, dual_inp=False, num_layers=3): + # if input is list, combine batch dimension + is_list = isinstance(x, tuple) or isinstance(x, list) + if is_list: + batch_dim = x[0].shape[0] + x = torch.cat(x, dim=0) + + # resize image + B, _, H, W = x.shape + x = resize_tensor(x, target_size=518, ratio=14) + + # normalization + x = ((x+1)/2 - self.mean) / self.std + + # DepthAnything + with torch.no_grad(): + # out_depth: [1, 1, 518, 756] + # out_fea: [1, 128, 296, 432] + depth, depth_fea = self.depth_anything(x) + + # resize image + # [1, 128, H//4, W//4] + x = resize_to_quarter(depth_fea, (H,W), 2**self.downsample) + x = self.layer1(x) + x = self.layer2(x) + x = self.conv(x) + + if is_list: + x = x.split(split_size=batch_dim, dim=0) + + return x \ No newline at end of file diff --git a/core/extractor_mast3r.py b/core/extractor_mast3r.py new file mode 100644 index 0000000000000000000000000000000000000000..9899c4c22982722c49bbe12d73aead0c8bed1584 --- /dev/null +++ b/core/extractor_mast3r.py @@ -0,0 +1,194 @@ +import os +import sys +import numpy as np +sys.path.insert(0,'mast3r') + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.transforms as T + +from core.extractor import ResidualBlock +from mast3r.model import AsymmetricMASt3R + + + +# def resize_and_pad_tensor(tensor, target_size=512): +# # 获取输入 tensor 的尺寸 (B, C, H, W) +# _, _, H, W = tensor.shape + +# # 计算 H 和 W 中较长的一边 +# if H > W: +# new_H = target_size +# new_W = int(W * (target_size / H)) +# else: +# new_W = target_size +# new_H = int(H * (target_size / W)) + +# # 使用 interpolate 进行缩放 +# resized_tensor = F.interpolate(tensor, size=(new_H, new_W), mode='bilinear', align_corners=False) + +# # 计算是否需要填充,使得尺寸可以被16整除 +# pad_H = (16 - new_H % 16) if new_H % 16 != 0 else 0 +# pad_W = (16 - new_W % 16) if new_W % 16 != 0 else 0 + +# # 进行填充,确保两边可以被16整除 +# padding = (0, pad_W, 0, pad_H) # (left, right, top, bottom) +# padded_tensor = F.pad(resized_tensor, padding) + +# return padded_tensor + +def resize_tensor(tensor, target_size=512, ratio=16): + # 获取输入 tensor 的尺寸 (B, C, H, W) + _, _, H, W = tensor.shape + + # 计算 H 和 W 中较长的一边 + if H > W: + new_H = target_size + new_W = int(W * (target_size / H)) + else: + new_W = target_size + new_H = int(H * (target_size / W)) + + new_W = (np.ceil(new_W / ratio) * ratio).astype(int) + new_H = (np.ceil(new_H / ratio) * ratio).astype(int) + + # 使用 interpolate 进行缩放 + resized_tensor = F.interpolate(tensor, size=(new_H, new_W), mode='bicubic', align_corners=False) + + return resized_tensor + + +def resize_to_quarter(tensor, original_size, ratio): + # 将尺寸缩小为原始尺寸的 1/4 + quarter_H = original_size[0] // ratio + quarter_W = original_size[1] // ratio + + # 使用 interpolate 进行缩小 + resized_tensor = F.interpolate(tensor, size=(quarter_H, quarter_W), mode='bilinear', align_corners=False) + + return resized_tensor + + +class Mast3rExtractor(nn.Module): + def __init__(self, model_name, output_dim=128, norm_fn='batch', downsample=2): + super(Mast3rExtractor, self).__init__() + self.norm_fn = norm_fn + self.downsample = downsample + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(64) + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(64) + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + # self.layer1 = nn.Sequential( + # nn.Conv2d(32, 64, kernel_size=7, stride=1, padding=3), + # self.norm1, + # nn.ReLU(inplace=True), + # ) + + self.layer1 = nn.Sequential( + nn.Conv2d(32, 64, kernel_size=7, stride=1, padding=3), + self.norm1, + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1), + self.norm1, + nn.ReLU(inplace=True), + ) + + self.in_planes = 64 + self.layer2 = self._make_layer(128, stride=1) + + # output convolution + self.conv = nn.Conv2d(128, output_dim, kernel_size=1) + + # self._init_weights() + + self.mast3r = AsymmetricMASt3R.from_pretrained(model_name).to('cuda') + + # 冻结 Mast3r 模型的所有参数 + for param in self.mast3r.parameters(): + param.requires_grad = False + + # def _init_weights(self): + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + # elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + # if m.weight is not None: + # nn.init.constant_(m.weight, 1) + # if m.bias is not None: + # nn.init.constant_(m.bias, 0) + + # def _make_layer(self, dim, stride=1): + # layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) + # layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + # layers = (layer1, layer2) + + # self.in_planes = dim + # return nn.Sequential(*layers) + + # def _make_layer(self, dim, stride=1): + # layer1 = ResidualBlock(self.in_planes, self.in_planes, self.norm_fn, stride=stride) + # layer2 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=1) + # layer3 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + # layers = (layer1, layer2, layer3) + + # self.in_planes = dim + # return nn.Sequential(*layers) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock(self.in_planes, self.in_planes, self.norm_fn, stride=stride) + layer1 = ResidualBlock(self.in_planes, self.in_planes, self.norm_fn, stride=stride) + layer2 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=1) + layer3 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layer3 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2, layer3) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, image1, image2, dual_inp=False): + # resize image + B, _, H, W = image1.shape + image1 = resize_tensor(image1) + image2 = resize_tensor(image2) + + # data format for MaSt3R + _, _, H1, W1 = image1.shape + view1 = dict(img=image1, + true_shape=torch.tensor([[H1,W1]], dtype=torch.int32).to(image1.device), + idx=B, instance=str(B)) + view2 = dict(img=image2, + true_shape=torch.tensor([[H1,W1]], dtype=torch.int32).to(image1.device), + idx=B, instance=str(B)) + + # mast3r + with torch.no_grad(): + pred1, pred2 = self.mast3r(view1, view2) + + # fetch features + ## 3, 1, 24, 1 + fea1 = [pred1['pts3d'], pred1['conf'][...,None], pred1['desc'], pred1['desc_conf'][...,None]] + fea1 = torch.cat(fea1, dim=-1).permute((0,3,1,2)) + fea1 = torch.cat([image1, fea1], dim=1) # 32 + fea2 = [pred2['pts3d_in_other_view'], pred2['conf'][...,None], pred2['desc'], pred2['desc_conf'][...,None]] + fea2 = torch.cat(fea2, dim=-1).permute((0,3,1,2)) + fea2 = torch.cat([image2, fea2], dim=1) # 32 + x = torch.cat([fea1,fea2], dim=0) # 32 + + # resize image + x = resize_to_quarter(x, (H,W), 2**self.downsample) + + # conv + x = self.layer1(x) + x = self.layer2(x) + x = self.conv(x) + + x = x.split(split_size=B, dim=0) + + return x \ No newline at end of file diff --git a/core/extractor_metric3d.py b/core/extractor_metric3d.py new file mode 100644 index 0000000000000000000000000000000000000000..d0cdf6d230f2bdf7bda81123aeafc42d8a5bdd05 --- /dev/null +++ b/core/extractor_metric3d.py @@ -0,0 +1,337 @@ +import os +import sys +import numpy as np +sys.path.insert(0,'Metric3D') + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.transforms as T + +from attrdict import AttrDict + +from core.extractor import ResidualBlock +from depth_anything_v2.dpt import DepthAnythingV2 +from core.utils.utils import sv_intermediate_results + + + +def resize_tensor(tensor, target_size=512, ratio=16): + # 获取输入 tensor 的尺寸 (B, C, H, W) + _, _, H, W = tensor.shape + + # 计算 H 和 W 中较长的一边 + if H > W: + new_H = target_size + new_W = int(W * (target_size / H)) + else: + new_W = target_size + new_H = int(H * (target_size / W)) + + new_W = (np.ceil(new_W / ratio) * ratio).astype(int) + new_H = (np.ceil(new_H / ratio) * ratio).astype(int) + + # 使用 interpolate 进行缩放 + resized_tensor = F.interpolate(tensor, size=(new_H, new_W), mode='bicubic', align_corners=False) + + return resized_tensor + + +def resize_to_quarter(tensor, original_size, ratio): + # 将尺寸缩小为原始尺寸的 1/4 + quarter_H = original_size[0] // ratio + quarter_W = original_size[1] // ratio + + # 使用 interpolate 进行缩小 + resized_tensor = F.interpolate(tensor, size=(quarter_H, quarter_W), mode='bilinear', align_corners=False) + + return resized_tensor + + + +from mono.utils.comm import get_func + +class Metric3DExtractor(nn.Module): + def __init__(self, args) -> None: + super(Metric3DExtractor, self).__init__() + self.args = args + + cfg = dict( + model = dict( + type='DensePredModel', + backbone=dict( + type='vit_large_reg', + prefix='backbones.', + out_channels=[1024, 1024, 1024, 1024], + drop_path_rate = 0.0, + checkpoint="./pretrained/metric3d/dinov2_vitl14_reg4_pretrain.pth", + ), + decode_head=dict( + type='RAFTDepthNormalDPT5', + # type='RAFTDepthDPT', + prefix='decode_heads.', + in_channels=[1024, 1024, 1024, 1024], + use_cls_token=True, + feature_channels = [256, 512, 1024, 1024], # [2/7, 1/7, 1/14, 1/14] + decoder_channels = [128, 256, 512, 1024, 1024], # [4/7, 2/7, 1/7, 1/14, 1/14] + up_scale = 7, + hidden_channels=[128, 128, 128, 128], # [x_4, x_8, x_16, x_32] [192, 384, 768, 1536] + n_gru_layers=3, + n_downsample=2, + iters=8, + slow_fast_gru=True, + num_register_tokens=4, + # detach=False + ), + ), + + data_basic = dict( + canonical_space = dict( + # img_size=(540, 960), + focal_length=1000.0, + ), + depth_range=(0, 1), + depth_normalize=(0.1, 200), + crop_size = (616, 1064), # %28 = 0 + clip_depth_range=(0.1, 200), + vit_size=(616,1064) + ), + ) + self.cfg = AttrDict(cfg) + + self.encoder = get_func('mono.model.' + self.cfg.model.backbone.prefix + self.cfg.model.backbone.type)(**self.cfg.model.backbone) + self.decoder = get_func('mono.model.' + self.cfg.model.decode_head.prefix + self.cfg.model.decode_head.type)(self.cfg) + # print(get_func('mono.model.' + self.cfg.model.backbone.prefix + self.cfg.model.backbone.type)) + # print(self.encoder) + + self.hidden_dims = self.cfg.model.decode_head.hidden_channels + self.n_gru_layers = self.cfg.model.decode_head.n_gru_layers + self.inp_convs = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(self.hidden_dims[i]*3, self.hidden_dims[i]*3, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(self.hidden_dims[i]*3, self.hidden_dims[i]*3, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(self.hidden_dims[i]*3, self.hidden_dims[i]*3, kernel_size=3, stride=1, padding=1), + ) for i in range(self.n_gru_layers) + ]) + self.net_convs = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(self.hidden_dims[i], self.hidden_dims[i], 3, padding=3//2), + nn.ReLU(inplace=True), + nn.Conv2d(self.hidden_dims[i], self.hidden_dims[i], 3, padding=3//2), + nn.ReLU(inplace=True), + nn.Conv2d(self.hidden_dims[i], self.hidden_dims[i], 3, padding=3//2), + ) for i in range(self.n_gru_layers) + ]) + + load_path = "./pretrained/metric3d/metric_depth_vit_large_800k.pth" + checkpoint = torch.load(load_path, map_location="cpu") + state_dict = checkpoint['model_state_dict'] + + encoder_state_dict = {k.replace("depth_model.encoder.", ""): v for k, v in state_dict.items() if k.startswith("depth_model.encoder")} + decoder_state_dict = {k.replace("depth_model.decoder.", ""): v for k, v in state_dict.items() if k.startswith("depth_model.decoder")} + + self.encoder.load_state_dict(encoder_state_dict) + self.decoder.load_state_dict(decoder_state_dict) + + self.encoder = self.encoder.to('cuda') + self.decoder = self.decoder.to('cuda') + + # 冻结 depth_anything 模型的所有参数 + for param in self.encoder.parameters(): + param.requires_grad = False + for param in self.decoder.parameters(): + param.requires_grad = False + + + mean = [123.675, 116.28, 103.53] + std = [58.395, 57.12, 57.375] + self.mean = torch.tensor(mean).view(1, 3, 1, 1).cuda() + self.std = torch.tensor(std).view(1, 3, 1, 1).cuda() + self.pad_val = torch.tensor(mean).view(1, 3, 1, 1).cuda() + + + def forward(self, rgb, intrinsic, baseline=1): + + with torch.no_grad(): + focal_length = (intrinsic[:, 0] + intrinsic[:, 1]) / 2 + rgb_input, cam_model_stacks, pad, label_scale_factor, (ori_h, ori_w) = self.aug_data(rgb, intrinsic) + + # [f_32, f_16, f_8, f_4] + features = self.encoder(rgb_input) + output = self.decoder(features, cam_model=cam_model_stacks) + + # outputs=dict( + # prediction=flow_predictions[-1], + # predictions_list=flow_predictions, + # confidence=conf_predictions[-1], + # confidence_list=conf_predictions, + # pred_logit=None, + # # samples_pred_list=samples_pred_list, + # # coord_list=coord_list, + # prediction_normal=norma`l_outs[-1], + # normal_out_list=normal_outs, + # low_resolution_init=low_resolution_init, + # net_list = net_list, + # inp_list = inp_list, + # ) + pred_depth, confidence = output['prediction'], output['confidence'] + net_list, inp_list = output['net_list'], output['inp_list'] + + B, C, H_new, W_new = pred_depth.shape + normalize_scale = self.cfg.data_basic.depth_range[1] + pred_depth = pred_depth[:, :, pad[0] : H_new - pad[1], pad[2] : W_new - pad[3]] + pred_depth = F.interpolate(pred_depth, [ori_h, ori_w], mode='bilinear') # to original size + # print("-"*10, f"pred_depth: {pred_depth.shape}, confidence: {confidence.shape}", pred_depth.max(), pred_depth.min()) + pred_depth = pred_depth * normalize_scale / label_scale_factor.unsqueeze(1).unsqueeze(1).unsqueeze(1) + # print("-"*10, pred_depth.max(), pred_depth.min(), normalize_scale, label_scale_factor, baseline, focal_length) + + pred_disp = (baseline * focal_length).unsqueeze(1).unsqueeze(1).unsqueeze(1) / pred_depth + pred_disp_down = F.interpolate(pred_disp, scale_factor=1/2**self.cfg.model.decode_head.n_downsample, mode='bilinear') * (1/2**self.cfg.model.decode_head.n_downsample) + # print("*"*30, rgb.shape, rgb_input.shape, pred_depth.shape, confidence.shape, pred_disp_down.max(), pred_disp_down.min()) + + + # with autocast(enabled=self.args.mixed_precision): + net_list = [F.interpolate(x, size=(ori_h//(2**(self.cfg.model.decode_head.n_downsample+i)), + ori_w//(2**(self.cfg.model.decode_head.n_downsample+i))), + mode='bilinear', align_corners=False) for i, x in enumerate(net_list)] + inp_list = [F.interpolate(torch.cat(x,dim=1), + size=(ori_h//(2**(self.cfg.model.decode_head.n_downsample+i)), + ori_w//(2**(self.cfg.model.decode_head.n_downsample+i))), + mode='bilinear', align_corners=False) for i, x in enumerate(inp_list)] + # Update the hidden states and context features + net_list = [conv(x) for x, conv in zip(net_list, self.net_convs)] + inp_list = [list( conv(x).chunk(3, dim=1) ) for x, conv in zip(inp_list, self.inp_convs)] + + return net_list, inp_list, pred_disp_down + + + def aug_data(self, rgb, intrinsic): + B, C, ori_h, ori_w = rgb.shape + ori_focal = (intrinsic[:,0] + intrinsic[:,1]) / 2 + canonical_focal = self.cfg.data_basic['canonical_space']['focal_length'] + cano_label_scale_ratio = canonical_focal / ori_focal # Shape: (B,) + + canonical_intrinsic = torch.stack([ + intrinsic[:,0] * cano_label_scale_ratio, + intrinsic[:,1] * cano_label_scale_ratio, + intrinsic[:,2], + intrinsic[:,3], + ], dim=1) + + # resize + rgb, cam_model, pad, resize_label_scale_ratio = resize_for_input(rgb, self.cfg.data_basic.crop_size, canonical_intrinsic, [ori_h, ori_w], 1.0, self.pad_val) + + # label scale factor + label_scale_factor = cano_label_scale_ratio * resize_label_scale_ratio # Shape: (B,) + + rgb = torch.div(((rgb+1)/2*255 - self.mean), self.std) + + cam_model = cam_model.permute((0, 3, 1, 2)).float() + cam_model = cam_model.cuda() + cam_model_stacks = [ + torch.nn.functional.interpolate(cam_model, size=(cam_model.shape[2]//i, cam_model.shape[3]//i), mode='bilinear', align_corners=False) + for i in [2, 4, 8, 16, 32] + ] + + return rgb, cam_model_stacks, pad, label_scale_factor, (ori_h, ori_w) + + +def resize_for_input(image, output_shape, intrinsic, canonical_shape, to_canonical_ratio, pad_values): + """ + Resize the input using PyTorch tensors. + """ + h, w = image.shape[-2:] + + resize_ratio_h = output_shape[0] / canonical_shape[0] + resize_ratio_w = output_shape[1] / canonical_shape[1] + to_scale_ratio = min(resize_ratio_h, resize_ratio_w) + + resize_ratio = to_canonical_ratio * to_scale_ratio + + reshape_h = int(resize_ratio * h) + reshape_w = int(resize_ratio * w) + + pad_h = max(output_shape[0] - reshape_h, 0) + pad_w = max(output_shape[1] - reshape_w, 0) + pad_h_half = pad_h // 2 + pad_w_half = pad_w // 2 + + # Resize image + image = F.interpolate(image, size=(reshape_h, reshape_w), mode='bilinear', align_corners=False) + + # Padding + # image = F.pad(image, (pad_w_half, pad_w - pad_w_half, pad_h_half, pad_h - pad_h_half), value=pad_values) + image = pad_with_channel_values(image, (pad_w_half, pad_w - pad_w_half, pad_h_half, pad_h - pad_h_half), pad_values) + + # Adjust intrinsic parameters + intrinsic[:, 2] *= to_scale_ratio # fx + intrinsic[:, 3] *= to_scale_ratio # fy + + # Build camera model (dummy implementation, replace with actual function) + cam_model = build_camera_model(reshape_h, reshape_w, intrinsic) + cam_model = F.pad(cam_model, (pad_w_half, pad_w - pad_w_half, pad_h_half, pad_h - pad_h_half), value=-1) + + pad = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half] + label_scale_factor = 1 / to_scale_ratio + + return image, cam_model, pad, label_scale_factor + +def pad_with_channel_values(input_tensor, padding, pad_values): + if isinstance(padding, int): + pad_left = pad_right = pad_top = pad_bottom = padding + else: + pad_left, pad_right, pad_top, pad_bottom = padding + + B, C, H, W = input_tensor.shape + new_H = H + pad_top + pad_bottom + new_W = W + pad_left + pad_right + + pad_values = pad_values.view(1, C, 1, 1) + + padded_tensor = pad_values.expand(B, C, new_H, new_W).clone() + + # 计算中间区域并复制数据 + h_start, h_end = pad_top, new_H - pad_bottom + w_start, w_end = pad_left, new_W - pad_right + padded_tensor[:, :, h_start:h_end, w_start:w_end] = input_tensor + + return padded_tensor + + +def build_camera_model(H: int, W: int, intrinsics: torch.Tensor) -> torch.Tensor: + """ + Encode the camera intrinsic parameters (focal length and principle point) to a 4-channel map. + Args: + H (int): Image height + W (int): Image width + intrinsics (torch.Tensor): Tensor of shape (B, 4) containing fx, fy, u0, v0 + Returns: + torch.Tensor: Camera model tensor of shape (B, H, W, 4) + """ + B = intrinsics.shape[0] + fx, fy, u0, v0 = intrinsics[:, 0:1], intrinsics[:, 1:2], intrinsics[:, 2:3], intrinsics[:, 3:4] + f = (fx + fy) / 2.0 # Shape: (B,1) + + # Generate normalized coordinate grids + x_row = torch.arange(W, dtype=torch.float32, device=intrinsics.device).view(1, W) + y_col = torch.arange(H, dtype=torch.float32, device=intrinsics.device).view(1, H) + + # Normalize based on principal point + x_center = (x_row - u0) / W # Shape: (B, W) + y_center = (y_col - v0) / H # Shape: (B, H) + + # Expand dimensions for batch processing + x_center = x_center.unsqueeze(1).expand(B, H, W) # Shape: (B, H, W) + y_center = y_center.unsqueeze(2).expand(B, H, W) # Shape: (B, H, W) + + # Compute FoV angles + fov_x = torch.atan(x_center / (f.unsqueeze(1) / W)) # Shape: (B, H, W) + fov_y = torch.atan(y_center / (f.unsqueeze(1) / H)) # Shape: (B, H, W) + + # Stack channels + cam_model = torch.stack([x_center, y_center, fov_x, fov_y], dim=-1) # Shape: (B, H, W, 4) + + return cam_model + diff --git a/core/fusion.py b/core/fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..9df9a3249abda6ce42062b3bb61d4471a2d74c57 --- /dev/null +++ b/core/fusion.py @@ -0,0 +1,179 @@ +import os +import sys +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from torch.distributions import Beta + +from core.extractor import ResidualBlock +from core.confidence import EfficientUNetSimple +from core.utils.utils import sv_intermediate_results + + + +class FusionDepth(nn.Module): + def __init__(self, args, norm_fn='batch', ): + super(FusionDepth, self).__init__() + self.args = args + self.norm_fn = norm_fn + + self.conv1 = nn.Sequential( + nn.Conv2d(3, 4, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(4, 4, kernel_size=3, padding=1, bias=True), + ) + self.down = nn.Sequential( + ResidualBlock(4, 2*4, self.norm_fn, stride=2), + ResidualBlock(2*4, 2*4, self.norm_fn, stride=1) + ) + self.up = nn.ConvTranspose2d(2*4, 4, kernel_size=2, stride=2) + self.conv2 = nn.Sequential( + nn.Conv2d(8, 4, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(4, 1, kernel_size=3, padding=1, bias=True), + ) + + + def forward(self, disp, depth, delta_disp): + x = disp + x1 = self.conv1( torch.cat([disp, depth, delta_disp], dim=1) ) + + x2 = self.up(self.down(x1)) + + x3 = self.conv2( torch.cat([x1,x2], dim=1) ) + + return x3 + + +class UpdateHistory(nn.Module): + def __init__(self, args, in_chans1, in_chans2): + super(UpdateHistory, self).__init__() + self.conv = nn.Conv2d(in_chans2, in_chans2, kernel_size=1, stride=1, padding=0) + self.update = nn.Sequential(nn.Conv2d(in_chans1+in_chans2, in_chans1, kernel_size=3, stride=1, padding=1),) + + def forward(self, his, disp): + hist_update = self.update( torch.cat([his,self.conv(disp)], dim=1) ) + return hist_update + + +class BetaModulator(nn.Module): + def __init__(self, args, lbp_dim, hidden_dim=None, norm_fn='batch'): + super(BetaModulator, self).__init__() + self.norm_fn = norm_fn + self.modulation_ratio = args.modulation_ratio + # self.conv_depth = nn.Sequential( + # nn.Conv2d(8, 16, kernel_size=1, padding=0, bias=True), + # nn.ReLU(inplace=True), + # nn.Conv2d(16, 16, kernel_size=3, padding=1, bias=True), + # ) + # self.conv_disp = nn.Sequential( + # nn.Conv2d(8, 16, kernel_size=1, padding=0, bias=True), + # nn.ReLU(inplace=True), + # nn.Conv2d(16, 16, kernel_size=3, padding=1, bias=True), + # ) + if hidden_dim is None: + hidden_dim = lbp_dim + self.conv1 = nn.Sequential( + nn.Conv2d(lbp_dim*2, hidden_dim*2, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim*2, hidden_dim*2, kernel_size=3, padding=1, bias=True), + ) + down_dim = 64 if hidden_dim*2<64 else 128 + self.down = nn.Sequential( + ResidualBlock(hidden_dim*2, down_dim, self.norm_fn, stride=2), + ResidualBlock(down_dim, 128, self.norm_fn, stride=1) + ) + self.up = nn.ConvTranspose2d(128, hidden_dim*2, kernel_size=2, stride=2) + self.conv2 = nn.Sequential( + nn.Conv2d(hidden_dim*4, hidden_dim, kernel_size=3, padding=1, bias=False), + nn.Softplus(), + nn.Conv2d(hidden_dim, 2, kernel_size=1, padding=0, bias=False), + nn.Softplus(), + ) + + def forward(self, lbp_disp, lbp_depth, out_distribution=False): + x1 = self.conv1( torch.cat([lbp_disp, lbp_depth], dim=1) ) + x2 = self.up(self.down(x1)) + beta_paras = self.conv2( torch.cat([x1,x2], dim=1) ) + 1 # enforcing alpha>=1, beta>=1 + + # build Beta distribution + alpha, beta = torch.split(beta_paras, 1, dim=1) + distribution = Beta(alpha, beta) + + if self.training: + modulation = distribution.rsample() + else: + modulation = distribution.mean + + if not out_distribution: + return modulation + return modulation, distribution + + # # modulation = modulation*2 - 1 + # modulation_rescale = 1 + modulation * (self.modulation_ratio * itr_ratio) # we hope modulation has less effect at the first several iterations as the disp is unreliable and the lcoal LBP disp is unreliable + # return modulation_rescale + + + +class RefinementMonStereo(nn.Module): + def __init__(self, args, norm_fn='batch', hidden_dim=32): + super(RefinementMonStereo, self).__init__() + self.args = args + + corr_channel = self.args.corr_levels * (self.args.corr_radius*2 + 1) + if not args.conf_from_fea: + conf_in_dim = corr_channel + else: + conf_in_dim = corr_channel + hidden_dim + 2 + self.conf_estimate = nn.Sequential( + nn.Conv2d(conf_in_dim, 128, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 1, 1, padding=0),) + self.norm_conf = nn.Sigmoid() + + if self.args.refine_unet: + self.mono_params_estimate = EfficientUNetSimple(num_classes=2) + else: + self.mono_params_estimate = nn.Sequential( + nn.Conv2d(2, 32, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(32, 32, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(32, 2, 1, padding=0)) + if self.args.refine_pool: + self.mono_params_estimate.add_module("global_avg_pool", nn.AdaptiveAvgPool2d((1, 1))) + + factor = 2**self.args.n_downsample + self.mask = nn.Sequential( + nn.Conv2d(hidden_dim+1, 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, (factor**2)*9, 1, padding=0)) + + def forward(self, disp, depth, hidden, cost_volume, Beta_distribution=None): + if not self.args.conf_from_fea: + conf = self.conf_estimate(cost_volume) + else: + conf = self.conf_estimate( torch.cat([cost_volume,hidden,Beta_distribution.mean,Beta_distribution.variance], dim=1) ) + conf_normed = self.norm_conf(conf) + + mono_params = self.mono_params_estimate( torch.cat([disp, depth], dim=1) ) + a, b = torch.split(mono_params, 1, dim=1) + depth_registered = depth * a + b + + disp = disp * conf_normed + (1-conf_normed) * depth_registered + + up_mask= self.mask( torch.cat([hidden, disp], dim=1) ) + + if self.args is not None and hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(disp, f"disp_refine", self.args.sv_root) + sv_intermediate_results(depth_registered, f"depth_registered", self.args.sv_root) + sv_intermediate_results(conf_normed, f"conf", self.args.sv_root) + sv_intermediate_results(a, f"a", self.args.sv_root) + sv_intermediate_results(b, f"b", self.args.sv_root) + + return disp, up_mask, depth_registered, conf \ No newline at end of file diff --git a/core/geometry.py b/core/geometry.py new file mode 100644 index 0000000000000000000000000000000000000000..ed938172220349a0b238773e2820d6638051c10f --- /dev/null +++ b/core/geometry.py @@ -0,0 +1,193 @@ +import re +import os +import sys +import logging +import numpy as np +from collections import OrderedDict + +# logging.basicConfig(level=logging.INFO, +# format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',) + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.utils.plane import convert2patch + + + +class Geometry_MLP(nn.Module): + def __init__(self, args): + super(Geometry_MLP, self).__init__() + self.args = args + self.reg = nn.Sequential( + nn.Linear(3,3), + nn.Linear(3,2), + ) + + if args.geo_fusion.lower()=="max": + self.fusion = nn.AdaptiveMaxPool1d(1) + elif args.geo_fusion.lower()=="mean": + self.fusion = nn.AdaptiveAvgPool1d(1) + else: + raise Exception(f"{args.geo_fusion} is not supported") + + def forward(self, img_coord, flow_up): + # (1,4,factor*factor,H,W) + factor = 2 ** self.args.n_downsample + fit_points = torch.cat([img_coord, flow_up], dim=1) + fit_points = convert2patch(fit_points, + patch_size=factor, + div_last=False) # (1,3,factor*factor,H,W) + + A = fit_points[:,:3].permute((0,2,3,4,1)) # (1,factor*factor,H,W,3) + ab_proposals = self.reg(A) # (1,factor*factor,H,W,2) + B,L,H,W,C = ab_proposals.shape + ab = self.fusion(ab_proposals.view(B,L,-1).transpose(-1,-2)) # (1,H*W*2,1) + ab = ab.view(B,H,W,C).permute((0,3,1,2)) # (1,2,H,W) + geo = torch.cat([disparity[:,:1],ab], dim=1) + return ab + + +class Geometry_Conv(nn.Module): + def __init__(self, args): + super(Geometry_Conv, self).__init__() + self.args = args + self.reg = nn.Sequential( + nn.Conv2d(3, 4, kernel_size=3, padding=1, stride=1), + nn.LeakyReLU(inplace=True), + nn.Conv2d(4, 8, kernel_size=3, padding=1, stride=2), + nn.LeakyReLU(inplace=True), + nn.Conv2d(8, 5, kernel_size=3, padding=1, stride=2), + nn.LeakyReLU(inplace=True), + nn.Conv2d(5, 5, kernel_size=1, padding=0, stride=1), + ) + + def forward(self, img_coord, disparity_up, disparity): + # img_coord: (1,2,H*factor,W*factor) + # disparity_up: (1,1,H*factor,W*factor) + # disparity: (1,1,H,W) + # factor = 2 ** self.args.n_downsample + + # points = torch.cat([img_coord, disparity_up], dim=1) # (1,3,factor*H,factor*W) + points = torch.cat([img_coord, disparity_up.detach()], dim=1) # (1,3,factor*H,factor*W) + + rest_params = self.reg(points) # (1,5,H,W) + params = torch.cat([disparity,rest_params], dim=1) # (1,6,H,W) + return params + + +class Geometry_Conv_Split(nn.Module): + def __init__(self, args): + super(Geometry_Conv_Split, self).__init__() + self.args = args + self.encode = nn.Sequential( + nn.Conv2d(3, 4, kernel_size=3, padding=1, stride=1), + nn.LeakyReLU(inplace=True), + nn.Conv2d(4, 8, kernel_size=3, padding=1, stride=2), + nn.LeakyReLU(inplace=True), + ) + self.decode_plane = nn.Sequential( + nn.Conv2d(8, 4, kernel_size=3, padding=1, stride=2), + nn.LeakyReLU(inplace=True), + nn.Conv2d(4, 2, kernel_size=1, padding=0, stride=1), + ) + self.decode_curvature = nn.Sequential( + nn.Conv2d(8, 4, kernel_size=3, padding=1, stride=2), + nn.LeakyReLU(inplace=True), + nn.Conv2d(4, 3, kernel_size=1, padding=0, stride=1), + ) + + def forward(self, img_coord, disparity_up, disparity): + # img_coord: (1,2,H*factor,W*factor) + # disparity_up: (1,1,H*factor,W*factor) + # disparity: (1,1,H,W) + # factor = 2 ** self.args.n_downsample + points = torch.cat([img_coord, disparity_up], dim=1) # (1,3,factor*H,factor*W) + + latten = self.encode(points) # (1,8,factor*H/2,factor*W/2) + plane_ab = self.decode_plane(latten) # (1,2,H,W) + hessian_g = self.decode_curvature(latten) # (1,3,H,W) + params = torch.cat([disparity,plane_ab,hessian_g], dim=1) # (1,6,H,W) + return params + + +class LBPEncoder(nn.Module): + """ + Computes the modified Local Binary Patterns (LBP) of an image using custom neighbor offsets. + """ + def __init__(self, args): + super(LBPEncoder, self).__init__() + self.args = args + self.lbp_neighbor_offsets = self._parse_offsets(self.args.lbp_neighbor_offsets) + + self._build_lbp_kernel() + self.sigmoid = nn.Sigmoid() + + def _build_lbp_kernel(self): + # Determine the kernel size based on the maximum offset + self.num_neighbors = len(self.lbp_neighbor_offsets) + self.max_offset = int(np.abs(self.lbp_neighbor_offsets).max()) + self.kernel_size = 2 * self.max_offset + 1 + self.padding = self.max_offset + + # Initialize the convolution layer for depthwise convolution + self.lbp_conv = nn.Conv2d( + in_channels=1, + out_channels=self.num_neighbors, + kernel_size=self.kernel_size, + padding=self.padding, + padding_mode="replicate", + bias=False, + groups=1 # Since in_channels=1, groups=1 makes it depthwise + ) + + self.lbp_weight = torch.zeros(self.num_neighbors, 1, + self.kernel_size, self.kernel_size).float() + center_y, center_x = self.max_offset, self.max_offset + for idx, (dy, dx) in enumerate(self.lbp_neighbor_offsets): + # Compute the position in the kernel for the neighbor + y, x = center_y + dy, center_x + dx + if 0 <= y < self.kernel_size and 0 <= x < self.kernel_size: + self.lbp_weight[idx, 0, y, x] = 1.0 + self.lbp_weight[idx, 0, center_y, center_x] = -1.0 + else: + raise ValueError(f"Offset ({dy}, {dx}) is out of kernel bounds.") + + # Assign the weight to the convolution layer + self.lbp_conv.weight = nn.Parameter(self.lbp_weight) + self.lbp_conv.weight.requires_grad = False # Ensure weights are not updated during training + + def _parse_offsets(self, offsets_str): + """ + Parses a string to extract neighbor offsets. + + Parameters: + offsets_str (str): String defining neighbor offsets, e.g., "(-1,-1), (1,1), (-1,1), (1,-1)" + + Returns: + list of tuples: List of neighbor offsets. + """ + # extract coordinate pairs + pattern = r'\((-?\d+),\s*(-?\d+)\)' + matches = re.findall(pattern, offsets_str) + if not matches: + raise ValueError(offsets_str + ": not suppoted format, please check it!") + offsets = [(int(y), int(x)) for y, x in matches] + return np.array(offsets) + + + def forward(self, img): + """ + Parameters: + img (torch.Tensor): Grayscale image tensor of shape [N, 1, H, W]. + Returns: + torch.Tensor: Modified LBP image of shape [N, C, H, W]. + """ + with torch.no_grad(): + # Apply convolution to compute differences directly + differences = self.lbp_conv(img) # Shape: [1, N, H, W] due to padding + + # Apply sigmoid to the differences to get encoding values between 0 and 1 + encoding = self.sigmoid(differences) # Shape: [1, N, H, W] + return encoding diff --git a/core/loss.py b/core/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c877ba6279440fdf926cea81f362dd09f7f4f55f --- /dev/null +++ b/core/loss.py @@ -0,0 +1,363 @@ +import os +import sys +import logging +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.utils.utils import coords_grid, disparity_computation +from core.utils.utils import LoggerCommon + +logger = LoggerCommon("LOSS") + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +def sequence_loss(flow_preds, flow_gt, valid, loss_gamma=0.9, max_flow=700): + """ Loss function defined over sequence of flow predictions """ + + n_predictions = len(flow_preds) + assert n_predictions >= 1 + flow_loss = 0.0 + + # exlude invalid pixels and extremely large diplacements + mag = torch.sum(flow_gt**2, dim=1).sqrt() + + # exclude extremly large displacements + valid = ((valid >= 0.5) & (mag < max_flow)).unsqueeze(1) + assert valid.shape == flow_gt.shape, [valid.shape, flow_gt.shape] + assert not torch.isinf(flow_gt[valid.bool()]).any() + + for i in range(n_predictions): + if not torch.isnan(flow_preds[i]).any() and not torch.isinf(flow_preds[i]).any(): + # We adjust the loss_gamma so it is consistent for any number of RAFT-Stereo iterations + adjusted_loss_gamma = loss_gamma**(15/(n_predictions - 1)) + i_weight = adjusted_loss_gamma**(n_predictions - i - 1) + i_loss = (flow_preds[i] - flow_gt).abs() + assert i_loss.shape == valid.shape, [i_loss.shape, valid.shape, flow_gt.shape, flow_preds[i].shape] + flow_loss += i_weight * i_loss[valid.bool()].mean() + + epe = torch.sum((flow_preds[-1] - flow_gt)**2, dim=1).sqrt() + epe = epe.view(-1)[valid.view(-1)] + + metrics = { + 'epe': epe.mean().item(), + '1px': (epe < 1).float().mean().item(), + '3px': (epe < 3).float().mean().item(), + '5px': (epe < 5).float().mean().item(), + } + + return flow_loss, metrics + + +def my_loss(res, flow_gt, valid, loss_gamma=0.9, max_flow=700): + pass + +class Loss(nn.Module): + def __init__(self, loss_gamma=0.9, max_flow=700, loss_zeta=0.3, + smoothness=None, slant=None, slant_norm=False, + ner_kernel_size=3, ner_weight_reduce=False, + local_rank=None, mixed_precision=True, + args=None): + super(Loss, self).__init__() + self.loss_gamma = loss_gamma + self.loss_zeta = loss_zeta + self.max_flow = max_flow + self.smoothness = smoothness + self.mixed_precision = mixed_precision + self.conf_disp = args.conf_disp + self.args = args + + if self.smoothness is not None and len(self.smoothness)>0: + self.smooth_loss_computer = SmoothLoss(self.smoothness, + slant=slant, + slant_norm=slant_norm, + kernel_size=ner_kernel_size, + ner_weight_reduce=ner_weight_reduce) + + logger.info(f"smoothness: {smoothness}, " +\ + f"slant: {slant}, slant_norm: {slant_norm}, " +\ + f"ner_kernel_size: {ner_kernel_size}, " +\ + f"ner_weight_reduce: {ner_weight_reduce}, " +\ + f"conf_disp: {self.conf_disp}. " ) + + def forward(self, flow_preds, flow_gt, valid, + disp_preds=None, disp_preds_refine=None, + confidence_list=None, + params_list=None, params_list_refine=None, + plane_abc=None, + imgL=None, imgR=None, + global_batch_num=None,): + """ Loss function defined over sequence of flow predictions """ + n_predictions = len(flow_preds) + assert n_predictions >= 1 + flow_loss = 0.0 + disp_loss = 0.0 + disp_refine_loss = 0.0 + smooth_loss = 0.0 + confidence_loss = 0.0 + params_loss = 0.0 + params_refine_loss = 0.0 + + # exlude invalid pixels and extremely large diplacements + mag = torch.sum(flow_gt**2, dim=1).sqrt() + + # exclude extremly large displacements + valid = ((valid >= 0.5) & (mag < self.max_flow)).unsqueeze(1) + assert valid.shape == flow_gt.shape, [valid.shape, flow_gt.shape] + assert not torch.isinf(flow_gt[valid.bool()]).any() + + for i in range(n_predictions): + assert not torch.isnan(flow_preds[i]).any() and not torch.isinf(flow_preds[i]).any() + # We adjust the loss_gamma so it is consistent for any number of RAFT-Stereo iterations + adjusted_loss_gamma = self.loss_gamma**(15/(n_predictions - 1)) + i_weight = adjusted_loss_gamma**(n_predictions - i - 1) + + # confidence loss + if confidence_list[i] is not None and \ + (self.args.offset_memory_last_iter<0 or \ + (self.args.offset_memory_last_iter>0 and i<=self.args.offset_memory_last_iter)): + with autocast(enabled=self.mixed_precision): + gt_error = (flow_preds[i].detach() - flow_gt).abs().detach() + gt_error = F.interpolate(gt_error,scale_factor=1/4,mode='bilinear') + # confidence_loss += i_weight * F.smooth_l1_loss(confidence_list[i], gt_error) + # confidence_loss += i_weight * F.binary_cross_entropy_with_logits(confidence_list[i], + # torch.sigmoid(gt_error-4)) + gt_conf = (gt_error>4).float() + weight = torch.pow(F.sigmoid(confidence_list[i])-gt_conf, 2) + tmp_confidence_loss = (1+gt_conf*0.5) * weight *\ + F.binary_cross_entropy_with_logits(confidence_list[i], + gt_conf, reduction='none') + confidence_loss += i_weight * tmp_confidence_loss.mean() + + # flow loss + i_loss = (flow_preds[i] - flow_gt).abs() + if self.conf_disp and global_batch_num>3 and confidence_list[i] is not None: + weight = F.interpolate(confidence_list[i],scale_factor=4,mode='bilinear') + i_loss = i_loss * (F.sigmoid(weight.detach()/3)*1.5 + 1) + assert i_loss.shape == valid.shape, [i_loss.shape, valid.shape, flow_gt.shape, flow_preds[i].shape] + flow_loss += i_weight * i_loss[valid.bool()].mean() + + # disparity loss + if disp_preds is not None and len(disp_preds)>0 and disp_preds[i] is not None: + i_loss = (-disp_preds[i] - flow_gt).abs() + disp_loss += i_weight * i_loss[valid.bool()].mean() + + # plane loss + if params_list is not None and len(params_list)>0 and plane_abc is not None and plane_abc.shape[1]==3: + # print("~"*30, params_list[-1].shape, plane_abc.shape) + i_loss = (params_list[i] - plane_abc).abs() + params_loss += i_weight * 0.5 * i_loss.mean() + + # refinement loss + if disp_preds_refine is not None and len(disp_preds_refine)>0 and disp_preds_refine[i] is not None: + i_loss = (-disp_preds_refine[i] - flow_gt).abs() + disp_refine_loss += i_weight * i_loss[valid.bool()].mean() + + # plane loss + if params_list_refine is not None and len(params_list_refine)>0 and plane_abc is not None and plane_abc.shape[1]==3: + # print("~"*30, params_list_refine[-1].shape, plane_abc.shape) + i_loss = (params_list_refine[i] - plane_abc).abs() + params_refine_loss += i_weight * 0.5 * i_loss.mean() + + if i>n_predictions//2: + with autocast(enabled=self.mixed_precision): + if self.smoothness=="gradient": + smooth_loss += i_weight * self.smooth_loss_computer(flow_preds[i], imgL).mean() + elif self.smoothness=="curvature": + smooth_loss += i_weight * self.smooth_loss_computer(params_list[i], imgL).mean() + + epe = torch.sum((flow_preds[-1] - flow_gt)**2, dim=1).sqrt() + epe = epe.view(-1)[valid.view(-1)] + + metrics = { + 'epe': epe.mean().item(), + '1px': (epe < 1).float().mean().item(), + '3px': (epe < 3).float().mean().item(), + '5px': (epe < 5).float().mean().item(), + } + + if disp_preds is not None and len(disp_preds)>0 and disp_preds[-1] is not None: + epe = torch.sum((-disp_preds[-1] - flow_gt)**2, dim=1).sqrt() + epe = epe.view(-1)[valid.view(-1)] + metrics.update({'epe_disp': epe.mean().item(), + '3px_disp': (epe < 3).float().mean().item(),}) + + if disp_preds_refine is not None and len(disp_preds_refine)>0 and disp_preds_refine[-1] is not None: + epe = torch.sum((-disp_preds_refine[-1] - flow_gt)**2, dim=1).sqrt() + epe = epe.view(-1)[valid.view(-1)] + metrics.update({'epe_disp_refine': epe.mean().item(), + '3px_disp_refine': (epe < 3).float().mean().item(),}) + + if self.smoothness is not None and len(self.smoothness)>0: + loss = flow_loss + disp_loss + params_loss + disp_refine_loss + params_refine_loss + confidence_loss + self.loss_zeta * smooth_loss + else: + loss = flow_loss + disp_loss + params_loss + disp_refine_loss + params_refine_loss + confidence_loss + smooth_loss = torch.Tensor([0.0]).to(flow_loss.device) + + return loss, metrics, flow_loss, disp_loss, disp_refine_loss, confidence_loss, smooth_loss, params_loss, params_refine_loss + + +class SmoothLoss(nn.Module): + """Smooth constaint for prediction. + - gradient-based smooth regularization: + \psi_{pq} = max(w_{pq},\epsilon) min(\hat{\psi}_{pq}(f_p,f_q), \tau_{dis}) \\ + w_{pq} = e^{-||I_L(p)-I_L(q)||_1 / \eta} \\ + \hat{\psi}_{pq} = |d_p(f_p) - d_q(f_q)| \\ + d_p(f_p) = a_p p_u + b_p p_v + c_p \\ + d_q(f_q) = a_q q_u + b_q q_v + c_q + - curvature-based smooth regularization: + \psi_{pq} = max(w_{pq},\epsilon) min(\hat{\psi}_{pq}(f_p,f_q), \tau_{dis}) \\ + w_{pq} = e^{-||I_L(p)-I_L(q)||_1 / \eta} \\ + \hat{\psi}_{pq} = |d_p(f_p) - d_p(f_q)| + |d_q(f_q) - d_q(f_p)| \\ + d_p(f_p) = a_p p_u + b_p p_v + c_p \\ + d_p(f_q) = a_p q_u + b_p q_v + c_p + """ + def __init__(self, smoothness, slant=None, slant_norm=False, kernel_size=3, + ner_weight_reduce=False, epsilon=0.01, tau=3, eta=10): + super(SmoothLoss, self).__init__() + self.smoothness = smoothness + self.slant = slant + self.slant_norm = slant_norm + + self.eta = eta + self.tau = tau + self.epsilon = epsilon + + self.reduce = ner_weight_reduce + self.img_ner_extractor = NerghborExtractor(3, kernel_size, reduce=self.reduce) + self.coord_ner_extractor = NerghborExtractor(2, kernel_size) + self.params_ner_extractor = NerghborExtractor(3, kernel_size) + + def forward(self, params, imgL): + """Function: compute smoothe loss + args: + params: (B,3,H,W) + imgL: (B,3,H,W) + coordL: (B,2,H,W) + corrdR: (B,2,H,W) + """ + img_ner = self.img_ner_extractor(imgL) # B,3,N,H,W + B, _, H, W = imgL.shape + coord = coords_grid(B, H, W).to(imgL.device) # B,2,H,W + coord_ner = self.coord_ner_extractor(coord) # B,2,N,H,W + coord = coord.unsqueeze(2) # B,2,1,H,W + params_ner = self.params_ner_extractor(params) # B,3,N,H,W + params = params.unsqueeze(2) # B,3,1,H,W + + # w_{pq} = e^{-||I_L(p)-I_L(q)||_1 / \eta} + if not self.reduce: + weight = torch.exp(-torch.abs(img_ner-imgL.unsqueeze(2)).mean(dim=1) / self.eta) # B,N,H,W + else: + weight = torch.exp(-torch.abs(img_ner).mean(dim=1) / self.eta) # B,N,H,W + + if self.smoothness=="gradient": + # \hat{\psi}_{pq} = |d_p(f_p) - d_q(f_q)| + psi_p = disparity_computation(params, coords0=coord, + slant=self.slant, slant_norm=self.slant_norm) - \ + disparity_computation(params_ner, coords0=coord_ner, + slant=self.slant, slant_norm=self.slant_norm) + psi = torch.abs(psi_p) # B,N,H,W + elif self.smoothness=="curvature": + # |d_p(f_p) - d_p(f_q)| + psi_p = disparity_computation(params, coords0=coord, + slant=self.slant, slant_norm=self.slant_norm) - \ + disparity_computation(params, coords0=coord_ner, + slant=self.slant, slant_norm=self.slant_norm) + # d_q(f_q) - d_q(f_p) + psi_q = disparity_computation(params_ner, coords0=coord_ner, + slant=self.slant, slant_norm=self.slant_norm) - \ + disparity_computation(params_ner, coords0=coord, + slant=self.slant, slant_norm=self.slant_norm) + # \hat{\psi} = |d_p(f_p) - d_p(f_q)| + |d_q(f_q) - d_q(f_p)| + psi = torch.abs(psi_p) + torch.abs(psi_q) # B,N,H,W + + # \psi_{pq} = max(w_{pq},\epsilon) min(\hat{\psi_{pq}(f_p,f_q)}, \tau_{dis}) + smooth_loss = torch.clip(weight, min=self.epsilon,) * \ + F.sigmoid(psi/self.tau*8-4) * self.tau + smooth_loss = smooth_loss.mean() + return smooth_loss + + +def diamond(n): + a = np.arange(n) + b = np.minimum(a,a[::-1]) + return (b[:,None]+b)>=(n-1)//2 +def diamond_edge(n): + arr = np.diagflat(np.ones(n//2+1), n//2) + arr = np.maximum(arr,np.flip(arr,1)) + return np.maximum(arr,np.flip(arr,0)) +kernel_dict = {} +kernel_dict["diamond"] = diamond +kernel_dict["diamond_edge"] = diamond_edge + +class NerghborExtractor(nn.Module): + """Extarct the neighbors of each pixel using depthwise convolution. + Input: (B,C,H,W), Output: (B,C,N,H,W). + """ + def __init__(self, input_channel, kernel_size=3, reduce=False): + super(NerghborExtractor, self).__init__() + self.reduce = reduce + self.input_channel = input_channel + + # build kernel matrix + if isinstance(kernel_size, int): + H, W = kernel_size, kernel_size + self.neighbors_num = kernel_size*kernel_size + neighbor_kernel = np.zeros((self.neighbors_num, H, W), dtype=np.float16) + for idx in range(self.neighbors_num): + neighbor_kernel[idx, idx//H, idx%W] = 1 + + elif isinstance(kernel_size, str): + ## obatin the compressed kernel + kernel_type, size = kernel_size.split("-") + kernel_size = int(size) + compressed_kernel = kernel_dict[kernel_type](kernel_size) + ## decode the compressed kernel into a series of kernels + H, W = compressed_kernel.shape + self.neighbors_num = np.count_nonzero(compressed_kernel) + neighbors_pos = np.nonzero(compressed_kernel) + neighbor_kernel = np.zeros((self.neighbors_num, H, W), dtype=np.float16) + for idx_k, (idx_h, idx_w) in enumerate(zip(neighbors_pos[0],neighbors_pos[1])): + neighbor_kernel[idx_k, idx_h, idx_w] = compressed_kernel[idx_h, idx_w] + else: + raise Exception("kernel_size currently only supports integer") + if self.reduce: + neighbor_kernel[:, H//2, W//2] = -1 + + if not self.reduce: + neighbor_kernel = np.tile(neighbor_kernel, (input_channel,1,1)) + neighbor_kernel = neighbor_kernel[:,np.newaxis] # in*neighbors_num, 1, k, k + output_channel = input_channel*self.neighbors_num + groups = input_channel + else: + neighbor_kernel = np.tile(neighbor_kernel[:, np.newaxis], + (1,input_channel,1,1)) # neighbors_num, in, k, k + output_channel = self.neighbors_num + groups = 1 + + # extract neighbors through depthwise conv + self.conv = nn.Conv2d(input_channel, output_channel, + kernel_size=kernel_size, padding=kernel_size//2, bias=False, + groups=groups, padding_mode="reflect") + neighbor_kernel = torch.Tensor(neighbor_kernel) + self.conv.weight = nn.Parameter(neighbor_kernel, requires_grad=False) + + def forward(self, x): + B,C,H,W = x.shape + neighbors = self.conv(x) + neighbors = neighbors.reshape((B,-1,self.neighbors_num,H,W)) + if self.reduce: + neighbors = neighbors / self.input_channel + return neighbors diff --git a/core/raft_stereo.py b/core/raft_stereo.py new file mode 100644 index 0000000000000000000000000000000000000000..9f323c9a5f3bf661a76e16c0d6732d22d39efe83 --- /dev/null +++ b/core/raft_stereo.py @@ -0,0 +1,144 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.update import BasicMultiUpdateBlock +from core.extractor import BasicEncoder, MultiBasicEncoder, ResidualBlock +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import coords_grid, upflow8 + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereo(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = MultiBasicEncoder(output_dim=[args.hidden_dims, context_dims], norm_fn=args.context_norm, downsample=args.n_downsample) + self.update_block = BasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_flow(self, img): + """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0""" + N, _, H, W = img.shape + + coords0 = coords_grid(N, H, W).to(img.device) + coords1 = coords_grid(N, H, W).to(img.device) + + return coords0, coords1 + + def upsample_flow(self, flow, mask): + """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ + N, D, H, W = flow.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_flow = F.unfold(factor * flow, [3,3], padding=1) + up_flow = up_flow.view(N, D, 9, 1, 1, H, W) + + up_flow = torch.sum(mask * up_flow, dim=2) + up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) + return up_flow.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, flow_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + cnet_list = self.cnet(image1, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.fnet([image1, image2]) + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + coords0, coords1 = self.initialize_flow(net_list[0]) + + if flow_init is not None: + coords1 = coords1 + flow_init + + flow_predictions = [] + for itr in range(iters): + coords1 = coords1.detach() + corr = corr_fn(coords1) # index correlation volume + flow = coords1 - coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_flow = self.update_block(net_list, inp_list, corr, flow, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # in stereo mode, project flow onto epipolar + delta_flow[:,1] = 0.0 + + # F(t+1) = F(t) + \Delta(t) + coords1 = coords1 + delta_flow + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + if up_mask is None: + flow_up = upflow8(coords1 - coords0) + else: + flow_up = self.upsample_flow(coords1 - coords0, up_mask) + flow_up = flow_up[:,:1] + + flow_predictions.append(flow_up) + + if test_mode: + return coords1 - coords0, flow_up + + if vis_mode: + return {"disp_predictions": flow_predictions} + + return flow_predictions \ No newline at end of file diff --git a/core/raft_stereo_depth_postfusion.py b/core/raft_stereo_depth_postfusion.py new file mode 100644 index 0000000000000000000000000000000000000000..67a8a5385f4e6d744af6baac301a26603e978397 --- /dev/null +++ b/core/raft_stereo_depth_postfusion.py @@ -0,0 +1,173 @@ +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid, rescale_modulation +from core.geometry import LBPEncoder +from core.fusion import BetaModulator, RefinementMonStereo + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthPostFusion(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthPostFusion, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + self.refinement = RefinementMonStereo(args, hidden_dim=args.hidden_dims[-1]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + # 冻结 除refinement以外 模块的所有参数 + for module in [self.cnet, self.update_block, self.context_zqr_convs, self.fnet]: + for param in module.parameters(): + param.requires_grad = False + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + + # refinement + corr = corr_fn(hor_coords1) + disp = -hor_coords1 + hor_coords0 + disp_refine, up_mask, depth_registered, conf = self.refinement(disp, depth, net_list[0], corr) + disp_up = self.upsample_disp(-disp_refine, up_mask) + depth_registered_up = self.upsample_disp(-depth_registered, up_mask) + disp_predictions.append(depth_registered_up) + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + # if test_mode: + # return hor_coords1 - hor_coords0, depth_registered_up + + if vis_mode: + return {"disp_predictions": disp_predictions, + "depth": depth, } + + return {"disp_predictions": disp_predictions, + "conf": conf} \ No newline at end of file diff --git a/core/raft_stereo_depthany.py b/core/raft_stereo_depthany.py new file mode 100644 index 0000000000000000000000000000000000000000..90db9bc60418b493269bdddf9d82cb31ced9184d --- /dev/null +++ b/core/raft_stereo_depthany.py @@ -0,0 +1,147 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthAny(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthAny, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, } + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_depthbeta.py b/core/raft_stereo_depthbeta.py new file mode 100644 index 0000000000000000000000000000000000000000..d4ed25705a211b874a8d1faa9d908f5f09a13679 --- /dev/null +++ b/core/raft_stereo_depthbeta.py @@ -0,0 +1,173 @@ +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid, rescale_modulation +from core.geometry import LBPEncoder +from core.fusion import BetaModulator + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthBeta(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthBeta, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.lbp_encoder = LBPEncoder(args=args) + self.modulater = BetaModulator(args, lbp_dim=self.lbp_encoder.num_neighbors) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + depth_lbp = self.lbp_encoder(depth) + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + modulation_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + + with autocast(enabled=self.args.mixed_precision): + disp_lbp = self.lbp_encoder(-disp) + modulation = self.modulater(disp_lbp, depth_lbp) + + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + modulation_weight = rescale_modulation(itr, iters, + self.args.modulation_alg, + self.args.modulation_ratio) + delta_disp = delta_disp * (1 + modulation * modulation_weight) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if vis_mode: + modulation_predictions.append(modulation) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, + "depth": depth, + "modulation_predictions": modulation_predictions} + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_depthbeta_nolbp.py b/core/raft_stereo_depthbeta_nolbp.py new file mode 100644 index 0000000000000000000000000000000000000000..efad212e1f35e2f37accf3cb6733487ae2fe0913 --- /dev/null +++ b/core/raft_stereo_depthbeta_nolbp.py @@ -0,0 +1,169 @@ +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid, rescale_modulation +from core.geometry import LBPEncoder +from core.fusion import BetaModulator + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthBetaNoLBP(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthBetaNoLBP, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.modulater = BetaModulator(args, lbp_dim=1, hidden_dim=self.args.noLBP_hidden_dim) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + modulation_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + + with autocast(enabled=self.args.mixed_precision): + modulation = self.modulater(disp, depth) + + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + modulation_weight = rescale_modulation(itr, iters, + self.args.modulation_alg, + self.args.modulation_ratio) + delta_disp = delta_disp * (1 + modulation * modulation_weight) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if vis_mode: + modulation_predictions.append(modulation) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, + "depth": depth, + "modulation_predictions": modulation_predictions} + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_depthbeta_refine.py b/core/raft_stereo_depthbeta_refine.py new file mode 100644 index 0000000000000000000000000000000000000000..855ee7956b967f9d1ef376034cad6a26559fb52d --- /dev/null +++ b/core/raft_stereo_depthbeta_refine.py @@ -0,0 +1,236 @@ +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid, rescale_modulation +from core.geometry import LBPEncoder +from core.fusion import BetaModulator, RefinementMonStereo +from core.utils.utils import sv_intermediate_results + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthBetaRefine(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthBetaRefine, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample, + args=args) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.lbp_encoder = LBPEncoder(args=args) + self.modulater = BetaModulator(args, lbp_dim=self.lbp_encoder.num_neighbors) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + self.refinement = RefinementMonStereo(args, hidden_dim=args.hidden_dims[-1]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + if not hasattr(self.args, "fintune_info") or "tune_refine" in self.args.fintune_info.lower().split(" ") : + # 冻结 除refinement以外 模块的所有参数 + for module in [self.cnet, self.update_block, self.lbp_encoder, + self.modulater, self.context_zqr_convs, self.fnet]: + for param in module.parameters(): + param.requires_grad = False + + elif "tune_raft" in self.args.fintune_info.lower().split(" ") : + # 冻结 refinement 模块的所有参数 + for module in [self.refinement]: + for param in module.parameters(): + param.requires_grad = False + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + depth_lbp = self.lbp_encoder(depth) + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(depth, "monocular_depth", self.args.sv_root) + sv_intermediate_results(depth_lbp, "depth_lbp", self.args.sv_root) + for i in range(len(inp_list)): + for j in range(3): + sv_intermediate_results(inp_list[i][j], f"inp_list-{i}-{j}", self.args.sv_root) + for i in range(len(net_list)): + sv_intermediate_results(net_list[i], f"net_list-{i}", self.args.sv_root) + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + modulation_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + + with autocast(enabled=self.args.mixed_precision): + disp_lbp = self.lbp_encoder(disp) + modulation, distribution = self.modulater(disp_lbp, depth_lbp, out_distribution=True) + if vis_mode: + modulation_predictions.append(modulation) + + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + if hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(disp_lbp, f"disp_lbp-itr{itr+1}", self.args.sv_root) + sv_intermediate_results(delta_disp, f"delta_disp-itr{itr+1}", self.args.sv_root) + for i in range(len(net_list)): + sv_intermediate_results(net_list[i], f"net_list-{i}-itr{itr+1}", self.args.sv_root) + + modulation_weight = rescale_modulation(itr, iters, + self.args.modulation_alg, + self.args.modulation_ratio) + delta_disp = delta_disp * (1 + modulation * modulation_weight) + + if hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(modulation, f"modulation-itr{itr+1}", self.args.sv_root) + sv_intermediate_results(delta_disp, f"reweighted_delta_disp-itr{itr+1}", self.args.sv_root) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + disp_predictions.append(disp_up) + + if hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(disp_up, f"disp_up-itr{itr+1}", self.args.sv_root) + + conf, depth_registered_up, depth_registered = None, None, None + if not hasattr(self.args, "fintune_info") or "tune_refine" in self.args.fintune_info.lower().split(" ") : + # refinement + corr = corr_fn(hor_coords1) + disp = -hor_coords1 + hor_coords0 + disp_refine, up_mask, depth_registered, conf = self.refinement(disp, depth, net_list[0], corr, distribution) + + disp_up = self.upsample_disp(-disp_refine, up_mask) + depth_registered_up = self.upsample_disp(-depth_registered, up_mask) + disp_predictions.append(depth_registered_up) + if not hasattr(self.args, 'train_refine_mono') or not self.args.train_refine_mono: + disp_predictions.append(disp_up) + + if hasattr(self.args, "vis_inter") and self.args.vis_inter: + sv_intermediate_results(disp_up, f"disp_refine_up", self.args.sv_root) + sv_intermediate_results(depth_registered_up, f"depth_registered_up", self.args.sv_root) + + if test_mode: + if hasattr(self.args, 'train_refine_mono') and self.args.train_refine_mono: + return hor_coords1 - hor_coords0, depth_registered_up + return hor_coords1 - hor_coords0, disp_up + + # if test_mode: + # return hor_coords1 - hor_coords0, depth_registered_up + + if vis_mode: + return {"disp_predictions": disp_predictions, + "depth": depth, + "depth_registered": depth_registered, + "depth_registered_up": -depth_registered_up, + "conf_fusion": conf, + "modulation_predictions": modulation_predictions} + + return {"disp_predictions": disp_predictions, + "conf": conf} \ No newline at end of file diff --git a/core/raft_stereo_depthfusion.py b/core/raft_stereo_depthfusion.py new file mode 100644 index 0000000000000000000000000000000000000000..6298846fbd60151493ca272740264dbb860f0536 --- /dev/null +++ b/core/raft_stereo_depthfusion.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid +from core.fusion import FusionDepth, UpdateHistory + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthFusion(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthFusion, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.fusion = FusionDepth(self.args) + self.update_hist = UpdateHistory(self.args, 128, 1) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + disp_new = disp + delta_disp + delta_disp_new = self.fusion(disp_new, depth, delta_disp) + disp_refine = disp + delta_disp + delta_disp_new + net_list[0] = self.update_hist(net_list[0], disp_refine.detach()) + + # F(t+1) = F(t) + \Delta(t) + # hor_coords1 = hor_coords1 + delta_disp + hor_coords1 = hor_coords1 + delta_disp_new + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, } + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_depthmatch.py b/core/raft_stereo_depthmatch.py new file mode 100644 index 0000000000000000000000000000000000000000..3dfce2341e1b204702cbbe93a7120a2fb00ec077 --- /dev/null +++ b/core/raft_stereo_depthmatch.py @@ -0,0 +1,151 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_depthany import DepthAnyExtractor, DepthMatchExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDepthMatch(nn.Module): + def __init__(self, args): + super(RAFTStereoDepthMatch, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = DepthAnyExtractor(model_dir=args.depthany_model_dir, + output_dim=[args.hidden_dims, context_dims], + norm_fn=args.context_norm, + downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + # self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + self.fnet = DepthMatchExtractor(model_dir=args.depthany_model_dir, + output_dim=256, + norm_fn='instance', + downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list, depth = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, } + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_disp.py b/core/raft_stereo_disp.py new file mode 100644 index 0000000000000000000000000000000000000000..a4ef23327428af86bc014434a4002841846d943d --- /dev/null +++ b/core/raft_stereo_disp.py @@ -0,0 +1,143 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, MultiBasicEncoder, ResidualBlock +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoDisp(nn.Module): + def __init__(self, args): + super(RAFTStereoDisp, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = MultiBasicEncoder(output_dim=[args.hidden_dims, context_dims], norm_fn=args.context_norm, downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, } + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_mast3r.py b/core/raft_stereo_mast3r.py new file mode 100644 index 0000000000000000000000000000000000000000..bfc28761691b182e2c6fda1bf5cb1fbb962f6890 --- /dev/null +++ b/core/raft_stereo_mast3r.py @@ -0,0 +1,154 @@ +import os +import sys +sys.path.insert(0,'mast3r') + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor_mast3r import Mast3rExtractor +from core.extractor import MultiBasicEncoder, ResidualBlock +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.corr import AbsCorrBlock1D, PytorchAlternateAbsCorrBlock1D +from core.utils.utils import hor_coords_grid + +from mast3r.model import AsymmetricMASt3R + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoMast3r(nn.Module): + def __init__(self, args): + super(RAFTStereoMast3r, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = MultiBasicEncoder(output_dim=[args.hidden_dims, context_dims], norm_fn=args.context_norm, downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = Mast3rExtractor(model_name=args.mast3r_model_path, output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet(image1, image2) + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default a*b + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "abs_reg": # Default abs(a-B) + corr_block = AbsCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "abs_alt": # More memory efficient abs_reg reg + corr_block = PytorchAlternateAbsCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + # torch.cuda.empty_cache() + disp = hor_coords1 - hor_coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, } + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/raft_stereo_metric3d.py b/core/raft_stereo_metric3d.py new file mode 100644 index 0000000000000000000000000000000000000000..c17c65d88f001179f2e3902f7b0c3d1de4518993 --- /dev/null +++ b/core/raft_stereo_metric3d.py @@ -0,0 +1,170 @@ +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from core.update_disp import DispBasicMultiUpdateBlock +from core.extractor import BasicEncoder, ResidualBlock +from core.extractor_metric3d import Metric3DExtractor +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid, rescale_modulation +from core.geometry import LBPEncoder +from core.fusion import BetaModulator, RefinementMonStereo + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoMetric3D(nn.Module): + def __init__(self, args): + super(RAFTStereoMetric3D, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = Metric3DExtractor(args) + self.update_block = DispBasicMultiUpdateBlock(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + self.refinement = RefinementMonStereo(args, hidden_dim=args.hidden_dims[-1]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + # # 冻结 除refinement以外 模块的所有参数 + # for module in [self.cnet, self.update_block, self.context_zqr_convs, self.fnet]: + # for param in module.parameters(): + # param.requires_grad = False + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + net_list, inp_list, depth = self.cnet(image1, intrinsic) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # net_list = [torch.tanh(x[0]) for x in cnet_list] + # inp_list = [torch.relu(x[1]) for x in cnet_list] + + # # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + # inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, inp_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, inp_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, inp_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + + # refinement + corr = corr_fn(hor_coords1) + disp = -hor_coords1 + hor_coords0 + disp_refine, up_mask, depth_registered, conf = self.refinement(disp, depth, net_list[0], corr) + disp_up = self.upsample_disp(-disp_refine, up_mask) + depth_registered_up = self.upsample_disp(-depth_registered, up_mask) + disp_predictions.append(depth_registered_up) + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + # if test_mode: + # return hor_coords1 - hor_coords0, depth_registered_up + + if vis_mode: + return {"disp_predictions": disp_predictions, + "depth": depth, } + + return {"disp_predictions": disp_predictions, + "conf": conf} \ No newline at end of file diff --git a/core/raft_stereo_noctx.py b/core/raft_stereo_noctx.py new file mode 100644 index 0000000000000000000000000000000000000000..e368ae500a65ee3cd5918df10f5ad8c0c40a7582 --- /dev/null +++ b/core/raft_stereo_noctx.py @@ -0,0 +1,143 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from core.update_noctx import DispBasicMultiUpdateBlock_NoCTX +from core.extractor import BasicEncoder, MultiBasicEncoder, ResidualBlock +from core.corr import CorrBlock1D, PytorchAlternateCorrBlock1D, CorrBlockFast1D, AlternateCorrBlock +from core.utils.utils import hor_coords_grid + + +try: + autocast = torch.cuda.amp.autocast +except: + # dummy autocast for PyTorch < 1.6 + class autocast: + def __init__(self, enabled): + pass + def __enter__(self): + pass + def __exit__(self, *args): + pass + +class RAFTStereoNoCTX(nn.Module): + def __init__(self, args): + super(RAFTStereoNoCTX, self).__init__() + self.args = args + + context_dims = args.hidden_dims + + self.cnet = MultiBasicEncoder(output_dim=[args.hidden_dims, context_dims], norm_fn=args.context_norm, downsample=args.n_downsample) + self.update_block = DispBasicMultiUpdateBlock_NoCTX(self.args, hidden_dims=args.hidden_dims) + + self.context_zqr_convs = nn.ModuleList([nn.Conv2d(context_dims[i], args.hidden_dims[i]*3, 3, padding=3//2) for i in range(self.args.n_gru_layers)]) + + if args.shared_backbone: + self.conv2 = nn.Sequential( + ResidualBlock(128, 128, 'instance', stride=1), + nn.Conv2d(128, 256, 3, padding=1)) + else: + self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', downsample=args.n_downsample) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_disp(self, img): + """ Disparity is represented as difference between two horizontal coordinate grids disp = hor_coords1 - hor_coords0""" + N, _, H, W = img.shape + + hor_coords0 = hor_coords_grid(N, H, W).to(img.device) + hor_coords1 = hor_coords_grid(N, H, W).to(img.device) + + return hor_coords0, hor_coords1 + + def upsample_disp(self, disp, mask): + """ Upsample disp field [H/8, W/8, 1] -> [H, W, 1] using convex combination """ + N, D, H, W = disp.shape + factor = 2 ** self.args.n_downsample + mask = mask.view(N, 1, 9, factor, factor, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(factor * disp, [3,3], padding=1) + up_disp = up_disp.view(N, D, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, D, factor*H, factor*W) + + + def forward(self, image1, image2, iters=12, disp_init=None, test_mode=False, vis_mode=False, intrinsic=None): + """ Estimate optical flow between pair of frames """ + + image1 = (2 * (image1 / 255.0) - 1.0).contiguous() + image2 = (2 * (image2 / 255.0) - 1.0).contiguous() + + # run the context network + with autocast(enabled=self.args.mixed_precision): + if self.args.shared_backbone: + *cnet_list, x = self.cnet(torch.cat((image1, image2), dim=0), dual_inp=True, num_layers=self.args.n_gru_layers) + fmap1, fmap2 = self.conv2(x).split(dim=0, split_size=x.shape[0]//2) + else: + # cnet_list: [[(128,248,360), (128,248,360)], [(128,124,180),(128,124,180)], [(128,62,90),(128,62,90)]] + cnet_list = self.cnet(image1, num_layers=self.args.n_gru_layers) + # fmap1: (128,248,360), fmap2: (128,248,360) + fmap1, fmap2 = self.fnet([image1, image2]) + + # from IPython import embed + # embed() + + net_list = [torch.tanh(x[0]) for x in cnet_list] + # inp_list = [torch.relu(x[1]) for x in cnet_list] + + # Rather than running the GRU's conv layers on the context features multiple times, we do it once at the beginning + # inp_list = [list(conv(i).split(split_size=conv.out_channels//3, dim=1)) for i,conv in zip(inp_list, self.context_zqr_convs)] + + if self.args.corr_implementation == "reg": # Default + corr_block = CorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "alt": # More memory efficient than reg + corr_block = PytorchAlternateCorrBlock1D + fmap1, fmap2 = fmap1.float(), fmap2.float() + elif self.args.corr_implementation == "reg_cuda": # Faster version of reg + corr_block = CorrBlockFast1D + elif self.args.corr_implementation == "alt_cuda": # Faster version of alt + corr_block = AlternateCorrBlock + corr_fn = corr_block(fmap1, fmap2, radius=self.args.corr_radius, num_levels=self.args.corr_levels) + + hor_coords0, hor_coords1 = self.initialize_disp(net_list[0]) + + if disp_init is not None: + hor_coords1 = hor_coords1 + disp_init + + disp_predictions = [] + for itr in range(iters): + hor_coords1 = hor_coords1.detach() + corr = corr_fn(hor_coords1) # index correlation volume + disp = hor_coords1 - hor_coords0 + with autocast(enabled=self.args.mixed_precision): + if self.args.n_gru_layers == 3 and self.args.slow_fast_gru: # Update low-res GRU + net_list = self.update_block(net_list, iter32=True, iter16=False, iter08=False, update=False) + if self.args.n_gru_layers >= 2 and self.args.slow_fast_gru:# Update low-res GRU and mid-res GRU + net_list = self.update_block(net_list, iter32=self.args.n_gru_layers==3, iter16=True, iter08=False, update=False) + net_list, up_mask, delta_disp = self.update_block(net_list, corr, disp, iter32=self.args.n_gru_layers==3, iter16=self.args.n_gru_layers>=2) + + # F(t+1) = F(t) + \Delta(t) + hor_coords1 = hor_coords1 + delta_disp + + # We do not need to upsample or output intermediate results in test_mode + if test_mode and itr < iters-1: + continue + + # upsample predictions + disp_up = self.upsample_disp(hor_coords1 - hor_coords0, up_mask) + + disp_predictions.append(disp_up) + + if test_mode: + return hor_coords1 - hor_coords0, disp_up + + if vis_mode: + return {"disp_predictions": disp_predictions, } + + return {"disp_predictions": disp_predictions,} \ No newline at end of file diff --git a/core/refinement.py b/core/refinement.py new file mode 100644 index 0000000000000000000000000000000000000000..26b2865976928c4ca3f1653893c7569bb843a1df --- /dev/null +++ b/core/refinement.py @@ -0,0 +1,409 @@ +import os +import sys +import logging +import numpy as np +from collections import OrderedDict + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',) + +import torch +import torch.nn as nn +import torch.nn.functional as F + + + +def to_2tuple(x): + if isinstance(x, tuple): + return x + if isinstance(x, list): + return tuple(x) + if isinstance(x, np.ndarray): + return tuple(x) + return (x,x) + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + window_size = to_2tuple(window_size) + B, H, W, C = x.shape + x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + window_size = to_2tuple(window_size) + B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1])) + x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + pretrained_window_size (tuple[int]): The height and width of the window in pre-training. + """ + + def __init__(self, dim_fea, dim_disp, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0., + pretrained_window_size=[0, 0]): + + super().__init__() + self.dim_fea = dim_fea + self.dim_disp = dim_disp + self.window_size = to_2tuple(window_size) # Wh, Ww + self.pretrained_window_size = to_2tuple(pretrained_window_size) + self.num_heads = num_heads + + self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True) + + # mlp to generate continuous relative position bias + self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True), + nn.ReLU(inplace=True), + nn.Linear(512, num_heads, bias=False)) + + # get relative_coords_table + relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32) + relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32) + relative_coords_table = torch.stack( + torch.meshgrid([relative_coords_h, + relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0) # 1, 2*Wh-1, 2*Ww-1, 2 + if pretrained_window_size[0] > 0: + relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1) + relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1) + else: + relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1) + relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1) + relative_coords_table *= 8 # normalize to -8, 8 + relative_coords_table = torch.sign(relative_coords_table) * torch.log2( + torch.abs(relative_coords_table) + 1.0) / np.log2(8) + + self.register_buffer("relative_coords_table", relative_coords_table) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qk = nn.Linear(dim_fea, dim_fea * 2, bias=False) + self.v = nn.Linear(dim_disp, dim_disp, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(dim_fea)) + self.v_bias = nn.Parameter(torch.zeros(dim_disp)) + else: + self.q_bias = None + self.v_bias = None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim_disp, dim_disp) + self.proj_drop = nn.Dropout(proj_drop) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, guidance, shift_mask=None, reliability_mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C_x = x.shape + qk_bias = None + v_bias = None + if self.q_bias is not None: + qk_bias = torch.cat((self.q_bias, torch.zeros_like(self.q_bias, requires_grad=False))) + v_bias = self.v_bias + qk = F.linear(input=guidance, weight=self.qk.weight, bias=qk_bias) + v = F.linear(input=x, weight=self.v.weight, bias=v_bias) + qk = qk.reshape(B_, N, 2, self.num_heads, -1).permute(2, 0, 3, 1, 4) # (2, B_, nH, N, C_fea/nH) + v = v.reshape(B_, N, 1, 1, -1).permute(2, 0, 3, 1, 4) # (1, B_, 1, N, C_x) + q, k = qk[0], qk[1] # make torchscript happy (cannot use tensor as tuple) + v = v.squeeze(0) + + # cosine attention + attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)) # (B_, nH, N, N) + logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01, device=self.logit_scale.device))).exp() + attn = attn * logit_scale + + relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads) # (1, 2*Wh-1, 2*Ww-1, nH) + relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + relative_position_bias = 16 * torch.sigmoid(relative_position_bias) + attn = attn + relative_position_bias.unsqueeze(0) # (B_, nH, N, N) + + if shift_mask is not None: + nW = shift_mask.shape[0] + # (B=B_/nW, nW, nH, N, N) + (nW, N, N) + (B=B_/nW, nW, N) + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + \ + shift_mask.unsqueeze(1).unsqueeze(0) +\ + reliability_mask.view(B_ // nW, nW, N).unsqueeze(2).unsqueeze(-2) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).mean(dim=1) # (B_, N, C_x) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + pretrained_window_size (int): Window size in pre-training. + """ + + def __init__(self, args, dim_fea, dim_disp, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0): + super().__init__() + self.dim_fea = dim_fea + self.dim_disp = dim_disp + self.num_heads = num_heads + self.window_size = to_2tuple(window_size) + self.shift_size = to_2tuple(shift_size) + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size[0] < self.window_size[0], "shift_size must in 0-window_size" + assert 0 <= self.shift_size[1] < self.window_size[1], "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim_disp) + self.attn = WindowAttention( + dim_fea, dim_disp, + window_size=to_2tuple(self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, + pretrained_window_size=to_2tuple(pretrained_window_size)) + + assert drop_path<=0, "no support for DropPath" + # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.drop_path = nn.Identity() + self.norm2 = norm_layer(dim_disp) + mlp_hidden_dim = int(dim_disp * mlp_ratio) + self.mlp = Mlp(in_features=dim_disp, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + self.apply(self._init_weights) + + def get_shift_mask(self, H, W, device): + if self.shift_size[0]>0 or self.shift_size[1]>0: + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, H, W, 1), device=device) # 1 H W 1 + first_end = -self.window_size[0] if self.window_size[0]>0 else None + second_end = -self.shift_size[0] if self.shift_size[0]>0 else None + h_slices = (slice(0, first_end), + slice(first_end, second_end), + slice(second_end, None)) + first_end = -self.window_size[1] if self.window_size[1]>0 else None + second_end = -self.shift_size[1] if self.shift_size[1]>0 else None + w_slices = (slice(0, first_end), + slice(first_end, second_end), + slice(second_end, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size[0] * self.window_size[1]) + shift_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + shift_mask = shift_mask.masked_fill(shift_mask != 0, float(-100.0)).masked_fill(shift_mask == 0, float(0.0)) + else: + shift_mask = None + return shift_mask + + def forward(self, x, guidance, reliability): + # padding + _,_,H,W = x.shape + wh,ww = to_2tuple(self.window_size) + padding_H = int(np.ceil(H/wh)*wh-H) + padding_W = int(np.ceil(W/ww)*ww-W) + x = F.pad(x,(padding_W,0,padding_H,0),mode="replicate") + guidance = F.pad(guidance,(padding_W,0,padding_H,0),mode="replicate") + reliability = F.pad(reliability,(padding_W,0,padding_H,0),mode="replicate") + + x = x.permute((0,2,3,1)) + guidance = guidance.permute((0,2,3,1)) + reliability = reliability.permute((0,2,3,1)) + + B, H, W, C_fea = guidance.shape + _, _, _, C_x = x.shape + # guidance = guidance.flatten(2).transpose(1, 2) # (B,H*W,C_fea) + # x = x.flatten(2).transpose(1, 2) # (B,H,W,C_x) + + shift_mask = self.get_shift_mask(H,W, x.device) + shortcut = x + # x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size[0]>0 or self.shift_size[1]>0: + shifted_x = torch.roll(x, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2)) # (80, 180) + shifted_guidance = torch.roll(guidance, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2)) + shifted_reliability = torch.roll(reliability, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2)) + else: + shifted_x = x + shifted_guidance = guidance + shifted_reliability = reliability + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C_x + x_windows = x_windows.view(-1, self.window_size[0] * self.window_size[1], C_x) # nW*B, window_size*window_size, C_x + guidance_windows = window_partition(shifted_guidance, self.window_size) # nW*B, window_size, window_size, C_fea + guidance_windows = guidance_windows.view(-1, self.window_size[0] * self.window_size[1], C_fea) # nW*B, window_size*window_size, C_fea + reliability_windows = window_partition(shifted_reliability, self.window_size) # nW*B, window_size, window_size, 1 + reliability_windows = reliability_windows.view(-1, self.window_size[0] * self.window_size[1]) # nW*B, window_size*window_size + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, guidance_windows, + shift_mask=shift_mask, + reliability_mask=reliability_windows) # nW*B, window_size*window_size, C_x + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size[0], self.window_size[1], C_x) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size[0]>0 or self.shift_size[1]>0: + x = torch.roll(shifted_x, shifts=(self.shift_size[0], self.shift_size[1]), dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H, W, C_x) + # x = shortcut + self.drop_path(self.norm1(x)) + + # FFN + # x = x + self.drop_path(self.norm2(self.mlp(x))) + # x = shortcut + self.drop_path(self.norm2(self.mlp(x))) + # x = shortcut + self.mlp(x) + x = shortcut + self.mlp(x)*(-shifted_reliability/100) + x = x.view(B,H,W,C_x).permute((0,3,1,2)) + + # unpadding + x = x[:,:,padding_H:,padding_W:] + return x + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + +class Refinement(nn.Module): + def __init__(self, args, in_chans, dim_fea, dim_disp): + super(Refinement, self).__init__() + self.args = args + self.detach = args.detach_in_refinement + self.window_size = to_2tuple(args.refine_win_size) + self.shift_size = (self.window_size[0]//2, self.window_size[1]//2) + self.patch_embed = nn.Conv2d(in_chans, dim_fea, kernel_size=3, stride=1, padding=1) + self.propagation_1 = SwinTransformerBlock(args, dim_fea, dim_disp, self.args.num_heads, + window_size=self.window_size, shift_size=0,) + self.propagation_2 = SwinTransformerBlock(args, dim_fea, dim_disp, self.args.num_heads, + window_size=self.window_size, shift_size=self.shift_size,) + if self.args.split_win: + rev_win_size = [self.window_size[1], self.window_size[0]] + rev_shift_size = [self.shift_size[1], self.shift_size[0]] + self.propagation_1_2 = SwinTransformerBlock(args, dim_fea, dim_disp, self.args.num_heads, + window_size=rev_win_size, shift_size=0,) + self.propagation_2_2 = SwinTransformerBlock(args, dim_fea, dim_disp, self.args.num_heads, + window_size=rev_win_size, shift_size=rev_shift_size,) + + def forward(self, geo_params, fea, confidence=None, if_shift=False): + if type(fea) is list: + fea = torch.cat(fea, dim=1) + guidance = self.patch_embed(fea.detach() if self.detach else fea) + + if confidence is not None : + uncertainty = F.sigmoid(confidence.detach()) + uncertainty = uncertainty.masked_fill(uncertainty>self.args.U_thold, float(-100.0)).masked_fill(uncertainty<=self.args.U_thold, float(0.0)) + reliability = uncertainty.detach() + else: + reliability = None + + if not if_shift: + geo_params_refine = self.propagation_1(geo_params.detach(), guidance, reliability) + if self.args.split_win: + geo_params_refine = self.propagation_1_2(geo_params_refine, guidance, reliability) + else: + geo_params_refine = self.propagation_2(geo_params.detach(), guidance, reliability) + if self.args.split_win: + geo_params_refine = self.propagation_2_2(geo_params_refine, guidance, reliability) + + return geo_params_refine + + +class UpdateHistory(nn.Module): + def __init__(self, args, in_chans1, in_chans2): + super(UpdateHistory, self).__init__() + self.conv = nn.Conv2d(in_chans2, in_chans2, kernel_size=1, stride=1, padding=0) + self.update = nn.Sequential(nn.Conv2d(in_chans1+in_chans2, in_chans1, kernel_size=3, stride=1, padding=1),) + + def forward(self, his, disp): + hist_update = self.update( torch.cat([his,self.conv(disp)], dim=1) ) + return hist_update + diff --git a/core/stereo_datasets.py b/core/stereo_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..ba1e581274c4fc39009f339e1234ceb2b7e5bba8 --- /dev/null +++ b/core/stereo_datasets.py @@ -0,0 +1,702 @@ +# Data loading based on https://github.com/NVIDIA/flownet2-pytorch + +import numpy as np +import torch +import torch.utils.data as data +import torch.nn.functional as F +import logging +import os +import re +import copy +import math +import random +from pathlib import Path +from glob import glob +import os.path as osp + +from core.utils import plane +from core.utils import frame_utils +from core.utils.ddp import get_loader +from core.utils.augmentor import FlowAugmentor, SparseFlowAugmentor +DATASET_ROOT = os.getenv('DATASET_ROOT') + + +class StereoDataset(data.Dataset): + def __init__(self, aug_params=None, sparse=False, reader=None, args=None): + self.augmentor = None + self.sparse = sparse + self.img_pad = aug_params.pop("img_pad", None) if aug_params is not None else None + if aug_params is not None and "crop_size" in aug_params: + if sparse: + self.augmentor = SparseFlowAugmentor(**aug_params) + else: + self.augmentor = FlowAugmentor(**aug_params) + + if reader is None: + self.disparity_reader = frame_utils.read_gen + else: + self.disparity_reader = reader + + # if args is not None: + # # self.plane = args.plane_datset + # self.slant = args.slant + # self.slant_norm = args.slant_norm + # else: + # # self.plane = False + # self.slant = None + # self.slant_norm = False + + self.is_test = args.is_test if hasattr(args, "is_test") and args.is_test else False + self.init_seed = False + self.flow_list = [] + self.disparity_list = [] + self.image_list = [] + self.extra_info = {} + + def __getitem__(self, index): + + if self.is_test: + img1 = frame_utils.read_gen(self.image_list[index][0]) + img2 = frame_utils.read_gen(self.image_list[index][1]) + img1 = np.array(img1).astype(np.uint8)[..., :3] + img2 = np.array(img2).astype(np.uint8)[..., :3] + img1 = torch.from_numpy(img1).permute(2, 0, 1).float() + img2 = torch.from_numpy(img2).permute(2, 0, 1).float() + return self.image_list[index] + [self.disparity_list[index]], \ + img1, img2, torch.zeros_like(torch.zeros_like(img1))[:1], torch.ones_like(torch.zeros_like(img1))[:1] + + if not self.init_seed: + worker_info = torch.utils.data.get_worker_info() + if worker_info is not None: + torch.manual_seed(worker_info.id) + np.random.seed(worker_info.id) + random.seed(worker_info.id) + self.init_seed = True + + try: + index = index % len(self.image_list) + intrinsic = self.extra_info["intrinsics"][index] if "intrinsics" in self.extra_info else None + disp = self.disparity_reader(self.disparity_list[index]) + if isinstance(disp, tuple): + disp, valid = disp + else: + valid = disp < 512 + + img1 = frame_utils.read_gen(self.image_list[index][0]) + img2 = frame_utils.read_gen(self.image_list[index][1]) + + img1 = np.array(img1).astype(np.uint8) + img2 = np.array(img2).astype(np.uint8) + + disp = np.array(disp).astype(np.float32) + flow = np.stack([-disp, np.zeros_like(disp)], axis=-1) + + except Exception as err: + raise Exception(err, "{}, {}, {}".format(self.image_list[index][0], + self.image_list[index][1], + self.disparity_list[index] )) + + # grayscale images + if len(img1.shape) == 2: + img1 = np.tile(img1[...,None], (1, 1, 3)) + img2 = np.tile(img2[...,None], (1, 1, 3)) + else: + img1 = img1[..., :3] + img2 = img2[..., :3] + + if self.augmentor is not None: + if self.sparse: + img1, img2, flow, valid, intrinsic = self.augmentor(img1, img2, flow, valid, intrinsic) + else: + img1, img2, flow, intrinsic = self.augmentor(img1, img2, flow, intrinsic) + + try: + img1 = torch.from_numpy(img1).permute(2, 0, 1).float() + img2 = torch.from_numpy(img2).permute(2, 0, 1).float() + flow = torch.from_numpy(flow).permute(2, 0, 1).float() + intrinsic = torch.from_numpy(np.array(intrinsic)).float() if intrinsic is not None else torch.from_numpy(np.eye(3)).float() + except Exception as err: + raise Exception(err, "{}, {}, {}".format(self.image_list[index][0], + self.image_list[index][1], + self.disparity_list[index]), + "{}, {}, {}".format(img1.shape, img2.shape, flow.shape), ) + + if self.sparse: + valid = torch.from_numpy(valid) + else: + valid = (flow[0].abs() < 512) & (flow[1].abs() < 512) + + if self.img_pad is not None: + padH, padW = self.img_pad + img1 = F.pad(img1, [padW]*2 + [padH]*2) + img2 = F.pad(img2, [padW]*2 + [padH]*2) + + flow = flow[:1] + + return self.image_list[index] + [self.disparity_list[index]], \ + img1, img2, flow, valid.float(), intrinsic + + + def __mul__(self, v): + copy_of_self = copy.deepcopy(self) + copy_of_self.flow_list = v * copy_of_self.flow_list + copy_of_self.image_list = v * copy_of_self.image_list + copy_of_self.disparity_list = v * copy_of_self.disparity_list + if isinstance(copy_of_self.extra_info, list): + copy_of_self.extra_info = v * copy_of_self.extra_info + else: + copy_of_self.extra_info = {key: val*v for key, val in copy_of_self.extra_info.items()} + return copy_of_self + + def __len__(self): + return len(self.image_list) + + +class SceneFlowDatasets(StereoDataset): + def __init__(self, aug_params=None, root='', dstype='frames_cleanpass', + things_test=False, caching=False, args=None, eval=False): + super(SceneFlowDatasets, self).__init__(aug_params, args=args) + self.eval = args.eval if args is not None else eval + self.root = root if len(root)>0 else DATASET_ROOT + self.dstype = dstype + self.caching = caching + self.extra_info["intrinsics"] = [] + assert os.path.exists(self.root), "check the existence: {}".format(self.root) + + if things_test: + self._add_things("TEST") + else: + self._add_things("TRAIN") + self._add_monkaa() + self._add_driving() + + def _add_things(self, split='TRAIN'): + """ Add FlyingThings3D data """ + + original_length = len(self.disparity_list) + cache_file = osp.join(self.root, 'flying3d'+"-"+self.dstype+"-"+split+".npz") + if self.caching and os.path.exists(cache_file): + cache = np.load(cache_file) + root = cache["root"] + left_images = cache["left_images"] + right_images = cache["right_images"] + disparity_images = cache["disparity_images"] + else : + root = osp.join(self.root, 'flying3d') + left_images = sorted( glob(osp.join(root, self.dstype, split, '*/*/left/*.png')) ) + right_images = [ im.replace('left', 'right') for im in left_images ] + disparity_images = [ im.replace(self.dstype, 'disparity').replace('.png', '.pfm') for im in left_images ] + if self.caching : + np.savez(cache_file, + root=root, + left_images=left_images, + right_images=right_images, + disparity_images=disparity_images) + + # Choose a random subset of 400 images for validation + state = np.random.get_state() + np.random.seed(1000) + if not self.eval: + val_idxs = set(np.random.permutation(len(left_images))[:400]) + else: + val_idxs = set(np.random.permutation(len(left_images))) + np.random.set_state(state) + + for idx, (img1, img2, disp) in enumerate(zip(left_images, right_images, disparity_images)): + if (split == 'TEST' and idx in val_idxs) or split == 'TRAIN': + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + self.extra_info["intrinsics"] += [ [1050, 1050, 479.5, 269.5] ] + + logging.info(f"Added {len(self.disparity_list) - original_length} from FlyingThings {self.dstype}") + + def _add_monkaa(self): + """ Add FlyingThings3D data """ + + original_length = len(self.disparity_list) + root = osp.join(self.root, 'monkaa') + left_images = sorted( glob(osp.join(root, self.dstype, '*/left/*.png')) ) + right_images = [ image_file.replace('left', 'right') for image_file in left_images ] + disparity_images = [ im.replace(self.dstype, 'disparity').replace('.png', '.pfm') for im in left_images ] + + for img1, img2, disp in zip(left_images, right_images, disparity_images): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + self.extra_info["intrinsics"] += [ [1050, 1050, 479.5, 269.5] ] + logging.info(f"Added {len(self.disparity_list) - original_length} from Monkaa {self.dstype}") + + + def _add_driving(self): + """ Add FlyingThings3D data """ + + original_length = len(self.disparity_list) + root = osp.join(self.root, 'driving') + left_images = sorted( glob(osp.join(root, self.dstype, '*/*/*/left/*.png')) ) + right_images = [ image_file.replace('left', 'right') for image_file in left_images ] + disparity_images = [ im.replace(self.dstype, 'disparity').replace('.png', '.pfm') for im in left_images ] + + for img1, img2, disp in zip(left_images, right_images, disparity_images): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + if img1.find("15mm_focallength") != -1: + self.extra_info["intrinsics"] += [ [450, 450, 479.5, 269.5] ] + elif img1.find("35mm_focallength") != -1: + self.extra_info["intrinsics"] += [ [1050, 1050, 479.5, 269.5] ] + else: + raise Exception(f"Unknown intrinsics: {im1}") + logging.info(f"Added {len(self.disparity_list) - original_length} from Driving {self.dstype}") + + +class ETH3D(StereoDataset): + def __init__(self, aug_params=None, root='datasets/ETH3D', split='training', args=None): + super(ETH3D, self).__init__(aug_params, sparse=True, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root), "check the existence: {}".format(root) + + image1_list = sorted( glob(osp.join(root, f'two_view_{split}/*/im0.png')) ) + image2_list = sorted( glob(osp.join(root, f'two_view_{split}/*/im1.png')) ) + disp_list = sorted( glob(osp.join(root, 'two_view_training/*/disp0GT.pfm')) ) if split == 'training' else [osp.join(root, 'two_view_training_gt/playground_1l/disp0GT.pfm')]*len(image1_list) + + for img1, img2, disp in zip(image1_list, image2_list, disp_list): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + +class SintelStereo(StereoDataset): + def __init__(self, aug_params=None, root='datasets/SintelStereo', args=None): + super().__init__(aug_params, sparse=True, reader=frame_utils.readDispSintelStereo, args=args) + root = root if len(root)>0 else DATASET_ROOT + + image1_list = sorted( glob(osp.join(root, 'training/*_left/*/frame_*.png')) ) + image2_list = sorted( glob(osp.join(root, 'training/*_right/*/frame_*.png')) ) + disp_list = sorted( glob(osp.join(root, 'training/disparities/*/frame_*.png')) ) * 2 + + for img1, img2, disp in zip(image1_list, image2_list, disp_list): + assert img1.split('/')[-2:] == disp.split('/')[-2:] + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + +class FallingThings(StereoDataset): + def __init__(self, aug_params=None, root='datasets/FallingThings', args=None): + super().__init__(aug_params, reader=frame_utils.readDispFallingThings, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root) + + with open(os.path.join(root, 'filenames.txt'), 'r') as f: + filenames = sorted(f.read().splitlines()) + + image1_list = [osp.join(root, e) for e in filenames] + image2_list = [osp.join(root, e.replace('left.jpg', 'right.jpg')) for e in filenames] + disp_list = [osp.join(root, e.replace('left.jpg', 'left.depth.png')) for e in filenames] + + for img1, img2, disp in zip(image1_list, image2_list, disp_list): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + +class TartanAir(StereoDataset): + def __init__(self, aug_params=None, root='datasets', keywords=[]): + super().__init__(aug_params, reader=frame_utils.readDispTartanAir) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root) + + with open(os.path.join(root, 'tartanair_filenames.txt'), 'r') as f: + filenames = sorted(list(filter(lambda s: 'seasonsforest_winter/Easy' not in s, f.read().splitlines()))) + for kw in keywords: + filenames = sorted(list(filter(lambda s: kw in s.lower(), filenames))) + + image1_list = [osp.join(root, e) for e in filenames] + image2_list = [osp.join(root, e.replace('_left', '_right')) for e in filenames] + disp_list = [osp.join(root, e.replace('image_left', 'depth_left').replace('left.png', 'left_depth.npy')) for e in filenames] + + for img1, img2, disp in zip(image1_list, image2_list, disp_list): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + +class KITTI(StereoDataset): + def __init__(self, aug_params=None, root='datasets/KITTI', image_set='training', args=None): + super(KITTI, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispKITTI, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root), "check the existence: {}".format(root) + + image1_list = sorted(glob(os.path.join(root, image_set, 'image_2/*_10.png'))) + image2_list = sorted(glob(os.path.join(root, image_set, 'image_3/*_10.png'))) + disp_list = sorted(glob(os.path.join(root, 'training', 'disp_occ_0/*_10.png'))) if image_set == 'training' else [osp.join(root, 'training/disp_occ_0/000085_10.png')]*len(image1_list) + + for idx, (img1, img2, disp) in enumerate(zip(image1_list, image2_list, disp_list)): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + + +class KITTI2012(StereoDataset): + def __init__(self, aug_params=None, root='datasets/KITTI2012', image_set='training', args=None): + super(KITTI2012, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispKITTI, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root), "check the existence: {}".format(root) + + image1_list = sorted(glob(os.path.join(root, image_set, 'image_0/*_10.png'))) + image2_list = sorted(glob(os.path.join(root, image_set, 'image_1/*_10.png'))) + disp_list = sorted(glob(os.path.join(root, 'training', 'disp_occ/*_10.png'))) if image_set == 'training' else [osp.join(root, 'training/disp_occ_0/000085_10.png')]*len(image1_list) + + for idx, (img1, img2, disp) in enumerate(zip(image1_list, image2_list, disp_list)): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + + +class Middlebury(StereoDataset): + def __init__(self, aug_params=None, root='datasets/Middlebury', split='F', image_set='training', args=None): + super(Middlebury, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispMiddlebury, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root), "check the existence: {}".format(root) + assert split in ["F", "H", "Q", "2014"] + if split == "2014": # datasets/Middlebury/2014/Pipes-perfect/im0.png + scenes = list((Path(root) / "2014").glob("*")) + for scene in scenes: + for s in ["E","L",""]: + self.image_list += [ [str(scene / "im0.png"), str(scene / f"im1{s}.png")] ] + self.disparity_list += [ str(scene / "disp0.pfm") ] + else: + lines = list(map(osp.basename, glob(os.path.join(root, f"MiddEval3/{image_set}{split}/*")))) + image1_list = sorted([os.path.join(root, "MiddEval3", f'{image_set}{split}', f'{name}/im0.png') for name in lines]) + image2_list = sorted([os.path.join(root, "MiddEval3", f'{image_set}{split}', f'{name}/im1.png') for name in lines]) + disp_list = sorted([os.path.join(root, "MiddEval3", f'{image_set}{split}', f'{name}/disp0GT.pfm') for name in lines]) + if image_set=="training": + assert len(image1_list) == len(image2_list) == len(disp_list) > 0, [image1_list, root, image_set, split] + else: + assert len(image1_list) == len(image2_list) > 0, [image1_list, root, image_set, split] + for img1, img2, disp in zip(image1_list, image2_list, disp_list): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + + +class Booster(StereoDataset): + def __init__(self, aug_params=None, root='datasets/booster/train/balanced', image_set='train', args=None): + super(Booster, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispBooster) + assert os.path.exists(root), print(root) + # image1_list = sorted(glob(os.path.join(root, image_set, "**/camera_00/im*.png"), recursive=True)) + image2_list = sorted(glob(os.path.join(root, image_set, "**/camera_02/im*.png"), recursive=True)) + image1_list = [img.replace("camera_02", "camera_00") for img in image2_list] + + disp_list = [os.path.join(os.path.split(x)[0].replace("camera_00", ""), 'disp_00.npy') for x in image1_list] + mask_list = [os.path.join(os.path.split(x)[0].replace("camera_00", ""), 'mask_cat.png') for x in image1_list] + right_disp_list = [os.path.join(os.path.split(x)[0].replace("camera_00", ""), 'disp_02.npy') for x in image1_list] + + for img1, img2, disp, disp_r, mask in zip(image1_list, image2_list, disp_list, right_disp_list,mask_list): + self.image_list += [[img1, img2]] + self.disparity_list += [disp] + # self.trans_mask += [mask] + + +class NerfStereoDataset(StereoDataset): + def __init__(self, aug_params=None, root='datasets/NerfStereo', image_set='training', args=None, txt_root=None): + super(NerfStereoDataset, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispNerfS, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root), "check the existence: {}".format(root) + + if txt_root is None: + left_list = sorted(glob(os.path.join(root, "*/*/baseline_*/left/*.jpg"), recursive=True)) + image1_list = [] + for path in left_list: + match = re.search(r"(.*?/Q/)", path) + prefix = match.group(1) # prefix + suffix = os.path.basename(path) # file name + path_new = f"{prefix}center/{suffix}" + image1_list.append( path_new ) + image2_list = sorted(glob(os.path.join(root, "*/*/baseline_*/right/*.jpg"), recursive=True)) + disp_list = sorted(glob(os.path.join(root, "*/*/baseline_*/disparity/*.png"), recursive=True)) + # dispr_list = sorted(glob(os.path.join(root, "**/*_right.disp.png"), recursive=True)) + else: + image1_list = np.load( os.path.join(txt_root, 'image1_list.npy') ) + image2_list = np.load( os.path.join(txt_root, 'image2_list.npy') ) + disp_list = np.load( os.path.join(txt_root, 'disp_list.npy') ) + + for idx, (img1, img2, disp) in enumerate(zip(image1_list, image2_list, disp_list)): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + + +class CREStereoDataset(StereoDataset): + def __init__(self, aug_params=None, root='datasets/CREStereo_dataset', image_set='training', args=None, txt_root=None): + super(CREStereoDataset, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispCRES, args=args) + root = root if len(root)>0 else DATASET_ROOT + assert os.path.exists(root), "check the existence: {}".format(root) + + if txt_root is None: + image1_list = sorted(glob(os.path.join(root, "**/*_left.jpg"), recursive=True)) + image2_list = sorted(glob(os.path.join(root, "**/*_right.jpg"), recursive=True)) + disp_list = sorted(glob(os.path.join(root, "**/*_left.disp.png"), recursive=True)) + else: + image1_list = np.load( os.path.join(txt_root, 'image1_list.npy') ) + image2_list = np.load( os.path.join(txt_root, 'image2_list.npy') ) + disp_list = np.load( os.path.join(txt_root, 'disp_list.npy') ) + # dispr_list = sorted(glob(os.path.join(root, "**/*_right.disp.png"), recursive=True)) + + for idx, (img1, img2, disp) in enumerate(zip(image1_list, image2_list, disp_list)): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + +class Trans(StereoDataset): + def __init__(self, aug_params=None, root='./datasets/Trans', things_test=False, args=None): + super(Trans, self).__init__(aug_params) + self.root = root if len(root)>0 else DATASET_ROOT + self.args = args + self.extra_info["intrinsics"] = [] + + if things_test: + self._add_things("TEST") + else: + self._add_things("TRAIN") + + def _add_things(self, split='TRAIN'): + original_length = len(self.disparity_list) + + left_images = sorted(glob(osp.join(self.root, split, '*/*/left/img/*.jpg')) ) + assert len(left_images)>0, f"Loaded 0 images from {self.root}" + + right_images = [ im.replace('left', 'right') for im in left_images ] + disparity_images = [ im.replace('img', 'disparity').replace('.jpg', '.pfm') for im in left_images ] + disparity_images_noTran = [im.replace('img', 'disparity_without_trans').replace('.jpg', '.pfm') for im in left_images ] + + for idx, (img1, img2, disp, disp_noTran) in enumerate(zip(left_images, right_images, disparity_images, disparity_images_noTran)): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + # self.multi_label.append([disp, disp_noTran]) + self.extra_info["intrinsics"] += [ [933.3333333333334, 787.5, 480.0, 270.0] ] + logging.info("-"*10 + f"Added {len(self.disparity_list) - original_length} from Trans") + +class Fooling3DDataset(StereoDataset): + def __init__(self, aug_params=None, root='datasets/Fooling3D', image_set='training', args=None): + super(Fooling3DDataset, self).__init__(aug_params, sparse=True, reader=frame_utils.readDispFooling3D) + assert os.path.exists(root) + self.root = root + self.image_set = image_set + self.video_frames_info = {} + + self._add_mono() + self._build_video_frames_info() + + def _add_mono(self): + origin_length = len(self.disparity_list) + print(f"using {self.image_set} in fooling3D") + + if self.image_set=="training": + df = pd.read_csv(os.path.join(self.root, 'meta_data/scale_factors.csv'), header=None) + + # df.columns = ['path', 'scale'] + # video_name = "Service_Cars_1_deleted_scene_3d_remake_Servio_Comunitrio" + # df = df[df['path'].str.contains(video_name, case=False, na=False)] + + self.scale_factor = dict(zip( + df.iloc[:, 0].apply(lambda x: x.replace('/data2', './datasets')), + df.iloc[:, 1] + )) + # right_images = sorted(glob(os.path.join(self.root, 'video_frame_sequence_right/*/*/*.png'))) + right_images = df.iloc[:, 0].apply(lambda x: x.replace('/data2', './datasets')).tolist() + disp_list = [ im.replace('video_frame_sequence_right', 'depth_rect') for im in right_images ] + left_images = [ im.replace('video_frame_sequence_right', 'video_frame_sequence') for im in right_images ] + + assert len(left_images) == len(right_images) == len(disp_list) > 0, [len(left_images), len(right_images), len(disp_list)] + for img1, img2, disp in zip(left_images, right_images, disp_list): + self.image_list += [ [img1, img2] ] + self.disparity_list += [ disp ] + + elif self.image_set=="testing": + with open(os.path.join(self.root, 'meta_data/testing_enter.pkl'), 'rb') as f: + data = pickle.load(f) + + self.extra_info["mask"] = [] + for key, frame_dict in data.items(): + left_image_path = os.path.join(self.root, "real_data/testing", frame_dict["left"]) + right_image_path = os.path.join(self.root, "real_data/testing", frame_dict["right"]) + disp_image_path = os.path.join(self.root, "real_data/testing", frame_dict["disp"]) + mask_image_path = os.path.join(self.root, "real_data/testing", frame_dict["mask"]) + + self.image_list += [ [left_image_path, right_image_path] ] + self.disparity_list += [ disp_image_path ] + self.extra_info["mask"] += [ mask_image_path ] + + assert len(self.image_list) == len(self.disparity_list) == len(self.extra_info["mask"]) > 0, \ + [len(self.image_list), len(self.disparity_list), len(self.extra_info["mask"])] + + else: + raise Exception(f"{self.image_set} is not in ['training', 'testing']") + + logging.info(f"Added {len(self.disparity_list) - origin_length} from Fooling3D Mono") + + def _build_video_frames_info(self): + for idx, img_path in enumerate(self.disparity_list): + parts = img_path.split('/') + video_name = parts[-2] + frame_name = parts[-1] + + if video_name not in self.video_frames_info: + self.video_frames_info[video_name] = [] + + self.video_frames_info[video_name].append(idx) + self.video_frames_info = list(self.video_frames_info.values()) + + + + +class Fooling3DBatchSampler(data.Sampler): + def __init__(self, dataset, batch_size): + """ + Args: + dataset (Dataset): The dataset to sample from. + batch_size (int): The size of each batch (how many frames from the same video). + """ + self.dataset = dataset + self.batch_size = batch_size + + def __iter__(self): + """ + This will return indices of frames in a single video folder, ensuring batch contains only frames from that video. + """ + for video_idx in range(len(self.dataset.video_frames_info)): + frames_info = self.dataset.video_frames_info[video_idx] + num_frames = len(frames_info) + frame_idx_list = list(np.arange(num_frames)) + + # # Shuffle the frame indices if shuffle is True + # if self.shuffle: + # np.random.shuffle(frame_idx_list) + + # If frames count is not divisible by batch size, repeat the last frame + if num_frames % self.batch_size != 0: + num_repeat = self.batch_size - (num_frames % self.batch_size) + frame_idx_list += [frame_idx_list[-1]] * num_repeat # Add last frame to fill up batch + + # Yield frames in batches of batch_size + for i in range(0, len(frame_idx_list), self.batch_size): + batch_info = [frames_info[frame_idx] for frame_idx in frame_idx_list[i:i + self.batch_size]] + yield batch_info + + def __len__(self): + """ + The length of the sampler is the number of total batches in all videos. + """ + total_batches = 0 + for frames_info in self.dataset.video_frames_info: + total_batches += len(frames_info) // self.batch_size + (1 if len(frames_info) % self.batch_size != 0 else 0) + return total_batches + + +from torch.utils.data.distributed import DistributedSampler +class DistributedFooling3DBatchSampler(DistributedSampler): + def __init__(self, dataset, batch_size, num_replicas=None, rank=None): + """ + Args: + dataset (Dataset): The dataset to sample from. + batch_size (int): The size of each batch (how many frames from the same video). + num_replicas (int): Total number of processes (GPUs) across all nodes. + rank (int): Rank of the current process (GPU) in the group of workers. + """ + self.dataset = dataset + self.batch_size = batch_size + self.num_replicas = num_replicas if num_replicas is not None else torch.distributed.get_world_size() + self.rank = rank if rank is not None else torch.distributed.get_rank() + + def __iter__(self): + """ + This will return indices of frames in a single video folder, ensuring batch contains only frames from that video. + Distributes the frames across different processes. + """ + for video_idx in range(len(self.dataset.video_frames_info)): + frames_info = self.dataset.video_frames_info[video_idx] + num_frames = len(frames_info) + frame_idx_list = list(np.arange(num_frames)) + + # # Shuffle the frame indices if shuffle is True + # if self.shuffle: + # np.random.shuffle(frame_idx_list) + + # If frames count is not divisible by batch size, repeat the last frame + if num_frames % self.batch_size != 0: + num_repeat = self.batch_size - (num_frames % self.batch_size) + frame_idx_list += [frame_idx_list[-1]] * num_repeat # Add last frame to fill up batch + + # Total number of batches across all replicas + num_batches = len(frame_idx_list) // self.batch_size + (1 if len(frame_idx_list) % self.batch_size != 0 else 0) + + # Divide the dataset into chunks and ensure each rank gets its share + # Find out how many batches each rank should process + chunks_per_rank = num_batches // self.num_replicas + remainder = num_batches % self.num_replicas + start_idx = self.rank * chunks_per_rank + min(self.rank, remainder) + end_idx = (self.rank + 1) * chunks_per_rank + min(self.rank + 1, remainder) + + # Generate the frames indices for the current process's portion of the data + for i in range(start_idx, end_idx): + batch_info = [frames_info[frame_idx] for frame_idx in frame_idx_list[i * self.batch_size:(i + 1) * self.batch_size]] + yield batch_info + + def __len__(self): + """ + The length of the sampler is the total number of batches divided across all processes. + """ + total_batches = 0 + for frames_info in self.dataset.video_frames_info: + total_batches += len(frames_info) // self.batch_size + (1 if len(frames_info) % self.batch_size != 0 else 0) + + # Divide the total batches by the number of processes + return total_batches // self.num_replicas + (1 if total_batches % self.num_replicas > self.rank else 0) + + +def fetch_dataloader(args): + """ Create the data loader for the corresponding trainign set """ + + aug_params = {'crop_size': args.image_size, 'min_scale': args.spatial_scale[0], 'max_scale': args.spatial_scale[1], 'do_flip': False, 'yjitter': not args.noyjitter} + if hasattr(args, "saturation_range") and args.saturation_range is not None: + aug_params["saturation_range"] = args.saturation_range + if hasattr(args, "img_gamma") and args.img_gamma is not None: + aug_params["gamma"] = args.img_gamma + if hasattr(args, "do_flip") and args.do_flip is not None: + aug_params["do_flip"] = args.do_flip + + train_dataset = None + for dataset_name in args.train_datasets: + if dataset_name.startswith("middlebury_"): + new_dataset = Middlebury(aug_params, split=dataset_name.replace('middlebury_',''), args=args) + logging.info(f"Adding {len(new_dataset)} samples from Middlebury") + elif dataset_name == 'sceneflow': + clean_dataset = SceneFlowDatasets(aug_params, dstype='frames_cleanpass', args=args) + final_dataset = SceneFlowDatasets(aug_params, dstype='frames_finalpass', args=args) + new_dataset = (clean_dataset*4) + (final_dataset*4) + logging.info(f"Adding {len(new_dataset)} samples from SceneFlow") + elif 'kitti' in dataset_name: + new_dataset = KITTI(aug_params, split=dataset_name, args=args) + logging.info(f"Adding {len(new_dataset)} samples from KITTI") + elif dataset_name == 'sintel_stereo': + new_dataset = SintelStereo(aug_params, args=args)*140 + logging.info(f"Adding {len(new_dataset)} samples from Sintel Stereo") + elif dataset_name == 'falling_things': + new_dataset = FallingThings(aug_params, args=args)*5 + logging.info(f"Adding {len(new_dataset)} samples from FallingThings") + elif dataset_name.startswith('tartan_air'): + new_dataset = TartanAir(aug_params, keywords=dataset_name.split('_')[2:]) + logging.info(f"Adding {len(new_dataset)} samples from Tartain Air") + elif 'nerfstereo' in dataset_name: + new_dataset = NerfStereoDataset(aug_params, args=args, root='./datasets/NerfStereo', txt_root='./datasets/NerfStereo/../') + logging.info(f"Adding {len(new_dataset)} samples from NerfStereoDataset") + elif 'crestereo' in dataset_name: + new_dataset = CREStereoDataset(aug_params, args=args, txt_root='./datasets/CREStereo_dataset/../') + logging.info(f"Adding {len(new_dataset)} samples from CREStereoDataset") + elif dataset_name == 'Trans': + new_dataset = Trans(aug_params, args=args) + logging.info(f"Adding {len(new_dataset)} samples from Trans") + elif dataset_name.lower() == 'fooling3d': + new_dataset = Fooling3DDataset(aug_params, args=args, root='./datasets/Fooling3D') + # print("+"*10, hasattr(args, 'enable_sampler') and args.enable_sampler) + if hasattr(args, 'enable_sampler') and args.enable_sampler: + # sampler = Fooling3DBatchSampler(new_dataset, args.batch_size) + sampler = DistributedFooling3DBatchSampler(new_dataset, args.batch_size) + logging.info(f"Adding {len(new_dataset)} samples from Fooling3DDataset") + # TODO: Add Fooling3D dataset with only one sampler may cause conflict with other datasets + train_dataset = new_dataset if train_dataset is None else train_dataset + new_dataset + + # train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, + # pin_memory=True, shuffle=True, num_workers=int(os.environ.get('SLURM_CPUS_PER_TASK', 6))-2, drop_last=True) + train_loader = get_loader(train_dataset, args) + train_loader.sampler.set_epoch(0) + + logging.info('Training with %d image pairs' % len(train_dataset)) + return train_loader + diff --git a/core/update.py b/core/update.py new file mode 100644 index 0000000000000000000000000000000000000000..ad68412ed66db4e9c6785bbf2a9b87c85453b1fc --- /dev/null +++ b/core/update.py @@ -0,0 +1,204 @@ +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from opt_einsum import contract + +class FlowHead(nn.Module): + def __init__(self, input_dim=128, hidden_dim=256, output_dim=2): + super(FlowHead, self).__init__() + self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) + self.conv2 = nn.Conv2d(hidden_dim, output_dim, 3, padding=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.conv2(self.relu(self.conv1(x))) + +class ConvGRU(nn.Module): + def __init__(self, hidden_dim, input_dim, kernel_size=3): + super(ConvGRU, self).__init__() + self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + + self._initialize_weights() + + def forward(self, h, cz, cr, cq, *x_list): + x = torch.cat(x_list, dim=1) + hx = torch.cat([h, x], dim=1) + + z = torch.sigmoid(self.convz(hx) + cz) + r = torch.sigmoid(self.convr(hx) + cr) + q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)) + cq) + + h = (1-z) * h + z * q + return h + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight) + if m.bias is not None: + m.bias.data.zero_() + +class SepConvGRU(nn.Module): + def __init__(self, hidden_dim=128, input_dim=192+128): + super(SepConvGRU, self).__init__() + self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + + self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + + + def forward(self, h, *x): + # horizontal + x = torch.cat(x, dim=1) + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz1(hx)) + r = torch.sigmoid(self.convr1(hx)) + q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + # vertical + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz2(hx)) + r = torch.sigmoid(self.convr2(hx)) + q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + return h + +class BasicMotionEncoder(nn.Module): + def __init__(self, args): + super(BasicMotionEncoder, self).__init__() + self.args = args + + cor_planes = args.corr_levels * (2*args.corr_radius + 1) + + self.convc1 = nn.Conv2d(cor_planes, 64, 1, padding=0) + self.convc2 = nn.Conv2d(64, 64, 3, padding=1) + self.convf1 = nn.Conv2d(2, 64, 7, padding=3) + self.convf2 = nn.Conv2d(64, 64, 3, padding=1) + self.conv = nn.Conv2d(64+64, 128-2, 3, padding=1) + + def forward(self, flow, corr): + cor = F.relu(self.convc1(corr)) + cor = F.relu(self.convc2(cor)) + flo = F.relu(self.convf1(flow)) + flo = F.relu(self.convf2(flo)) + + cor_flo = torch.cat([cor, flo], dim=1) + out = F.relu(self.conv(cor_flo)) + return torch.cat([out, flow], dim=1) + +def pool2x(x): + return F.avg_pool2d(x, 3, stride=2, padding=1) + +def pool4x(x): + return F.avg_pool2d(x, 5, stride=4, padding=1) + +def interp(x, dest): + interp_args = {'mode': 'bilinear', 'align_corners': True} + return F.interpolate(x, dest.shape[2:], **interp_args) + + +class BasicMultiUpdateBlock(nn.Module): + def __init__(self, args, hidden_dims=[]): + super().__init__() + self.args = args + self.encoder = BasicMotionEncoder(args) + encoder_output_dim = 128 + + self.gru08 = ConvGRU(hidden_dims[2], encoder_output_dim + hidden_dims[1] * (args.n_gru_layers > 1)) + self.gru16 = ConvGRU(hidden_dims[1], hidden_dims[0] * (args.n_gru_layers == 3) + hidden_dims[2]) + self.gru32 = ConvGRU(hidden_dims[0], hidden_dims[1]) + self.flow_head = FlowHead(hidden_dims[2], hidden_dim=256, output_dim=2) + factor = 2**self.args.n_downsample + + self.mask = nn.Sequential( + nn.Conv2d(hidden_dims[2], 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, (factor**2)*9, 1, padding=0)) + + def forward(self, net, inp, corr=None, flow=None, iter08=True, iter16=True, iter32=True, update=True): + + if iter32: + net[2] = self.gru32(net[2], *(inp[2]), pool2x(net[1])) + if iter16: + if self.args.n_gru_layers > 2: + net[1] = self.gru16(net[1], *(inp[1]), pool2x(net[0]), interp(net[2], net[1])) + else: + net[1] = self.gru16(net[1], *(inp[1]), pool2x(net[0])) + if iter08: + motion_features = self.encoder(flow, corr) + if self.args.n_gru_layers > 1: + net[0] = self.gru08(net[0], *(inp[0]), motion_features, interp(net[1], net[0])) + else: + net[0] = self.gru08(net[0], *(inp[0]), motion_features) + + if not update: + return net + + delta_flow = self.flow_head(net[0]) + + # scale mask to balence gradients + mask = .25 * self.mask(net[0]) + return net, mask, delta_flow + + +class ManifoldBasicMultiUpdateBlock(nn.Module): + def __init__(self, args, hidden_dims=[]): + super().__init__() + self.args = args + self.encoder = BasicMotionEncoder(args) + + encoder_output_dim = 128 + output_dim = 2 + + self.gru08 = ConvGRU(hidden_dims[2], encoder_output_dim + hidden_dims[1] * (args.n_gru_layers > 1)) + self.gru16 = ConvGRU(hidden_dims[1], hidden_dims[0] * (args.n_gru_layers == 3) + hidden_dims[2]) + self.gru32 = ConvGRU(hidden_dims[0], hidden_dims[1]) + self.flow_head = FlowHead(hidden_dims[2], hidden_dim=256, output_dim=output_dim) + factor = 2**self.args.n_downsample + + self.mask = nn.Sequential( + nn.Conv2d(hidden_dims[2], 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, (factor**2)*9, 1, padding=0)) + + self.mask2 = nn.Sequential( + nn.Conv2d(hidden_dims[2], 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, (factor**2)*9, 1, padding=0)) + + def forward(self, net, inp, corr=None, flow=None, iter08=True, iter16=True, iter32=True, update=True): + + if iter32: + net[2] = self.gru32(net[2], *(inp[2]), pool2x(net[1])) + if iter16: + if self.args.n_gru_layers > 2: + net[1] = self.gru16(net[1], *(inp[1]), pool2x(net[0]), interp(net[2], net[1])) + else: + net[1] = self.gru16(net[1], *(inp[1]), pool2x(net[0])) + if iter08: + motion_features = self.encoder(flow, corr) + if self.args.n_gru_layers > 1: + net[0] = self.gru08(net[0], *(inp[0]), motion_features, interp(net[1], net[0])) + else: + net[0] = self.gru08(net[0], *(inp[0]), motion_features) + + if not update: + return net + + delta_flow = self.flow_head(net[0]) + + # scale mask to balence gradients + mask = .25 * self.mask(net[0]) + # return net, mask, delta_flow + + mask_disp = None + mask_disp = .25 * self.mask2(net[0]) + return net, mask, delta_flow, mask_disp diff --git a/core/update_disp.py b/core/update_disp.py new file mode 100644 index 0000000000000000000000000000000000000000..83762a7cf80770740476d6f54660b2ce977b2944 --- /dev/null +++ b/core/update_disp.py @@ -0,0 +1,150 @@ +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from opt_einsum import contract + +class DispHead(nn.Module): + def __init__(self, input_dim=128, hidden_dim=256, output_dim=1): + super(DispHead, self).__init__() + self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) + self.conv2 = nn.Conv2d(hidden_dim, output_dim, 3, padding=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.conv2(self.relu(self.conv1(x))) + +class ConvGRU(nn.Module): + def __init__(self, hidden_dim, input_dim, kernel_size=3): + super(ConvGRU, self).__init__() + self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + + self._initialize_weights() + + def forward(self, h, cz, cr, cq, *x_list): + x = torch.cat(x_list, dim=1) + hx = torch.cat([h, x], dim=1) + + z = torch.sigmoid(self.convz(hx) + cz) + r = torch.sigmoid(self.convr(hx) + cr) + q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)) + cq) + + h = (1-z) * h + z * q + return h + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight) + if m.bias is not None: + m.bias.data.zero_() + +class SepConvGRU(nn.Module): + def __init__(self, hidden_dim=128, input_dim=192+128): + super(SepConvGRU, self).__init__() + self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + + self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + + + def forward(self, h, *x): + # horizontal + x = torch.cat(x, dim=1) + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz1(hx)) + r = torch.sigmoid(self.convr1(hx)) + q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + # vertical + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz2(hx)) + r = torch.sigmoid(self.convr2(hx)) + q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + return h + +class BasicShiftEncoder(nn.Module): + def __init__(self, args): + super(BasicShiftEncoder, self).__init__() + self.args = args + + cor_planes = args.corr_levels * (2*args.corr_radius + 1) + + self.convc1 = nn.Conv2d(cor_planes, 64, 1, padding=0) + self.convc2 = nn.Conv2d(64, 64, 3, padding=1) + self.convf1 = nn.Conv2d(1, 64, 7, padding=3) + self.convf2 = nn.Conv2d(64, 64, 3, padding=1) + self.conv = nn.Conv2d(64+64, 128-1, 3, padding=1) + + def forward(self, disp, corr): + cor = F.relu(self.convc1(corr)) + cor = F.relu(self.convc2(cor)) + dis = F.relu(self.convf1(disp)) + dis = F.relu(self.convf2(dis)) + + cor_dis = torch.cat([cor, dis], dim=1) + out = F.relu(self.conv(cor_dis)) + return torch.cat([out, disp], dim=1) + +def pool2x(x): + return F.avg_pool2d(x, 3, stride=2, padding=1) + +def pool4x(x): + return F.avg_pool2d(x, 5, stride=4, padding=1) + +def interp(x, dest): + interp_args = {'mode': 'bilinear', 'align_corners': True} + return F.interpolate(x, dest.shape[2:], **interp_args) + + +class DispBasicMultiUpdateBlock(nn.Module): + def __init__(self, args, hidden_dims=[]): + super(DispBasicMultiUpdateBlock, self).__init__() + self.args = args + self.encoder = BasicShiftEncoder(args) + encoder_output_dim = 128 + + self.gru08 = ConvGRU(hidden_dims[2], encoder_output_dim + hidden_dims[1] * (args.n_gru_layers > 1)) + self.gru16 = ConvGRU(hidden_dims[1], hidden_dims[0] * (args.n_gru_layers == 3) + hidden_dims[2]) + self.gru32 = ConvGRU(hidden_dims[0], hidden_dims[1]) + self.disp_head = DispHead(hidden_dims[2], hidden_dim=256, output_dim=1) + factor = 2**self.args.n_downsample + + self.mask = nn.Sequential( + nn.Conv2d(hidden_dims[2], 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, (factor**2)*9, 1, padding=0)) + + def forward(self, net, inp, corr=None, disp=None, iter08=True, iter16=True, iter32=True, update=True): + + if iter32: + net[2] = self.gru32(net[2], *(inp[2]), pool2x(net[1])) + if iter16: + if self.args.n_gru_layers > 2: + net[1] = self.gru16(net[1], *(inp[1]), pool2x(net[0]), interp(net[2], net[1])) + else: + net[1] = self.gru16(net[1], *(inp[1]), pool2x(net[0])) + if iter08: + motion_features = self.encoder(disp, corr) + if self.args.n_gru_layers > 1: + net[0] = self.gru08(net[0], *(inp[0]), motion_features, interp(net[1], net[0])) + else: + net[0] = self.gru08(net[0], *(inp[0]), motion_features) + + if not update: + return net + + delta_disp = self.disp_head(net[0]) + + # scale mask to balence gradients + mask = .25 * self.mask(net[0]) + return net, mask, delta_disp + diff --git a/core/update_noctx.py b/core/update_noctx.py new file mode 100644 index 0000000000000000000000000000000000000000..faa3868ded67948be9a017f12583b739059a5b7d --- /dev/null +++ b/core/update_noctx.py @@ -0,0 +1,150 @@ +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from opt_einsum import contract + +class DispHead(nn.Module): + def __init__(self, input_dim=128, hidden_dim=256, output_dim=1): + super(DispHead, self).__init__() + self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) + self.conv2 = nn.Conv2d(hidden_dim, output_dim, 3, padding=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.conv2(self.relu(self.conv1(x))) + +class ConvGRU_NoCTX(nn.Module): + def __init__(self, hidden_dim, input_dim, kernel_size=3): + super(ConvGRU_NoCTX, self).__init__() + self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, kernel_size, padding=kernel_size//2) + + self._initialize_weights() + + def forward(self, h, *x_list): + x = torch.cat(x_list, dim=1) + hx = torch.cat([h, x], dim=1) + + z = torch.sigmoid(self.convz(hx) ) + r = torch.sigmoid(self.convr(hx) ) + q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)) ) + + h = (1-z) * h + z * q + return h + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight) + if m.bias is not None: + m.bias.data.zero_() + +class SepConvGRU(nn.Module): + def __init__(self, hidden_dim=128, input_dim=192+128): + super(SepConvGRU, self).__init__() + self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) + + self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) + + + def forward(self, h, *x): + # horizontal + x = torch.cat(x, dim=1) + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz1(hx)) + r = torch.sigmoid(self.convr1(hx)) + q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + # vertical + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz2(hx)) + r = torch.sigmoid(self.convr2(hx)) + q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1))) + h = (1-z) * h + z * q + + return h + +class BasicShiftEncoder(nn.Module): + def __init__(self, args): + super(BasicShiftEncoder, self).__init__() + self.args = args + + cor_planes = args.corr_levels * (2*args.corr_radius + 1) + + self.convc1 = nn.Conv2d(cor_planes, 64, 1, padding=0) + self.convc2 = nn.Conv2d(64, 64, 3, padding=1) + self.convf1 = nn.Conv2d(1, 64, 7, padding=3) + self.convf2 = nn.Conv2d(64, 64, 3, padding=1) + self.conv = nn.Conv2d(64+64, 128-1, 3, padding=1) + + def forward(self, disp, corr): + cor = F.relu(self.convc1(corr)) + cor = F.relu(self.convc2(cor)) + dis = F.relu(self.convf1(disp)) + dis = F.relu(self.convf2(dis)) + + cor_dis = torch.cat([cor, dis], dim=1) + out = F.relu(self.conv(cor_dis)) + return torch.cat([out, disp], dim=1) + +def pool2x(x): + return F.avg_pool2d(x, 3, stride=2, padding=1) + +def pool4x(x): + return F.avg_pool2d(x, 5, stride=4, padding=1) + +def interp(x, dest): + interp_args = {'mode': 'bilinear', 'align_corners': True} + return F.interpolate(x, dest.shape[2:], **interp_args) + + +class DispBasicMultiUpdateBlock_NoCTX(nn.Module): + def __init__(self, args, hidden_dims=[]): + super(DispBasicMultiUpdateBlock_NoCTX, self).__init__() + self.args = args + self.encoder = BasicShiftEncoder(args) + encoder_output_dim = 128 + + self.gru08 = ConvGRU_NoCTX(hidden_dims[2], encoder_output_dim + hidden_dims[1] * (args.n_gru_layers > 1)) + self.gru16 = ConvGRU_NoCTX(hidden_dims[1], hidden_dims[0] * (args.n_gru_layers == 3) + hidden_dims[2]) + self.gru32 = ConvGRU_NoCTX(hidden_dims[0], hidden_dims[1]) + self.disp_head = DispHead(hidden_dims[2], hidden_dim=256, output_dim=1) + factor = 2**self.args.n_downsample + + self.mask = nn.Sequential( + nn.Conv2d(hidden_dims[2], 256, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, (factor**2)*9, 1, padding=0)) + + def forward(self, net, corr=None, disp=None, iter08=True, iter16=True, iter32=True, update=True): + + if iter32: + net[2] = self.gru32(net[2], pool2x(net[1])) + if iter16: + if self.args.n_gru_layers > 2: + net[1] = self.gru16(net[1], pool2x(net[0]), interp(net[2], net[1])) + else: + net[1] = self.gru16(net[1], pool2x(net[0])) + if iter08: + motion_features = self.encoder(disp, corr) + if self.args.n_gru_layers > 1: + net[0] = self.gru08(net[0], motion_features, interp(net[1], net[0])) + else: + net[0] = self.gru08(net[0], motion_features) + + if not update: + return net + + delta_disp = self.disp_head(net[0]) + + # scale mask to balence gradients + mask = .25 * self.mask(net[0]) + return net, mask, delta_disp + diff --git a/core/utils/__init__.py b/core/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/core/utils/__pycache__/__init__.cpython-310.pyc b/core/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d64f897f19853e43f5ee69a5856a588bfaadebbb Binary files /dev/null and b/core/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/core/utils/__pycache__/frame_utils.cpython-310.pyc b/core/utils/__pycache__/frame_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff1cbddcae06100c125013854de61deed74a8515 Binary files /dev/null and b/core/utils/__pycache__/frame_utils.cpython-310.pyc differ diff --git a/core/utils/__pycache__/plane.cpython-310.pyc b/core/utils/__pycache__/plane.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b6f8fce99b015cc1c325d59a246f8d16e673c44 Binary files /dev/null and b/core/utils/__pycache__/plane.cpython-310.pyc differ diff --git a/core/utils/__pycache__/utils.cpython-310.pyc b/core/utils/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..844bc9edbabf0510f64addc2a40e466883b9ecc3 Binary files /dev/null and b/core/utils/__pycache__/utils.cpython-310.pyc differ diff --git a/core/utils/__pycache__/vis.cpython-310.pyc b/core/utils/__pycache__/vis.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..593250693e87f19c7a17a5e9575325c5ff9bdf11 Binary files /dev/null and b/core/utils/__pycache__/vis.cpython-310.pyc differ diff --git a/core/utils/augmentor.py b/core/utils/augmentor.py new file mode 100644 index 0000000000000000000000000000000000000000..b5efcf029b2c690988a80615c699b3167831c10d --- /dev/null +++ b/core/utils/augmentor.py @@ -0,0 +1,355 @@ +import numpy as np +import random +import warnings +import os +import time +from glob import glob +from skimage import color, io +from PIL import Image + +import cv2 +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +import torch +from torchvision.transforms import ColorJitter, functional, Compose +import torch.nn.functional as F + +def get_middlebury_images(): + root = "datasets/Middlebury/MiddEval3" + with open(os.path.join(root, "official_train.txt"), 'r') as f: + lines = f.read().splitlines() + return sorted([os.path.join(root, 'trainingQ', f'{name}/im0.png') for name in lines]) + +def get_eth3d_images(): + return sorted(glob('datasets/ETH3D/two_view_training/*/im0.png')) + +def get_kitti_images(): + return sorted(glob('datasets/KITTI/training/image_2/*_10.png')) + +def transfer_color(image, style_mean, style_stddev): + reference_image_lab = color.rgb2lab(image) + reference_stddev = np.std(reference_image_lab, axis=(0,1), keepdims=True)# + 1 + reference_mean = np.mean(reference_image_lab, axis=(0,1), keepdims=True) + + reference_image_lab = reference_image_lab - reference_mean + lamb = style_stddev/reference_stddev + style_image_lab = lamb * reference_image_lab + output_image_lab = style_image_lab + style_mean + l, a, b = np.split(output_image_lab, 3, axis=2) + l = l.clip(0, 100) + output_image_lab = np.concatenate((l,a,b), axis=2) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + output_image_rgb = color.lab2rgb(output_image_lab) * 255 + return output_image_rgb + +class AdjustGamma(object): + + def __init__(self, gamma_min, gamma_max, gain_min=1.0, gain_max=1.0): + self.gamma_min, self.gamma_max, self.gain_min, self.gain_max = gamma_min, gamma_max, gain_min, gain_max + + def __call__(self, sample): + gain = random.uniform(self.gain_min, self.gain_max) + gamma = random.uniform(self.gamma_min, self.gamma_max) + return functional.adjust_gamma(sample, gamma, gain) + + def __repr__(self): + return f"Adjust Gamma {self.gamma_min}, ({self.gamma_max}) and Gain ({self.gain_min}, {self.gain_max})" + +class FlowAugmentor: + def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True, yjitter=False, saturation_range=[0.6,1.4], gamma=[1,1,1,1]): + + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = 1.0 + self.stretch_prob = 0.8 + self.max_stretch = 0.2 + + # flip augmentation params + self.yjitter = yjitter + self.do_flip = do_flip + self.h_flip_prob = 0.5 + self.v_flip_prob = 0.1 + + # photometric augmentation params + self.photo_aug = Compose([ColorJitter(brightness=0.4, contrast=0.4, saturation=saturation_range, hue=0.5/3.14), AdjustGamma(*gamma)]) + self.asymmetric_color_aug_prob = 0.2 + self.eraser_aug_prob = 0.5 + + def color_transform(self, img1, img2): + """ Photometric augmentation """ + + # asymmetric + if np.random.rand() < self.asymmetric_color_aug_prob: + img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) + img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) + + # symmetric + else: + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) + img1, img2 = np.split(image_stack, 2, axis=0) + + return img1, img2 + + def eraser_transform(self, img1, img2, bounds=[50, 100]): + """ Occlusion augmentation """ + + ht, wd = img1.shape[:2] + if np.random.rand() < self.eraser_aug_prob: + mean_color = np.mean(img2.reshape(-1, 3), axis=0) + for _ in range(np.random.randint(1, 3)): + x0 = np.random.randint(0, wd) + y0 = np.random.randint(0, ht) + dx = np.random.randint(bounds[0], bounds[1]) + dy = np.random.randint(bounds[0], bounds[1]) + img2[y0:y0+dy, x0:x0+dx, :] = mean_color + + return img1, img2 + + def spatial_transform(self, img1, img2, flow, intrinsic): + # randomly sample scale + ht, wd = img1.shape[:2] + min_scale = np.maximum( + (self.crop_size[0] + 8) / float(ht), + (self.crop_size[1] + 8) / float(wd)) + + scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) + scale_x = scale + scale_y = scale + if np.random.rand() < self.stretch_prob: + scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + + scale_x = np.clip(scale_x, min_scale, None) + scale_y = np.clip(scale_y, min_scale, None) + + if np.random.rand() < self.spatial_aug_prob: + # rescale the images + img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow = flow * [scale_x, scale_y] + if intrinsic is not None: + intrinsic = [intrinsic[0]*scale_x, intrinsic[1]*scale_y, intrinsic[2]*scale_x, intrinsic[3]*scale_y] + + if self.do_flip: + if np.random.rand() < self.h_flip_prob and self.do_flip == 'hf': # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + + if np.random.rand() < self.h_flip_prob and self.do_flip == 'h': # h-flip for stereo + tmp = img1[:, ::-1] + img1 = img2[:, ::-1] + img2 = tmp + + if np.random.rand() < self.v_flip_prob and self.do_flip == 'v': # v-flip + img1 = img1[::-1, :] + img2 = img2[::-1, :] + flow = flow[::-1, :] * [1.0, -1.0] + + if self.yjitter: + y0 = np.random.randint(2, img1.shape[0] - self.crop_size[0] - 2) + x0 = np.random.randint(2, img1.shape[1] - self.crop_size[1] - 2) + + y1 = y0 + np.random.randint(-2, 2 + 1) + img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + img2 = img2[y1:y1+self.crop_size[0], x0:x0+self.crop_size[1]] + flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + + else: + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) + x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) + + img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + if intrinsic is not None: + intrinsic[2] = intrinsic[2] - x0 + intrinsic[3] = intrinsic[3] - y0 + + return img1, img2, flow, intrinsic + + + def __call__(self, img1, img2, flow, intrinsic=None): + img1, img2 = self.color_transform(img1, img2) + img1, img2 = self.eraser_transform(img1, img2) + img1, img2, flow, intrinsic = self.spatial_transform(img1, img2, flow, intrinsic) + + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + if intrinsic is not None: + intrinsic= np.array(intrinsic) + + return img1, img2, flow, intrinsic + +class SparseFlowAugmentor: + def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False, yjitter=False, saturation_range=[0.7,1.3], gamma=[1,1,1,1]): + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = 0.8 + self.stretch_prob = 0.8 + self.max_stretch = 0.2 + + # flip augmentation params + self.do_flip = do_flip + self.h_flip_prob = 0.5 + self.v_flip_prob = 0.1 + + # photometric augmentation params + self.photo_aug = Compose([ColorJitter(brightness=0.3, contrast=0.3, saturation=saturation_range, hue=0.3/3.14), AdjustGamma(*gamma)]) + self.asymmetric_color_aug_prob = 0.2 + self.eraser_aug_prob = 0.5 + + def color_transform(self, img1, img2): + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) + img1, img2 = np.split(image_stack, 2, axis=0) + return img1, img2 + + def eraser_transform(self, img1, img2): + ht, wd = img1.shape[:2] + if np.random.rand() < self.eraser_aug_prob: + mean_color = np.mean(img2.reshape(-1, 3), axis=0) + for _ in range(np.random.randint(1, 3)): + x0 = np.random.randint(0, wd) + y0 = np.random.randint(0, ht) + dx = np.random.randint(50, 100) + dy = np.random.randint(50, 100) + img2[y0:y0+dy, x0:x0+dx, :] = mean_color + + return img1, img2 + + def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0): + ht, wd = flow.shape[:2] + coords = np.meshgrid(np.arange(wd), np.arange(ht)) + coords = np.stack(coords, axis=-1) + + coords = coords.reshape(-1, 2).astype(np.float32) + flow = flow.reshape(-1, 2).astype(np.float32) + valid = valid.reshape(-1).astype(np.float32) + + coords0 = coords[valid>=1] + flow0 = flow[valid>=1] + + ht1 = int(round(ht * fy)) + wd1 = int(round(wd * fx)) + + coords1 = coords0 * [fx, fy] + flow1 = flow0 * [fx, fy] + + xx = np.round(coords1[:,0]).astype(np.int32) + yy = np.round(coords1[:,1]).astype(np.int32) + + v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) + xx = xx[v] + yy = yy[v] + flow1 = flow1[v] + + flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) + valid_img = np.zeros([ht1, wd1], dtype=np.int32) + + flow_img[yy, xx] = flow1 + valid_img[yy, xx] = 1 + + return flow_img, valid_img + + def pad_images(self, img1, img2, flow, valid, intrinsic): + ch, cw = self.crop_size + padded_data = [] + + for data in [img1, img2, flow, valid]: + h, w = data.shape[:2] + pad_h = max(0, ch - h) + pad_w = max(0, cw - w) + + if pad_h > 0 or pad_w > 0: + pad_width = ((0, pad_h), (0, pad_w)) + ((0, 0),) * (data.ndim - 2) + padded_data.append(np.pad(data, pad_width, mode='constant', constant_values=0)) + else: + padded_data.append(data) + + return padded_data, intrinsic + + def spatial_transform(self, img1, img2, flow, valid, intrinsic): + # randomly sample scale + + ht, wd = img1.shape[:2] + min_scale = np.maximum( + (self.crop_size[0] + 1) / float(ht), + (self.crop_size[1] + 1) / float(wd)) + + scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) + scale_x = np.clip(scale, min_scale, None) + scale_y = np.clip(scale, min_scale, None) + + if np.random.rand() < self.spatial_aug_prob: + # rescale the images + img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) + flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y) + if intrinsic is not None: + intrinsic = [intrinsic[0]*scale_x, intrinsic[1]*scale_y, intrinsic[2]*scale_x, intrinsic[3]*scale_y] + + if self.do_flip: + if np.random.rand() < self.h_flip_prob and self.do_flip == 'hf': # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + + if np.random.rand() < self.h_flip_prob and self.do_flip == 'h': # h-flip for stereo + tmp = img1[:, ::-1] + img1 = img2[:, ::-1] + img2 = tmp + + if np.random.rand() < self.v_flip_prob and self.do_flip == 'v': # v-flip + img1 = img1[::-1, :] + img2 = img2[::-1, :] + flow = flow[::-1, :] * [1.0, -1.0] + + margin_y = 20 + margin_x = 50 + + img1, img2, flow, valid, intrinsic = self.pad_images(img1, img2, flow, valid, intrinsic) + # img1_raw_shape = img1.shape + # valid_raw_shape = valid.shape + + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y) + x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x) + + y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0]) + x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1]) + + img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] + + if intrinsic is not None: + intrinsic[2] = intrinsic[2] - x0 + intrinsic[3] = intrinsic[3] - y0 + + # print("-"*10, "SparseFlowAugmentor: ", self.crop_size, [x0,y0], img1.shape, img1_raw_shape, valid.shape, valid_raw_shape) + return img1, img2, flow, valid, intrinsic + + + def __call__(self, img1, img2, flow, valid, intrinsic=None): + img1, img2 = self.color_transform(img1, img2) + img1, img2 = self.eraser_transform(img1, img2) + img1, img2, flow, valid, intrinsic = self.spatial_transform(img1, img2, flow, valid, intrinsic) + + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + valid = np.ascontiguousarray(valid) + if intrinsic is not None: + intrinsic= np.array(intrinsic) + + return img1, img2, flow, valid, intrinsic diff --git a/core/utils/ddp.py b/core/utils/ddp.py new file mode 100644 index 0000000000000000000000000000000000000000..0c7ad281ecbfb52eb297bc72f40294f73155b2a4 --- /dev/null +++ b/core/utils/ddp.py @@ -0,0 +1,182 @@ +# ============================================================================== +# Copyright (c) 2022 The PersFormer Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import random +import logging +import subprocess +import numpy as np + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',) + +import torch +import torch.nn as nn +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.nn import DataParallel as DP +from torch.utils.data import Dataset, DataLoader +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler as DS + +from core.raft_stereo import RAFTStereo +from core.raft_stereo_disp import RAFTStereoDisp +from core.raft_stereo_mast3r import RAFTStereoMast3r +from core.raft_stereo_depthany import RAFTStereoDepthAny +from core.raft_stereo_noctx import RAFTStereoNoCTX +from core.raft_stereo_depthfusion import RAFTStereoDepthFusion +from core.raft_stereo_depthbeta import RAFTStereoDepthBeta +from core.raft_stereo_depthbeta_nolbp import RAFTStereoDepthBetaNoLBP +from core.raft_stereo_depthmatch import RAFTStereoDepthMatch +from core.raft_stereo_depthbeta_refine import RAFTStereoDepthBetaRefine +from core.raft_stereo_depth_postfusion import RAFTStereoDepthPostFusion +from core.raft_stereo_metric3d import RAFTStereoMetric3D + + +def setup_distributed(args): + args.rank = int(os.getenv("RANK")) + args.local_rank = int(os.getenv("LOCAL_RANK")) + args.world_size = int(os.getenv("WORLD_SIZE")) + # print("-"*10, "local_rank: {}, world_size:{}".format(args.local_rank, args.world_size), + # " - {}, {}".format(dist.get_rank(), dist.get_world_size())) # they result in the same value + dist.init_process_group(backend='nccl') + torch.cuda.set_device(args.local_rank) + torch.set_printoptions(precision=10) + +def ddp_init(args): + if 'WORLD_SIZE' in os.environ: + args.distributed = int(os.environ['WORLD_SIZE']) >= 1 + + if args.distributed: + setup_distributed(args) + + # deterministic + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + torch.manual_seed(args.local_rank) + np.random.seed(args.local_rank) + random.seed(args.local_rank) + + print("complete initialization of local_rank:{}".format(args.local_rank)) + +def ddp_close(): + dist.destroy_process_group() + +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + +def reduce_tensor(tensor, world_size): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.ReduceOp.SUM) + rt /= world_size + return rt + +def reduce_tensors(*tensors, world_size): + return [reduce_tensor(tensor, world_size) for tensor in tensors] + + +def get_loader(dataset, args): + """ + create dataset from ground-truth + return a batch sampler based ont the dataset + """ + if args.distributed: + if args.local_rank == 0: + print('use distributed sampler') + data_sampler = DS(dataset, shuffle=True, drop_last=True) + data_loader = DataLoader(dataset, + batch_size=args.batch_size, + sampler=data_sampler, + num_workers=args.num_workers, + pin_memory=True, + persistent_workers=True) + else: + if args.local_rank == 0: + print("use default sampler") + # data_sampler = torch.utils.data.sampler.SubsetRandomSampler(sample_idx) + # data_loader = DataLoader(transformed_dataset, + # batch_size=args.batch_size, sampler=data_sampler, + # num_workers=args.nworkers, pin_memory=True, + # persistent_workers=True, + # worker_init_fn=seed_worker, + # generator=g) + train_loader = DataLoader(train_dataset, + batch_size=args.batch_size, + pin_memory=True, shuffle=True, + num_workers=args.num_workers, + drop_last=True) + return data_loader + +NODE_RANK = os.getenv('NODE_RANK', default=0) +LOCAL_RANK = os.getenv("LOCAL_RANK", default=0) + +def get_model_ddp(args): + if args.model_name.lower() == "raftstereo": + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereo(args)) + elif args.model_name.lower() == "raftstereodisp": + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDisp(args)) + elif args.model_name.lower() == "raftstereomast3r": + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoMast3r(args)) + elif args.model_name.lower() == "raftstereodepthany": + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthAny(args)) + elif args.model_name.lower() == "raftstereodepthfusion": + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthFusion(args)) + elif args.model_name.lower() == "RAFTStereoDepthBeta".lower(): + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthBeta(args)) + elif args.model_name.lower() == "RAFTStereoDepthBetaNoLBP".lower(): + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthBetaNoLBP(args)) + elif args.model_name.lower() == "RAFTStereoDepthMatch".lower(): + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthMatch(args)) + elif args.model_name.lower() == "RAFTStereoDepthBetaRefine".lower(): + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthBetaRefine(args)) + elif args.model_name.lower() == "RAFTStereoDepthPostFusion".lower(): + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoDepthPostFusion(args)) + elif args.model_name.lower() == "RAFTStereoMetric3D".lower(): + model = nn.SyncBatchNorm.convert_sync_batchnorm(RAFTStereoMetric3D(args)) + else : + raise Exception("No such model: {}".format(args.model_name)) + + device = torch.device("cuda", args.local_rank) + model = model.to(device) + + if args.restore_ckpt is not None: + assert args.restore_ckpt.endswith(".pth") or args.restore_ckpt.endswith(".tar") + if args.local_rank==0 : + logging.info("Loading checkpoint from {} ...".format(args.restore_ckpt)) + checkpoint = torch.load(args.restore_ckpt) + new_state_dict = {} + for key, value in checkpoint.items(): + new_key = key.replace('module.', '') # 去掉 'module.' 前缀 + # if key.find("refinement.conf_estimate") != -1: + # continue + new_state_dict[new_key] = value + # model.load_state_dict(new_state_dict, strict=True) + model.load_state_dict(new_state_dict, strict=False) + if args.local_rank==0 : + logging.info(f"Done loading checkpoint") + + dist.barrier() + + # DDP setting + if args.distributed: + model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, + find_unused_parameters=True) + else: + model = DP(RAFTStereo(args)) + return model diff --git a/core/utils/frame_utils.py b/core/utils/frame_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a51dabc0d8e17082848270f94e83126b271014a0 --- /dev/null +++ b/core/utils/frame_utils.py @@ -0,0 +1,261 @@ +import os +import numpy as np +from PIL import Image +from os.path import * +import re +import json +import imageio +import cv2 +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +TAG_CHAR = np.array([202021.25], np.float32) + +def readFlow(fn): + """ Read .flo file in Middlebury format""" + # Code adapted from: + # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy + + # WARNING: this will work on little-endian architectures (eg Intel x86) only! + # print 'fn = %s'%(fn) + with open(fn, 'rb') as f: + magic = np.fromfile(f, np.float32, count=1) + if 202021.25 != magic: + print('Magic number incorrect. Invalid .flo file') + return None + else: + w = np.fromfile(f, np.int32, count=1) + h = np.fromfile(f, np.int32, count=1) + # print 'Reading %d x %d flo file\n' % (w, h) + data = np.fromfile(f, np.float32, count=2*int(w)*int(h)) + # Reshape data into 3D array (columns, rows, bands) + # The reshape here is for visualization, the original code is (w,h,2) + return np.resize(data, (int(h), int(w), 2)) + +def readPFM(file): + file = open(file, 'rb') + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header == b'PF': + color = True + elif header == b'Pf': + color = False + else: + raise Exception('Not a PFM file.') + + dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline()) + if dim_match: + width, height = map(int, dim_match.groups()) + else: + raise Exception('Malformed PFM header.') + + scale = float(file.readline().rstrip()) + if scale < 0: # little-endian + endian = '<' + scale = -scale + else: + endian = '>' # big-endian + + data = np.fromfile(file, endian + 'f') + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + return data + +def writePFM(file, array): + import os + assert type(file) is str and type(array) is np.ndarray and \ + os.path.splitext(file)[1] == ".pfm" + with open(file, 'wb') as f: + H, W = array.shape + headers = ["Pf\n", f"{W} {H}\n", "-1\n"] + for header in headers: + f.write(str.encode(header)) + array = np.flip(array, axis=0).astype(np.float32) + f.write(array.tobytes()) + + + +def writeFlow(filename,uv,v=None): + """ Write optical flow to file. + + If v is None, uv is assumed to contain both u and v channels, + stacked in depth. + Original code by Deqing Sun, adapted from Daniel Scharstein. + """ + nBands = 2 + + if v is None: + assert(uv.ndim == 3) + assert(uv.shape[2] == 2) + u = uv[:,:,0] + v = uv[:,:,1] + else: + u = uv + + assert(u.shape == v.shape) + height,width = u.shape + f = open(filename,'wb') + # write the header + f.write(TAG_CHAR) + np.array(width).astype(np.int32).tofile(f) + np.array(height).astype(np.int32).tofile(f) + # arrange into matrix form + tmp = np.zeros((height, width*nBands)) + tmp[:,np.arange(width)*2] = u + tmp[:,np.arange(width)*2 + 1] = v + tmp.astype(np.float32).tofile(f) + f.close() + + +def readFlowKITTI(filename): + flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR) + flow = flow[:,:,::-1].astype(np.float32) + flow, valid = flow[:, :, :2], flow[:, :, 2] + flow = (flow - 2**15) / 64.0 + return flow, valid + +def readDispKITTI(filename): + disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0 + valid = disp > 0.0 + return disp, valid + +def writeDispKITTI(filename, disp): + disp = np.round(disp * 256).astype(np.uint16) + # skimage.io.imsave(filename, disp) + cv2.imwrite(filename, disp) + +def readDispCRES(filename): + try: + disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH).astype(np.float32) / 32.0 + valid = disp > 0.0 + return disp, valid + except Exception as err: + raise(Exception(err, "Something wrong with {}".format(filename), os.getcwd())) + +def writeDispCRES(filename, disp): + disp = np.round(disp * 32).astype(np.uint16) + # skimage.io.imsave(filename, disp) + cv2.imwrite(filename, disp) + +def readDispNerfS(filename): + disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH).astype(np.float32) / 64.0 + + match = re.search(r"(.*?/Q/)", filename) + if match: + prefix = match.group(1) # prefix + suffix = os.path.basename(filename) # file name + # AO path, aka confidence + ao_path = f"{prefix}AO/{suffix}" + # print("AO图路径:", ao_path) + else: + raise Exception("corrupted path for NerfStereo: {}".format(filename)) + valid = cv2.imread(ao_path, cv2.IMREAD_ANYDEPTH).astype(np.float32) / 65535 + return disp, valid + +def writeDispNerfS(filename, disp): + disp = np.round(disp * 64).astype(np.uint16) + # skimage.io.imsave(filename, disp) + cv2.imwrite(filename, disp) + +def readDispBooster(file_name): + disp = np.load(file_name, encoding='bytes', allow_pickle=True) + # mask_00 = os.path.join(os.path.split(file_name)[0], 'mask_00.png') + mask_cat_path = os.path.join(os.path.split(file_name)[0], 'mask_cat.png') + mask_cat = cv2.imread(mask_cat_path, cv2.IMREAD_ANYDEPTH).astype(np.float32) + valid = mask_cat + return disp, valid + +def writeDispBooster(filename, disp): + # disp = np.round(disp).astype(np.uint16) + # # skimage.io.imsave(filename, disp) + # filename = filename.replace(".npy", ".jpg") + # cv2.imwrite(filename, disp) + np.save(filename, disp) + +# Method taken from /n/fs/raft-depth/RAFT-Stereo/datasets/SintelStereo/sdk/python/sintel_io.py +def readDispSintelStereo(file_name): + a = np.array(Image.open(file_name)) + d_r, d_g, d_b = np.split(a, axis=2, indices_or_sections=3) + disp = (d_r * 4 + d_g / (2**6) + d_b / (2**14))[..., 0] + mask = np.array(Image.open(file_name.replace('disparities', 'occlusions'))) + valid = ((mask == 0) & (disp > 0)) + return disp, valid + +# Method taken from https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt +def readDispFallingThings(file_name): + a = np.array(Image.open(file_name)) + with open('/'.join(file_name.split('/')[:-1] + ['_camera_settings.json']), 'r') as f: + intrinsics = json.load(f) + fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx'] + disp = (fx * 6.0 * 100) / a.astype(np.float32) + valid = disp > 0 + return disp, valid + +# Method taken from https://github.com/castacks/tartanair_tools/blob/master/data_type.md +def readDispTartanAir(file_name): + depth = np.load(file_name) + disp = 80.0 / depth + valid = disp > 0 + return disp, valid + + +def readDispMiddlebury(file_name): + if basename(file_name) == 'disp0GT.pfm': + disp = readPFM(file_name).astype(np.float32) + assert len(disp.shape) == 2 + nocc_pix = file_name.replace('disp0GT.pfm', 'mask0nocc.png') + assert exists(nocc_pix) + nocc_pix = imageio.imread(nocc_pix) == 255 + assert np.any(nocc_pix) + return disp, nocc_pix + elif basename(file_name) == 'disp0.pfm': + disp = readPFM(file_name).astype(np.float32) + valid = disp < 1e3 + return disp, valid + +def writeDispMiddlebury(file_name, disp): + writePFM(file_name, disp) + +def writeFlowKITTI(filename, uv): + uv = 64.0 * uv + 2**15 + valid = np.ones([uv.shape[0], uv.shape[1], 1]) + uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) + cv2.imwrite(filename, uv[..., ::-1]) + + +def read_gen(file_name, pil=False): + ext = splitext(file_name)[-1] + if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': + return Image.open(file_name) + elif ext == '.bin' or ext == '.raw': + return np.load(file_name) + elif ext == '.flo': + return readFlow(file_name).astype(np.float32) + elif ext == '.pfm': + flow = readPFM(file_name).astype(np.float32) + if len(flow.shape) == 2: + return flow + else: + return flow[:, :, :-1] + return [] + +def write_gen(file_name, disp, pil=False): + ext = splitext(file_name)[-1] + if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': + raise Exception("no support for {} file".format(ext)) + elif ext == '.bin' or ext == '.raw': + np.save(disp, file_name) + elif ext == '.flo': + writeFlow(file_name, disp) + elif ext == '.pfm': + writePFM(file_name, disp) + else: + raise Exception("no support for {} file".format(ext)) \ No newline at end of file diff --git a/core/utils/plane.py b/core/utils/plane.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4c9fa127aac559fbd414653b93c320d7330c5a --- /dev/null +++ b/core/utils/plane.py @@ -0,0 +1,289 @@ +import os +import sys +import time +import numpy as np + +import torch +from torch import nn +from torch.nn import functional as F +from PIL import Image + +import frame_utils +import vis + + +def get_pos(H,W,disp=None,slant="slant",slant_norm=False,patch_size=None,device=None): + if slant=="slant": + u,v = torch.arange(W,device=device), torch.arange(H,device=device) + grid_u, grid_v = torch.meshgrid(u, v, indexing="xy") + if slant_norm: + grid_u = grid_u/W + grid_v = grid_v/H + elif slant=="slant_local": + assert H%patch_size==0 and W%patch_size==0 + if not slant_norm: + u = torch.arange(-patch_size/2+0.5, patch_size/2-0.5 + 1, step=1, device=device) + v = torch.arange(-patch_size/2+0.5, patch_size/2-0.5 + 1, step=1, device=device) + else: + # restrict into (-1,1) + u = torch.arange(-1+1/patch_size, 1, step=2/patch_size, device=device) + v = torch.arange(-1+1/patch_size, 1, step=2/patch_size, device=device) + # print(u,v,sep="\r\n") + u = u.tile((W//patch_size)) + v = v.tile((H//patch_size)) + grid_u, grid_v = torch.meshgrid(u, v, indexing="xy") + # print(grid_u.shape, grid_v.shape) + # print(grid_u[0:2,:10], grid_v[0:2, :10], sep="\r\n") + grid_u = grid_u.view((1,1,H,W)) + grid_v = grid_v.view((1,1,H,W)) + if disp is not None: + pos = torch.cat([grid_u,grid_v,disp],dim=1) + else: + pos = torch.cat([grid_u,grid_v],dim=1) + return pos.float() + +def convert2patch(data, patch_size, div_last=False): + """ + data: B,C,H,W; + """ + B,C,H,W = data.shape + assert H%patch_size==0 and W%patch_size==0 + patch_data = F.unfold(data, kernel_size=patch_size, dilation=1, padding=0, stride=patch_size) + patch_data = patch_data.view((-1,C,patch_size*patch_size,H//patch_size,W//patch_size)) + if div_last: + patch_data[:,-1] /= patch_size + return patch_data + +def intra_dist4patch(patch_data, patch_size): + """ + patch_data: B,C,patch_size*patch_size,H,W + """ + src = patch_data.unsqueeze(3).tile((1,1,1,patch_size*patch_size,1,1)) + tar = patch_data.unsqueeze(2).tile((1,1,patch_size*patch_size,1,1,1)) + dist = torch.sqrt(torch.square(src-tar).sum(dim=1)) + return dist + +def get_adjacent_matrix(dist,patch_size,thold=3): + connect = (distbikhw', connect, connect) + connect = (connect>0).float() + connect = (connect>0).sum(dim=2) + return connect + +def reduce_noise(patch_coord, mask): + """ + patch_coord: B,C,patch_size*patch_size,H,W; + mask: B,patch_size*patch_size,H,W; + """ + # replace the other clique with center point of the largest clique + center_coord = (patch_coord*mask.unsqueeze(1)).sum(dim=2) / mask.sum(dim=1) + chs_coord = patch_coord*mask.unsqueeze(1) + (~mask.unsqueeze(1)) * center_coord.unsqueeze(2) + # print(mask.shape, coord.shape, patch_coord.shape, chs_coord.shape) + return chs_coord + +# def abs2relative(patch_coord): +# """ +# patch_coord: B,C,patch_size*patch_size,H,W; +# """ +# center_patch_coord = patch_coord.mean(dim=2) +# rel_patch_coord = patch_coord - center_patch_coord.unsqueeze(2) +# return rel_patch_coord, center_patch_coord + +def get_plane_lstsq(chs_coord, slant, patch_coord=None): + """ + chs_coord: B,C,patch_size*patch_size,H,W; + mask: B,patch_size*patch_size,H,W; + return: + cab: B,6,H,W; (disparity, a, b, g_uu, g_vv, g_uv) + """ + # "slant": get a*u + b*v - d + c = 0 through least squares + # "slant_local": a*(u-u_p) + b*(b-b_p) - (d-d_p) = 0 + B,C,L,H,W = chs_coord.shape + chs_coord = chs_coord.flatten(-2,-1).transpose(-2,-1) # (B,C,H*W,patch_size*patch_size) + u_coord = chs_coord[:,0] + v_coord = chs_coord[:,1] + d_coord = chs_coord[:,2] + A = torch.stack((torch.ones_like(u_coord), u_coord, v_coord, + u_coord*u_coord/2, v_coord*v_coord/2, u_coord*v_coord), dim=3) # (B,H*W,patch_size*patch_size,6) + + # print(chs_coord.shape, A.shape, d_coord.shape) + cab = torch.linalg.lstsq(A, d_coord).solution # B,H*W,C + cab = cab.transpose(1,2).view((-1,6,H,W)) + + # # A(B,N,P,C) X(B,N,C) Y(B,N,P) + # # print("-"*10, A.shape, d_coord.shape, abc.shape) + # left_top = torch.einsum('aijk,aikh->aijh', A.transpose(-1,-2), A) # (B,N,C,C) + # right_top = -torch.einsum('aijk,aikh->aijh', A.transpose(-1,-2), d_coord.unsqueeze(-1)) # (B,N,C,1) + # left_bottom = right_top.transpose(-1,-2) # (B,N,1,C) + # right_bottom = d_coord.square().sum(dim=-1,keepdim=True).unsqueeze(-1) # (B,N,1,1) + # top = torch.cat([left_top,right_top], dim=3) + # bottom = torch.cat([left_bottom,right_bottom], dim=3) + # B = torch.cat([top,bottom], dim=2) + # L, V = torch.linalg.eig(B) + # print(L, V.shape) + + return cab + +def extract_plane(disp,slant="slant", slant_norm=False, patch_size=4,thold=3,vis=False): + """ + disp: B,1,H,W; + return: + cab: B,6,H,W; (disparity, a, b, g_uu, g_vv, g_uv) + """ + # cluster through nearest search + patch_pos = convert2patch(disp, patch_size=patch_size) + dist = intra_dist4patch(patch_pos, patch_size=patch_size) + connect = get_adjacent_matrix(dist, patch_size=patch_size, thold=thold) + + # get the largest clique + mask = connect - torch.amax(connect,dim=1).unsqueeze(1) + mask = mask >= -0.0001 + # print((mask==0).sum(), (mask>0.5).sum(), mask.size()) + # print(disp[0,0,8:12,0:4], patch_pos[0,0,:,2,0], dist[0,:,:,2,0], connect[0,:,2,0], mask[0,:,2,0], sep="\r\n") + + # get the 3d coordinate (u,v,d) of each point + B,_,H,W = disp.shape + coord = get_pos(H,W,disp=disp,slant=slant,slant_norm=slant_norm,patch_size=patch_size) + patch_coord = convert2patch(coord, patch_size=patch_size, div_last=True) + + # replace the other clique with center point of the largest clique + chs_coord = reduce_noise(patch_coord, mask) + # print(coord[0,:,400:404,400:404], patch_coord[0,:,:,100,100], chs_coord[0,:,:,100,100], sep="\r\n") + + # "slant": get a*u + b*v - d + c = 0 through least squares + # "slant_local": a*(u-u_p) + b*(b-b_p) - (d-d_p) = 0 + cab = get_plane_lstsq(chs_coord, slant, patch_coord) + + if vis: + return cab, mask + return cab + +def predict_disp(cab, uv_coord, patch_size, mul_last=False): + """ + cab: B,6,H,W; (disparity, a, b, g_uu, g_vv, g_uv) + uv_coord: B,2,patch_size*patch_size,H,W; + """ + u_coord = uv_coord[:,0] + v_coord = uv_coord[:,1] + A = torch.stack((torch.ones_like(u_coord), u_coord, v_coord, + u_coord*u_coord/2, v_coord*v_coord/2, u_coord*v_coord), dim=1) # (B,6,patch_size*patch_size,H,W) + d_coord = (A * cab.unsqueeze(dim=2)).sum(dim=1) + if mul_last: + d_coord *= patch_size + # print(d_coord.shape) + return d_coord + +def compute_curvature(cab): + """ + cab: B,6,H,W; (disparity, a, b, g_uu, g_vv, g_uv) + + """ + B,C,H,W = cab.shape + hessian = torch.stack([cab[0,-3], cab[0,-1], cab[0,-1], cab[0,-2]],dim=-1).reshape(H,W,2,2) + eigen_val, eigen_vec = torch.linalg.eigh(hessian) + Gaussian_cur = eigen_val[...,0] * eigen_val[...,1] + mean_cur = (eigen_val[...,0] + eigen_val[...,1]) / 2 + + Gaussian_cur = Gaussian_cur.abs() + mean_cur = mean_cur.abs() + + Gaussian_cur[Gaussian_cur>0.03] = 0 + mean_cur[mean_cur>0.01] = 0 + + Gaussian_cur = (Gaussian_cur - Gaussian_cur.min()) / (Gaussian_cur.max()-Gaussian_cur.min()) + mean_cur = (mean_cur - mean_cur.min()) / (mean_cur.max()-mean_cur.min()) + + # print(Gaussian_cur[120, 170:180], mean_cur[120, 170:180], cab[0,-3:, 120, 170:180], sep="\r\n") + + return Gaussian_cur, mean_cur + + +if __name__ == '__main__': + # slant = "slant" + slant = "slant_local" + # slant_norm = True + slant_norm = False + patch_size = 4 + root = "/horizon-bucket/saturn_v_dev/01_users/chengtang.yao/Sceneflow" + disp_path = root+"/flyingthings3d/disparity/TRAIN/A/0717/left/0006.pfm" + left_path = root+"/flyingthings3d/frames_cleanpass/TRAIN/A/0717/left/0006.png" + sv_path = "./tmp.png" + + img0 = np.array(Image.open(left_path)) + disp = np.array(frame_utils.readPFM(disp_path)) + # disp = np.zeros((20,20)) + # disp[9:] = 10 + H,W = disp.shape + + start_time = time.time() + disp = torch.from_numpy(disp).unsqueeze(0).unsqueeze(0) + img0 = torch.from_numpy(img0).permute((2,0,1)).unsqueeze(0) + + # extract planes a*u + b*v - d + c = 0 + # (B,6,H,W) ~ [disparity, u_coord, v_coord, g_uu, g_vv, g_uv] + cab, mask = extract_plane(disp, + slant=slant, slant_norm=slant_norm, + patch_size=patch_size, thold=3, vis=True) + # print(cab.shape) + + uv_coord = get_pos(H,W, slant=slant, slant_norm=slant_norm, patch_size=patch_size) + patch_uv_coord = convert2patch(uv_coord, patch_size=patch_size) + d_coord = predict_disp(cab, patch_uv_coord, patch_size=patch_size, mul_last=True) + + patch_disp = convert2patch(disp, patch_size=patch_size, div_last=True) + rec_disp = F.fold(d_coord.flatten(-2,-1), disp.shape[-2:], kernel_size=patch_size, stride=patch_size).view(1,1,H,W) + rec_mask = F.fold(mask.flatten(-2,-1).float(), disp.shape[-2:], kernel_size=patch_size, stride=patch_size).view(1,1,H,W).bool() + # print(rec_disp.shape, patch_disp.shape, disp.shape[-2:]) + + # print(disp.shape, img0.shape, patch_pos.shape, dist.shape, connect.shape, mask.shape) + # test_v, test_u = 100,100 + # torch.set_printoptions(precision=2) + # print(src[0,:,0,:,test_v, test_u], tar[0,:,0,:,test_v, test_u], patch_pos[0,:,:,test_v, test_u], dist[0,:,:,test_v, test_u], sep="\r\n") + # print(connect[0,:,test_v, test_u], mask[0,:,test_v, test_u], sep="\r\n") + + end_time = time.time() + print("cost time: {}".format(end_time-start_time), cab.shape) + + disp = disp.squeeze(0).squeeze(0).cpu().data.numpy() + img0 = img0.squeeze(0).permute((1,2,0)).cpu().data.numpy() + patch_disp = patch_disp[0,0,0,...].cpu().data.numpy() + rec_disp = rec_disp[0,0,...].cpu().data.numpy() + rec_mask = rec_mask[0,0,...].cpu().data.numpy() + + error_map = np.abs(rec_disp-disp) + color_error_map = vis.colorize_error_map(error_map) + + # normals + degree = torch.atan(cab[0,1] / cab[0,2]) + + # curvatures + Gaussian_cur, mean_cur = compute_curvature(cab) + print("-"*10, Gaussian_cur.min(), Gaussian_cur.max(), Gaussian_cur.mean(), Gaussian_cur.median()) + print("-"*10, mean_cur.min(), mean_cur.max(), mean_cur.mean(), mean_cur.median()) + + atom_dict = [{"img":img0, "title":"Left Image", }, + {"img":disp, "title":"GT Disparity", "cmap":'jet', }, + {"img":patch_disp, "title":"GT Patch Disparity", "cmap":'jet', }, + {"img":rec_disp, "title":"GT recover Disparity", "cmap":'jet', }, + {"img":rec_mask, "title":"rec_mask", "cmap": "gray"}, + {"img":color_error_map, "title":"color_error_map", }, + {"img":degree, "title":"GT ab", "cmap":'jet', }, + {"img":cab[0,0], "title":"GT c", "cmap":'jet', }, + + {"img":Gaussian_cur.abs(), "title":"Gaussian curvature", "cmap":'jet', }, + {"img":mean_cur.abs(), "title":"mean curvature", "cmap":'jet', }, + ] + + if slant=="slant_local": + d_p = cab[0,0] + error_map = np.abs(d_p-patch_disp) + color_error_map = vis.colorize_error_map(error_map) + tmp_dict = [{"img":d_p, "title":"GT Disparity of Plane", "cmap":'jet', }, + {"img":color_error_map, "title":"color_error_map of Plane", },] + atom_dict += tmp_dict + + vis.show_imgs(atom_dict, + sv_img=True, save2where=sv_path, if_inter=False, + fontsize=20, szWidth=10, szHeight=5, group=2) \ No newline at end of file diff --git a/core/utils/utils.py b/core/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..16c1372becfe8e32f6a9765acf8503c1cdf5b126 --- /dev/null +++ b/core/utils/utils.py @@ -0,0 +1,351 @@ +import os +import sys +import time +import shutil +import logging +import numpy as np + +from scipy import interpolate +from datetime import datetime + +import torch +import torch.nn.functional as F + + + +class InputPadder: + """ Pads images such that dimensions are divisible by 8 """ + def __init__(self, dims, mode='sintel', divis_by=8): + self.ht, self.wd = dims[-2:] + pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by + pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by + if mode == 'sintel': + self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] + else: + self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] + + def pad(self, *inputs): + assert all((x.ndim == 4) for x in inputs) + return [F.pad(x, self._pad, mode='replicate') for x in inputs] + + def pad_intrinsics(self, intrinsic): + intrinsic[:, 2] += self._pad[0] + intrinsic[:, 3] += self._pad[2] + return intrinsic + + def unpad(self, x): + assert x.ndim == 4 + ht, wd = x.shape[-2:] + c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] + return x[..., c[0]:c[1], c[2]:c[3]] + +def forward_interpolate(flow): + flow = flow.detach().cpu().numpy() + dx, dy = flow[0], flow[1] + + ht, wd = dx.shape + x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) + + x1 = x0 + dx + y1 = y0 + dy + + x1 = x1.reshape(-1) + y1 = y1.reshape(-1) + dx = dx.reshape(-1) + dy = dy.reshape(-1) + + valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) + x1 = x1[valid] + y1 = y1[valid] + dx = dx[valid] + dy = dy[valid] + + flow_x = interpolate.griddata( + (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) + + flow_y = interpolate.griddata( + (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) + + flow = np.stack([flow_x, flow_y], axis=0) + return torch.from_numpy(flow).float() + + +def bilinear_sampler(img, coords, mode='bilinear', mask=False): + """ Wrapper for grid_sample, uses pixel coordinates """ + H, W = img.shape[-2:] + xgrid, ygrid = coords.split([1,1], dim=-1) + xgrid = 2*xgrid/(W-1) - 1 + if H > 1: + ygrid = 2*ygrid/(H-1) - 1 + + grid = torch.cat([xgrid, ygrid], dim=-1) + img = F.grid_sample(img, grid, align_corners=True) + + if mask: + mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) + return img, mask.float() + + return img + + +def coords_grid(batch, ht, wd): + coords = torch.meshgrid(torch.arange(ht), torch.arange(wd)) + coords = torch.stack(coords[::-1], dim=0).float() + return coords[None].repeat(batch, 1, 1, 1) + +def hor_coords_grid(batch, ht, wd): + # (batch,1,H,W) + hor_coords = torch.arange(wd).float().repeat(batch, 1, ht, 1) + return hor_coords + + +def upflow8(flow, mode='bilinear'): + new_size = (8 * flow.shape[2], 8 * flow.shape[3]) + return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) + +def gauss_blur(input, N=5, std=1): + B, D, H, W = input.shape + x, y = torch.meshgrid(torch.arange(N).float() - N//2, torch.arange(N).float() - N//2) + unnormalized_gaussian = torch.exp(-(x.pow(2) + y.pow(2)) / (2 * std ** 2)) + weights = unnormalized_gaussian / unnormalized_gaussian.sum().clamp(min=1e-4) + weights = weights.view(1,1,N,N).to(input) + output = F.conv2d(input.reshape(B*D,1,H,W), weights, padding=N//2) + return output.view(B, D, H, W) + +def disparity_computation(params, slant=None, slant_norm=False, coords0=None): + """ + args: + params: (B,C,...), C is the type of parameters. + coords0: (B,C,...), C is the number of coordinates' axis. + """ + if slant is None or len(slant)==0 : + offset = params + elif slant=="slant" : + # d = a*u + b*v + c + B,H,W = coords0.shape[0], coords0.shape[-2], coords0.shape[-1] + if slant_norm: + norm_range = torch.Tensor([W,H])[None,:,None,None].float().to(coords0.device) + offset = params[:,0] * coords0[:,0] / norm_range[:,0] + \ + params[:,1] * coords0[:,1] / norm_range[:,1] + \ + params[:,2] + else: + offset = params[:,0] * coords0[:,0] + \ + params[:,1] * coords0[:,1] + \ + params[:,2] + elif slant=="slant_local": + raise Exception("slant_local is not supported") + else: + raise Exception(f"{slant} is not supported") + return offset + + +def sv_intermediate_results(data, name, sv_path): + try: + sv_path = os.path.join(sv_path, "data") + if not os.path.exists(sv_path): + os.makedirs(sv_path) + + data_numpy = data.cpu().data.numpy() + np.save(os.path.join(sv_path, name+".npy"), data_numpy) + # print("saving to {}".format( os.path.join(sv_path, name+".npy") )) + except Exception as err: + raise Exception(err, data.shape, name, sv_path) + +def load_intermediate_results(name, sv_path): + sv_path = os.path.join(sv_path, "data") + data = np.load(os.path.join(sv_path, name+".npy")) + return data + + +def rescale_modulation(itr, iters, modulation_alg, modulation_ratio): + # we hope modulation has less effect at the first several iterations as the disp is unreliable and the lcoal LBP disp is unreliable + if modulation_alg == "linear": + ratio = modulation_ratio * itr / iters + elif modulation_alg == "sigmoid": + ratio = modulation_ratio * 1 / (1 + np.exp(-2 * (itr - 5))) + else: + raise Exception("Not supported modulation_alg: {}".format(modulation_alg)) + return ratio + + + +NODE_RANK = os.getenv('NODE_RANK', default=0) +LOCAL_RANK = os.getenv("LOCAL_RANK", default=0) +LOG_ROOT = os.getenv('LOG_ROOT', default="logs") +TB_ROOT = os.getenv('TB_ROOT', default="runs") + +class LoggerCommon: + def __init__(self, name): + self.name = name + self.log_name = '{}-{}.log'.format(name, datetime.now().strftime("%y%m%d_%H%M%S")) + self.log_path = os.path.join(LOG_ROOT, self.log_name) + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0: + os.makedirs(LOG_ROOT, exist_ok=True) + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', + handlers = [logging.FileHandler(self.log_path), + logging.StreamHandler()] + ) + self.logger = logging.getLogger(name) + self.logger.addHandler(logging.FileHandler(self.log_path)) + + def _set_handlers(self): + # 清除之前的所有处理器 + self.logger.handlers.clear() + + # 设置文件和控制台处理器 + file_handler = logging.FileHandler(self.log_path) + console_handler = logging.StreamHandler() + + # 设置处理器格式 + formatter = logging.Formatter('%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s') + file_handler.setFormatter(formatter) + console_handler.setFormatter(formatter) + + # 添加处理器 + self.logger.addHandler(file_handler) + self.logger.addHandler(console_handler) + + def set_log_path(self, new_log_path, name=None): + # 删除旧日志文件(如果存在) + if os.path.exists(self.log_path): + os.remove(self.log_path) + + # 更新路径并重设处理器 + if name is not None: + self.name = name + self.log_name = '{}-{}.log'.format(self.name, datetime.now().strftime("%y%m%d_%H%M%S")) + self.log_path = os.path.join(new_log_path, self.log_name) + os.makedirs(new_log_path, exist_ok=True) + self._set_handlers() + + + def info(self, message): + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0: + self.logger.info(message) + + def warning(self, message): + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0: + self.logger.warning(message) + + def error(self, message): + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0: + self.logger.error(message) + + def exception(self, message): + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0: + self.logger.exception(message) + + def print_args(self, args): + msg = "" + args_dict = vars(args) + max_arg_length = max(len(arg_name) for arg_name in args_dict.keys()) + for arg_name, arg_value in args_dict.items(): + arg_name_padded = arg_name.ljust(max_arg_length) + msg += f"{arg_name_padded}: {arg_value}\r\n" + self.info(msg) + + + +from torch.utils.tensorboard import SummaryWriter + +class LoggerTraining(LoggerCommon): + + SUM_FREQ = 100 + + def __init__(self, name, model=None, scheduler=None): + super(LoggerTraining, self).__init__(name) + + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0: + os.makedirs(TB_ROOT, exist_ok=True) + + self.model = model + self.scheduler = scheduler + self.silence = False + self.total_steps = 0 + self.running_loss = {} + self.writer = SummaryWriter(log_dir=TB_ROOT) + + def set_training(self, model, scheduler): + self.model = model + self.scheduler = scheduler + + def _print_training_status(self): + metrics_data = [self.running_loss[k]/LoggerTraining.SUM_FREQ for k in sorted(self.running_loss.keys())] + training_str = "[{:6d}, {:10.7f}] ".format(self.total_steps+1, self.scheduler.get_last_lr()[0]) + metrics_str = ("{:10.4f}, "*len(metrics_data)).format(*metrics_data) + + # print the training status + self.info(f"Training Metrics ({self.total_steps}): {training_str + metrics_str}") + + if self.writer is None: + self.writer = SummaryWriter(log_dir=TB_ROOT) + + for k in self.running_loss: + self.writer.add_scalar(k, self.running_loss[k]/LoggerTraining.SUM_FREQ, self.total_steps) + self.running_loss[k] = 0.0 + + def push(self, metrics): + self.total_steps += 1 + + for key in metrics: + if key not in self.running_loss: + self.running_loss[key] = 0.0 + + self.running_loss[key] += metrics[key] + + if self.total_steps % LoggerTraining.SUM_FREQ == LoggerTraining.SUM_FREQ-1: + self._print_training_status() + self.running_loss = {} + + def write_dict(self, results): + if self.writer is None: + self.writer = SummaryWriter(log_dir=TB_ROOT) + + for key in results: + self.writer.add_scalar(key, results[key], self.total_steps) + + def close(self): + self.writer.close() + + + +def init_directories(directories): + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0 : + for directory in directories: + os.makedirs(directory, exist_ok=True) + +def delete_directories_if_static(directories): + if int(LOCAL_RANK)==0 and int(NODE_RANK)==0 : + # 如果检测到文件大小有变化,终止删除操作 + if not is_any_folder_static(directories): + print("File sizes are changing in one of the directories {}.".format(directories) + \ + "No directories will be deleted.") + return + + # 如果所有文件都静止,删除目录 + for directory in directories: + if os.path.exists(directory): + shutil.rmtree(directory) + print(f"Directory {directory} deleted") + +def get_file_sizes(directories): + """返回多个目录中所有文件的大小字典""" + file_sizes = {} + for directory in directories: + if os.path.exists(directory): + for root, dirs, files in os.walk(directory): + for file in files: + filepath = os.path.join(root, file) + file_sizes[filepath] = os.path.getsize(filepath) + return file_sizes + +def is_any_folder_static(directories, check_interval=2): + """检测所有文件是否静止(没有变化)""" + # 获取所有文件初始大小 + initial_sizes = get_file_sizes(directories) + time.sleep(check_interval) # 等待一段时间,观察文件变化 + final_sizes = {filepath: os.path.getsize(filepath) for filepath in initial_sizes if os.path.exists(filepath)} + + # 如果文件大小一致,则所有文件静止 + return initial_sizes == final_sizes \ No newline at end of file diff --git a/core/utils/vis.py b/core/utils/vis.py new file mode 100644 index 0000000000000000000000000000000000000000..cae341aa1a85ff1b4b241f1e9c2c94d81937020c --- /dev/null +++ b/core/utils/vis.py @@ -0,0 +1,638 @@ +import os +import re +import sys +import cv2 +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +import frame_utils as frame_utils + + + +def show_imgs(param, sv_img=False, save2where=None, + fontsize=20, szWidth=10, szHeight=5, group=3, + if_inter=False, dpi=600): + """function: visualize the input data + args: + paras: [(img, title, colormap), ... ] or + [{"img":..., "title":..., "cmap":..., "point_x":..., "point_y":..., "point_s":..., "point_c":..., "point_m":..., "colorbar":...}, ... ] + sv_img: whether to save the visualization + fontsize : the size of font in title + szWidth, szHeight: width and height of each subfigure + group: the columns of the whole figure + """ + img_num = len(param) + cols = int(group) + rows = int(np.ceil(img_num/group)) + sv_title = "" + color_map = None + plt_par_list = [] +# plt.clf() + fig = plt.figure(figsize=(szWidth*cols, szHeight*rows)) + + for i in np.arange(img_num) : + if len(param[i])<2 : + raise Exception("note, each element should be (img, title, ...)") + + if isinstance(param[i], list) or isinstance(param[i], np.ndarray) or isinstance(param[i], tuple) : + name_list = ["img", "title", "cmap", "point_x", "point_y", "point_s", "point_c", "point_m", "point_alpha"] + plt_par = {} + for key_id, ele in enumerate(param[i]) : + plt_par[name_list[key_id]] = ele + elif isinstance(param[i], dict) : + plt_par = param[i] + else : + raise Exception("unrecognized type: {}, only recept element with type list, np.ndarray, tuple or dict".format(type(param[i]))) + plt_par_list.append(plt_par) + + plt.subplot(rows,cols,i+1) +# plt.subplots_adjust(wspace =0, hspace =0)#调整子图间距 + plt.title(plt_par.get("title").replace("\t"," "), fontsize=fontsize) + im = plt.imshow(plt_par.get("img"), cmap=plt_par.get("cmap"), alpha=plt_par.get("alpha"), + vmin=plt_par.get("vmin"), vmax=plt_par.get("vmax")) + + if plt_par.get("colorbar") == True : + # plt.colorbar(im, orientation='horizontal', fraction=0.02, pad=0.0004) + plt.colorbar(im, orientation='horizontal') + + if plt_par.get("point_x") is not None and plt_par.get("point_y") is not None : + plt.scatter(plt_par.get("point_x"), plt_par.get("point_y"), s=plt_par.get("point_s"), c=plt_par.get("point_c"), marker=plt_par.get("point_m"), alpha=plt_par.get("point_alpha")) + plt.axis("off") + +# plt.gca().xaxis.set_major_locator(plt.NullLocator()) +# plt.gca().yaxis.set_major_locator(plt.NullLocator()) +# plt.subplots_adjust(top=1,bottom=0,left=0,right=1,hspace=0,wspace=0) +# plt.margins(0,0) + fig.subplots_adjust(left=None, bottom=None, right=None, wspace=None, hspace=None) + + if sv_img is True : + if i>0 : + sv_title += "-" + sv_title += plt_par.get("title") + + if if_inter : + from ipywidgets import Output + output = Output() + display(output) + + @output.capture() + def onclick(event): + if event.button == 3 and event.ydata is not None and event.xdata is not None : + print_info = "" + for i in np.arange(img_num) : + img = plt_par_list[i].get("img") + title = plt_par_list[i].get("title") + print_info += "{}:\t({},{})-{}\r\n".format(title, int(np.round(event.ydata)), int(np.round(event.xdata)), img[int(np.round(event.ydata)),int(np.round(event.xdata))]) + print(print_info) + + cid = fig.canvas.mpl_connect('button_press_event', onclick) + plt.tight_layout() + + if sv_img is True and save2where is not None : + plt.savefig(os.path.join(save2where), dpi=dpi) + # plt.show(block=False) + plt.close() + + +def show_dis(param, sv_img=False, fontsize=20, szWidth=10, szHeight=5, group=3): + """function: visualize the input data + args: + paras: [([(x,y,label),(x,y,label),...], title), ... ] or + [{"x":...shape(num_type,inter), "y":...shape(num_type,inter), "label":...shape(batch,), "title":...}, ... ] + sv_img: whether to save the visualization + fontsize : the size of font in title + szWidth, szHeight: width and height of each subfigure + group: the columns of the whole figure + """ + fig_num = len(param) + cols = group + rows = np.ceil(fig_num/group) + sv_title = "" + color_map = None + plt.figure(figsize=(szWidth*cols, szHeight*rows)) + + for i in np.arange(fig_num) : + if len(param[i])<3 : + raise Exception("note, each element should be (x, y, title, ...)") + + if isinstance(param[i], list) or isinstance(param[i], np.ndarray) or isinstance(param[i], tuple) : + name_list = ["x", "y", "title", "cmap", "point_x", "point_y", "point_s", "point_c", "point_m"] + plt_par = {} + for key_id, ele in enumerate(param[i]) : + plt_par[name_list[key_id]] = ele + elif isinstance(param[i], dict) : + plt_par = param[i] + else : + raise Exception("unrecognized type: {}, only recept element with type list, np.ndarray, tuple or dict".format(type(param[i]))) + + plt.subplot(rows,cols,i+1) + plt.title(plt_par.get("title"), fontsize=fontsize) + plt.bar(plt_par.get("x"), plt_par.get("y"), color=plt_par.get("cmap")) +# plt.legend() + + if plt_par.get("point_x") is not None and plt_par.get("point_y") is not None : + plt.scatter(plt_par.get("point_x"), plt_par.get("point_y"), s=plt_par.get("point_s"), c=plt_par.get("point_c"), marker=plt_par.get("point_m")) +# plt.axis("off") + if sv_img is True : + if i>0 : + sv_title += "-" + sv_title += plt_par.get("title") + + if sv_img is True : + plt.savefig(os.path.join(args.save2where,sv_title+".png")) + # plt.show(block=False) + + +def compute_confidence(movement_cur, movement_pre): + # mask_forward = ((movement_cur<-1) & (movement_cur>=movement_pre-3)) | (movement_cur>=-1) + mask_forward = np.ones_like(movement_cur) + mask_direction = ((np.abs(movement_cur)>1) & (np.abs(movement_pre)>1) & (movement_cur*movement_pre>0)) | (np.abs(movement_cur)<=1) | (np.abs(movement_pre)<=1) + return mask_forward * mask_direction + + +class Visualizer: + def __init__(self, root, sv_root, dataset=None, scratch=True, args=None, logger=None): + self.root = root.rstrip("/") + self.sv_root = sv_root.rstrip("/") + self.dataset = dataset + self.scratch = scratch + self.args = args + + tmp_dir = self.args.dataset.lower() + self.sv_root = self.sv_root if self.sv_root[-(1+len(tmp_dir)):]=="/"+tmp_dir \ + else os.path.join(self.sv_root, tmp_dir) + self.vis_root = os.path.join(os.path.dirname(self.sv_root), "analysis", tmp_dir) + + self.my_print = print if logger is None else logger.info + self.my_print("saving prediction to {}, visualization to {}".format(self.sv_root, self.vis_root)) + + def save_pred_vis(self, flow_pr, imageGT_file): + assert self.root in imageGT_file, "{} not in {}".format(self.root, imageGT_file) + + # create saving path, /xxx/disp0GT.pfm -> /xxx/disp0GT-pred.pfm + sv_path = imageGT_file.replace(self.root, self.sv_root) + pre,lat = os.path.splitext(sv_path) + sv_path = pre + "-pred" + lat + if not self.scratch and os.path.exists(sv_path): + self.my_print("{} exists".format(sv_path)) + return True + + # build directory + sv_dir = os.path.dirname(sv_path) + os.makedirs(sv_dir, exist_ok=True) + + # write prediction + if self.dataset.lower()=="middlebury" : + frame_utils.writeDispMiddlebury(sv_path, flow_pr) + elif self.dataset.lower()=="kitti2015" : + frame_utils.writeDispKITTI(sv_path, flow_pr) + elif self.dataset.lower()=="eth3d" : + frame_utils.write_gen(sv_path, flow_pr) + elif self.dataset.lower()=="booster" : + frame_utils.writeDispBooster(sv_path, flow_pr) + elif self.dataset.lower()=="common": + frame_utils.writeDispKITTI(sv_path, flow_pr) + else: + raise Exception("such daatset is not supported: {}".format(dataset)) + return True + + def get_xpx(self, key_list): + pattern = re.compile(r'^\d+(\.\d+)?px_list$') + px_keys = [key for key in key_list if pattern.match(key)] + assert len(px_keys) <= 1, f"too many xpx in {key_list} ~ {px_keys}" + if len(px_keys)==0: + return "0px_list" + return px_keys[0] + + def get_error_map(self, pr_list, gt_list, stop_idx=-1): + error_map_list = [] + colored_error_map_list = [] + for idx in np.arange( len(pr_list) ): + if stop_idx>0 and idx>=stop_idx: + break + + gt = gt_list[0] if len(gt_list)==1 else gt_list[idx] + error_map = np.abs(pr_list[idx] - gt) + error_map[np.isinf(gt) | np.isnan(gt) | (gt==0)] = 0 + error_map_list.append(error_map) + + # colored_error_map = colorize_error_map(error_map, ver_hor="hor") + colored_error_map = colorize_error_map(error_map, ver_hor="ver") + colored_error_map_list.append(colored_error_map) + + return error_map_list, colored_error_map_list + + def get_imp_map(self, error_map_list, stop_idx=-1): + imp_map_list = [] + colored_imp_map_list = [] + for idx in np.arange( len(error_map_list) ): + if stop_idx>0 and idx>=stop_idx: + break + + imp_map = np.zeros_like(error_map_list[0]) if idx==0 else error_map_list[idx] - error_map_list[idx-1] + imp_map_list.append(imp_map) + + # colored_imp_map = colorize_improvement_map(imp_map, ver_hor="hor") + colored_imp_map = colorize_improvement_map(imp_map, ver_hor="ver") + colored_imp_map_list.append(colored_imp_map) + return imp_map_list, colored_imp_map_list + + def get_movement_map(self, pr_list, stop_idx=-1): + move_map_list = [] + colored_move_map_list = [] + for idx in range(0, len(pr_list)): + if stop_idx>0 and idx>=stop_idx: + break + + move_map = np.zeros_like(pr_list[idx]) if idx<1 else pr_list[idx] - pr_list[idx-1] + move_map_list.append(move_map) + + # colored_move_map = colorize_improvement_map(move_map, ver_hor="hor") + colored_move_map = colorize_improvement_map(move_map, ver_hor="ver") + colored_move_map_list.append(colored_move_map) + return move_map_list, colored_move_map_list + + def get_acceleration_map(self, move_map_list, stop_idx=-1): + # get the difference between movement vector + colored_acc_map_list =[] + for idx in range(0, len(move_map_list)): + if stop_idx>0 and idx>=stop_idx: + break + + acc_map = np.zeros_like(move_map_list[idx]) if idx<2 else move_map_list[idx] - move_map_list[idx-1] + + # colored_acc_map = colorize_improvement_map(acc_map, ver_hor="hor") + colored_acc_map = colorize_improvement_map(acc_map, ver_hor="ver") + colored_acc_map_list.append(colored_acc_map) + return colored_acc_map_list + + def get_mask(self, mask_list, binary_thold, stop_idx=-1): + colored_mask_list = [] + mask_binary_list = [] + for idx in range(0, len(mask_list)): + if stop_idx>0 and idx>=stop_idx: + break + + # colored_mask = colorize_confidence(mask, ver_hor="hor") + colored_mask = colorize_confidence(mask_list[idx], ver_hor="ver") + colored_mask_list.append(colored_mask) + + mask_binary = mask_list[idx] < binary_thold + mask_binary_list.append(mask_binary) + + return colored_mask_list, mask_binary_list + + def analyze(self, dict_list, imageGT_file, in_one_fig=False, group=2): + """ + dict_list: + [{"name": "disp", + "img_list": [...], + "cmap": "jet", + "epe_list": [...], + "xpx_list": [...], + "GT": [tensor], + "stop_idx": 20, + "improvement": False, + "movement": False, + "error_map": True, + "acceleration": False, + "mask": False, + "binary_thold": 0.5}, + ] + """ + # create saving path + file_name = "-".join(imageGT_file.replace(self.root, "").split("/"))[1:] + pre,lat = os.path.splitext(file_name) + file_name = pre+".png" + sv_path = os.path.join(self.vis_root, file_name) + + # build directory + sv_dir = os.path.dirname(sv_path) + os.makedirs(sv_dir, exist_ok=True) + + fig_data_list = [] + for vis_dict in dict_list : + vis_name = vis_dict.get("name", None) + assert vis_name is not None, "missing 'name' in vis_dict" + + GT = vis_dict.get("GT", None) + img_list = vis_dict.get("img_list", []) + cmap = vis_dict.get("cmap", None) + stop_idx = vis_dict.get("stop_idx", -1) + vmin = vis_dict.get("vmin", None) + vmax = vis_dict.get("vmax", None) + colorbar = vis_dict.get("colorbar", False) + + epe_list = vis_dict.get("epe_list", None) + xpx_name = self.get_xpx(vis_dict.keys()) + xpx_list = vis_dict.get(xpx_name, None) + + error_map_req = vis_dict.get("error_map", False) + movement_req = vis_dict.get("movement", False) + improvement_req = vis_dict.get("improvement", False) + acceleration_req = vis_dict.get("acceleration", False) + + binary_thold = vis_dict.get("binary_thold", 0.5) + mask_req = vis_dict.get("mask", False) + + if img_list is None or len(img_list)==0 : + continue + + # get the colored error maps for the prediction sequence + if error_map_req : + error_map_list, colored_error_map_list = self.get_error_map(img_list, GT, stop_idx) + + # get the colored improvement map between adjacent iterations, + # the improvement map of the first iteration is empty. + if error_map_req and improvement_req : + imp_map_list, colored_imp_map_list = self.get_imp_map(error_map_list, stop_idx) + + # get the movement vector at each step + if movement_req : + move_map_list, colored_move_map_list = self.get_movement_map(img_list, stop_idx) + + # get the difference between movement vector + if acceleration_req : + colored_acc_map_list = self.get_acceleration_map(move_map_list, stop_idx) + + # get the colorized mask and binary mask + if mask_req : + colored_mask_list, mask_binary_list = self.get_mask(img_list, binary_thold, stop_idx) + + cnt = 0 + for idx in np.arange( len(img_list) ) : + if stop_idx>0 and idx>=stop_idx: + break + + info = "" + if epe_list is not None and len(epe_list) > 0 : + info = ": epe~{:.2f}".format(epe_list[idx]) + ", " + \ + "{}~{:.1f}".format(xpx_name[:-5], epe_list[idx]*100) + + idx_mark = f"" if len(img_list)==1 else f"-{idx}" + + if cmap is None or cmap.find("private") == -1 : + cnt += 1 + title = f"{vis_name}" + idx_mark + fig_data_list += [{"img" : img_list[idx], + "title" : title, + "cmap" : cmap, + "vmin" : vmin, + "vmax" : vmax, + "colorbar": colorbar},] + + if error_map_req : + cnt += 1 + title = f"{vis_name}-Error Map" + idx_mark + info + fig_data_list += [{"img" : colored_error_map_list[idx], + "title": title, + "cmap" : None, },] + + if error_map_req and improvement_req : + cnt += 1 + title = f"Improvement (err[i]-err[i-1])" + idx_mark + fig_data_list += [{"img" : colored_imp_map_list[idx], + "title": title, + "cmap" : None, },] + + if movement_req : + cnt += 1 + title = f"Movement (disp[i]-disp[i-1])" + idx_mark + fig_data_list += [{"img" : colored_move_map_list[idx], + "title": title, + "cmap" : None, },] + + if acceleration_req : + cnt += 1 + title = f"Acceleration (Move[i]-Move[i-1])" + idx_mark + fig_data_list += [{"img" : colored_acc_map_list[idx], + "title": title, + "cmap" : None, },] + + if mask_req: + cnt += 1 + title = f"Mask" + idx_mark + fig_data_list += [{"img" : colored_mask_list[idx], + "title": title, + "cmap" : None, },] + + cnt += 1 + title = f"Binary Mask" + idx_mark + fig_data_list += [{"img" : mask_binary_list[idx], + "title": title, + "cmap" : "gray", },] + if not in_one_fig: + tmp_group = cnt // (stop_idx if stop_idx>0 else len(img_list)) + H,W = img_list[0].shape + pre,lat = os.path.splitext(sv_path) + tmp_sv_path = pre + f"-sequence-{vis_name}" + lat + show_imgs(fig_data_list, + sv_img=True, save2where=tmp_sv_path, if_inter=False, + fontsize=20, szWidth=np.ceil(W/H)*5, szHeight=5, + group=tmp_group, dpi=300) + fig_data_list = [] + + if in_one_fig: + show_imgs(fig_data_list, + sv_img=True, save2where=sv_path, if_inter=False, + fontsize=20, szWidth=10, szHeight=5, group=group, dpi=300) + + +def colorize_error_map(error_map, ver_hor="hor"): + # Define a custom colormap for errors within 10 (shades of red) + num_colors = 10 + colors_map = [ + (255, 255, 255), # White + (255, 248, 220), # Brown + (255, 192, 203), # Pink + (128, 128, 128), # Gray + (128, 0, 128), # Purple + (64, 224, 208), # Turquoise + (255, 165, 0), # Orange + (255, 255, 0), # Yellow + (0, 128, 0), # Green + (0, 0, 255), # Blue + (255, 0, 0), # Red + ] + + # Create a blank colored map with the same dimensions as the error map + colored_map = np.zeros((error_map.shape[0], error_map.shape[1], 3), dtype=np.uint8) + + # Map error values within 10 to custom colors + for i in range(1, num_colors + 1): + colored_map[(error_map=i-1)] = colors_map[i - 1] + colored_map[error_map>=i] = colors_map[i - 1] + + # create corlor bar + font = cv2.FONT_HERSHEY_SIMPLEX + font_color = (0, 0, 0) # Black + if ver_hor=="hor": + bar_size = 15 + font_scale = 0.45 + font_thickness = 1 + color_bar = np.ones((bar_size, error_map.shape[1], 3))*255 + step = error_map.shape[1]//(num_colors+1) + for i in range(1+num_colors): + color_bar[bar_size//3:, i*step:(i+1)*step] = colors_map[i] + for i in range(1+num_colors): + x = i * step + step // 8 + y = bar_size//3*2 + cv2.putText(color_bar, str(i), (x, y), font, font_scale, font_color, font_thickness) + colored_map = np.vstack((colored_map, color_bar)) + + elif ver_hor=="ver": + bar_size = error_map.shape[1] // 10 + font_scale = 0.9 + font_thickness = 2 + color_bar = np.ones((error_map.shape[0], bar_size, 3))*255 + step = error_map.shape[0]//(num_colors+1) + for i in range(1+num_colors): + color_bar[i*step:(i+1)*step, bar_size//3:] = colors_map[i] + for i in range(1+num_colors): + y = i * step + step // 4 + x = bar_size//3*2 + cv2.putText(color_bar, str(i), (x, y), font, font_scale, font_color, font_thickness) + colored_map = np.hstack((colored_map, color_bar)) + + return colored_map.astype(np.uint8) + + +def colorize_confidence(confidence, ver_hor="hor"): + # Define a custom colormap for errors within 10 (shades of red) + colors_map = [ + (255, 219, 172), # Navajo White + (241, 194, 125), # Mellow Apricot + (233, 159, 51 ), + (224, 172, 105), # Fawn + (198, 134, 66 ), # Peru + (168, 112, 50 ), + (141, 85 , 36 ), # Russet + (121, 81 , 37 ), + (103, 63 , 27 ), + (53 , 32 , 13 ), + ] + num_colors = len(colors_map) + + # Create a blank colored map with the same dimensions as the error map + colored_map = np.zeros((confidence.shape[0], confidence.shape[1], 3), dtype=np.uint8) + + # Map error values within 10 to custom colors + for i in range(1, num_colors+1): + colored_map[(confidence>=(i-1)/num_colors) & (confidence=i/num_colors] = colors_map[i-1] + + # create corlor bar + font = cv2.FONT_HERSHEY_SIMPLEX + font_color = (0, 0, 0) # Black + if ver_hor=="hor": + bar_size = 8 + font_scale = 0.35 + font_thickness = 1 + color_bar = np.ones((bar_size, confidence.shape[1], 3))*255 + step = confidence.shape[1]//num_colors + for i in range(1,1+num_colors): + color_bar[bar_size//3:, (i-1)*step:i*step] = colors_map[i-1] + for i in range(1+num_colors): + x = i * step + x = x + step // 8 if i=bound_val[idx][0]) & \ + (improvement_map=bound_val[idx][0]) & \ + (improvement_map<=bound_val[idx][1])] = colors_map[idx] + else : + colored_map[(improvement_map>bound_val[idx][0]) & \ + (improvement_map<=bound_val[idx][1])] = colors_map[idx] + + # create corlor bar + font = cv2.FONT_HERSHEY_SIMPLEX + font_color = (0, 0, 0) # Black + if ver_hor=="hor": + bar_size = 15 + font_scale = 0.45 + font_thickness = 1 + color_bar = np.ones((bar_size, improvement_map.shape[1], 3))*255 + step = improvement_map.shape[1]//(num_colors+1) + for i in range(1+num_colors): + color_bar[bar_size//3:, i*step:(i+1)*step] = colors_map[i] + for i in range(1+num_colors): + x = i * step + step // 8 + y = bar_size//3*2 + cv2.putText(color_bar, str(bound_val[i][0]), (x, y), font, font_scale, font_color, font_thickness) + colored_map = np.vstack((colored_map, color_bar)) + + elif ver_hor=="ver": + bar_size = improvement_map.shape[1] // 10 + font_scale = 0.9 + font_thickness = 2 + color_bar = np.ones((improvement_map.shape[0], bar_size, 3))*255 + step = improvement_map.shape[0]//(num_colors+1) + for i in range(1+num_colors): + color_bar[i*step:(i+1)*step, bar_size//3:] = colors_map[i] + for i in range(1+num_colors): + y = i * step + step // 4 + x = bar_size//3*2 + cv2.putText(color_bar, str(bound_val[i][0]), (x, y), font, font_scale, font_color, font_thickness) + colored_map = np.hstack((colored_map, color_bar)) + + return colored_map.astype(np.uint8) \ No newline at end of file diff --git a/envs/enviroment_nvidia-cuda.yaml b/envs/enviroment_nvidia-cuda.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4f64994b0c382885ab70560e6b0b67f8ddc5be6 --- /dev/null +++ b/envs/enviroment_nvidia-cuda.yaml @@ -0,0 +1,14 @@ +nvidia-cuda-runtime-cu11 +nvidia-cuda-cupti-cu11 +nvidia-cuda-nvcc-cu11 +nvidia-nvml-dev-cu11 +nvidia-cuda-nvrtc-cu11 +nvidia-nvtx-cu11 +nvidia-cuda-sanitizer-api-cu11 +nvidia-cublas-cu11 +nvidia-cufft-cu11 +nvidia-curand-cu11 +nvidia-cusolver-cu11 +nvidia-cusparse-cu11 +# nvidia-npp-cu11 +nvidia-nvjpeg-cu11 \ No newline at end of file diff --git a/envs/environment.yaml b/envs/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..653275552b9b4ca612973047d1ff65f8c180abfb --- /dev/null +++ b/envs/environment.yaml @@ -0,0 +1,19 @@ +name: raftstereo +channels: + - pytorch + - bioconda + - defaults +dependencies: + - python=3.7.6 + - pytorch=1.7.0 + - torchvision=0.8.1 + - cudatoolkit=10.2.89 + - matplotlib + - tensorboard + - scipy + - opencv + - tqdm + - opt_einsum + - imageio + - scikit-image + - p7zip diff --git a/envs/environment_GStereo.yml b/envs/environment_GStereo.yml new file mode 100644 index 0000000000000000000000000000000000000000..e8eb2b1e7f174688673e92d69c0e80ad2f0debca --- /dev/null +++ b/envs/environment_GStereo.yml @@ -0,0 +1,311 @@ +name: raftstereo +channels: + - pytorch + - nvidia + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - absl-py=2.1.0=py38h06a4308_0 + - aiohttp=3.9.5=py38h5eee18b_0 + - aiosignal=1.2.0=pyhd3eb1b0_0 + - aom=3.6.0=h6a678d5_0 + - async-timeout=4.0.3=py38h06a4308_0 + - attrs=23.1.0=py38h06a4308_0 + - blas=1.0=mkl + - blinker=1.6.2=py38h06a4308_0 + - blosc=1.21.3=h6a678d5_0 + - bottleneck=1.3.7=py38ha9d4c09_0 + - brotli=1.0.9=h5eee18b_8 + - brotli-bin=1.0.9=h5eee18b_8 + - brotli-python=1.0.9=py38h6a678d5_8 + - brunsli=0.1=h2531618_0 + - bzip2=1.0.8=h5eee18b_6 + - c-ares=1.19.1=h5eee18b_0 + - ca-certificates=2024.7.2=h06a4308_0 + - cachetools=5.3.3=py38h06a4308_0 + - cairo=1.16.0=hb05425b_5 + - certifi=2024.7.4=py38h06a4308_0 + - cffi=1.16.0=py38h5eee18b_1 + - cfitsio=3.470=h5893167_7 + - charls=2.2.0=h2531618_0 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - click=8.1.7=py38h06a4308_0 + - cloudpickle=3.0.0=py38h06a4308_0 + - cmake=3.14.0=h52cb24c_0 + - contourpy=1.0.5=py38hdb19cb5_0 + - cryptography=42.0.5=py38hdda0065_1 + - cuda-cudart=12.1.105=0 + - cuda-cupti=12.1.105=0 + - cuda-libraries=12.1.0=0 + - cuda-nvrtc=12.1.105=0 + - cuda-nvtx=12.1.105=0 + - cuda-opencl=12.6.37=0 + - cuda-runtime=12.1.0=0 + - cuda-version=12.6=3 + - cycler=0.11.0=pyhd3eb1b0_0 + - cyrus-sasl=2.1.28=h52b45da_1 + - cytoolz=0.12.2=py38h5eee18b_0 + - dask-core=2023.4.1=py38h06a4308_0 + - dav1d=1.2.1=h5eee18b_0 + - dbus=1.13.18=hb2f20db_0 + - eigen=3.4.0=hdb19cb5_0 + - et_xmlfile=1.1.0=py38h06a4308_0 + - expat=2.6.2=h6a678d5_0 + - ffmpeg=4.2.2=h20bf706_0 + - filelock=3.13.1=py38h06a4308_0 + - fontconfig=2.14.1=h55d465d_3 + - fonttools=4.51.0=py38h5eee18b_0 + - freetype=2.12.1=h4a9f257_0 + - frozenlist=1.4.0=py38h5eee18b_0 + - fsspec=2024.3.1=py38h06a4308_0 + - future=0.18.3=py38h06a4308_0 + - giflib=5.2.1=h5eee18b_3 + - glib=2.78.4=h6a678d5_0 + - glib-tools=2.78.4=h6a678d5_0 + - gmp=6.2.1=h295c915_3 + - gmpy2=2.1.2=py38heeb90bb_0 + - gnutls=3.6.15=he1e5248_0 + - google-auth=2.29.0=py38h06a4308_0 + - google-auth-oauthlib=0.5.2=py38h06a4308_0 + - graphite2=1.3.14=h295c915_1 + - grpcio=1.62.2=py38h6a678d5_0 + - gst-plugins-base=1.14.1=h6a678d5_1 + - gstreamer=1.14.1=h5eee18b_1 + - harfbuzz=4.3.0=hf52aaf7_2 + - hdf5=1.12.1=h2b7332f_3 + - icu=73.1=h6a678d5_0 + - idna=3.7=py38h06a4308_0 + - imagecodecs=2023.1.23=py38hc4b7b5f_0 + - imageio=2.33.1=py38h06a4308_0 + - importlib-metadata=7.0.1=py38h06a4308_0 + - importlib_resources=6.4.0=py38h06a4308_0 + - intel-openmp=2021.4.0=h06a4308_3561 + - jinja2=3.1.4=py38h06a4308_0 + - jpeg=9e=h5eee18b_3 + - jxrlib=1.1=h7b6447c_2 + - kiwisolver=1.4.4=py38h6a678d5_0 + - krb5=1.20.1=h143b758_1 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libabseil=20240116.2=cxx17_h6a678d5_0 + - libaec=1.0.4=he6710b0_1 + - libavif=0.11.1=h5eee18b_0 + - libbrotlicommon=1.0.9=h5eee18b_8 + - libbrotlidec=1.0.9=h5eee18b_8 + - libbrotlienc=1.0.9=h5eee18b_8 + - libclang=14.0.6=default_hc6dbbc7_1 + - libclang13=14.0.6=default_he11475f_1 + - libcublas=12.1.0.26=0 + - libcufft=11.0.2.4=0 + - libcufile=1.11.0.15=0 + - libcups=2.4.2=h2d74bed_1 + - libcurand=10.3.7.37=0 + - libcurl=8.7.1=h251f7ec_0 + - libcusolver=11.4.4.55=0 + - libcusparse=12.0.2.55=0 + - libdeflate=1.17=h5eee18b_1 + - libedit=3.1.20230828=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libglib=2.78.4=hdc74915_0 + - libgomp=11.2.0=h1234567_1 + - libgrpc=1.62.2=h2d74bed_0 + - libiconv=1.16=h5eee18b_3 + - libidn2=2.3.4=h5eee18b_0 + - libjpeg-turbo=2.0.0=h9bf148f_0 + - libllvm14=14.0.6=hdb19cb5_3 + - libnghttp2=1.57.0=h2d74bed_0 + - libnpp=12.0.2.50=0 + - libnvjitlink=12.1.105=0 + - libnvjpeg=12.1.1.14=0 + - libopus=1.3.1=h7b6447c_0 + - libpng=1.6.39=h5eee18b_0 + - libpq=12.17=hdbd6064_0 + - libprotobuf=4.25.3=he621ea3_0 + - libssh2=1.11.0=h251f7ec_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.5.1=h6a678d5_0 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libvpx=1.7.0=h439df22_0 + - libwebp-base=1.3.2=h5eee18b_0 + - libxcb=1.15=h7f8727e_0 + - libxkbcommon=1.0.1=h097e994_2 + - libxml2=2.13.1=hfdd30dd_2 + - libzopfli=1.0.3=he6710b0_0 + - llvm-openmp=14.0.6=h9e868ea_0 + - locket=1.0.0=py38h06a4308_0 + - lz4-c=1.9.4=h6a678d5_1 + - markdown=3.4.1=py38h06a4308_0 + - markupsafe=2.1.3=py38h5eee18b_0 + - matplotlib=3.7.2=py38h06a4308_0 + - matplotlib-base=3.7.2=py38h1128e8f_0 + - mkl=2021.4.0=h06a4308_640 + - mkl-service=2.4.0=py38h7f8727e_0 + - mkl_fft=1.3.1=py38hd3c417c_0 + - mkl_random=1.2.2=py38h51133e4_0 + - mpc=1.1.0=h10f8cd9_1 + - mpfr=4.0.2=hb69a4c5_1 + - mpmath=1.3.0=py38h06a4308_0 + - multidict=6.0.4=py38h5eee18b_0 + - mysql=5.7.24=h721c034_2 + - ncurses=6.4=h6a678d5_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.1=py38h06a4308_0 + - ninja=1.10.2=h06a4308_5 + - ninja-base=1.10.2=hd09550d_5 + - numexpr=2.8.4=py38he184ba9_0 + - numpy=1.24.3=py38h14f4228_0 + - numpy-base=1.24.3=py38h31eccc5_0 + - oauthlib=3.2.2=py38h06a4308_0 + - opencv=4.10.0=py38h0a8ef67_0 + - openh264=2.1.1=h4ff587b_0 + - openjpeg=2.5.2=he7f1fd0_0 + - openpyxl=3.1.5=py38h5eee18b_0 + - openssl=3.0.14=h5eee18b_0 + - opt_einsum=3.3.0=pyhd3eb1b0_1 + - p7zip=16.02=h6a678d5_0 + - packaging=24.1=py38h06a4308_0 + - pandas=2.0.3=py38h1128e8f_0 + - partd=1.4.1=py38h06a4308_0 + - pcre2=10.42=hebb0a14_1 + - pillow=10.4.0=py38h5eee18b_0 + - pip=24.2=py38h06a4308_0 + - pixman=0.40.0=h7f8727e_1 + - platformdirs=3.10.0=py38h06a4308_0 + - ply=3.11=py38_0 + - pooch=1.7.0=py38h06a4308_0 + - protobuf=4.25.3=py38h12ddb61_0 + - pyasn1=0.4.8=pyhd3eb1b0_0 + - pyasn1-modules=0.2.8=py_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyjwt=2.8.0=py38h06a4308_0 + - pyopenssl=24.2.1=py38h06a4308_0 + - pyparsing=3.0.9=py38h06a4308_0 + - pyqt=5.15.10=py38h6a678d5_0 + - pyqt5-sip=12.13.0=py38h5eee18b_0 + - pysocks=1.7.1=py38h06a4308_0 + - python=3.8.19=h955ad1f_0 + - python-dateutil=2.9.0post0=py38h06a4308_2 + - python-tzdata=2023.3=pyhd3eb1b0_0 + - pytorch=2.4.0=py3.8_cuda12.1_cudnn9.1.0_0 + - pytorch-cuda=12.1=ha16c6d3_5 + - pytorch-mutex=1.0=cuda + - pytz=2024.1=py38h06a4308_0 + - pywavelets=1.4.1=py38h5eee18b_0 + - pyyaml=6.0.1=py38h5eee18b_0 + - qt-main=5.15.2=h53bd1ea_10 + - re2=2022.04.01=h295c915_0 + - readline=8.2=h5eee18b_0 + - requests=2.32.3=py38h06a4308_0 + - requests-oauthlib=2.0.0=py38h06a4308_0 + - rhash=1.4.3=hdbd6064_0 + - rsa=4.7.2=pyhd3eb1b0_1 + - scikit-image=0.19.3=py38h6a678d5_1 + - scipy=1.10.1=py38h14f4228_0 + - setuptools=72.1.0=py38h06a4308_0 + - sip=6.7.12=py38h6a678d5_0 + - six=1.16.0=pyhd3eb1b0_1 + - snappy=1.2.1=h6a678d5_0 + - sqlite=3.45.3=h5eee18b_0 + - sympy=1.12=py38h06a4308_0 + - tensorboard=2.12.1=py38h06a4308_0 + - tensorboard-data-server=0.7.0=py38h52d8a92_0 + - tensorboard-plugin-wit=1.8.1=py38h06a4308_0 + - tifffile=2023.4.12=py38h06a4308_0 + - tk=8.6.14=h39e8969_0 + - tomli=2.0.1=py38h06a4308_0 + - toolz=0.12.0=py38h06a4308_0 + - torchaudio=2.4.0=py38_cu121 + - torchtriton=3.0.0=py38 + - torchvision=0.19.0=py38_cu121 + - tornado=6.4.1=py38h5eee18b_0 + - tqdm=4.66.4=py38h2f386ee_0 + - typing-extensions=4.11.0=py38h06a4308_0 + - typing_extensions=4.11.0=py38h06a4308_0 + - unicodedata2=15.1.0=py38h5eee18b_0 + - urllib3=2.2.2=py38h06a4308_0 + - werkzeug=3.0.3=py38h06a4308_0 + - wheel=0.43.0=py38h06a4308_0 + - x264=1!157.20191217=h7b6447c_0 + - xz=5.4.6=h5eee18b_1 + - yaml=0.2.5=h7b6447c_0 + - yarl=1.9.3=py38h5eee18b_0 + - zfp=1.0.0=h6a678d5_0 + - zipp=3.17.0=py38h06a4308_0 + - zlib=1.2.13=h5eee18b_1 + - zstd=1.5.5=hc292b87_2 + - pip: + - accelerate==0.33.0 + - aiofiles==23.2.1 + - annotated-types==0.7.0 + - antlr4-python3-runtime==4.9.3 + - anyio==4.4.0 + - asttokens==2.4.1 + - backcall==0.2.0 + - corr-sampler==0.0.0 + - decorator==5.1.1 + - diffusers==0.30.0 + - einops==0.8.0 + - exceptiongroup==1.2.2 + - executing==2.0.1 + - fastapi==0.112.1 + - ffmpy==0.4.0 + - gradio==4.41.0 + - gradio-client==1.3.0 + - gradio-imageslider==0.0.18 + - h11==0.14.0 + - httpcore==1.0.5 + - httpx==0.27.0 + - huggingface-hub==0.24.5 + - ipython==8.12.3 + - jedi==0.19.1 + - joblib==1.4.2 + - markdown-it-py==3.0.0 + - matplotlib-inline==0.1.7 + - mdurl==0.1.2 + - omegaconf==2.3.0 + - orjson==3.10.7 + - parso==0.8.4 + - pexpect==4.9.0 + - pickleshare==0.7.5 + - prompt-toolkit==3.0.47 + - psutil==6.0.0 + - ptyprocess==0.7.0 + - pure-eval==0.2.3 + - pydantic==2.8.2 + - pydantic-core==2.20.1 + - pydub==0.25.1 + - pyglet==1.5.29 + - pygments==2.18.0 + - python-multipart==0.0.9 + - regex==2024.7.24 + - rich==13.7.1 + - roma==1.5.0 + - ruff==0.6.1 + - safetensors==0.4.4 + - scikit-learn==1.3.2 + - semantic-version==2.10.0 + - shellingham==1.5.4 + - sniffio==1.3.1 + - stack-data==0.6.3 + - starlette==0.38.2 + - tabulate==0.9.0 + - threadpoolctl==3.5.0 + - tokenizers==0.19.1 + - tomlkit==0.12.0 + - traitlets==5.14.3 + - transformers==4.44.0 + - trimesh==4.4.6 + - typer==0.12.4 + - uvicorn==0.30.6 + - wcwidth==0.2.13 + - websockets==12.0 +prefix: /home/xxx/anaconda3/envs/raftstereo diff --git a/envs/environment_cuda11.yaml b/envs/environment_cuda11.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1c875cde945e3b647faac4aa09e4c1e4f25a01df --- /dev/null +++ b/envs/environment_cuda11.yaml @@ -0,0 +1,16 @@ +name: raftstereo +dependencies: + - python=3.8 + # - pytorch=1.11.0 + # - cudatoolkit=11.3 + # - torchvision + - matplotlib + # - tensorboard + - scipy + - opencv + - tqdm + - opt_einsum + - imageio + - scikit-image + - p7zip + - pandas diff --git a/envs/requirements.txt b/envs/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9a95b7a4f8b4ffd197f978737a9063ea926b104 --- /dev/null +++ b/envs/requirements.txt @@ -0,0 +1,5 @@ +torch==1.10.1+cu111 +torchvision==0.11.2+cu111 +torchaudio==0.10.1 +tensorboard +openpyxl \ No newline at end of file diff --git a/envs/requirements_mix.txt b/envs/requirements_mix.txt new file mode 100644 index 0000000000000000000000000000000000000000..99a00a412952ef0d9c3a1efcab549163636c28eb --- /dev/null +++ b/envs/requirements_mix.txt @@ -0,0 +1,24 @@ +accelerate>=0.22.0 +diffusers>=0.25.0 +matplotlib +scipy +torch>=2.0.1 +torchvision>=0.15.2 +transformers>=4.32.1 +omegaconf +pandas +tabulate +scikit-learn +torch +torchvision +roma +gradio +matplotlib +tqdm +opencv-python +scipy +einops +trimesh +tensorboard +pyglet<2 +huggingface-hub[torch]>=0.22 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d147ed6109a3676ad607a21bdf00dc38e25c3f78 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,44 @@ +attrdict==2.0.1 +einops==0.8.1 +fire==0.7.0 +gradio==5.38.0 +gradio_imageslider==0.0.20 +h5py==3.14.0 +html4vision==0.5.2 +huggingface_hub==0.29.1 +imageio==2.37.0 +imgaug==0.4.0 +iopath==0.1.10 +ipywidgets==8.1.5 +kapture==1.1.10 +kapture_localization==1.1.10 +matplotlib==3.10.3 +numpy +onnxruntime==1.22.1 +open3d==0.19.0 +opencv_python==4.11.0.86 +opt_einsum==3.4.0 +packaging==25.0 +pandas==2.3.1 +Pillow==11.3.0 +pillow_heif==1.0.0 +plyfile==1.1.2 +poselib==2.0.4 +pycolmap==3.12.3 +pyrender==0.1.45 +quaternion==3.5.2.post4 +roma==1.5.3 +scikit_learn==1.7.1 +scipy==1.15.0 +setuptools==75.1.0 +scikit-image>=0.0 +tensorboardX==2.6.4 +tensorflow==2.19.0 +termcolor==3.1.0 +timm==1.0.17 +torch==2.5.1 +torchvision==0.20.1 +tqdm==4.67.1 +trimesh +xformers +huggingface_hub \ No newline at end of file diff --git a/sampler/__init__.py b/sampler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sampler/sampler.cpp b/sampler/sampler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b1ed09b9525ce32c2b342b53cf60182502c026e0 --- /dev/null +++ b/sampler/sampler.cpp @@ -0,0 +1,51 @@ +#include + +#include + +// CUDA forward declarations + + +std::vector sampler_cuda_forward( + torch::Tensor volume, + torch::Tensor coords, + int radius); + +std::vector sampler_cuda_backward( + torch::Tensor volume, + torch::Tensor coords, + torch::Tensor corr_grad, + int radius); + + +#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +std::vector sampler_forward( + torch::Tensor volume, + torch::Tensor coords, + int radius) { + CHECK_INPUT(volume); + CHECK_INPUT(coords); + + return sampler_cuda_forward(volume, coords, radius); +} + +std::vector sampler_backward( + torch::Tensor volume, + torch::Tensor coords, + torch::Tensor corr_grad, + int radius) { + CHECK_INPUT(volume); + CHECK_INPUT(coords); + CHECK_INPUT(corr_grad); + + auto volume_grad = sampler_cuda_backward(volume, coords, corr_grad, radius); + return {volume_grad}; +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &sampler_forward, "SAMPLER forward"); + m.def("backward", &sampler_backward, "SAMPLER backward"); +} \ No newline at end of file diff --git a/sampler/sampler_kernel.cu b/sampler/sampler_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..7dee2a8d175b99c63f1428a7bb87b5071f935f96 --- /dev/null +++ b/sampler/sampler_kernel.cu @@ -0,0 +1,167 @@ +#include +#include +#include +#include +#include +#include + + +#include +#include +#include + +#define BLOCK 16 + +__forceinline__ __device__ bool within_bounds(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +template +__global__ void sampler_forward_kernel( + const torch::PackedTensorAccessor32 volume, + const torch::PackedTensorAccessor32 coords, + torch::PackedTensorAccessor32 corr, + int r) +{ + // batch index + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int n = blockIdx.z; + + const int h1 = volume.size(1); + const int w1 = volume.size(2); + const int w2 = volume.size(3); + + if (!within_bounds(y, x, h1, w1)) { + return; + } + + float x0 = coords[n][0][y][x]; + float y0 = coords[n][1][y][x]; + + float dx = x0 - floor(x0); + float dy = y0 - floor(y0); + + int rd = 2*r + 1; + for (int i=0; i(floor(x0)) - r + i; + + if (within_bounds(0, x1, 1, w2)) { + scalar_t s = volume[n][y][x][x1]; + + if (i > 0) + corr[n][i-1][y][x] += s * scalar_t(dx); + + if (i < rd) + corr[n][i][y][x] += s * scalar_t((1.0f-dx)); + + } + } +} + + +template +__global__ void sampler_backward_kernel( + const torch::PackedTensorAccessor32 coords, + const torch::PackedTensorAccessor32 corr_grad, + torch::PackedTensorAccessor32 volume_grad, + int r) +{ + // batch index + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int n = blockIdx.z; + + const int h1 = volume_grad.size(1); + const int w1 = volume_grad.size(2); + const int w2 = volume_grad.size(3); + + if (!within_bounds(y, x, h1, w1)) { + return; + } + + float x0 = coords[n][0][y][x]; + float y0 = coords[n][1][y][x]; + + float dx = x0 - floor(x0); + float dy = y0 - floor(y0); + + int rd = 2*r + 1; + for (int i=0; i(floor(x0)) - r + i; + + if (within_bounds(0, x1, 1, w2)) { + scalar_t g = 0.0; + + if (i > 0) + g += corr_grad[n][i-1][y][x] * scalar_t(dx); + + if (i < rd) + g += corr_grad[n][i][y][x] * scalar_t((1.0f-dx)); + + volume_grad[n][y][x][x1] += g; + } + } +} + +std::vector sampler_cuda_forward( + torch::Tensor volume, + torch::Tensor coords, + int radius) +{ + const auto batch_size = volume.size(0); + const auto ht = volume.size(1); + const auto wd = volume.size(2); + + const dim3 blocks((wd + BLOCK - 1) / BLOCK, + (ht + BLOCK - 1) / BLOCK, + batch_size); + + const dim3 threads(BLOCK, BLOCK); + + auto opts = volume.options(); + torch::Tensor corr = torch::zeros( + {batch_size, 2*radius+1, ht, wd}, opts); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type(), "sampler_forward_kernel", ([&] { + sampler_forward_kernel<<>>( + volume.packed_accessor32(), + coords.packed_accessor32(), + corr.packed_accessor32(), + radius); + })); + + return {corr}; + +} + +std::vector sampler_cuda_backward( + torch::Tensor volume, + torch::Tensor coords, + torch::Tensor corr_grad, + int radius) +{ + const auto batch_size = volume.size(0); + const auto ht = volume.size(1); + const auto wd = volume.size(2); + + auto volume_grad = torch::zeros_like(volume); + + const dim3 blocks((wd + BLOCK - 1) / BLOCK, + (ht + BLOCK - 1) / BLOCK, + batch_size); + + const dim3 threads(BLOCK, BLOCK); + + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(volume.type(), "sampler_backward_kernel", ([&] { + sampler_backward_kernel<<>>( + coords.packed_accessor32(), + corr_grad.packed_accessor32(), + volume_grad.packed_accessor32(), + radius); + })); + + return {volume_grad}; +} + diff --git a/sampler/setup.py b/sampler/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..51843eaca6ded572a76b1cceac37b3b4f290a9c7 --- /dev/null +++ b/sampler/setup.py @@ -0,0 +1,28 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +extra_compile_flags = {} +gencodes = ['-arch=sm_50', + '-gencode', 'arch=compute_50,code=sm_50', + '-gencode', 'arch=compute_52,code=sm_52', + '-gencode', 'arch=compute_60,code=sm_60', + '-gencode', 'arch=compute_61,code=sm_61', + '-gencode', 'arch=compute_70,code=sm_70', + '-gencode', 'arch=compute_75,code=sm_75', + '-gencode', 'arch=compute_75,code=compute_75',] + +# extra_compile_flags['nvcc'] = gencodes + +setup( + name='corr_sampler', + ext_modules=[ + CUDAExtension('corr_sampler', [ + 'sampler.cpp', 'sampler_kernel.cu', + ], + extra_compile_args=extra_compile_flags) + ], + cmdclass={ + 'build_ext': BuildExtension + }) + + diff --git a/script/demo_stereo_raftstereo.sh b/script/demo_stereo_raftstereo.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc6e3e2f4fbfb0e572073513942e14c5b010703c --- /dev/null +++ b/script/demo_stereo_raftstereo.sh @@ -0,0 +1,11 @@ +# /usr/bin/bash + +export LOG_ROOT="/data5/yao/runs/vis" +export TB_ROOT="/data5/yao/runs/tboard" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint" + + + +# CUDA_VISIBLE_DEVICES=4 python3 demo_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "RefineSigmoidPreMonoBatch32" + +CUDA_VISIBLE_DEVICES=4 python3 demo_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis/demo/ --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --is_test --test_exp_name "RefineSigmoidPreMonoBatch48ConfDim" diff --git a/script/download_datasets.sh b/script/download_datasets.sh new file mode 100644 index 0000000000000000000000000000000000000000..c121b75c78ef55ce31a5fec681772fc75dace49b --- /dev/null +++ b/script/download_datasets.sh @@ -0,0 +1,24 @@ +mkdir datasets/Middlebury -p +cd datasets/Middlebury/ +wget https://www.dropbox.com/s/fn8siy5muak3of3/official_train.txt -P MiddEval3/ +wget https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-Q.zip +unzip MiddEval3-data-Q.zip +wget https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-GT0-Q.zip +unzip MiddEval3-GT0-Q.zip +wget https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-H.zip +unzip MiddEval3-data-H.zip +wget https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-GT0-H.zip +unzip MiddEval3-GT0-H.zip +wget https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip +unzip MiddEval3-data-F.zip +wget https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-GT0-F.zip +unzip MiddEval3-GT0-F.zip +rm *.zip +cd ../.. + +mkdir datasets/ETH3D/two_view_testing -p +cd datasets/ETH3D/two_view_testing +wget https://www.eth3d.net/data/two_view_test.7z +echo "Unzipping two_view_test.7z using p7zip (installed from environment.yaml)" +7za x two_view_test.7z +cd ../../.. \ No newline at end of file diff --git a/script/download_middlebury_2014.sh b/script/download_middlebury_2014.sh new file mode 100644 index 0000000000000000000000000000000000000000..009c8c274befebb2549a8f3397e45f3a7956771f --- /dev/null +++ b/script/download_middlebury_2014.sh @@ -0,0 +1,50 @@ +mkdir datasets/Middlebury/2014 -p +cd datasets/Middlebury/2014 +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Adirondack-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Adirondack-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Backpack-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Backpack-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Bicycle1-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Bicycle1-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Cable-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Cable-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Classroom1-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Classroom1-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Couch-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Couch-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Flowers-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Flowers-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Jadeplant-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Jadeplant-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Mask-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Mask-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Motorcycle-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Motorcycle-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Piano-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Piano-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Pipes-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Pipes-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Playroom-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Playroom-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Playtable-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Playtable-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Recycle-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Recycle-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Shelves-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Shelves-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Shopvac-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Shopvac-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Sticks-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Sticks-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Storage-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Storage-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Sword1-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Sword1-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Sword2-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Sword2-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Umbrella-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Umbrella-perfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Vintage-imperfect.zip +wget https://vision.middlebury.edu/stereo/data/scenes2014/zip/Vintage-perfect.zip +unzip \*.zip +rm -vf *.zip \ No newline at end of file diff --git a/script/download_models.sh b/script/download_models.sh new file mode 100644 index 0000000000000000000000000000000000000000..81582c289cd8b6f1183e231cb8de7fcb9c9c3ca9 --- /dev/null +++ b/script/download_models.sh @@ -0,0 +1,6 @@ +#!/bin/bash +mkdir models -p +cd models +wget https://www.dropbox.com/s/ftveifyqcomiwaq/models.zip +unzip models.zip +rm models.zip -f diff --git a/script/evaluate_stereo_raftstereo.sh b/script/evaluate_stereo_raftstereo.sh new file mode 100644 index 0000000000000000000000000000000000000000..f931e5d7e195f19899a30436ce6985923ef0bedd --- /dev/null +++ b/script/evaluate_stereo_raftstereo.sh @@ -0,0 +1,233 @@ +# /usr/bin/bash + +export LOG_ROOT="/data5/yao/runs/log" +export TB_ROOT="/data5/yao/runs/tboard" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint" + +export CUDA_VISIBLE_DEVICES=3 + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/raftstereo.pth --dataset middlebury_H --model_name "RaftStereo" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/90000_raftstereo.pth --dataset middlebury_H --model_name "RaftStereo" --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/80000_raftstereo.pth --dataset middlebury_H --model_name "RaftStereo" --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/70000_raftstereo.pth --dataset middlebury_H --model_name "RaftStereo" --test_exp_name "70000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt pretrained/rebuild-raft2-noSF-singleMask.pth --slant "slant_local" --slant_norm --geo_estimator "geometry_conv" --dataset kitti +# python3 evaluate_stereo_raftstereo.py --restore_ckpt pretrained/rebuild-raft2-noSF-singleMask.pth --slant "slant_local" --slant_norm --geo_estimator "geometry_conv" --dataset eth3d + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDisp_20240821_142314/RaftStereoDisp.pth --dataset middlebury_H --model_name "RaftStereoDisp" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDisp_20240821_142314/90000_RaftStereoDisp.pth --dataset middlebury_H --model_name "RaftStereoDisp" --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDisp_20240821_142314/80000_RaftStereoDisp.pth --dataset middlebury_H --model_name "RaftStereoDisp" --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDisp_20240821_142314/70000_RaftStereoDisp.pth --dataset middlebury_H --model_name "RaftStereoDisp" --test_exp_name "70000itr" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_deep3_20240902_201411/RaftStereoMast3r_deep3.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_deep3_20240902_201411/90000_RaftStereoMast3r_deep3.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_deep3_20240902_201411/80000_RaftStereoMast3r_deep3.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_deep3_20240902_201411/70000_RaftStereoMast3r_deep3.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "70000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_deep3_20240902_201411/60000_RaftStereoMast3r_deep3.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "60000itr" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthAny" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/90000_RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthAny" --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/80000_RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthAny" --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/70000_RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthAny" --test_exp_name "70000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/60000_RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthAny" --test_exp_name "60000itr" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthAny" --test_exp_name "booster-final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/90000_RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthAny" --test_exp_name "booster-90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/80000_RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthAny" --test_exp_name "booster-80000itr" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset eth3d --model_name "RAFTStereoDepthAny" --test_exp_name "eth3d-final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti2015 --model_name "RAFTStereoDepthAny" --test_exp_name "kitti2015-final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti --model_name "RAFTStereoDepthAny" --test_exp_name "kitti-final" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_fineNoPadding_20240908_133009/RaftStereoMast3r_fineNoPadding.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_fineNoPadding_20240908_133009/150000_RaftStereoMast3r_fineNoPadding.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "150000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_fineNoPadding_20240908_133009/100000_RaftStereoMast3r_fineNoPadding.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "100000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoMast3r_fineNoPadding_20240908_133009/50000_RaftStereoMast3r_fineNoPadding.pth --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --dataset middlebury_H --model_name "RAFTStereoMast3r" --test_exp_name "50000itr" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/RaftStereoNoCTX.pth --dataset middlebury_H --model_name "RaftStereoNoCTX" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/90000_RaftStereoNoCTX.pth --dataset middlebury_H --model_name "RaftStereoNoCTX" --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/80000_RaftStereoNoCTX.pth --dataset middlebury_H --model_name "RaftStereoNoCTX" --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/70000_RaftStereoNoCTX.pth --dataset middlebury_H --model_name "RaftStereoNoCTX" --test_exp_name "70000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/60000_RaftStereoNoCTX.pth --dataset middlebury_H --model_name "RaftStereoNoCTX" --test_exp_name "60000itr" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/RaftStereoNoCTX.pth --dataset booster --model_name "RaftStereoNoCTX" --test_exp_name "booster-final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/90000_RaftStereoNoCTX.pth --dataset booster --model_name "RaftStereoNoCTX" --test_exp_name "booster-90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoNoCTX_20240911_155418/80000_RaftStereoNoCTX.pth --dataset booster --model_name "RaftStereoNoCTX" --test_exp_name "booster-80000itr" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthFusion_20240913_234716/RaftStereoDepthFusion.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthFusion" --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthFusion_20240913_234716/90000_RaftStereoDepthFusion.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthFusion" --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthFusion_20240913_234716/80000_RaftStereoDepthFusion.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthFusion" --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthFusion_20240913_234716/70000_RaftStereoDepthFusion.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthFusion" --test_exp_name "70000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthFusion_20240913_234716/60000_RaftStereoDepthFusion.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthFusion" --test_exp_name "60000itr" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBeta_20240917_122247/RaftStereoDepthBeta.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-1,-1), (1,1), (1,-1), (-1,1)" --modulation_ratio 1.0 --test_exp_name "final" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBeta_20240917_122247/90000_RaftStereoDepthBeta.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-1,-1), (1,1), (1,-1), (-1,1)" --modulation_ratio 1.0 --test_exp_name "90000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBeta_20240917_122247/80000_RaftStereoDepthBeta.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-1,-1), (1,1), (1,-1), (-1,1)" --modulation_ratio 1.0 --test_exp_name "80000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBeta_20240917_122247/70000_RaftStereoDepthBeta.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-1,-1), (1,1), (1,-1), (-1,1)" --modulation_ratio 1.0 --test_exp_name "70000itr" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBeta_20240917_122247/60000_RaftStereoDepthBeta.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-1,-1), (1,1), (1,-1), (-1,1)" --modulation_ratio 1.0 --test_exp_name "60000itr" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3_20240917_122628/RaftStereoDepthBetaK3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "final-N3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3_20240917_122628/90000_RaftStereoDepthBetaK3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-N3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3_20240917_122628/80000_RaftStereoDepthBetaK3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-N3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3_20240917_122628/70000_RaftStereoDepthBetaK3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-N3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3_20240917_122628/60000_RaftStereoDepthBetaK3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-N3" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3Sigmoid_20240920_170304/RaftStereoDepthBetaK3Sigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --modulation_alg "sigmoid" --test_exp_name "final-N3-Sigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3Sigmoid_20240920_170304/90000_RaftStereoDepthBetaK3Sigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --modulation_alg "sigmoid" --test_exp_name "90000itr-N3-Sigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3Sigmoid_20240920_170304/80000_RaftStereoDepthBetaK3Sigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --modulation_alg "sigmoid" --test_exp_name "80000itr-N3-Sigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3Sigmoid_20240920_170304/70000_RaftStereoDepthBetaK3Sigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --modulation_alg "sigmoid" --test_exp_name "70000itr-N3-Sigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3Sigmoid_20240920_170304/60000_RaftStereoDepthBetaK3Sigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --modulation_alg "sigmoid" --test_exp_name "60000itr-N3-Sigmoid" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "final-N53" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/90000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-N53" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-N53" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/70000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-N53" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/60000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-N53" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK9753_20240923_221809/RaftStereoDepthBetaK9753.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-9,-9), (9,9), (9,-9), (-9,9), (-7,0), (7,0), (0,-7), (0,7), (-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "final-N9753" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK9753_20240923_221809/90000_RaftStereoDepthBetaK9753.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-9,-9), (9,9), (9,-9), (-9,9), (-7,0), (7,0), (0,-7), (0,7), (-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-N9753" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK9753_20240923_221809/80000_RaftStereoDepthBetaK9753.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-9,-9), (9,9), (9,-9), (-9,9), (-7,0), (7,0), (0,-7), (0,7), (-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-N9753" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK9753_20240923_221809/70000_RaftStereoDepthBetaK9753.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-9,-9), (9,9), (9,-9), (-9,9), (-7,0), (7,0), (0,-7), (0,7), (-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-N9753" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK9753_20240923_221809/60000_RaftStereoDepthBetaK9753.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-9,-9), (9,9), (9,-9), (-9,9), (-7,0), (7,0), (0,-7), (0,7), (-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-N9753" + + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBP_20240923_223033/RaftStereoDepthBetaK3NoLBP.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "final-NoLBP" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBP_20240923_223033/90000_RaftStereoDepthBetaK3NoLBP.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-NoLBP" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBP_20240923_223033/80000_RaftStereoDepthBetaK3NoLBP.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-NoLBP" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBP_20240923_223033/70000_RaftStereoDepthBetaK3NoLBP.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-NoLBP" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBP_20240923_223033/60000_RaftStereoDepthBetaK3NoLBP.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-NoLBP" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBPDim32_20240927_005940/RaftStereoDepthBetaK3NoLBPDim32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --noLBP_hidden_dim 32 --test_exp_name "final-NoLBPDim32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBPDim32_20240927_005940/90000_RaftStereoDepthBetaK3NoLBPDim32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --noLBP_hidden_dim 32 --test_exp_name "90000itr-NoLBPDim32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBPDim32_20240927_005940/80000_RaftStereoDepthBetaK3NoLBPDim32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --noLBP_hidden_dim 32 --test_exp_name "80000itr-NoLBPDim32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBPDim32_20240927_005940/70000_RaftStereoDepthBetaK3NoLBPDim32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --noLBP_hidden_dim 32 --test_exp_name "70000itr-NoLBPDim32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3NoLBPDim32_20240927_005940/60000_RaftStereoDepthBetaK3NoLBPDim32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaNoLBP" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --noLBP_hidden_dim 32 --test_exp_name "60000itr-NoLBPDim32" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK131197531_20240927_005250/RaftStereoDepthBetaK131197531.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-13,0), (13,0), (0,-13), (0,13), (-11,-11), (11,11), (11,-11), (-11,11), (-9,0), (9,0), (0,-9), (0,9), (-7,-7), (7,7), (7,-7), (-7,7), (-5,0), (5,0), (0,-5), (0,5), (-3,-3), (3,3), (3,-3), (-3,3), (-1,0), (1,0), (0,-1), (0,1)" --modulation_ratio 1.0 --test_exp_name "final-N131197531" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK131197531_20240927_005250/90000_RaftStereoDepthBetaK131197531.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-13,0), (13,0), (0,-13), (0,13), (-11,-11), (11,11), (11,-11), (-11,11), (-9,0), (9,0), (0,-9), (0,9), (-7,-7), (7,7), (7,-7), (-7,7), (-5,0), (5,0), (0,-5), (0,5), (-3,-3), (3,3), (3,-3), (-3,3), (-1,0), (1,0), (0,-1), (0,1)" --modulation_ratio 1.0 --test_exp_name "90000itr-N131197531" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK131197531_20240927_005250/80000_RaftStereoDepthBetaK131197531.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-13,0), (13,0), (0,-13), (0,13), (-11,-11), (11,11), (11,-11), (-11,11), (-9,0), (9,0), (0,-9), (0,9), (-7,-7), (7,7), (7,-7), (-7,7), (-5,0), (5,0), (0,-5), (0,5), (-3,-3), (3,3), (3,-3), (-3,3), (-1,0), (1,0), (0,-1), (0,1)" --modulation_ratio 1.0 --test_exp_name "80000itr-N131197531" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK131197531_20240927_005250/70000_RaftStereoDepthBetaK131197531.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-13,0), (13,0), (0,-13), (0,13), (-11,-11), (11,11), (11,-11), (-11,11), (-9,0), (9,0), (0,-9), (0,9), (-7,-7), (7,7), (7,-7), (-7,7), (-5,0), (5,0), (0,-5), (0,5), (-3,-3), (3,3), (3,-3), (-3,3), (-1,0), (1,0), (0,-1), (0,1)" --modulation_ratio 1.0 --test_exp_name "70000itr-N131197531" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK131197531_20240927_005250/60000_RaftStereoDepthBetaK131197531.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-13,0), (13,0), (0,-13), (0,13), (-11,-11), (11,11), (11,-11), (-11,11), (-9,0), (9,0), (0,-9), (0,9), (-7,-7), (7,7), (7,-7), (-7,7), (-5,0), (5,0), (0,-5), (0,5), (-3,-3), (3,3), (3,-3), (-3,3), (-1,0), (1,0), (0,-1), (0,1)" --modulation_ratio 1.0 --test_exp_name "60000itr-N131197531" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio2_20240930_022616/RaftStereoDepthBetaK53Ratio2.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "final-N53Ratio2" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio2_20240930_022616/90000_RaftStereoDepthBetaK53Ratio2.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "90000itr-N53Ratio2" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio2_20240930_022616/80000_RaftStereoDepthBetaK53Ratio2.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "80000itr-N53Ratio2" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio2_20240930_022616/70000_RaftStereoDepthBetaK53Ratio2.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "70000itr-N53Ratio2" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio2_20240930_022616/60000_RaftStereoDepthBetaK53Ratio2.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "60000itr-N53Ratio2" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio3_20240930_022650/RaftStereoDepthBetaK53Ratio3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 3.0 --test_exp_name "final-N53Ratio3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio3_20240930_022650/90000_RaftStereoDepthBetaK53Ratio3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "90000itr-N53Ratio3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio3_20240930_022650/80000_RaftStereoDepthBetaK53Ratio3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "80000itr-N53Ratio3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio3_20240930_022650/70000_RaftStereoDepthBetaK53Ratio3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "70000itr-N53Ratio3" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Ratio3_20240930_022650/60000_RaftStereoDepthBetaK53Ratio3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --test_exp_name "60000itr-N53Ratio3" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthMatch_20241007_020814/RaftStereoDepthMatch.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthMatch" --test_exp_name "final-DepthMatch" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthMatch_20241007_020814/90000_RaftStereoDepthMatch.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthMatch" --test_exp_name "90000itr-DepthMatch" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthMatch_20241007_020814/80000_RaftStereoDepthMatch.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthMatch" --test_exp_name "80000itr-DepthMatch" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthMatch_20241007_020814/70000_RaftStereoDepthMatch.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthMatch" --test_exp_name "70000itr-DepthMatch" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthMatch_20241007_020814/60000_RaftStereoDepthMatch.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthMatch" --test_exp_name "60000itr-DepthMatch" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Disp_20241011_220622/RaftStereoDepthBetaK53Disp.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "final-DepthBetaK53Disp" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Disp_20241011_220622/90000_RaftStereoDepthBetaK53Disp.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-DepthBetaK53Disp" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Disp_20241011_220622/80000_RaftStereoDepthBetaK53Disp.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-DepthBetaK53Disp" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Disp_20241011_220622/70000_RaftStereoDepthBetaK53Disp.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-DepthBetaK53Disp" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53Disp_20241011_220622/60000_RaftStereoDepthBetaK53Disp.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-DepthBetaK53Disp" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoid_20241017_205931/RaftStereoDepthBetaK53DispRefineSigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "final-DepthBetaK53DispRefineSigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoid_20241017_205931/90000_RaftStereoDepthBetaK53DispRefineSigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-DepthBetaK53DispRefineSigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoid_20241017_205931/80000_RaftStereoDepthBetaK53DispRefineSigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-DepthBetaK53DispRefineSigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoid_20241017_205931/70000_RaftStereoDepthBetaK53DispRefineSigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-DepthBetaK53DispRefineSigmoid" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoid_20241017_205931/60000_RaftStereoDepthBetaK53DispRefineSigmoid.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-DepthBetaK53DispRefineSigmoid" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispMonoDepthBatch16_20241021_222325/RaftStereoDepthBetaK53DispMonoDepthBatch16.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "final-DepthBetaK53DispMonDepth" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispMonoDepthBatch16_20241021_222325/90000_RaftStereoDepthBetaK53DispMonoDepthBatch16.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-DepthBetaK53DispMonDepth" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispMonoDepthBatch16_20241021_222325/80000_RaftStereoDepthBetaK53DispMonoDepthBatch16.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-DepthBetaK53DispMonDepth" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "final-RefineSigmoidPreMonoBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/90000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "90000itr-RefineSigmoidPreMonoBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/80000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "80000itr-RefineSigmoidPreMonoBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/70000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "70000itr-RefineSigmoidPreMonoBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/60000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "60000itr-RefineSigmoidPreMonoBatch32" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthPostFusionBatch32_20241028_215503/RaftStereoDepthPostFusionBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthPostFusion" --test_exp_name "final-RaftStereoDepthPostFusionBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthPostFusionBatch32_20241028_215503/90000_RaftStereoDepthPostFusionBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthPostFusion" --test_exp_name "90000itr-RaftStereoDepthPostFusionBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthPostFusionBatch32_20241028_215503/80000_RaftStereoDepthPostFusionBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthPostFusion" --test_exp_name "80000itr-RaftStereoDepthPostFusionBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthPostFusionBatch32_20241028_215503/70000_RaftStereoDepthPostFusionBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthPostFusion" --test_exp_name "70000itr-RaftStereoDepthPostFusionBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthPostFusionBatch32_20241028_215503/60000_RaftStereoDepthPostFusionBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthPostFusion" --test_exp_name "60000itr-RaftStereoDepthPostFusionBatch32" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "50000itr-RefineSigmoidPreMonoBatch48ConfDim" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/40000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "40000itr-RefineSigmoidPreMonoBatch48ConfDim" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/30000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "30000itr-RefineSigmoidPreMonoBatch48ConfDim" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/20000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "20000itr-RefineSigmoidPreMonoBatch48ConfDim" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/10000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "10000itr-RefineSigmoidPreMonoBatch48ConfDim" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "50000itr-RefineSigmoidPreMonoBatch48ConfDim" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/40000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "40000itr-RefineSigmoidPreMonoBatch48ConfDim" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/30000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "30000itr-RefineSigmoidPreMonoBatch48ConfDim" + + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "middlebury_F - final-RefineSigmoidPreMonoBatch32" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "middlebury_F - 80000itr-N53" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-10,-10), (10,10), (10,-10), (-10,10), (-6,0), (6,0), (0,-6), (0,6)" --modulation_ratio 1.0 --test_exp_name "middlebury_F - 80000itr-N53" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-10,-10), (10,10), (10,-10), (-10,10), (-6,0), (6,0), (0,-6), (0,6)" --modulation_ratio 3.0 --test_exp_name "middlebury_F - final-RefineSigmoidPreMonoBatch32" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/raftstereo.pth --dataset middlebury_F --model_name "RaftStereo" --test_exp_name "middlebury_F - final-raftstereo" + + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "middlebury_F - final-RefineSigmoidPreMonoBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti2012 --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "kitti2012 - final-RefineSigmoidPreMonoBatch32" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset eth3d --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "kitti2012 - final-RefineSigmoidPreMonoBatch32" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "booster - final-RefineSigmoidPreMonoBatch32" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/raftstereo.pth --dataset booster --model_name "RaftStereo" --test_exp_name "booster - final-raftstereo" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "MiddleburyF-50000itr-RefineSigmoidPreMonoBatch48ConfDim" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "MiddleburyF-50000itr-RefineSigmoidPreMonoBatch48ConfDim-median" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/fintune_CRE_20241105_104054/fintune_CRE.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "middlebury_H - fintune_CRE" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/fintune_CRE_20241105_104054/90000_fintune_CRE.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "middlebury_H - 90000itr_CRE" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/fintune_CRE_20241105_104054/80000_fintune_CRE.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "middlebury_H - 80000itr_CRE" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/fintune_CRE_20241105_104054/70000_fintune_CRE.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "middlebury_H - 70000itr_CRE" +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/fintune_CRE_20241105_104054/60000_fintune_CRE.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "middlebury_H - 60000itr_CRE" + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/fintune_NerfStereo_20241108_225550/70000_fintune_NerfStereo.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "middlebury_H - 70000itr_NerfStereo" + + + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt ./clouds/ckpoint/Trans_RAFTStereoDepthBetaRefine_tuneRefine_20250322_093723/Trans_RAFTStereoDepthBetaRefine_tuneRefine.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "Trans_RAFTStereoDepthBetaRefine_tuneRefine" + +python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "boosterF-50000itr-RefineSigmoidPreMonoBatch48ConfDim" + +# python3 evaluate_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/raftstereo.pth --dataset booster --model_name "RaftStereo" --test_exp_name "booster_H - final-raftstereo" + + +# python3 merge_sheet.py diff --git a/script/finetune.sh b/script/finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..1c7e639efd3492eb3f6f9340291fbc20c04a5f61 --- /dev/null +++ b/script/finetune.sh @@ -0,0 +1,43 @@ +# /usr/bin/bash + + +# 获取当前 shell 文件名(不包含路径和扩展名) +SCRIPT_NAME=$(basename "$0" .sh) + +# 获取当前时间 +CURRENT_TIME=$(date +"%Y%m%d_%H%M%S") + +# 如果有参数,使用参数作为文件夹名,否则使用脚本名加时间 +if [ -n "$1" ]; then + FOLDER_NAME="${1}_${CURRENT_TIME}" + EXP_NAME="${1}" +else + FOLDER_NAME="${SCRIPT_NAME}_${CURRENT_TIME}" + EXP_NAME="${SCRIPT_NAME}" +fi + + +# export NCCL_DEBUG=WARN +export NCCL_P2P_DISABLE=1 +# export NCCL_SOCKET_IFNAME=eth0 # 设置正确的网络接口 +# export MASTER_ADDR=127.0.0.1 +# export MASTER_PORT=29501 +# export CUDA_VISIBLE_DEVICES=0,1,2,3 +export CUDA_VISIBLE_DEVICES=4,5,6,7 + +# export DATASET_ROOT="/data6/sceneflow/sceneflow" + +export LOG_ROOT="/data5/yao/runs/log/${FOLDER_NAME}" +export TB_ROOT="/data5/yao/runs/tboard/${FOLDER_NAME}" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint/${FOLDER_NAME}" + +# 输出新的路径,确认设置正确 +echo "LOG_ROOT is set to: $LOG_ROOT" +echo "TB_ROOT is set to: $TB_ROOT" +echo "CKPOINT_ROOT is set to: $CKPOINT_ROOT" + + + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29301 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth" --lr 0.0001 --finetune --train_datasets "crestereo" --exp_name "$EXP_NAME" + +torchrun --nnode 1 --nproc_per_node 4 --master_port 29301 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth" --lr 0.0001 --finetune --train_datasets "nerfstereo" --exp_name "$EXP_NAME" \ No newline at end of file diff --git a/script/gen_sample_stereo_raftstereo.sh b/script/gen_sample_stereo_raftstereo.sh new file mode 100644 index 0000000000000000000000000000000000000000..6dde6668dae5bf3dabf49e54de4d9e4c88523949 --- /dev/null +++ b/script/gen_sample_stereo_raftstereo.sh @@ -0,0 +1,42 @@ +# /usr/bin/bash + +export LOG_ROOT="/data5/yao/runs/vis" +export TB_ROOT="/data5/yao/runs/tboard" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint" + + + +# CUDA_VISIBLE_DEVICES=0 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK3_20240917_122628/RaftStereoDepthBetaK3.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --test_exp_name "final-N3" + +# CUDA_VISIBLE_DEVICES=1 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthAny" --test_exp_name "final" + +# CUDA_VISIBLE_DEVICES=2 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "8000-N53" + +# CUDA_VISIBLE_DEVICES=3 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBeta" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "8000-N53" --mask + +# CUDA_VISIBLE_DEVICES=3 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-10,-10), (10,10), (10,-10), (-10,10), (-6,0), (6,0), (0,-6), (0,6)" --modulation_ratio 1.0 --test_exp_name "MF - final-RefineSigmoidPreMonoBatch32" --improvement --movement + + +# CUDA_VISIBLE_DEVICES=3 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "booster-mono-final-RefineSigmoidPreMonoBatch32" + +# CUDA_VISIBLE_DEVICES=3 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_F --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --test_exp_name "middlebury_F-mono-final-RefineSigmoidPreMonoBatch32" + + +# CUDA_VISIBLE_DEVICES=4 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "SUPP" + +# CUDA_VISIBLE_DEVICES=4 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "SUPP" + +# CUDA_VISIBLE_DEVICES=4 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset eth3d --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "SUPP" + +CUDA_VISIBLE_DEVICES=4 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "SUPP" + +CUDA_VISIBLE_DEVICES=4 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti2012 --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --test_exp_name "SUPP" + + +# CUDA_VISIBLE_DEVICES=1 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/raftstereo.pth --dataset booster --sv_root /data5/yao/runs/vis --model_name "RaftStereo" --test_exp_name "booster - final-raftstereo" + +# CUDA_VISIBLE_DEVICES=6 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/raftstereo_20240821_142156/raftstereo.pth --dataset booster --sv_root /data5/yao/runs/vis --model_name "RaftStereo" --test_exp_name "booster-final-raftstereo-visRange" + + + +# CUDA_VISIBLE_DEVICES=6 python3 gen_sample_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --sv_root /data5/yao/runs/vis --model_name "RAFTStereoDepthAny" --test_exp_name "booster-final-RAFTStereoDepthAny-visRange" \ No newline at end of file diff --git a/script/infer_stereo_raftstereo.sh b/script/infer_stereo_raftstereo.sh new file mode 100644 index 0000000000000000000000000000000000000000..c65fac384dbb9e361a04cd0e5aa65e3608a37708 --- /dev/null +++ b/script/infer_stereo_raftstereo.sh @@ -0,0 +1,22 @@ +# /usr/bin/bash + +export LOG_ROOT="/data5/yao/runs/vis" +export TB_ROOT="/data5/yao/runs/tboard" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint" + + + +# CUDA_VISIBLE_DEVICES=4 python3 infer_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" \ +# --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea \ +# --dataset flicker1024_val --root /data6/Flickr1024 --img_path_txt /home/yao/Document/OpenStereo/OpenStereo-1f93c822e9c9d571a5238c813560478bc4c18662/data/Flicker1024/Flicker1024_validation.txt \ +# --sv_root /data5/yao/runs/vis --test_exp_name "ICCV-Rebuttal" + +# CUDA_VISIBLE_DEVICES=5 python3 infer_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" \ +# --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea \ +# --dataset flicker1024_train --root /data6/Flickr1024 --img_path_txt /home/yao/Document/OpenStereo/OpenStereo-1f93c822e9c9d571a5238c813560478bc4c18662/data/Flicker1024/Flicker1024_train.txt \ +# --sv_root /data5/yao/runs/vis --test_exp_name "ICCV-Rebuttal" + +# CUDA_VISIBLE_DEVICES=6 python3 infer_stereo_raftstereo.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" \ +# --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea \ +# --dataset flicker1024_test --root /data6/Flickr1024 --img_path_txt /home/yao/Document/OpenStereo/OpenStereo-1f93c822e9c9d571a5238c813560478bc4c18662/data/Flicker1024/Flicker1024_test.txt \ +# --sv_root /data5/yao/runs/vis --test_exp_name "ICCV-Rebuttal" \ No newline at end of file diff --git a/script/train_stereo_raftstereo.sh b/script/train_stereo_raftstereo.sh new file mode 100644 index 0000000000000000000000000000000000000000..0b6a71c55445282f98a1304891bd3b7a3defb555 --- /dev/null +++ b/script/train_stereo_raftstereo.sh @@ -0,0 +1,57 @@ +# /usr/bin/bash + + +# 获取当前 shell 文件名(不包含路径和扩展名) +SCRIPT_NAME=$(basename "$0" .sh) + +# 获取当前时间 +CURRENT_TIME=$(date +"%Y%m%d_%H%M%S") + +# 如果有参数,使用参数作为文件夹名,否则使用脚本名加时间 +if [ -n "$1" ]; then + FOLDER_NAME="${1}_${CURRENT_TIME}" + EXP_NAME="${1}" +else + FOLDER_NAME="${SCRIPT_NAME}_${CURRENT_TIME}" + EXP_NAME="${SCRIPT_NAME}" +fi + + +# export NCCL_DEBUG=WARN +export NCCL_P2P_DISABLE=1 +# export NCCL_SOCKET_IFNAME=eth0 # 设置正确的网络接口 +# export MASTER_ADDR=127.0.0.1 +# export MASTER_PORT=29501 +# export CUDA_VISIBLE_DEVICES=0,1,2,3 +export CUDA_VISIBLE_DEVICES=4,5,6,7 + +# "/horizon-bucket/saturn_v_dev/01_users/chengtang.yao/Sceneflow" +# "/horizon-bucket/saturn_v_dev/01_users/chengtang.yao/Middlebury" +# "/horizon-bucket/saturn_v_dev/01_users/chengtang.yao/KITTI2015" +# "/horizon-bucket/saturn_v_dev/01_users/chengtang.yao/ETH3D" +export DATASET_ROOT="/data6/sceneflow/sceneflow" + +export LOG_ROOT="/data5/yao/runs/log/${FOLDER_NAME}" +export TB_ROOT="/data5/yao/runs/tboard/${FOLDER_NAME}" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint/${FOLDER_NAME}" + +# 输出新的路径,确认设置正确 +echo "LOG_ROOT is set to: $LOG_ROOT" +echo "TB_ROOT is set to: $TB_ROOT" +echo "CKPOINT_ROOT is set to: $CKPOINT_ROOT" + + + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RaftStereo" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RaftStereoDisp" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoMast3r" --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29300 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoMast3r" --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --corr_implementation "abs_alt" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29300 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoMast3r" --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 200000 --mixed_precision --model_name "RAFTStereoMast3r" --mast3r_model_path "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" --exp_name "$EXP_NAME" --lr 0.001 + +torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RaftStereoNoCTX" --exp_name "$EXP_NAME" diff --git a/script/train_stereo_raftstereo_depthany.sh b/script/train_stereo_raftstereo_depthany.sh new file mode 100644 index 0000000000000000000000000000000000000000..a90153a54ecf78819a4a1db08813ff0162c19b49 --- /dev/null +++ b/script/train_stereo_raftstereo_depthany.sh @@ -0,0 +1,113 @@ +# /usr/bin/bash + + +# 获取当前 shell 文件名(不包含路径和扩展名) +SCRIPT_NAME=$(basename "$0" .sh) + +# 获取当前时间 +CURRENT_TIME=$(date +"%Y%m%d_%H%M%S") + +# 如果有参数,使用参数作为文件夹名,否则使用脚本名加时间 +if [ -n "$1" ]; then + FOLDER_NAME="${1}_${CURRENT_TIME}" + EXP_NAME="${1}" +else + FOLDER_NAME="${SCRIPT_NAME}_${CURRENT_TIME}" + EXP_NAME="${SCRIPT_NAME}" +fi + + +# export NCCL_DEBUG=WARN +export NCCL_P2P_DISABLE=1 +# export NCCL_SOCKET_IFNAME=eth0 # 设置正确的网络接口 +# export MASTER_ADDR=127.0.0.1 +# export MASTER_PORT=29501 +# export CUDA_VISIBLE_DEVICES=0,1,2,3 +# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +# export CUDA_VISIBLE_DEVICES=0,6 +# export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=2,3,4,5,6,7 + +# export DATASET_ROOT="/data6/sceneflow/sceneflow" +export DATASET_ROOT="./datasets/Trans" + +export LOG_ROOT="/data5/yao/runs/log/${FOLDER_NAME}" +export TB_ROOT="/data5/yao/runs/tboard/${FOLDER_NAME}" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint/${FOLDER_NAME}" + +# 输出新的路径,确认设置正确 +echo "LOG_ROOT is set to: $LOG_ROOT" +echo "TB_ROOT is set to: $TB_ROOT" +echo "CKPOINT_ROOT is set to: $CKPOINT_ROOT" + + +nproc_per_node=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l) # Count the number of GPUs + + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthAny" --depthany_model_dir "/data5/yao/pretrained" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthFusion" --depthany_model_dir "/data5/yao/pretrained" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-1,-1), (1,1), (1,-1), (-1,1)" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_alg "sigmoid" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-9,-9), (9,9), (9,-9), (-9,9), (-7,0), (7,0), (0,-7), (0,7), (-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaNoLBP" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-3,-3), (3,3), (3,-3), (-3,3)" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-13,0), (13,0), (0,-13), (0,13), (-11,-11), (11,11), (11,-11), (-11,11), (-9,0), (9,0), (0,-9), (0,9), (-7,-7), (7,7), (7,-7), (-7,7), (-5,0), (5,0), (0,-5), (0,5), (-3,-3), (3,3), (3,-3), (-3,3), (-1,0), (1,0), (0,-1), (0,1)" --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaNoLBP" --depthany_model_dir "/data5/yao/pretrained" --noLBP_hidden_dim 32 --modulation_ratio 1.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 2.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 3.0 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29400 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthMatch" --depthany_model_dir "/data5/yao/pretrained" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBeta" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --exp_name "$EXP_NAME"\ + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth" --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 16 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth" --lr 0.0005 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 32 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispMonoDepthBatch16_20241021_222325/RaftStereoDepthBetaK53DispMonoDepthBatch16.pth" --lr 0.0005 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 32 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthPostFusion" --depthany_model_dir "/data5/yao/pretrained" --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthAny_20240908_125231/RaftStereoDepthAny.pth" --lr 0.0005 --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 48 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispMonoDepthBatch16_20241021_222325/RaftStereoDepthBetaK53DispMonoDepthBatch16.pth" --lr 0.0005 --exp_name "$EXP_NAME" + + + +# # mono pooling +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29501 train_stereo_raftstereo.py --batch_size 48 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --refine_pool --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth" --lr 0.0005 --train_refine_mono --exp_name "$EXP_NAME" + +# # w/o mono pooling +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29401 train_stereo_raftstereo.py --batch_size 48 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth" --lr 0.0005 --train_refine_mono --exp_name "$EXP_NAME" + +# mono EfficientUnet +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29301 train_stereo_raftstereo.py --batch_size 32 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --refine_unet --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53_20240920_165346/80000_RaftStereoDepthBetaK53.pth" --lr 0.0005 --train_refine_mono --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29301 train_stereo_raftstereo.py --batch_size 32 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --refine_unet --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32Unet_20241112_132207/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32Unet.pth" --lr 0.0005 --train_refine_mono --exp_name "$EXP_NAME" + +# torchrun --nnode 1 --nproc_per_node 4 --master_port 29301 train_stereo_raftstereo.py --batch_size 48 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --refine_unet --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32Unet_20241114_125423/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32Unet.pth" --lr 0.0003 --train_refine_mono --exp_name "$EXP_NAME" + + + +# torchrun --nnode 1 --nproc_per_node 2 --master_port 29301 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoMetric3D" --lr 0.0002 --exp_name "RAFTStereoMetric3D" + +# torchrun --nnode 1 --nproc_per_node 8 --master_port 29301 train_stereo_raftstereo.py --batch_size 6 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 100000 --mixed_precision --model_name "RAFTStereoMetric3D" --lr 0.0001 --restore_ckpt "/data5/yao/runs/ckpoint/RAFTStereoMetric3D_20250305_043320/20000_RAFTStereoMetric3D.pth" --exp_name "RAFTStereoMetric3D_ConfLoss_Lr0001" + + + + + +# On Trans dataset +# torchrun --nnode 1 --nproc_per_node $nproc_per_node --master_port 29501 train_stereo_raftstereo.py --batch_size 8 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 10000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --restore_ckpt "/data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth" --lr 0.0005 --fintune_info "tune_raft" --train_datasets "Trans" --exp_name "Trans_RAFTStereoDepthBetaRefine" + +torchrun --nnode 1 --nproc_per_node $nproc_per_node --master_port 29501 train_stereo_raftstereo.py --batch_size 32 --train_iters 22 --valid_iters 32 --spatial_scale -0.2 0.4 --saturation_range 0 1.4 --n_downsample 2 --num_steps 10000 --mixed_precision --model_name "RAFTStereoDepthBetaRefine" --depthany_model_dir "/data5/yao/pretrained" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --restore_ckpt "/home/yao/Document/GeneralizedStereoMatching/clouds/ckpoint/Trans_RAFTStereoDepthBetaRefine_20250321_180351/Trans_RAFTStereoDepthBetaRefine.pth" --lr 0.0005 --fintune_info "tune_refine" --train_datasets "Trans" --exp_name "Trans_RAFTStereoDepthBetaRefine_tuneRefine" diff --git a/script/vis_inter_stereo_raftstereo.sh b/script/vis_inter_stereo_raftstereo.sh new file mode 100644 index 0000000000000000000000000000000000000000..6f0d3c433731c1d10ae262362c78902e589c7fa9 --- /dev/null +++ b/script/vis_inter_stereo_raftstereo.sh @@ -0,0 +1,22 @@ +# /usr/bin/bash + +export LOG_ROOT="/data5/yao/runs/vis" +export TB_ROOT="/data5/yao/runs/tboard" +export CKPOINT_ROOT="/data5/yao/runs/ckpoint" + + +CUDA_VISIBLE_DEVICES=6 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --vis_inter --test_exp_name "SUPP" + +CUDA_VISIBLE_DEVICES=6 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset middlebury_H --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --vis_inter --test_exp_name "SUPP" + +CUDA_VISIBLE_DEVICES=6 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset eth3d --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --vis_inter --test_exp_name "SUPP" + +CUDA_VISIBLE_DEVICES=6 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --vis_inter --test_exp_name "SUPP" + +CUDA_VISIBLE_DEVICES=6 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/50000_RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim.pth --depthany_model_dir "/data5/yao/pretrained" --dataset kitti2012 --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --conf_from_fea --vis_inter --test_exp_name "SUPP" + + +# CUDA_VISIBLE_DEVICES=4 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --vis_inter --test_exp_name "50000itr-RefineSigmoidPreMonoBatch32" + + +# CUDA_VISIBLE_DEVICES=4 python3 vis_intermediate_results.py --restore_ckpt /data5/yao/runs/ckpoint/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32_20241024_172455/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch32.pth --depthany_model_dir "/data5/yao/pretrained" --dataset booster --sv_root /data5/yao/runs/vis/inter_results --model_name "RAFTStereoDepthBetaRefine" --lbp_neighbor_offsets "(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)" --modulation_ratio 1.0 --vis_inter --test_exp_name "final-RefineSigmoidPreMonoBatch32" \ No newline at end of file diff --git a/tools/anonymous.py b/tools/anonymous.py new file mode 100644 index 0000000000000000000000000000000000000000..e1c758a2d5140012f73b0b041282df5361d47432 --- /dev/null +++ b/tools/anonymous.py @@ -0,0 +1,58 @@ +import os +import re +import argparse +import shutil + + +def delete_pycache(root_dir): + for dirpath, dirnames, filenames in os.walk(root_dir): + # Check if __pycache__ is in the current directory + if "__pycache__" in dirnames: + pycache_path = os.path.join(dirpath, "__pycache__") + try: + shutil.rmtree(pycache_path) # Recursively delete the directory + print(f"Deleted: {pycache_path}") + except Exception as e: + print(f"Failed to delete {pycache_path}: {e}") + +def replace_keyword_in_files(root_dir, keyword, replacement): + # Compile the keyword for performance if using regular expressions + keyword_pattern = re.compile(re.escape(keyword)) # Escapes special regex characters in keyword + + for dirpath, dirnames, filenames in os.walk(root_dir): + # if "__pycache__" in dirpath: + # continue + for file in filenames: + file_path = os.path.join(dirpath, file) + + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Replace all occurrences of the keyword + new_content = keyword_pattern.sub(replacement, content) + + # Write back the modified content only if changes were made + if content != new_content: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + print(f"Modified: {file_path}") + # exit(0) + + except (UnicodeDecodeError, PermissionError, FileNotFoundError) as e: + print(f"Skipping file {file_path} due to error: {e}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Replace all occurrences of a keyword in files under a directory.") + parser.add_argument("-k", "--target_keyword", type=str, help="The keyword to search for.") + parser.add_argument("-r", "--replacement_string", type=str, help="The string to replace the keyword with.") + parser.add_argument("-d", "--root_directory", type=str, help="The root directory to search in.") + + args = parser.parse_args() + + target_keyword = args.target_keyword + replacement_string = args.replacement_string + root_directory = args.root_directory + + delete_pycache(root_directory) + replace_keyword_in_files(root_directory, target_keyword, replacement_string) diff --git a/tools/demo_matching.py b/tools/demo_matching.py new file mode 100644 index 0000000000000000000000000000000000000000..cfb0739fb311b8b18c377450262ba0d2eb284785 --- /dev/null +++ b/tools/demo_matching.py @@ -0,0 +1,81 @@ +import os +import sys +sys.path.insert(0,'mast3r') + +from mast3r.model import AsymmetricMASt3R +from mast3r.fast_nn import fast_reciprocal_NNs + +import mast3r.utils.path_to_dust3r +from dust3r.inference import inference +from dust3r.utils.image import load_images + +if __name__ == '__main__': + device = 'cuda' + schedule = 'cosine' + lr = 0.01 + niter = 300 + + # Namespace(model="AsymmetricMASt3R(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100',img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), patch_embed_cls='PatchEmbedDust3R', two_confs=True, desc_conf_mode=('exp', 0, inf))") + model_name = "/data5/yao/pretrained/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth" + # you can put the path to a local checkpoint in model_name if needed + model = AsymmetricMASt3R.from_pretrained(model_name).to(device) + print(next(model.parameters()).device) + images = load_images(['/data5/yao/tmp/Piano/im0.png', '/data5/yao/tmp/Piano/im1.png'], size=512) + output = inference([tuple(images)], model, device, batch_size=1, verbose=False) + + # from IPython import embed + # embed() + + # at this stage, you have the raw dust3r predictions + view1, pred1 = output['view1'], output['pred1'] + view2, pred2 = output['view2'], output['pred2'] + + desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach() + + # find 2D-2D matches between the two images + matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8, + device=device, dist='dot', block_size=2**13) + + # ignore small border around the edge + H0, W0 = view1['true_shape'][0] + valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & ( + matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3) + + H1, W1 = view2['true_shape'][0] + valid_matches_im1 = (matches_im1[:, 0] >= 3) & (matches_im1[:, 0] < int(W1) - 3) & ( + matches_im1[:, 1] >= 3) & (matches_im1[:, 1] < int(H1) - 3) + + valid_matches = valid_matches_im0 & valid_matches_im1 + matches_im0, matches_im1 = matches_im0[valid_matches], matches_im1[valid_matches] + + # visualize a few matches + import numpy as np + import torch + import torchvision.transforms.functional + from matplotlib import pyplot as pl + + n_viz = 20 + num_matches = matches_im0.shape[0] + match_idx_to_viz = np.round(np.linspace(0, num_matches - 1, n_viz)).astype(int) + viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz] + + image_mean = torch.as_tensor([0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1) + image_std = torch.as_tensor([0.5, 0.5, 0.5], device='cpu').reshape(1, 3, 1, 1) + + viz_imgs = [] + for i, view in enumerate([view1, view2]): + rgb_tensor = view['img'] * image_std + image_mean + viz_imgs.append(rgb_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy()) + + H0, W0, H1, W1 = *viz_imgs[0].shape[:2], *viz_imgs[1].shape[:2] + img0 = np.pad(viz_imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0) + img1 = np.pad(viz_imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0) + img = np.concatenate((img0, img1), axis=1) + pl.figure() + pl.imshow(img) + cmap = pl.get_cmap('jet') + for i in range(n_viz): + (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T + pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False) + # pl.show(block=True) + pl.savefig('./plot_image.png') \ No newline at end of file diff --git a/tools/filter_noise_cres.py b/tools/filter_noise_cres.py new file mode 100644 index 0000000000000000000000000000000000000000..0fbb838431ecdba78622ed965ebd28be53a0689a --- /dev/null +++ b/tools/filter_noise_cres.py @@ -0,0 +1,93 @@ +import os +import cv2 +import numpy as np +from glob import glob +from tqdm import tqdm +from PIL import Image +from multiprocessing import Pool + +root = './datasets/CREStereo_dataset' + +# Assume image1_list, image2_list, disp_list are your file path lists +image1_list = sorted(glob(os.path.join(root, "**/*_left.jpg"), recursive=True)) +image2_list = sorted(glob(os.path.join(root, "**/*_right.jpg"), recursive=True)) +disp_list = sorted(glob(os.path.join(root, "**/*_left.disp.png"), recursive=True)) +print("Original number of files:", len(image1_list)) + +# Create a set of scene names to ensure data in each list matches +image1_scenes = {os.path.basename(path).replace("_left.jpg", "") for path in image1_list} +image2_scenes = {os.path.basename(path).replace("_right.jpg", "") for path in image2_list} +disp_scenes = {os.path.basename(path).replace("_left.disp.png", "") for path in disp_list} + +# Find scene names that exist in all three lists +valid_scenes = image1_scenes & image2_scenes & disp_scenes + +# Filter out valid file paths +valid_image1_list = [path for path in image1_list if os.path.basename(path).replace("_left.jpg", "") in valid_scenes] +valid_image2_list = [path for path in image2_list if os.path.basename(path).replace("_right.jpg", "") in valid_scenes] +valid_disp_list = [path for path in disp_list if os.path.basename(path).replace("_left.disp.png", "") in valid_scenes] + +# Update the original lists +image1_list, image2_list, disp_list = valid_image1_list, valid_image2_list, valid_disp_list + +# Print the number of valid files +print("Number of valid files:", len(image1_list)) + +# Split the data and set the number of processes +num_processes = 50 + +# Use numpy.array_split to ensure relatively balanced chunk sizes +image1_chunks = np.array_split(image1_list, num_processes) +image2_chunks = np.array_split(image2_list, num_processes) +disp_chunks = np.array_split(disp_list, num_processes) + +def check_validity(img1_chunk, img2_chunk, disp_chunk): + valid_img1, valid_img2, valid_disp = [], [], [] + + for img1_path, img2_path, disp_path in zip(img1_chunk, img2_chunk, disp_chunk): + try: + img1 = Image.open(img1_path) + img2 = Image.open(img2_path) + disp = cv2.imread(disp_path, cv2.IMREAD_ANYDEPTH).astype(np.float32) / 64.0 + + img1 = np.array(img1).astype(np.uint8) + img2 = np.array(img2).astype(np.uint8) + disp = np.array(disp).astype(np.float32) + except Exception as err: + print(err) + print(f"Invalid file: {img1_path if img1 is None else ''} {img2_path if img2 is None else ''} {disp_path if disp is None else ''}") + continue + + if img1 is not None and img2 is not None and disp is not None: + valid_img1.append(img1_path) + valid_img2.append(img2_path) + valid_disp.append(disp_path) + + return valid_img1, valid_img2, valid_disp + +# Use a process pool to check file validity in parallel +with Pool(processes=num_processes) as pool: + # Show overall progress in the main process with tqdm + results = list(tqdm(pool.starmap(check_validity, zip(image1_chunks, image2_chunks, disp_chunks)), total=num_processes)) + +# Combine results +valid_image1_list = [img for result in results for img in result[0]] +valid_image2_list = [img for result in results for img in result[1]] +valid_disp_list = [img for result in results for img in result[2]] + +# Update the lists +image1_list, image2_list, disp_list = valid_image1_list, valid_image2_list, valid_disp_list + +print("Final number of valid files:", len(image1_list)) + +# Convert the lists to numpy arrays +image1_array = np.array(image1_list) +image2_array = np.array(image2_list) +disp_array = np.array(disp_list) + +# Save them as .npy files +np.save('image1_list.npy', image1_array) +np.save('image2_list.npy', image2_array) +np.save('disp_list.npy', disp_array) + +print("Data has been saved as .npy files") diff --git a/tools/filter_noise_nerfstereo.py b/tools/filter_noise_nerfstereo.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe2bddc9931f55ae78aa899b7b492c03bbec497 --- /dev/null +++ b/tools/filter_noise_nerfstereo.py @@ -0,0 +1,138 @@ +import os +import re +import cv2 +import numpy as np +from glob import glob +from tqdm import tqdm +from PIL import Image +from multiprocessing import Pool + +root = './datasets/NerfStereo' + +# Assume image1_list, image2_list, disp_list are your file path lists +left_list = sorted(glob(os.path.join(root, "*/*/baseline_*/left/*.jpg"), recursive=True)) +image1_list = [] +vad_list = [] +for path in left_list: + match = re.search(r"(.*?/Q/)", path) + prefix = match.group(1) # prefix + suffix = os.path.basename(path) # file name + image1_list.append(f"{prefix}center/{suffix}") + suffix = suffix.replace(".jpg", ".png") + vad_list.append(f"{prefix}AO/{suffix}") +image2_list = sorted(glob(os.path.join(root, "*/*/baseline_*/right/*.jpg"), recursive=True)) +disp_list = sorted(glob(os.path.join(root, "*/*/baseline_*/disparity/*.png"), recursive=True)) +print("Original number of files:", len(image1_list), len(image2_list), len(disp_list), len(vad_list)) +# print(image1_list[10], image2_list[10], disp_list[10], vad_list[10], sep="\r\n") + + +# A function to remove duplicates while maintaining order +def remove_duplicates(paths): + seen = set() + unique_paths = [] + for path in paths: + basename = os.path.basename(path).replace(".jpg", "").replace(".png", "") + if basename not in seen: + unique_paths.append(path) + seen.add(basename) + return unique_paths + +# Path lists after removing duplicates +unique_image1_list = remove_duplicates(image1_list) +unique_image2_list = remove_duplicates(image2_list) +unique_disp_list = remove_duplicates(disp_list) +unique_vad_list = remove_duplicates(vad_list) + +# Create sets of scene names to ensure matching data in each list +image1_scenes = {os.path.basename(path).replace(".jpg", "") for path in unique_image1_list} +image2_scenes = {os.path.basename(path).replace(".jpg", "") for path in unique_image2_list} +disp_scenes = {os.path.basename(path).replace(".png", "") for path in unique_disp_list} +vad_scenes = {os.path.basename(path).replace(".png", "") for path in unique_vad_list} +# print(sorted(list(image1_scenes))[0], +# sorted(list(image2_scenes))[0], +# sorted(list(disp_scenes))[0], +# sorted(list(vad_scenes))[0]) + +# Find scene names that exist in all four lists +valid_scenes = image1_scenes & image2_scenes & disp_scenes & vad_scenes + +# Filter valid file paths +valid_image1_list = [path for path in image1_list if os.path.basename(path).replace(".jpg", "") in valid_scenes] +valid_image2_list = [path for path in image2_list if os.path.basename(path).replace(".jpg", "") in valid_scenes] +valid_disp_list = [path for path in disp_list if os.path.basename(path).replace(".png", "") in valid_scenes] +valid_vad_list = [path for path in vad_list if os.path.basename(path).replace(".png", "") in valid_scenes] + +# Update the original lists +image1_list, image2_list, disp_list, vad_list = valid_image1_list, valid_image2_list, valid_disp_list, valid_vad_list + +# Print the number of valid files +print("Number of valid files:", len(image1_list), len(image2_list), len(disp_list), len(vad_list)) +# print(image1_list[10], image2_list[10], disp_list[10], vad_list[10], sep="\r\n") + + +# Split the data and set the number of processes +num_processes = 50 + +# Use numpy.array_split to ensure relatively balanced chunk sizes +image1_chunks = np.array_split(image1_list, num_processes) +image2_chunks = np.array_split(image2_list, num_processes) +disp_chunks = np.array_split(disp_list, num_processes) +vad_chunks = np.array_split(vad_list, num_processes) + +def check_validity(img1_chunk, img2_chunk, disp_chunk, vad_chunks): + valid_img1, valid_img2, valid_disp, valid_vad = [], [], [], [] + + for img1_path, img2_path, disp_path, vad_path in zip(img1_chunk, img2_chunk, disp_chunk, vad_chunks): + try: + img1 = Image.open(img1_path) + img2 = Image.open(img2_path) + disp = cv2.imread(disp_path, cv2.IMREAD_ANYDEPTH).astype(np.float32) / 64.0 + vad = cv2.imread(vad_path, cv2.IMREAD_ANYDEPTH).astype(np.float32) / 65535 + + img1 = np.array(img1).astype(np.uint8) + img2 = np.array(img2).astype(np.uint8) + disp = np.array(disp).astype(np.float32) + vad = np.array(vad).astype(np.float32) + except Exception as err: + print(err) + print(f"Invalid file: {img1_path}-{img1 is None} {img2_path}-{img2 is None} {disp_path}-{disp is None} {vad_path}-{vad is None}") + continue + + if img1 is not None and img2 is not None and disp is not None and vad is not None: + if img1.shape[:2] == img2.shape[:2] and img2.shape[:2] == disp.shape[:2] and disp.shape[:2] == vad.shape[:2]: + valid_img1.append(img1_path) + valid_img2.append(img2_path) + valid_disp.append(disp_path) + valid_vad.append(vad_path) + + return valid_img1, valid_img2, valid_disp, valid_vad + +# Use a process pool to check file validity in parallel +with Pool(processes=num_processes) as pool: + # Show overall progress in the main process with tqdm + results = list(tqdm(pool.starmap(check_validity, zip(image1_chunks, image2_chunks, disp_chunks, vad_chunks)), total=num_processes)) + +# Combine results +valid_image1_list = [img for result in results for img in result[0]] +valid_image2_list = [img for result in results for img in result[1]] +valid_disp_list = [img for result in results for img in result[2]] +valid_vad_list = [img for result in results for img in result[3]] + +# Update the lists +image1_list, image2_list, disp_list, vad_list = valid_image1_list, valid_image2_list, valid_disp_list, valid_vad_list + +print("Final number of valid files:", len(image1_list), len(image2_list), len(disp_list), len(vad_list)) + +# Convert the lists to numpy arrays +image1_array = np.array(image1_list) +image2_array = np.array(image2_list) +disp_array = np.array(disp_list) +vad_list = np.array(vad_list) + +# Save them as .npy files +np.save('image1_list.npy', image1_array) +np.save('image2_list.npy', image2_array) +np.save('disp_list.npy', disp_array) +np.save('vad_list.npy', vad_list) + +print("Data has been saved as .npy files") diff --git a/tools/get_statistics.py b/tools/get_statistics.py new file mode 100644 index 0000000000000000000000000000000000000000..be7a2692e7b73c50b17099802741b0b917734253 --- /dev/null +++ b/tools/get_statistics.py @@ -0,0 +1,32 @@ +import pandas as pd +import argparse + +def calculate_metrics(file_path, start_row, end_row): + # Read the Excel file + df = pd.read_excel(file_path) + + # Select the rows within the given range (note: start_row and end_row are 1-based row numbers) + data = df.iloc[start_row-1:end_row] + + # Calculate mean and standard deviation + epe_mean = data['middleburyH-epe'].mean() + epe_std = data['middleburyH-epe'].std() + d1_mean = data['middleburyH-d1'].mean() + d1_std = data['middleburyH-d1'].std() + + # Print the results + print(f"middleburyH-epe Mean: {epe_mean:.2f}, Standard Deviation: {epe_std:.2f}") + print(f"middleburyH-d1 Mean: {d1_mean:.2f}, Standard Deviation: {d1_std:.2f}") + print(f"{epe_mean:.2f}+-{epe_std:.2f} {d1_mean:.2f}+-{d1_std:.2f}") + +if __name__ == "__main__": + # Set up command-line arguments + parser = argparse.ArgumentParser(description="Calculate metrics for specified rows in an Excel file.") + parser.add_argument("file_path", type=str, help="Path to the Excel file") + parser.add_argument("start_row", type=int, help="Start row (1-based index)") + parser.add_argument("end_row", type=int, help="End row (1-based index)") + + args = parser.parse_args() + + # Call the function and pass the parameters + calculate_metrics(args.file_path, args.start_row, args.end_row) diff --git a/tools/print_limited_tree.sh b/tools/print_limited_tree.sh new file mode 100644 index 0000000000000000000000000000000000000000..5c779eca0789e4a01e4c78ada0943c477291206b --- /dev/null +++ b/tools/print_limited_tree.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if [ -z "$1" ]; then + echo "Usage: $0 /path/to/folder" + exit 1 +fi + +folder="$1" + +find "$folder" -type l -exec sh -c ' + echo "Link: {} -> $(readlink -f {})" + readlink -f {} | xargs -I{} tree -L 2 {} | head -n 15 +' \; diff --git a/tools/regitser.py b/tools/regitser.py new file mode 100644 index 0000000000000000000000000000000000000000..52049fea96fa55439e9db2424f4c6fa0a5993db6 --- /dev/null +++ b/tools/regitser.py @@ -0,0 +1,83 @@ +import numpy as np +import random +import os +import sys +import subprocess +import cv2 +from sklearn.linear_model import RANSACRegressor, LinearRegression + +sys.path.insert(0,'core') +sys.path.insert(0,'core/utils') + +from core.utils import frame_utils +from core.utils import vis + +gt_path = "/home/yao/Document/GeneralizedStereoMatching/datasets/Middlebury/MiddEval3/trainingF/Jadeplant/disp0GT.pfm" +pse_gt_path = "/data5/yao/runs/vis/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/middlebury_F-50000itr-RefineSigmoidPreMonoBatch48ConfDim/middlebury_f/MiddEval3/trainingF/Jadeplant/disp0GT-pred.pfm" +pred_path = "/data5/yao/tmp/Jadeplant-depthany.png" + + +gt, valid = frame_utils.readDispMiddlebury(gt_path) +pse_gt = frame_utils.readPFM(pse_gt_path) + +pred = cv2.imread(pred_path, cv2.IMREAD_ANYDEPTH) +print(gt.shape, pred.shape, pse_gt.shape, gt.max()) + + +class Args: + def __init__(self, dataset): + self.dataset = dataset +args = Args(dataset="middlebury_F") +viser = vis.Visualizer(root="/data5/yao/tmp/register", sv_root="/data5/yao/tmp/register", dataset="middlebury", scratch=True, args=args, logger=None) + + +Y = gt[valid].reshape((-1,1)) +X = pred[valid].reshape((-1,1)) +X = np.hstack([X, np.ones_like(X)]) +print(X.shape, Y.shape, X.max(), Y.max()) + + +model = LinearRegression(fit_intercept=False) +model.fit(X, Y) + + +a, b = model.coef_[0][0], model.coef_[0][1] +reg_pred = pred * a + b +epe = np.abs(reg_pred[valid]-gt[valid]).mean() +bad3 = (epe>3).mean() +print(f"{a}, {b}: ", epe, bad3) + +# vis1 = [{"name": "GT Disp", "img_list": [gt], "cmap": "jet"}, +# {"name": "PSE GT Disp", "img_list": [pse_gt], "cmap": "jet"}, +# {"name": "Pred Disp", "img_list": [pred], "cmap": "jet"}, +# {"name": "Reg Disp", +# "img_list": [reg_pred], +# "cmap": "jet", +# "GT": [gt], +# "error_map": True,}, +# ] +# viser.analyze(vis1, os.path.basename(gt_path), in_one_fig=True) + + + +ransac = RANSACRegressor(estimator=LinearRegression(), max_trials=1000, min_samples=500, residual_threshold=1.0) +ransac.fit(X, Y) + + +a = ransac.estimator_.coef_[0][0] +b = ransac.estimator_.intercept_ +reg_pred = pred * a + b +epe = np.abs(reg_pred[valid]-gt[valid]).mean() +bad3 = (epe>3).mean() +print(f"{a}, {b}: ", epe, bad3) + +# vis1 = [{"name": "GT Disp", "img_list": [gt], "cmap": "jet"}, +# {"name": "PSE GT Disp", "img_list": [pse_gt], "cmap": "jet"}, +# {"name": "Pred Disp", "img_list": [pred], "cmap": "jet"}, +# {"name": "Reg Disp", +# "img_list": [reg_pred], +# "cmap": "jet", +# "GT": [gt], +# "error_map": True,}, +# ] +# viser.analyze(vis1, os.path.basename(gt_path), in_one_fig=True) \ No newline at end of file diff --git a/tools/split_vis_image.py b/tools/split_vis_image.py new file mode 100644 index 0000000000000000000000000000000000000000..9f2cca6ec315714a4fe498a420ce4750d661614d --- /dev/null +++ b/tools/split_vis_image.py @@ -0,0 +1,195 @@ +import cv2 +import numpy as np +import os +from multiprocessing import Pool + + + +def pad_to_match_height(image, target_height, color=(255, 255, 255)): + h, w, c = image.shape + if h >= target_height: + return image # No padding needed + top_pad = 0 + bottom_pad = target_height - h - top_pad + return cv2.copyMakeBorder(image, top_pad, bottom_pad, 0, 0, cv2.BORDER_CONSTANT, value=color) + +def compress(img, resize_factor, path): + if image_path.find("kitti")!=-1: + return img + if image_path.find("eth3d")!=-1: + return img + if image_path.find("middlebury")!=-1: + return np.hstack([img[:, 100:650, :], + img[:, 850:1400, :], + img[:, 1600:2150, :]]) + elif image_path.find("booster")!=-1: + return np.hstack([img[:, int(590 * resize_factor):int(2550 * resize_factor), :], + img[:, int(3500 * resize_factor):int(5550 * resize_factor), :], + img[:, int(6450 * resize_factor):int(8450 * resize_factor), :]]) + else: + raise Exception("Not supported: ", path) + +def split_image_with_custom_grouping(image_path, output_dir, num_rows, resize_factor=None, compression_params=None, gap=10, destination_path=None): + # Read the image + img = cv2.imread(image_path) + if img is None: + raise ValueError(f"Cannot read the image from {image_path}") + + if resize_factor is not None: + img = cv2.resize(img, (0, 0), fx=resize_factor, fy=resize_factor, interpolation=cv2.INTER_AREA) + # cv2.imwrite(os.path.join(output_dir, "resized_image.png"), img) + + # Get the height and width of the image + height, width, _ = img.shape + + # Determine the height of each row + row_height = height // num_rows + + # Create the output directory + os.makedirs(output_dir, exist_ok=True) + + # Combine the first five rows + first_five_height = 5 * row_height + first_five_rows = img[:first_five_height, :, :] + if image_path.find("kitti")!=-1: + group = [] + for i in range(0, 5): + upper = i * row_height + 50 + lower = (i + 1) * row_height - 40 + group.append(img[upper:lower, :, :]) + first_five_rows = np.vstack(group) + compressed_first_five_rows = compress(first_five_rows, resize_factor, image_path) + # cv2.imwrite(os.path.join(output_dir, "first_five_rows.jpg"), compressed_first_five_rows, compression_params) + + # Initialize groups + group_0 = [] # Rows divisible by 3 + group_1 = [] # Rows with remainder 1 when divided by 3 + group_2 = [] # Rows with remainder 2 when divided by 3 + group_3 = [] # Additional rows with remainder 2 + + upper_shift, lower_shit = 0, 0 + if image_path.find("kitti")!=-1: + upper_shift = 50 + lower_shit = -40 + + # Process the remaining rows + for i in range(5, num_rows): + if i < 29: + upper = i * row_height + upper_shift + lower = (i + 1) * row_height + lower_shit if i < num_rows - 1 else height + else: + upper = 29 * row_height + (i - 29) * (row_height - 1) + upper_shift + lower = 29 * row_height + (i - 28) * (row_height - 1) + lower_shit if i < num_rows - 1 else height + + row_img = img[upper:lower, :, :] + + compressed_row_img = compress(row_img, resize_factor, image_path) + + if i < num_rows - 2: + # Group rows based on modulo 3 + if (i - 5) % 3 == 0: + group_0.append(compressed_row_img) + elif (i - 5) % 3 == 1: + group_1.append(compressed_row_img) + else: + group_2.append(compressed_row_img) + else: + group_3.append(compressed_row_img) + + # Concatenate and save each group + if group_0: + group_0_img = np.vstack(group_0) # Concatenate vertically + # cv2.imwrite(os.path.join(output_dir, "group_0.jpg"), group_0_img, compression_params) + + if group_1: + group_1_img = np.vstack(group_1) + # cv2.imwrite(os.path.join(output_dir, "group_1.jpg"), group_1_img, compression_params) + + if group_2: + group_2_img = np.vstack(group_2) + # cv2.imwrite(os.path.join(output_dir, "group_2.jpg"), group_2_img, compression_params) + + if group_3: + group_3_img = np.vstack(group_3) + # cv2.imwrite(os.path.join(output_dir, "group_3.jpg"), group_3_img, compression_params) + + + if destination_path is not None: + group_images = [] + group_images.append(np.vstack(group_1)) + group_images.append(np.vstack(group_0)) + group_images.append(np.vstack(group_2)) + + # Create the gap as a black (zeros) image + gap_img = np.ones((gap, group_3_img.shape[1], 3), dtype=np.uint8) * 255 + left_img = np.vstack([compressed_first_five_rows, gap_img, group_3_img]) + + # Find the maximum height among all images + max_height = max(left_img.shape[0], *(img.shape[0] for img in group_images)) + + # Pad the first image to match the maximum height + padded_image1 = pad_to_match_height(left_img, max_height) + padded_images = [pad_to_match_height(img, max_height) for img in group_images] + + gap_img = np.ones((max_height, gap, 3), dtype=np.uint8) * 255 + concatenated_img = padded_image1 + for img in padded_images: + concatenated_img = np.hstack([concatenated_img, gap_img, img]) + + # Save the result + cv2.imwrite(destination_path.replace("png","jpg"), concatenated_img, compression_params) + + +def process_file(args): + source_path, output_dir, num_rows, resize_factor, compression_params, gap, destination_path = args + split_image_with_custom_grouping(source_path, output_dir, num_rows, resize_factor, compression_params, gap, destination_path) + print(f"Processed {source_path} to {destination_path}") + + +def process(input_dir, output_dir, num_rows, resize_factor=None, compression_params=None, gap = 10): + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) + + tasks = [] + + # Walk through the directory tree + for root, dirs, files in os.walk(input_dir): + for file in files: + # Check if the file is an image based on extension + if file.lower().endswith(('.png',)): + # Determine the relative path of the current directory + relative_path = os.path.relpath(root, input_dir) + + # Create the corresponding output subdirectory + output_subdir = os.path.join(output_dir, relative_path) + os.makedirs(output_subdir, exist_ok=True) + + # Copy the file to the output directory + source_path = os.path.join(root, file) + destination_path = os.path.join(output_subdir, file.replace(".png", ".jpg")) + + tasks.append((source_path, output_dir, num_rows, resize_factor, compression_params, gap, destination_path)) + + # split_image_with_custom_grouping(source_path, output_dir, num_rows, resize_factor, compression_params, gap, destination_path) + # print(f"process {source_path} to {destination_path}") + + with Pool(processes=(os.cpu_count())//2) as pool: # Use as many processes as available CPU cores + pool.map(process_file, tasks) + + +image_path = "/data5/yao/runs/vis/inter_results/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/SUPP/analysis/booster/train-balanced-Case-disp_00.png" +# image_path = "/data5/yao/runs/vis/inter_results/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/SUPP/analysis/eth3d/two_view_training-delivery_area_1l-disp0GT.png" +# image_path = "/data5/yao/runs/vis/inter_results/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/SUPP/analysis/kitti/training-disp_occ_0-000000_10.png" +# image_path = "/data5/yao/runs/vis/inter_results/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/SUPP/analysis/middlebury_h/MiddEval3-trainingH-Adirondack-disp0GT.png" +output_dir = "./output" +num_rows = 43 +resize_factor = 0.25 +compression_params = [cv2.IMWRITE_JPEG_QUALITY, 75] +destination_path = "./output/res.png" + +# split_image_with_custom_grouping(image_path, output_dir, num_rows, resize_factor, compression_params, 10, destination_path) + +input_dir = "/data5/yao/runs/vis/inter_results/RaftStereoDepthBetaK53DispRefineSigmoidPreMonoBatch48ConfDim_20241102_014050/SUPP/analysis/" +output_dir = "./output" +process(input_dir, output_dir, num_rows, resize_factor, compression_params, gap=10) + diff --git a/tools/vis_cre.py b/tools/vis_cre.py new file mode 100644 index 0000000000000000000000000000000000000000..3e590993bec6d9279affec7adb29897b1ce47fd5 --- /dev/null +++ b/tools/vis_cre.py @@ -0,0 +1,42 @@ +import numpy as np +import random +import os +import subprocess + +# Define the root path where the .npy files are stored +txt_root = './datasets/CREStereo_dataset' +image1_path = os.path.join(txt_root, "image1_list.npy") +image2_path = os.path.join(txt_root, "image2_list.npy") +disp_path = os.path.join(txt_root, "disp_list.npy") + +# Load file paths from the .npy files +image1_list = np.load(image1_path) +image2_list = np.load(image2_path) +disp_list = np.load(disp_path) + +# Set the number of random samples to select +num_samples = 10 # Change this number as needed +selected_indices = random.sample(range(len(image1_list)), num_samples) + +# Collect paths of selected files +selected_files = [] +for i in selected_indices: + selected_files.extend([image1_list[i], image2_list[i], disp_list[i]]) + # print(selected_files[-1]) + +# Upload selected files to cloud storage +# Ensure 'rclone' remote storage is configured, e.g., 'my_remote' +remote_path = "alist:/xunlei_private/Vis/CREStereo" + +for file_path in selected_files: + # Extract the parent directory name and the file name + parent_dir = os.path.basename(os.path.dirname(file_path)) + file_name = os.path.basename(file_path) + + # Create a new file name by concatenating parent directory name and file name + new_file_name = f"{parent_dir}-{file_name}" + print("copy {} to {}".format(file_path, f"{remote_path}/{new_file_name}")) + + # Upload to cloud with the new name + subprocess.run(["rclone", "copyto", file_path, f"{remote_path}/{new_file_name}"]) +