Merge pull request #104 from openclimatefix/docs/pvnet-instructions

jcamier · web-flow · commit 203d86682453 · 2025-10-16T07:15:47.000-05:00
Docs/pvnet instructions
diff --git a/.gitignore b/.gitignore
@@ -102,5 +102,8 @@ data/
 # custom
 config_tree.txt
 GFS_samples/
+GFS_TEST_RUN/
 PLACEHOLDER/
-*.zarr
+*.zarr
+example_configuration.yaml # config file for samples adjusted for local paths
+experiment_config.yaml
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -20,6 +20,7 @@ Welcome to the Solar Forecasting project! This document will introduce you to th
 15. [How This Project Fits into Renewable Energy](#how-this-project-fits-into-renewable-energy)
 16. [Development and Testing Guide](#development-and-testing-guide)
 17. [Command Line Interface (CLI)](#command-line-interface-cli)
+18. [Running PVNet Model](#running-pvnet-model)
 
 ---
 
@@ -722,6 +723,52 @@ Common error messages and their solutions:
 - "Error loading dataset": Verify your internet connection and credentials
 - "Invalid chunks specification": Ensure chunk string follows the format "dim1:size1,dim2:size2"
 
+
+## Running PVNet Model
+
+1. Update configuration file
+  Go to src/open_data_pvnet/configs/PVNet_configs/datamodule/streamed_batches.yaml
+
+  Change values if desired (increase at your discretion):
+  num_train_samples: 5
+  num_val_samples: 5
+
+2. Update src/open_data_pvnet/configs/PVNet_configs/datamodule/premade_batches.yaml
+  Change this line to configuration: <your_directory...open-data-pvnet/src/open_data_pvnet/configs/PVNet_configs/datamodule/configuration/example_configuration.yaml>
+
+3. Update src/open_data_pvnet/configs/PVNet_configs/config.yaml
+  Change the line to - datamodule: premade_batches.yaml
+
+4. Open a Weights & Biases Account https://wandb.ai/
+  Go to src/open_data_pvnet/configs/PVNet_configs/logger/wandb.yaml
+  Change to project: "GFS_TEST_RUN"
+  Change to save_dir: "GFS_TEST_RUN"
+
+5. Run the samples
+  We recommend you save the samples locally for faster processing
+  In your main open-data-pvnet directory, run the following command (assumes aws cli is installed locally)
+    aws s3 sync s3://ocf-open-data-pvnet/data/gfs/v4/2023.zarr/ ./gfs_2023.zarr --no-sign-request
+    aws s3 sync s3://ocf-open-data-pvnet/data/uk/pvlive/v2/combined_2023_gsp.zarr ./gsp_2023.zarr --no-sign-request
+  Change the example_configuration.yaml `zarr_path` attributes to local paths you made above
+  Comment out both of these lines
+    `public: True` # If you are going to use the actual s3 buckets then leave alone however this may be really slow
+  In streamed_batches.yaml change this line
+    `configuration: null` to your actual path of the example_configuration.yaml file
+
+  # If running in a virtual environment, be sure to activate it. `source ./venv/bin/activate`
+  `rm -rf GFS_samples PLACEHOLDER` # to remove previous sample runs
+  `python src/open_data_pvnet/scripts/save_samples.py`
+
+6. Run the training
+  Go to config.yaml and change this line
+  `- datamodule: streamed_batches.yaml` to `- datamodule: premade_batches.yaml`
+  `python run.py`
+
+
+
+
+
+
 ---
 
 Thank you for joining us on this journey to advance solar forecasting and renewable energy solutions!
diff --git a/run.py b/run.py
@@ -0,0 +1,70 @@
+"""Run training
+"""
+
+import os
+
+import torch
+
+try:
+    torch.multiprocessing.set_start_method("spawn")
+    import torch.multiprocessing as mp
+
+    mp.set_start_method("spawn")
+except RuntimeError:
+    pass
+
+import logging
+import sys
+
+# Tired of seeing these warnings
+import warnings
+from datetime import datetime
+
+import hydra
+from omegaconf import DictConfig
+from sqlalchemy import exc as sa_exc
+
+warnings.filterwarnings("ignore", category=sa_exc.SAWarning)
+
+logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
+
+os.environ["HYDRA_FULL_ERROR"] = "1"
+
+if "WANDB_RUN_ID" not in os.environ:
+    os.environ["WANDB_RUN_ID"] = datetime.now().strftime("%y%m%d%H%M%S")
+
+
+# this file can be run for example using
+#  python run.py experiment=example_simple
+
+
+@hydra.main(
+    config_path="src/open_data_pvnet/configs/PVNet_configs",
+    config_name="config.yaml",
+    version_base="1.2",
+)
+def main(config: DictConfig):
+    """Runs training"""
+    # Imports should be nested inside @hydra.main to optimize tab completion
+    # Read more here: https://github.com/facebookresearch/hydra/issues/934
+    from pvnet.training import train
+    from pvnet.utils import extras, print_config
+
+    # A couple of optional utilities:
+    # - disabling python warnings
+    # - easier access to debug mode
+    # - forcing debug friendly configuration
+    # - forcing multi-gpu friendly configuration
+    # You can safely get rid of this line if you don't want those
+    extras(config)
+
+    # Pretty print config using Rich library
+    if config.get("print_config"):
+        print_config(config, resolve=True)
+
+    # Train model
+    return train(config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/open_data_pvnet/configs/PVNet_configs/config.yaml b/src/open_data_pvnet/configs/PVNet_configs/config.yaml
@@ -5,7 +5,7 @@ defaults:
   - _self_
   - trainer: default.yaml
   - model: multimodal.yaml
-  - datamodule: streamed_batches.yaml
+  - datamodule: streamed_batches.yaml #
   - callbacks: default.yaml # set this to null if you don't want to use callbacks
   # - logger: null
   - logger: wandb.yaml # set logger here or use command line (e.g. `python run.py logger=wandb`)
diff --git a/src/open_data_pvnet/configs/PVNet_configs/datamodule/premade_batches.yaml b/src/open_data_pvnet/configs/PVNet_configs/datamodule/premade_batches.yaml
@@ -7,7 +7,7 @@ configuration: null
 # The sample_dir is the location batches were saved to using the save_batches.py script
 # The sample_dir should contain train and val subdirectories with batches
 
-sample_output_dir: "GFS_samples"
+sample_dir: "GFS_samples"
 num_workers: 8
 prefetch_factor: 2
 batch_size: 8
diff --git a/src/open_data_pvnet/configs/PVNet_configs/datamodule/streamed_batches.yaml b/src/open_data_pvnet/configs/PVNet_configs/datamodule/streamed_batches.yaml
@@ -8,8 +8,8 @@ prefetch_factor: 2
 batch_size: 8
 
 sample_output_dir: "GFS_samples"
-num_train_samples: 1000
-num_val_samples: 1000
+num_train_samples: 5 #1000 Increase at your discretion
+num_val_samples: 5 #1000 Increase at your discretion
 
 train_period:
   - null
diff --git a/src/open_data_pvnet/configs/PVNet_configs/logger/wandb.yaml b/src/open_data_pvnet/configs/PVNet_configs/logger/wandb.yaml
@@ -3,12 +3,13 @@
 wandb:
   _target_: lightning.pytorch.loggers.wandb.WandbLogger
   # wandb project to log to
-  project: "PLACEHOLDER"
+  project: "GFS_TEST_RUN"
   name: "${model_name}"
   # location to store the wandb local logs
   save_dir: "PLACEHOLDER"
   offline: False # set True to store all logs only locally
   id: null # pass correct id to resume experiment!
+  id: "${oc.env:WANDB_RUN_ID}"
   # entity: ""  # set to name of your wandb team or just remove it
   log_model: True
   prefix: ""