Source code for satellitetools.common.timeseries

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Functions for timeseries data handling.

@author: Olli Nevalainen (Finnish Meteorological Institute)

"""
import logging
from typing import List, Union

try:
    # breaking change introduced in python 3.11
    from enum import StrEnum
except ImportError:
    from enum import Enum

[docs] class StrEnum(str, Enum): pass
import numpy as np import pandas as pd import xarray as xr from satellitetools.biophys.biophys import ( SNAP_BIO_RMSE, BiophysVariable, VegetationIndex, )
[docs] logger = logging.getLogger(__name__)
[docs] class ConfidenceLevel(StrEnum): """Confidence level for calculating the confidence interval bounds."""
[docs] C90 = "90"
[docs] C95 = "95"
[docs] C99 = "99"
[docs] def xr_dataset_to_timeseries( xr_dataset: xr.Dataset, variables: List[Union[BiophysVariable, VegetationIndex]], add_uncertainty: bool = False, add_confidence_intervals: bool = False, confidence_level: ConfidenceLevel = ConfidenceLevel.C95, ) -> pd.DataFrame: """Compute timeseries dataframe from xarray dataset. Parameters ---------- xr_dataset : xr.Dataset xarray dataset with Sentinel-2 data. variables : List[BiophysVariable] List of variables to compute timeseries for. add_uncertainty : bool, default False Adds variable {variable}_uncertainty and confidence intervals to dataframe. Currently, uncertainty is equal to standar error (se) or if variable is biophysical variable from biophys_xarray, it sqrt(se^2 + RMSE_mean^2) where RMSE_mean is propagated uncertainty for the individual observations/pixels uncertainties. Uncertainty for the individual pixels is considered to be the variable RMSE from the SNAP biophysical processor developers (see biophys_xarray.py and linked ATBD) (i.e. same for all pixels). confidence_level : ConfidenceLevel, default ConfidenceLevel.C95 Confidence level (%) for calculating the confidence interval bounds. Options "90", "95" & "99" Returns ------- df : pandas dataframe Pandas dataframe with mean, std, se and percentage of NaNs inside AOI. """ df = pd.DataFrame(index=pd.to_datetime(xr_dataset.time.values)) for var in variables: # nans occure due to missging data from 1D to 2D array # (pixels outside the polygon), # from snap algorihtm nans occure due to input/output ouf of bounds # checking. # TODO: flaggging with snap biophys algorith or some other solution to # check which nan are from snap algorithm and which from 1d to 2d transformation nans = np.isnan(xr_dataset[var]).sum(dim=["x", "y"]) sample_n = len(xr_dataset[var].x) * len(xr_dataset[var].y) - nans # Filter dates with all pixels equal to nan (i.e. sample n = 0) non_zero_n = np.where(sample_n != 0)[0] xr_dataset = xr_dataset.isel(time=non_zero_n) sample_n = sample_n.isel(time=non_zero_n) nans = nans.isel(time=non_zero_n) # Drop also from df df = df.iloc[non_zero_n] # adjust n if data is resampled if var in SNAP_BIO_RMSE: # if data is upsampled, take this into account in # uncertainty (n "artificially increased") sample_n = _adjust_sample_size(xr_dataset, sample_n) df[var] = xr_dataset[var].mean(dim=["x", "y"]) df[var + "_F050"] = xr_dataset[var].median(dim=["x", "y"]) df[var + "_std"] = xr_dataset[var].std(dim=["x", "y"]) df[var + "_se"] = df[var + "_std"] / np.sqrt(sample_n) if add_uncertainty or add_confidence_intervals: df = compute_uncertainty(df, xr_dataset, var, sample_n) if add_confidence_intervals: df = compute_confidence_intervals( df, var, confidence_level=confidence_level ) if hasattr(xr_dataset, "aoi_pixels"): # compute how many of the nans are inside aoi (due to snap algorithm) out_of_aoi_pixels = ( len(xr_dataset[var].x) * len(xr_dataset[var].y) - xr_dataset.aoi_pixels ) nans_inside_aoi = nans - out_of_aoi_pixels df[var + "_aoi_nan_percentage"] = nans_inside_aoi / xr_dataset.aoi_pixels return df
[docs] def _adjust_sample_size(xr_dataset: xr.Dataset, sample_n: float) -> float: """Get sample size for uncertainty calculation. Parameters ---------- xr_dataset : xr.Dataset xarray dataset with Sentinel-2 data. sample_n : float Original sample size. Returns ------- sample_n : float Adjusted sample size. """ # 20 = 20 m which is the SNAP_BIO function standard resolution try: resampling_ratio = 20 / np.abs(xr_dataset.x[1] - xr_dataset.x[0]) except IndexError: try: resampling_ratio = 20 / np.abs(xr_dataset.y[1] - xr_dataset.y[0]) except IndexError: # Assume that the data is not resampled logger.warning( "Could not determine resampling ratio, assuming no resampling" ) resampling_ratio = 1 pixel_multiplier = ( resampling_ratio**2 ) # doubling resolution quadruples amount of pixels etc. if resampling_ratio > 1: sample_n = sample_n / pixel_multiplier sample_n[sample_n < 1] = ( 1 # with small areas the adjusted sample_n might be < 1 ) return sample_n
[docs] def propagate_rmse(n: int, rmse: float) -> float: """Propagate RMSE for the mean value. Parameters ---------- n : int Sample size. rmse : float Root mean square error. Returns ------- propagated_rmse : float Propagated RMSE for the mean value. """ propagated_rmse = np.sqrt(np.sum([rmse**2] * int(n))) / n return propagated_rmse
[docs] def compute_uncertainty( df: pd.DataFrame, xr_dataset: xr.Dataset, var: BiophysVariable, sample_n: float ) -> pd.DataFrame: """Compute uncertainty for the variable. Parameters ---------- df : pd.DataFrame Dataframe with the variable. xr_dataset : xr.Dataset xarray dataset with Sentinel-2 data. var : BiophysVariable Biophysical variable. sample_n : float Sample size for uncertainty calculation. Returns ------- df : pd.DataFrame Dataframe with the variable and uncertainty. """ if var in SNAP_BIO_RMSE: sample_n = _adjust_sample_size(xr_dataset, sample_n) # propagated RMSE for the # mean value (RMSE as uncertainty for individual observations/pixels) rmse_mean = xr.apply_ufunc( propagate_rmse, sample_n, kwargs={"rmse": SNAP_BIO_RMSE[var]}, vectorize=True, ) # rmse_means = np.sqrt(np.sum([SNAP_BIO_RMSE[var] ** 2 ] * n)) / n # if data is upsampled, take this into account in # uncertainty (n "artificially increased") # 20 = 20 m which is the SNAP_BIO function standard resolution # resampling_ratio = 20 / np.abs(xr_dataset.x[1] - xr_dataset.x[0]) # pixel_multiplier = ( # resampling_ratio ** 2 # ) # doubling resolution quadruples amount of pixels etc. # if resampling_ratio > 1: # rmse_mean = rmse_mean * pixel_multiplier / np.sqrt(pixel_multiplier) df[var + "_uncertainty"] = np.sqrt(df[var + "_std"] ** 2 + rmse_mean**2) else: df[var + "_uncertainty"] = df[var + "_std"] return df
[docs] def compute_confidence_intervals( df: pd.DataFrame, var: BiophysVariable, confidence_level=ConfidenceLevel.C95 ) -> pd.DataFrame: """Compute confidence intervals for the variable. Parameters ---------- df : pd.DataFrame Dataframe with the variable. var : BiophysVariable Biophysical variable. confidence_level : ConfidenceLevel, default ConfidenceLevel.C95 Confidence level (%) for calculating the confidence interval bounds. Options "90", "95" & "99" Returns ------- df : pd.DataFrame Dataframe with the variable and confidence intervals. """ if confidence_level == ConfidenceLevel.C90: z_score = 1.645 ci_min = "_F005" ci_max = "_F095" elif confidence_level == ConfidenceLevel.C95: z_score = 1.96 ci_min = "_F0025" ci_max = "_F0975" elif confidence_level == ConfidenceLevel.C99: z_score = 2.576 ci_min = "_F0005" ci_max = "_F0995" else: logger.error("Unknown confidence level") raise ValueError("Unknown confidence level") # uncertainty to confidence intervals df[var + ci_min] = df[var] - z_score * df[var + "_uncertainty"] df[var + ci_max] = df[var] + z_score * df[var + "_uncertainty"] if var in SNAP_BIO_RMSE: # Cap unrealistic lower bound negative values to 0 df.loc[df[var + ci_min] < 0, var + ci_min] = 0 return df