Laura exmaple: drying slopes

8f2fe4e3 · JuanPi Carbajal · fbb132cf · 8f2fe4e3 · 8f2fe4e3
Commit 8f2fe4e3 authored 3 years ago by JuanPi Carbajal
--- a/examples/plot_explore_data_Laura.py
+++ b/examples/plot_explore_data_Laura.py
@@ -25,10 +25,14 @@ import matplotlib
 matplotlib.rcParams['font.size'] = 16
 matplotlib.rcParams['figure.figsize'] = (15, 10)
 from statsmodels.graphics.tsaplots import plot_pacf
+from scipy import signal
+import scipy.interpolate as interp

 sys.path.insert(0, os.path.abspath('..'))
 from swa_models.datahandling import read_config
 from swa_models.dbtools import swaDBContext as swaDB, read_data, flux_filter_arg
+from swa_models.utils import find_zero_crossing_by_spline as find_zeros
+from swa_models.utils import (true_interval, range_score)

 # %%
 # Load configuration
@@ -81,6 +85,17 @@ if not filepath.exists():
    data.drop(columns=["_measurement", "weather_source"], inplace=True)
    data.rename(columns={'_field': 'signal'}, inplace=True)
    data = data.set_index('datetime')
+
+    # Massage the data to make all analyses simpler
+    data_ = []
+    for g, d in data.groupby('object'):
+        d_ = d.pivot(columns=['signal'], values=['value']).droplevel(0, axis='columns')
+        # resample to target signal frequency
+        d_ = d_.resample('1h').mean()
+        d_['object'] = g
+        data_.append(d_.melt(id_vars=['object'], ignore_index=False))
+    data = pd.concat(data_, axis=0)
+
    data.to_pickle(filepath)
    print(f'Saved to {filepath}')
 else:
@@ -94,28 +109,137 @@ else:
 # model parameters.
 #

+# Group data by object then by signal
+# gives a multi-index column
+data_g = data.pivot(columns=['object', 'signal'], values=['value'])
+data_g = data_g.droplevel(0, axis='columns')
+
 # %%
 # Time-series plot
 # ^^^^^^^^^^^^^^^^
 # We show the data as time series.
 #
+# We are interested in predicting when the moisture on the soil will drop.
+# These "drying" events are visible in the time series plots.
+# You can also see that the moisture recovers abruptly when water comes into
+# the soil either by watering, rain, or melted snow.
+#
+
 objects = data.object.unique()
 data_available = pd.Series(index=objects, dtype=data.index.dtype)
-for obj in objects:
-    d = data.loc[data.object == obj, ['signal', 'value']]
-    d = d.pivot(columns=['signal'], values=['value']).droplevel(0, axis='columns')
-    # resample to target signal frequency
-    d = d[inputs].resample('1h').mean()
+for obj in data_g.columns.levels[0]:
+    d = data_g[obj]
+
    # Earliest non-nan date
    data_available[obj] = d.dropna().index.min()

-    axs = d.plot(subplots=True, title=obj)
-    [x.legend(loc='upper right') for x in axs]
+    axs = d[inputs].plot(subplots=True, title=obj)
+    [x.legend(loc='lower left') for x in axs]
    fig = plt.gcf()
    plt.xlabel('')
    fig.tight_layout()
    if 'handles' not in locals():
        handles, _ = list(zip(*[x.get_legend_handles_labels() for x in axs]))
+
 # %%
-# The data availablilty
-data_available
\ No newline at end of file
+# The data availability
+#
+data_available
+
+# %%
+# Characteristic time of drying events
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# We explore the characteristic times of the "drying" events identified in the
+# time series plots above.
+# We first align the different events (signal registration) to see how different
+# or similar their dynamics are.
+#
+
+# %%
+# We first pro-process the data for the alignment
+#
+
+# Collect soil_moisture signals
+idx = pd.IndexSlice
+sm_raw = data_g.loc[:, idx[:, 'soil_moisture']].droplevel('signal',
+                                                          axis='columns')
+# Fill missing values
+sm_ = sm_raw.fillna(method='ffill').fillna(method='bfill')
+# De-noise
+sm = sm_.transform(signal.savgol_filter, window_length=41, polyorder=0)
+
+axs = sm_raw.plot(subplots=True, style='.', alpha=0.25)
+sm.plot(subplots=True, ax=axs, legend=False)
+[x.legend(loc='lower left') for x in axs]
+fig = plt.gcf()
+plt.xlabel('')
+fig.tight_layout()
+
+# Find alignment points
+x = (sm.index - sm.index[0]).total_seconds() / 3600 / 24
+lvl = 0.80
+
+def zero_selector(x, s): return s.derivative()(x) < 1e-3
+
+fig, axs = plt.subplots(nrows=sm.columns.size, ncols=2,
+                        sharex='col', sharey='row',
+                        gridspec_kw={'width_ratios': [3, 1]})
+slopes = []
+for c, ax in zip(sm, axs):
+    y_, ym, yr = range_score(sm_[c].to_numpy(), return_params=True)
+    y_ = y_.ravel()
+    y = (sm[c].to_numpy() - ym[0]) / yr[0]
+
+    x0, y0, spl = find_zeros(y - lvl, x=x, s=0, selector=zero_selector,
+                             return_spline=True)
+    dy0 = spl.derivative()(x0) * yr[0]
+
+    ax[0].plot(x, y_, '.k', ms=5, alpha=0.1, label=c)
+    ax[0].axhline(lvl, ls='--', c='C1')
+    ax[0].set_ylabel(c)
+
+    # Plot aligned events
+    for n, zdz in enumerate(zip(x0, dy0)):
+        z, dz = zdz
+        d0 = sm.index[np.argmin(np.abs(x-z))]
+        slopes.append(dict(slope=dz, object=c, event=n, datetime=d0))
+
+        msk = (x > (z - 3)) & (x < (z + 5))
+        #h = ax[1].plot(x[msk] - z, spl(x[msk]) + lvl)
+        h = ax[1].plot(x[msk] - z, y[msk])[0]
+
+        ax[0].plot(x[msk], y[msk], c=h.get_color(), zorder=2.5)
+        ax[0].plot(z, lvl, c=h.get_color(), zorder=3.5)
+
+slopes = pd.DataFrame.from_records(slopes)
+fig.suptitle('soil moisture')
+fig.supxlabel(f'days since {sm.index[0]}')
+fig.tight_layout()
+
+# %%
+# Plot the local drying slope for each object
+#
+fig, ax = plt.subplots()
+sns.swarmplot(data=slopes, x='object', y='slope', size=25, ax=ax)
+plt.xlabel('object')
+plt.ylabel('drying slope [1/day]')
+plt.tight_layout()
+
+vals = []
+for obj, d in slopes.groupby('object'):
+    d_ = data_g.loc[:, idx[obj, :]]
+    for st, s in zip(d.datetime, d.slope):
+        tmp = d_.loc[st-pd.Timedelta('6h'):st, :].mean().droplevel(0)
+        tmp['object'] = obj
+        tmp['drying slope'] = s
+        vals.append(tmp.to_dict())
+vals = pd.DataFrame.from_records(vals)
+
+#sns.pairplot(data=vals, y_vars='drying slope', x_vars=inputs, hue='object')
+g = sns.pairplot(data=vals, y_vars='drying slope', x_vars=inputs)
+g.map_offdiag(sns.regplot)
+
+# %%
+# Dynamical model
+# ---------------
+#
--- a/swa_models/utils.py
+++ b/swa_models/utils.py
@@ -17,11 +17,208 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.

 # Author: Juan Pablo Carbajal <ajuanpi+dev@gmail.com>
+import warnings

 import pandas as pd
-
+import numpy as np
+from sklearn.cluster import KMeans
+from scipy.interpolate import UnivariateSpline

 def utc_now():
    """ UTC now as :class:`pandas.Timestamp`.
    """
    return pd.Timestamp.utcnow()
+
+
+def std_score(df):
+    """ Remove mean and normalize by standard deviation.
+
+        The mean value is subtracted and the data is scaled by its
+        standard deviation. Also known as z-score.
+
+        Parameters
+        ----------
+        df: :class:`pandas.DataFrame` or :class:`pandas.Series`
+
+        Returns
+        -------
+        Same as ``df``
+            Scored input
+    """
+    return (df - df.mean()) / df.std()
+
+
+def range_score(df, *, return_params=False, tol=1e-16):
+    """ Remove minimum and normalize by range.
+
+        The columns are rescaled to the [0, 1] interval by subtracting its
+        minimum value and scaling by its range.
+
+        Parameters
+        ----------
+        data: :class:`numpy.ndarray`, shape (n, m) or (n,)
+
+        return_params: bool
+            Whether to return the parameters used for scoring.
+
+        tol: float
+            If the range of the column is lower than this value then it is
+            considered a constant column. In the latter case the column is set
+            to 1.
+
+        Returns
+        -------
+        scored_df: Same as ``data``
+            All columns mapped to [0,1]
+
+        min,range: float or :class:`pandas.Series`
+            Only returned if ``return_params`` is True.
+
+    """
+    df = np.atleast_2d(df).T if df.ndim <= 1 else df
+    dfmin = df.min(axis=0)
+    dfrng = (df.max(axis=0) - dfmin)
+
+    msk = dfrng > 1e-16  # do not divide if range is too low (zero)
+    df[:, msk] = (df - dfmin)[:, msk] / dfrng[msk]
+    df[:, ~msk] = 1.0
+    if return_params:
+        return df, dfmin, dfrng
+    else:
+        return df
+
+
+def iqr(y):
+    """ Inter-quantile range """
+    return np.diff(np.quantile(y, [0.25, 0.75]))
+
+
+def find_zero_crossing_by_interval(y, *, x=None, dy=None, n_roots=1,
+                      loc=None, spread=None, selector=None, **kw_cluster):
+    """ Find zero crossings of the given signal
+    """
+    loc = np.median if loc is None else loc
+    spread = iqr if spread is None else spread
+
+    if n_roots > 1:
+        cluster = KMeans(**kw_cluster)
+
+    y = np.asarray(y)
+    x = np.arange(y.size) if x is None else x
+    dy = (y.max() - y.min()) / 255 if dy is None else dy
+
+    # Select y-values within tolerance
+    zc = np.abs(y) < dy
+    x_zc = x[zc]
+    # Filter the found values by selector
+    if selector is not None:
+        x_zc = x_zc[selector(x_zc, y[zc])]
+
+    if n_roots > 1:
+        # TODO determine optimal number of roots/clusters
+        nc = min(n_roots, x_zc.size)
+        if nc < n_roots:
+            warnings.warn(f'Found fewer zero crossings {nc} than requested {n_roots}. '
+                          f'Will return only {nc}. '
+                          'Try reducing n_roots or increasing dy.', RuntimeWarning,
+                          stacklevel=2)
+
+        cluster.set_params(n_clusters=nc)
+        x_zc = np.atleast_2d(x_zc).reshape(-1, 1)
+        labels = cluster.fit(x_zc).labels_
+        location = []
+        loc_spread = []
+        for ll in np.unique(labels):
+            x0 = x_zc[labels == ll]
+            location.append(loc(x0))
+            loc_spread.append(spread(x0))
+    else:
+        location, loc_spread = loc(x_zc), spread(x_zc)
+
+    location, loc_spread = np.atleast_1d(location), np.atleast_1d(loc_spread)
+
+    order = np.argsort(location)
+    return location[order], loc_spread[order]
+
+
+def find_zero_crossing_by_spline(y, *, x=None, dy=None, selector=None,
+                                 return_spline=False, **kw_spline):
+    """ Find zero crossings of the given signal
+    """
+
+    y = np.asarray(y)
+    x = np.arange(y.size) if x is None else x
+    if 'w' not in kw_spline:
+        kw_spline['w'] = None if dy is None else 1 / dy
+    if 's' not in kw_spline:
+        kw_spline['s'] = 0 if dy is None else None
+
+    spl = UnivariateSpline(x, y, **kw_spline)
+    x0 = spl.roots()
+    # Filter the found values by selector
+    if selector is not None:
+        x0 = x0[selector(x0, spl)]
+    y0 = spl(x0)
+    if return_spline:
+        return x0, y0, spl
+    else:
+        return x0, y0
+
+
+def true_interval(booliter):
+    """ Identify contiguous intervals of True values
+
+        The function returns a list of tuples with the start and one-pass-the-end
+        index of all intervals of contiguous True values. If no True value is
+        found the result is an empty list.
+
+        Parameters
+        ----------
+        booliter : Iterable
+            An iterable with boolean values.
+
+        Returns
+        -------
+        start_end: list of tuples
+            Each tuple contains the start and end+1 index of an interval of
+            contiguous True values.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> x = [True, True, True]
+        >>> trueinterval(x)
+        [(0, 3)]
+        >>> x = [False, True, False, True, True, False]
+        >>> trueinterval(x)
+        [(1, 2), (3, 5)]
+
+        Here is an example of a use case
+
+        >>> y = np.array([3, 2, 2, 3, 0, 1, 2, 4, 3, 4])
+        >>> ranges = trueinterval(y < 3)
+        >>> y[slice(*ranges[1])]
+        array([0, 1, 2])
+    """
+
+    if not np.any(booliter):
+        return []
+
+    foundstart = False
+    start = []
+    stop = []
+    length = []
+    for idx, v in enumerate(booliter):
+        if v and not foundstart:  # first encounter of True
+            foundstart = True
+            start.append(idx)
+        elif not v and foundstart:  # last True in segment
+            foundstart = False
+            stop.append(idx)
+            length.append(stop[-1] - start[-1])
+
+    # Case no stop was found
+    if foundstart:
+        stop.append(len(booliter))
+
+    return list(zip(start, stop))