Skip to content
Snippets Groups Projects
Commit 8f2fe4e3 authored by JuanPi Carbajal's avatar JuanPi Carbajal
Browse files

Laura exmaple: drying slopes

parent fbb132cf
No related branches found
No related tags found
No related merge requests found
Pipeline #59825 failed
......@@ -25,10 +25,14 @@ import matplotlib
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['figure.figsize'] = (15, 10)
from statsmodels.graphics.tsaplots import plot_pacf
from scipy import signal
import scipy.interpolate as interp
sys.path.insert(0, os.path.abspath('..'))
from swa_models.datahandling import read_config
from swa_models.dbtools import swaDBContext as swaDB, read_data, flux_filter_arg
from swa_models.utils import find_zero_crossing_by_spline as find_zeros
from swa_models.utils import (true_interval, range_score)
# %%
# Load configuration
......@@ -81,6 +85,17 @@ if not filepath.exists():
data.drop(columns=["_measurement", "weather_source"], inplace=True)
data.rename(columns={'_field': 'signal'}, inplace=True)
data = data.set_index('datetime')
# Massage the data to make all analyses simpler
data_ = []
for g, d in data.groupby('object'):
d_ = d.pivot(columns=['signal'], values=['value']).droplevel(0, axis='columns')
# resample to target signal frequency
d_ = d_.resample('1h').mean()
d_['object'] = g
data_.append(d_.melt(id_vars=['object'], ignore_index=False))
data = pd.concat(data_, axis=0)
data.to_pickle(filepath)
print(f'Saved to {filepath}')
else:
......@@ -94,28 +109,137 @@ else:
# model parameters.
#
# Group data by object then by signal
# gives a multi-index column
data_g = data.pivot(columns=['object', 'signal'], values=['value'])
data_g = data_g.droplevel(0, axis='columns')
# %%
# Time-series plot
# ^^^^^^^^^^^^^^^^
# We show the data as time series.
#
# We are interested in predicting when the moisture on the soil will drop.
# These "drying" events are visible in the time series plots.
# You can also see that the moisture recovers abruptly when water comes into
# the soil either by watering, rain, or melted snow.
#
objects = data.object.unique()
data_available = pd.Series(index=objects, dtype=data.index.dtype)
for obj in objects:
d = data.loc[data.object == obj, ['signal', 'value']]
d = d.pivot(columns=['signal'], values=['value']).droplevel(0, axis='columns')
# resample to target signal frequency
d = d[inputs].resample('1h').mean()
for obj in data_g.columns.levels[0]:
d = data_g[obj]
# Earliest non-nan date
data_available[obj] = d.dropna().index.min()
axs = d.plot(subplots=True, title=obj)
[x.legend(loc='upper right') for x in axs]
axs = d[inputs].plot(subplots=True, title=obj)
[x.legend(loc='lower left') for x in axs]
fig = plt.gcf()
plt.xlabel('')
fig.tight_layout()
if 'handles' not in locals():
handles, _ = list(zip(*[x.get_legend_handles_labels() for x in axs]))
# %%
# The data availablilty
data_available
\ No newline at end of file
# The data availability
#
data_available
# %%
# Characteristic time of drying events
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We explore the characteristic times of the "drying" events identified in the
# time series plots above.
# We first align the different events (signal registration) to see how different
# or similar their dynamics are.
#
# %%
# We first pro-process the data for the alignment
#
# Collect soil_moisture signals
idx = pd.IndexSlice
sm_raw = data_g.loc[:, idx[:, 'soil_moisture']].droplevel('signal',
axis='columns')
# Fill missing values
sm_ = sm_raw.fillna(method='ffill').fillna(method='bfill')
# De-noise
sm = sm_.transform(signal.savgol_filter, window_length=41, polyorder=0)
axs = sm_raw.plot(subplots=True, style='.', alpha=0.25)
sm.plot(subplots=True, ax=axs, legend=False)
[x.legend(loc='lower left') for x in axs]
fig = plt.gcf()
plt.xlabel('')
fig.tight_layout()
# Find alignment points
x = (sm.index - sm.index[0]).total_seconds() / 3600 / 24
lvl = 0.80
def zero_selector(x, s): return s.derivative()(x) < 1e-3
fig, axs = plt.subplots(nrows=sm.columns.size, ncols=2,
sharex='col', sharey='row',
gridspec_kw={'width_ratios': [3, 1]})
slopes = []
for c, ax in zip(sm, axs):
y_, ym, yr = range_score(sm_[c].to_numpy(), return_params=True)
y_ = y_.ravel()
y = (sm[c].to_numpy() - ym[0]) / yr[0]
x0, y0, spl = find_zeros(y - lvl, x=x, s=0, selector=zero_selector,
return_spline=True)
dy0 = spl.derivative()(x0) * yr[0]
ax[0].plot(x, y_, '.k', ms=5, alpha=0.1, label=c)
ax[0].axhline(lvl, ls='--', c='C1')
ax[0].set_ylabel(c)
# Plot aligned events
for n, zdz in enumerate(zip(x0, dy0)):
z, dz = zdz
d0 = sm.index[np.argmin(np.abs(x-z))]
slopes.append(dict(slope=dz, object=c, event=n, datetime=d0))
msk = (x > (z - 3)) & (x < (z + 5))
#h = ax[1].plot(x[msk] - z, spl(x[msk]) + lvl)
h = ax[1].plot(x[msk] - z, y[msk])[0]
ax[0].plot(x[msk], y[msk], c=h.get_color(), zorder=2.5)
ax[0].plot(z, lvl, c=h.get_color(), zorder=3.5)
slopes = pd.DataFrame.from_records(slopes)
fig.suptitle('soil moisture')
fig.supxlabel(f'days since {sm.index[0]}')
fig.tight_layout()
# %%
# Plot the local drying slope for each object
#
fig, ax = plt.subplots()
sns.swarmplot(data=slopes, x='object', y='slope', size=25, ax=ax)
plt.xlabel('object')
plt.ylabel('drying slope [1/day]')
plt.tight_layout()
vals = []
for obj, d in slopes.groupby('object'):
d_ = data_g.loc[:, idx[obj, :]]
for st, s in zip(d.datetime, d.slope):
tmp = d_.loc[st-pd.Timedelta('6h'):st, :].mean().droplevel(0)
tmp['object'] = obj
tmp['drying slope'] = s
vals.append(tmp.to_dict())
vals = pd.DataFrame.from_records(vals)
#sns.pairplot(data=vals, y_vars='drying slope', x_vars=inputs, hue='object')
g = sns.pairplot(data=vals, y_vars='drying slope', x_vars=inputs)
g.map_offdiag(sns.regplot)
# %%
# Dynamical model
# ---------------
#
......@@ -17,11 +17,208 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Author: Juan Pablo Carbajal <ajuanpi+dev@gmail.com>
import warnings
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.interpolate import UnivariateSpline
def utc_now():
""" UTC now as :class:`pandas.Timestamp`.
"""
return pd.Timestamp.utcnow()
def std_score(df):
""" Remove mean and normalize by standard deviation.
The mean value is subtracted and the data is scaled by its
standard deviation. Also known as z-score.
Parameters
----------
df: :class:`pandas.DataFrame` or :class:`pandas.Series`
Returns
-------
Same as ``df``
Scored input
"""
return (df - df.mean()) / df.std()
def range_score(df, *, return_params=False, tol=1e-16):
""" Remove minimum and normalize by range.
The columns are rescaled to the [0, 1] interval by subtracting its
minimum value and scaling by its range.
Parameters
----------
data: :class:`numpy.ndarray`, shape (n, m) or (n,)
return_params: bool
Whether to return the parameters used for scoring.
tol: float
If the range of the column is lower than this value then it is
considered a constant column. In the latter case the column is set
to 1.
Returns
-------
scored_df: Same as ``data``
All columns mapped to [0,1]
min,range: float or :class:`pandas.Series`
Only returned if ``return_params`` is True.
"""
df = np.atleast_2d(df).T if df.ndim <= 1 else df
dfmin = df.min(axis=0)
dfrng = (df.max(axis=0) - dfmin)
msk = dfrng > 1e-16 # do not divide if range is too low (zero)
df[:, msk] = (df - dfmin)[:, msk] / dfrng[msk]
df[:, ~msk] = 1.0
if return_params:
return df, dfmin, dfrng
else:
return df
def iqr(y):
""" Inter-quantile range """
return np.diff(np.quantile(y, [0.25, 0.75]))
def find_zero_crossing_by_interval(y, *, x=None, dy=None, n_roots=1,
loc=None, spread=None, selector=None, **kw_cluster):
""" Find zero crossings of the given signal
"""
loc = np.median if loc is None else loc
spread = iqr if spread is None else spread
if n_roots > 1:
cluster = KMeans(**kw_cluster)
y = np.asarray(y)
x = np.arange(y.size) if x is None else x
dy = (y.max() - y.min()) / 255 if dy is None else dy
# Select y-values within tolerance
zc = np.abs(y) < dy
x_zc = x[zc]
# Filter the found values by selector
if selector is not None:
x_zc = x_zc[selector(x_zc, y[zc])]
if n_roots > 1:
# TODO determine optimal number of roots/clusters
nc = min(n_roots, x_zc.size)
if nc < n_roots:
warnings.warn(f'Found fewer zero crossings {nc} than requested {n_roots}. '
f'Will return only {nc}. '
'Try reducing n_roots or increasing dy.', RuntimeWarning,
stacklevel=2)
cluster.set_params(n_clusters=nc)
x_zc = np.atleast_2d(x_zc).reshape(-1, 1)
labels = cluster.fit(x_zc).labels_
location = []
loc_spread = []
for ll in np.unique(labels):
x0 = x_zc[labels == ll]
location.append(loc(x0))
loc_spread.append(spread(x0))
else:
location, loc_spread = loc(x_zc), spread(x_zc)
location, loc_spread = np.atleast_1d(location), np.atleast_1d(loc_spread)
order = np.argsort(location)
return location[order], loc_spread[order]
def find_zero_crossing_by_spline(y, *, x=None, dy=None, selector=None,
return_spline=False, **kw_spline):
""" Find zero crossings of the given signal
"""
y = np.asarray(y)
x = np.arange(y.size) if x is None else x
if 'w' not in kw_spline:
kw_spline['w'] = None if dy is None else 1 / dy
if 's' not in kw_spline:
kw_spline['s'] = 0 if dy is None else None
spl = UnivariateSpline(x, y, **kw_spline)
x0 = spl.roots()
# Filter the found values by selector
if selector is not None:
x0 = x0[selector(x0, spl)]
y0 = spl(x0)
if return_spline:
return x0, y0, spl
else:
return x0, y0
def true_interval(booliter):
""" Identify contiguous intervals of True values
The function returns a list of tuples with the start and one-pass-the-end
index of all intervals of contiguous True values. If no True value is
found the result is an empty list.
Parameters
----------
booliter : Iterable
An iterable with boolean values.
Returns
-------
start_end: list of tuples
Each tuple contains the start and end+1 index of an interval of
contiguous True values.
Examples
--------
>>> import numpy as np
>>> x = [True, True, True]
>>> trueinterval(x)
[(0, 3)]
>>> x = [False, True, False, True, True, False]
>>> trueinterval(x)
[(1, 2), (3, 5)]
Here is an example of a use case
>>> y = np.array([3, 2, 2, 3, 0, 1, 2, 4, 3, 4])
>>> ranges = trueinterval(y < 3)
>>> y[slice(*ranges[1])]
array([0, 1, 2])
"""
if not np.any(booliter):
return []
foundstart = False
start = []
stop = []
length = []
for idx, v in enumerate(booliter):
if v and not foundstart: # first encounter of True
foundstart = True
start.append(idx)
elif not v and foundstart: # last True in segment
foundstart = False
stop.append(idx)
length.append(stop[-1] - start[-1])
# Case no stop was found
if foundstart:
stop.append(len(booliter))
return list(zip(start, stop))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment