Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
swa-models
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SWA
swa-models
Commits
8f2fe4e3
Commit
8f2fe4e3
authored
3 years ago
by
JuanPi Carbajal
Browse files
Options
Downloads
Patches
Plain Diff
Laura exmaple: drying slopes
parent
fbb132cf
No related branches found
No related tags found
No related merge requests found
Pipeline
#59825
failed
3 years ago
Stage: test
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
examples/plot_explore_data_Laura.py
+133
-9
133 additions, 9 deletions
examples/plot_explore_data_Laura.py
swa_models/utils.py
+198
-1
198 additions, 1 deletion
swa_models/utils.py
with
331 additions
and
10 deletions
examples/plot_explore_data_Laura.py
+
133
−
9
View file @
8f2fe4e3
...
...
@@ -25,10 +25,14 @@ import matplotlib
matplotlib
.
rcParams
[
'
font.size
'
]
=
16
matplotlib
.
rcParams
[
'
figure.figsize
'
]
=
(
15
,
10
)
from
statsmodels.graphics.tsaplots
import
plot_pacf
from
scipy
import
signal
import
scipy.interpolate
as
interp
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
'
..
'
))
from
swa_models.datahandling
import
read_config
from
swa_models.dbtools
import
swaDBContext
as
swaDB
,
read_data
,
flux_filter_arg
from
swa_models.utils
import
find_zero_crossing_by_spline
as
find_zeros
from
swa_models.utils
import
(
true_interval
,
range_score
)
# %%
# Load configuration
...
...
@@ -81,6 +85,17 @@ if not filepath.exists():
data
.
drop
(
columns
=
[
"
_measurement
"
,
"
weather_source
"
],
inplace
=
True
)
data
.
rename
(
columns
=
{
'
_field
'
:
'
signal
'
},
inplace
=
True
)
data
=
data
.
set_index
(
'
datetime
'
)
# Massage the data to make all analyses simpler
data_
=
[]
for
g
,
d
in
data
.
groupby
(
'
object
'
):
d_
=
d
.
pivot
(
columns
=
[
'
signal
'
],
values
=
[
'
value
'
]).
droplevel
(
0
,
axis
=
'
columns
'
)
# resample to target signal frequency
d_
=
d_
.
resample
(
'
1h
'
).
mean
()
d_
[
'
object
'
]
=
g
data_
.
append
(
d_
.
melt
(
id_vars
=
[
'
object
'
],
ignore_index
=
False
))
data
=
pd
.
concat
(
data_
,
axis
=
0
)
data
.
to_pickle
(
filepath
)
print
(
f
'
Saved to
{
filepath
}
'
)
else
:
...
...
@@ -94,28 +109,137 @@ else:
# model parameters.
#
# Group data by object then by signal
# gives a multi-index column
data_g
=
data
.
pivot
(
columns
=
[
'
object
'
,
'
signal
'
],
values
=
[
'
value
'
])
data_g
=
data_g
.
droplevel
(
0
,
axis
=
'
columns
'
)
# %%
# Time-series plot
# ^^^^^^^^^^^^^^^^
# We show the data as time series.
#
# We are interested in predicting when the moisture on the soil will drop.
# These "drying" events are visible in the time series plots.
# You can also see that the moisture recovers abruptly when water comes into
# the soil either by watering, rain, or melted snow.
#
objects
=
data
.
object
.
unique
()
data_available
=
pd
.
Series
(
index
=
objects
,
dtype
=
data
.
index
.
dtype
)
for
obj
in
objects
:
d
=
data
.
loc
[
data
.
object
==
obj
,
[
'
signal
'
,
'
value
'
]]
d
=
d
.
pivot
(
columns
=
[
'
signal
'
],
values
=
[
'
value
'
]).
droplevel
(
0
,
axis
=
'
columns
'
)
# resample to target signal frequency
d
=
d
[
inputs
].
resample
(
'
1h
'
).
mean
()
for
obj
in
data_g
.
columns
.
levels
[
0
]:
d
=
data_g
[
obj
]
# Earliest non-nan date
data_available
[
obj
]
=
d
.
dropna
().
index
.
min
()
axs
=
d
.
plot
(
subplots
=
True
,
title
=
obj
)
[
x
.
legend
(
loc
=
'
upper righ
t
'
)
for
x
in
axs
]
axs
=
d
[
inputs
]
.
plot
(
subplots
=
True
,
title
=
obj
)
[
x
.
legend
(
loc
=
'
lower lef
t
'
)
for
x
in
axs
]
fig
=
plt
.
gcf
()
plt
.
xlabel
(
''
)
fig
.
tight_layout
()
if
'
handles
'
not
in
locals
():
handles
,
_
=
list
(
zip
(
*
[
x
.
get_legend_handles_labels
()
for
x
in
axs
]))
# %%
# The data availablilty
data_available
\ No newline at end of file
# The data availability
#
data_available
# %%
# Characteristic time of drying events
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We explore the characteristic times of the "drying" events identified in the
# time series plots above.
# We first align the different events (signal registration) to see how different
# or similar their dynamics are.
#
# %%
# We first pro-process the data for the alignment
#
# Collect soil_moisture signals
idx
=
pd
.
IndexSlice
sm_raw
=
data_g
.
loc
[:,
idx
[:,
'
soil_moisture
'
]].
droplevel
(
'
signal
'
,
axis
=
'
columns
'
)
# Fill missing values
sm_
=
sm_raw
.
fillna
(
method
=
'
ffill
'
).
fillna
(
method
=
'
bfill
'
)
# De-noise
sm
=
sm_
.
transform
(
signal
.
savgol_filter
,
window_length
=
41
,
polyorder
=
0
)
axs
=
sm_raw
.
plot
(
subplots
=
True
,
style
=
'
.
'
,
alpha
=
0.25
)
sm
.
plot
(
subplots
=
True
,
ax
=
axs
,
legend
=
False
)
[
x
.
legend
(
loc
=
'
lower left
'
)
for
x
in
axs
]
fig
=
plt
.
gcf
()
plt
.
xlabel
(
''
)
fig
.
tight_layout
()
# Find alignment points
x
=
(
sm
.
index
-
sm
.
index
[
0
]).
total_seconds
()
/
3600
/
24
lvl
=
0.80
def
zero_selector
(
x
,
s
):
return
s
.
derivative
()(
x
)
<
1e-3
fig
,
axs
=
plt
.
subplots
(
nrows
=
sm
.
columns
.
size
,
ncols
=
2
,
sharex
=
'
col
'
,
sharey
=
'
row
'
,
gridspec_kw
=
{
'
width_ratios
'
:
[
3
,
1
]})
slopes
=
[]
for
c
,
ax
in
zip
(
sm
,
axs
):
y_
,
ym
,
yr
=
range_score
(
sm_
[
c
].
to_numpy
(),
return_params
=
True
)
y_
=
y_
.
ravel
()
y
=
(
sm
[
c
].
to_numpy
()
-
ym
[
0
])
/
yr
[
0
]
x0
,
y0
,
spl
=
find_zeros
(
y
-
lvl
,
x
=
x
,
s
=
0
,
selector
=
zero_selector
,
return_spline
=
True
)
dy0
=
spl
.
derivative
()(
x0
)
*
yr
[
0
]
ax
[
0
].
plot
(
x
,
y_
,
'
.k
'
,
ms
=
5
,
alpha
=
0.1
,
label
=
c
)
ax
[
0
].
axhline
(
lvl
,
ls
=
'
--
'
,
c
=
'
C1
'
)
ax
[
0
].
set_ylabel
(
c
)
# Plot aligned events
for
n
,
zdz
in
enumerate
(
zip
(
x0
,
dy0
)):
z
,
dz
=
zdz
d0
=
sm
.
index
[
np
.
argmin
(
np
.
abs
(
x
-
z
))]
slopes
.
append
(
dict
(
slope
=
dz
,
object
=
c
,
event
=
n
,
datetime
=
d0
))
msk
=
(
x
>
(
z
-
3
))
&
(
x
<
(
z
+
5
))
#h = ax[1].plot(x[msk] - z, spl(x[msk]) + lvl)
h
=
ax
[
1
].
plot
(
x
[
msk
]
-
z
,
y
[
msk
])[
0
]
ax
[
0
].
plot
(
x
[
msk
],
y
[
msk
],
c
=
h
.
get_color
(),
zorder
=
2.5
)
ax
[
0
].
plot
(
z
,
lvl
,
c
=
h
.
get_color
(),
zorder
=
3.5
)
slopes
=
pd
.
DataFrame
.
from_records
(
slopes
)
fig
.
suptitle
(
'
soil moisture
'
)
fig
.
supxlabel
(
f
'
days since
{
sm
.
index
[
0
]
}
'
)
fig
.
tight_layout
()
# %%
# Plot the local drying slope for each object
#
fig
,
ax
=
plt
.
subplots
()
sns
.
swarmplot
(
data
=
slopes
,
x
=
'
object
'
,
y
=
'
slope
'
,
size
=
25
,
ax
=
ax
)
plt
.
xlabel
(
'
object
'
)
plt
.
ylabel
(
'
drying slope [1/day]
'
)
plt
.
tight_layout
()
vals
=
[]
for
obj
,
d
in
slopes
.
groupby
(
'
object
'
):
d_
=
data_g
.
loc
[:,
idx
[
obj
,
:]]
for
st
,
s
in
zip
(
d
.
datetime
,
d
.
slope
):
tmp
=
d_
.
loc
[
st
-
pd
.
Timedelta
(
'
6h
'
):
st
,
:].
mean
().
droplevel
(
0
)
tmp
[
'
object
'
]
=
obj
tmp
[
'
drying slope
'
]
=
s
vals
.
append
(
tmp
.
to_dict
())
vals
=
pd
.
DataFrame
.
from_records
(
vals
)
#sns.pairplot(data=vals, y_vars='drying slope', x_vars=inputs, hue='object')
g
=
sns
.
pairplot
(
data
=
vals
,
y_vars
=
'
drying slope
'
,
x_vars
=
inputs
)
g
.
map_offdiag
(
sns
.
regplot
)
# %%
# Dynamical model
# ---------------
#
This diff is collapsed.
Click to expand it.
swa_models/utils.py
+
198
−
1
View file @
8f2fe4e3
...
...
@@ -17,11 +17,208 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Author: Juan Pablo Carbajal <ajuanpi+dev@gmail.com>
import
warnings
import
pandas
as
pd
import
numpy
as
np
from
sklearn.cluster
import
KMeans
from
scipy.interpolate
import
UnivariateSpline
def
utc_now
():
"""
UTC now as :class:`pandas.Timestamp`.
"""
return
pd
.
Timestamp
.
utcnow
()
def
std_score
(
df
):
"""
Remove mean and normalize by standard deviation.
The mean value is subtracted and the data is scaled by its
standard deviation. Also known as z-score.
Parameters
----------
df: :class:`pandas.DataFrame` or :class:`pandas.Series`
Returns
-------
Same as ``df``
Scored input
"""
return
(
df
-
df
.
mean
())
/
df
.
std
()
def
range_score
(
df
,
*
,
return_params
=
False
,
tol
=
1e-16
):
"""
Remove minimum and normalize by range.
The columns are rescaled to the [0, 1] interval by subtracting its
minimum value and scaling by its range.
Parameters
----------
data: :class:`numpy.ndarray`, shape (n, m) or (n,)
return_params: bool
Whether to return the parameters used for scoring.
tol: float
If the range of the column is lower than this value then it is
considered a constant column. In the latter case the column is set
to 1.
Returns
-------
scored_df: Same as ``data``
All columns mapped to [0,1]
min,range: float or :class:`pandas.Series`
Only returned if ``return_params`` is True.
"""
df
=
np
.
atleast_2d
(
df
).
T
if
df
.
ndim
<=
1
else
df
dfmin
=
df
.
min
(
axis
=
0
)
dfrng
=
(
df
.
max
(
axis
=
0
)
-
dfmin
)
msk
=
dfrng
>
1e-16
# do not divide if range is too low (zero)
df
[:,
msk
]
=
(
df
-
dfmin
)[:,
msk
]
/
dfrng
[
msk
]
df
[:,
~
msk
]
=
1.0
if
return_params
:
return
df
,
dfmin
,
dfrng
else
:
return
df
def
iqr
(
y
):
"""
Inter-quantile range
"""
return
np
.
diff
(
np
.
quantile
(
y
,
[
0.25
,
0.75
]))
def
find_zero_crossing_by_interval
(
y
,
*
,
x
=
None
,
dy
=
None
,
n_roots
=
1
,
loc
=
None
,
spread
=
None
,
selector
=
None
,
**
kw_cluster
):
"""
Find zero crossings of the given signal
"""
loc
=
np
.
median
if
loc
is
None
else
loc
spread
=
iqr
if
spread
is
None
else
spread
if
n_roots
>
1
:
cluster
=
KMeans
(
**
kw_cluster
)
y
=
np
.
asarray
(
y
)
x
=
np
.
arange
(
y
.
size
)
if
x
is
None
else
x
dy
=
(
y
.
max
()
-
y
.
min
())
/
255
if
dy
is
None
else
dy
# Select y-values within tolerance
zc
=
np
.
abs
(
y
)
<
dy
x_zc
=
x
[
zc
]
# Filter the found values by selector
if
selector
is
not
None
:
x_zc
=
x_zc
[
selector
(
x_zc
,
y
[
zc
])]
if
n_roots
>
1
:
# TODO determine optimal number of roots/clusters
nc
=
min
(
n_roots
,
x_zc
.
size
)
if
nc
<
n_roots
:
warnings
.
warn
(
f
'
Found fewer zero crossings
{
nc
}
than requested
{
n_roots
}
.
'
f
'
Will return only
{
nc
}
.
'
'
Try reducing n_roots or increasing dy.
'
,
RuntimeWarning
,
stacklevel
=
2
)
cluster
.
set_params
(
n_clusters
=
nc
)
x_zc
=
np
.
atleast_2d
(
x_zc
).
reshape
(
-
1
,
1
)
labels
=
cluster
.
fit
(
x_zc
).
labels_
location
=
[]
loc_spread
=
[]
for
ll
in
np
.
unique
(
labels
):
x0
=
x_zc
[
labels
==
ll
]
location
.
append
(
loc
(
x0
))
loc_spread
.
append
(
spread
(
x0
))
else
:
location
,
loc_spread
=
loc
(
x_zc
),
spread
(
x_zc
)
location
,
loc_spread
=
np
.
atleast_1d
(
location
),
np
.
atleast_1d
(
loc_spread
)
order
=
np
.
argsort
(
location
)
return
location
[
order
],
loc_spread
[
order
]
def
find_zero_crossing_by_spline
(
y
,
*
,
x
=
None
,
dy
=
None
,
selector
=
None
,
return_spline
=
False
,
**
kw_spline
):
"""
Find zero crossings of the given signal
"""
y
=
np
.
asarray
(
y
)
x
=
np
.
arange
(
y
.
size
)
if
x
is
None
else
x
if
'
w
'
not
in
kw_spline
:
kw_spline
[
'
w
'
]
=
None
if
dy
is
None
else
1
/
dy
if
'
s
'
not
in
kw_spline
:
kw_spline
[
'
s
'
]
=
0
if
dy
is
None
else
None
spl
=
UnivariateSpline
(
x
,
y
,
**
kw_spline
)
x0
=
spl
.
roots
()
# Filter the found values by selector
if
selector
is
not
None
:
x0
=
x0
[
selector
(
x0
,
spl
)]
y0
=
spl
(
x0
)
if
return_spline
:
return
x0
,
y0
,
spl
else
:
return
x0
,
y0
def
true_interval
(
booliter
):
"""
Identify contiguous intervals of True values
The function returns a list of tuples with the start and one-pass-the-end
index of all intervals of contiguous True values. If no True value is
found the result is an empty list.
Parameters
----------
booliter : Iterable
An iterable with boolean values.
Returns
-------
start_end: list of tuples
Each tuple contains the start and end+1 index of an interval of
contiguous True values.
Examples
--------
>>>
import
numpy
as
np
>>>
x
=
[
True
,
True
,
True
]
>>>
trueinterval
(
x
)
[(
0
,
3
)]
>>>
x
=
[
False
,
True
,
False
,
True
,
True
,
False
]
>>>
trueinterval
(
x
)
[(
1
,
2
),
(
3
,
5
)]
Here is an example of a use case
>>>
y
=
np
.
array
([
3
,
2
,
2
,
3
,
0
,
1
,
2
,
4
,
3
,
4
])
>>>
ranges
=
trueinterval
(
y
<
3
)
>>>
y
[
slice
(
*
ranges
[
1
])]
array
([
0
,
1
,
2
])
"""
if
not
np
.
any
(
booliter
):
return
[]
foundstart
=
False
start
=
[]
stop
=
[]
length
=
[]
for
idx
,
v
in
enumerate
(
booliter
):
if
v
and
not
foundstart
:
# first encounter of True
foundstart
=
True
start
.
append
(
idx
)
elif
not
v
and
foundstart
:
# last True in segment
foundstart
=
False
stop
.
append
(
idx
)
length
.
append
(
stop
[
-
1
]
-
start
[
-
1
])
# Case no stop was found
if
foundstart
:
stop
.
append
(
len
(
booliter
))
return
list
(
zip
(
start
,
stop
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment