-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclear-invalid-env-Miedes.Rmd
124 lines (92 loc) · 4.41 KB
/
clear-invalid-env-Miedes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
In this notebook we look when the sensors have been moved and, therefore, the logged data isn't valid.
```{r}
# import libraries
source('lib-dendro.R')
library(plotly)
library(datacleanr) # to find invalid periods (interactively)
library(readr) # for write_csv() function
library(tidyverse) # for %>%
library(glue)
# global variables
PATH = dirname(rstudioapi::getSourceEditorContext()$path)
setwd(PATH)
PLACE = 'Miedes'
BUFFER_ENV_DIR = glue('processed/{PLACE}-env-buffer-toclear')
OUTPUT_ENV_DIR = glue('processed/{PLACE}-env-processed')
```
```{r}
# importing processed environmental data #
list_files <- list.files(file.path(PATH,BUFFER_ENV_DIR), pattern="*.csv$", full.names=TRUE)
db.env<-read.all.env.processed(list_files)
str(db.env)
```
With VWC we can better see when the sensor is moved (VWC equals to 0 for a long period):
```{r}
f <- plot_ly(db.env, x = ~ts, y = ~vwc, color = ~series, type = 'scatter', mode = 'lines')
f
# library(datacleanr); dcr_app(db.env)
```
Defining and clearing the periods of invalid data. There's quite a bunch of invalid data (timeframes where the sensor has been removed by an animal), we need to remove those:
First, we remove all the data prior to installation:
```{r}
ts_start = "2022-03-10 13:00:00"
head(db.env)
db.env <- db.env[which(db.env$ts>=ts_start),]
head(db.env)
```
Then, we set NA when there's no valid data:
```{r}
# Then we clear the invalid data for each series:
# for series1 = db.env$series == "94231939"
series1 = (db.env$series == "94231939")
# first interval:
interval1.1 = (db.env$ts >= "2022-06-28 22:45:00" & db.env$ts <= "2022-08-19 10:45:00")
# equivalent to: (adjust to corresponding series and interval)
#db.2 = db.env %>% filter((series == "94231939") & between(ts, ymd_hms("2022-05-14 21:00:00", tz = "Europe/Madrid"), ymd_hms("2022-05-19 11:45:00", tz = "Europe/Madrid") ))
# second interval: 2022-09-11T08:45:00Z al 2022-11-24T14:45:00Z
interval1.2 = (db.env$ts > "2022-09-08 22:15:00" & db.env$ts < "2022-11-18 10:15:00" )
# third interval: from 2023-07-28T08:15:00Z to 2023-09-26 18:00:00
interval1.3 = (db.env$ts > "2023-07-28 08:15:00" & db.env$ts < "2023-09-26 18:00:00")
interval1.4 = (db.env$ts > "2024-01-14 08:15:00")
db.env[series1 & (interval1.1 | interval1.2 | interval1.3 | interval1.4) ,c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
# now series == "94231938"
series2 = db.env$series == "94231938"
# interval1:
interval2.1 = db.env$ts >= "2022-06-11 20:15:00" & db.env$ts <= "2022-08-19 10:45:00"
interval2.2 = db.env$ts >= "2022-09-08 22:15:00" & db.env$ts <= "2022-11-24 11:30:00"
# interval2: 2023-09-02T03:45:00Z to the end
interval2.3 = db.env$ts >= "2023-09-02 03:45:00" & db.env$ts < "2023-09-26 15:30:00"
db.env[series2 & (interval2.1 | interval2.2 | interval2.3), c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
# now series == "94231936"
series3 = db.env$series == "94231936"
# third interval:
interval3.1 = (db.env$ts >= "2023-07-11 19:30:00") & (db.env$ts < "2023-09-13 14:45:00")
interval3.2 = (db.env$ts > "2024-03-19 21:15:00")
db.env[series3 & (interval3.1 | interval3.2), c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
# now series == "94231940"
series4 = db.env$series == "94231940"
interval4.1 <- db.env$ts > "2023-12-24 00:45:00" & db.env$ts < "2023-12-24 02:30:00"
interval4.2 <- db.env$ts > "2024-03-09 18:15:00"
db.env[series4 & (interval4.1 | interval4.2), c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
```
This is the result of adding the NA to the invalid periods:
```{r}
f <- plot_ly(db.env, x = ~ts, y = ~vwc, color = ~series, type = 'scatter', mode = 'lines')
f
```
save result
```{r}
OUTPUT_PATH = file.path(PATH, OUTPUT_ENV_DIR)
if (!dir.exists(OUTPUT_PATH)) {dir.create(OUTPUT_PATH)}
write_csv(db.env, file.path(OUTPUT_PATH, "proc-env.csv" ), append = F, col_names = T)
db.agg <- subset(db.env, select = c("ts", "soil.temp", "surface.temp", "air.temp", "vwc") )
# Do mean of all sensors
db.agg <- db.env %>%
#filter(ts < ymd_hms("2022-12-31 00:00:00")) %>%
group_by(ts) %>%
dplyr::summarise(soil.temp = mean(soil.temp, na.rm = T), surface.temp = mean(surface.temp, na.rm = T), air.temp = mean(air.temp, na.rm = T), vwc = mean(vwc, na.rm = T))
summary(db.agg)
# write aggregated data to file.
if (!dir.exists(file.path(OUTPUT_PATH, 'aggregated'))) {dir.create(file.path(OUTPUT_PATH, 'aggregated'))}
write_csv(db.agg, file.path(OUTPUT_PATH, 'aggregated', "proc-agg-env.csv"), append = F, col_names = T)
```