library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 2.0.0 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Gathering
ForestGEO datasets encode the status of each stem in the values of the column status
.
select(fgeo.data::luquillo_stem5_random, status)
## # A tibble: 1,320 x 1
## status
## <chr>
## 1 A
## 2 A
## 3 G
## 4 G
## 5 G
## 6 A
## 7 A
## 8 A
## 9 A
## 10 A
## # ... with 1,310 more rows
In raw data, however, the values of status
are sometimes spread across multiple columns. For example:
wide <- tibble::tribble(
~treeID, ~stemID, ~gx, ~gy, ~dbh, ~A, ~G, ~P,
104L, 143L, 10.31, 245.36, 193, "A", NA, NA,
119L, 158L, 182.89, 410.15, 40.4, "A", NA, NA,
180L, 222L, 164.61, 409.5, NA, NA, "G", NA,
180L, 223L, 164.61, 409.5, NA, NA, "G", NA,
180L, 224L, 164.61, 409.5, NA, NA, "G", NA,
180L, 225L, 164.61, 409.5, 45, "A", NA, NA,
602L, 736L, 148.96, 414.44, 33, "A", NA, NA,
631L, 775L, 38.3, 245.3, 140, "A", NA, NA,
647L, 793L, 143.24, 410.91, 246, "A", NA, NA,
1086L, 1339L, 68.89, 253.04, 176, "A", NA, NA
)
wide
## # A tibble: 10 x 8
## treeID stemID gx gy dbh A G P
## <int> <int> <dbl> <dbl> <dbl> <chr> <chr> <lgl>
## 1 104 143 10.3 245. 193 A <NA> NA
## 2 119 158 183. 410. 40.4 A <NA> NA
## 3 180 222 165. 410. NA <NA> G NA
## 4 180 223 165. 410. NA <NA> G NA
## 5 180 224 165. 410. NA <NA> G NA
## 6 180 225 165. 410. 45 A <NA> NA
## 7 602 736 149. 414. 33 A <NA> NA
## 8 631 775 38.3 245. 140 A <NA> NA
## 9 647 793 143. 411. 246 A <NA> NA
## 10 1086 1339 68.9 253. 176 A <NA> NA
We can gather the multiple status
columns with tidyr::gather()
. See ?gather()
for a definition of each argument.
long <- gather(wide, key = "status", "value", A, G, P, na.rm = TRUE)
long
## # A tibble: 10 x 7
## treeID stemID gx gy dbh status value
## <int> <int> <dbl> <dbl> <dbl> <chr> <chr>
## 1 104 143 10.3 245. 193 A A
## 2 119 158 183. 410. 40.4 A A
## 3 180 225 165. 410. 45 A A
## 4 602 736 149. 414. 33 A A
## 5 631 775 38.3 245. 140 A A
## 6 647 793 143. 411. 246 A A
## 7 1086 1339 68.9 253. 176 A A
## 8 180 222 165. 410. NA G G
## 9 180 223 165. 410. NA G G
## 10 180 224 165. 410. NA G G
# In this case the column `value` is useless
select(long ,-value)
## # A tibble: 10 x 6
## treeID stemID gx gy dbh status
## <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 104 143 10.3 245. 193 A
## 2 119 158 183. 410. 40.4 A
## 3 180 225 165. 410. 45 A
## 4 602 736 149. 414. 33 A
## 5 631 775 38.3 245. 140 A
## 6 647 793 143. 411. 246 A
## 7 1086 1339 68.9 253. 176 A
## 8 180 222 165. 410. NA G
## 9 180 223 165. 410. NA G
## 10 180 224 165. 410. NA G
The raw data may also look like this.
wide2 <- tibble::tribble(
~treeID, ~stemID, ~gx, ~gy, ~dbh, ~A, ~G, ~P,
104L, 143L, 10.31, 245.36, 193, 1, 0, 0,
119L, 158L, 182.89, 410.15, 40.4, 1, 0, 0,
180L, 222L, 164.61, 409.5, NA, 0, 1, 0,
180L, 223L, 164.61, 409.5, NA, 0, 1, 0,
180L, 224L, 164.61, 409.5, NA, 0, 1, 0,
180L, 225L, 164.61, 409.5, 45, 1, 0, 0,
602L, 736L, 148.96, 414.44, 33, 1, 0, 0,
631L, 775L, 38.3, 245.3, 140, 1, 0, 0,
647L, 793L, 143.24, 410.91, 246, 1, 0, 0,
1086L, 1339L, 68.89, 253.04, 176, 1, 0, 0
)
wide2
## # A tibble: 10 x 8
## treeID stemID gx gy dbh A G P
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 104 143 10.3 245. 193 1 0 0
## 2 119 158 183. 410. 40.4 1 0 0
## 3 180 222 165. 410. NA 0 1 0
## 4 180 223 165. 410. NA 0 1 0
## 5 180 224 165. 410. NA 0 1 0
## 6 180 225 165. 410. 45 1 0 0
## 7 602 736 149. 414. 33 1 0 0
## 8 631 775 38.3 245. 140 1 0 0
## 9 647 793 143. 411. 246 1 0 0
## 10 1086 1339 68.9 253. 176 1 0 0
long2 <- gather(wide2, key = "status", value = "value", A, G, P)
# Sorting to better understand the point that follows
arrange(long2, treeID, stemID)
## # A tibble: 30 x 7
## treeID stemID gx gy dbh status value
## <int> <int> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 104 143 10.3 245. 193 A 1
## 2 104 143 10.3 245. 193 G 0
## 3 104 143 10.3 245. 193 P 0
## 4 119 158 183. 410. 40.4 A 1
## 5 119 158 183. 410. 40.4 G 0
## 6 119 158 183. 410. 40.4 P 0
## 7 180 222 165. 410. NA A 0
## 8 180 222 165. 410. NA G 1
## 9 180 222 165. 410. NA P 0
## 10 180 223 165. 410. NA A 0
## # ... with 20 more rows
We need to remove duplicates. For each stem the number of rows should be unique but instead we have 3 rows – one for each of the columns we gathered. Above we could fix this by using na.rm
. Now we have no missing values but we can get the same result by excluding the rows where value
is zero.
filter(long2, value != 0)
## # A tibble: 10 x 7
## treeID stemID gx gy dbh status value
## <int> <int> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 104 143 10.3 245. 193 A 1
## 2 119 158 183. 410. 40.4 A 1
## 3 180 225 165. 410. 45 A 1
## 4 602 736 149. 414. 33 A 1
## 5 631 775 38.3 245. 140 A 1
## 6 647 793 143. 411. 246 A 1
## 7 1086 1339 68.9 253. 176 A 1
## 8 180 222 165. 410. NA G 1
## 9 180 223 165. 410. NA G 1
## 10 180 224 165. 410. NA G 1
# We no longer need `value` so we can remove it
single_rows <- filter(long2, value != 0)
single_rows <- select(single_rows, -value)
single_rows
## # A tibble: 10 x 6
## treeID stemID gx gy dbh status
## <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 104 143 10.3 245. 193 A
## 2 119 158 183. 410. 40.4 A
## 3 180 225 165. 410. 45 A
## 4 602 736 149. 414. 33 A
## 5 631 775 38.3 245. 140 A
## 6 647 793 143. 411. 246 A
## 7 1086 1339 68.9 253. 176 A
## 8 180 222 165. 410. NA G
## 9 180 223 165. 410. NA G
## 10 180 224 165. 410. NA G
Spreading
The opposite of gathering is spreading. This is not a common operation in ForestGEO data but I’ll it for completeness.
spread(long2, key = "status", value = "value")
## # A tibble: 10 x 8
## treeID stemID gx gy dbh A G P
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 104 143 10.3 245. 193 1 0 0
## 2 119 158 183. 410. 40.4 1 0 0
## 3 180 222 165. 410. NA 0 1 0
## 4 180 223 165. 410. NA 0 1 0
## 5 180 224 165. 410. NA 0 1 0
## 6 180 225 165. 410. 45 1 0 0
## 7 602 736 149. 414. 33 1 0 0
## 8 631 775 38.3 245. 140 1 0 0
## 9 647 793 143. 411. 246 1 0 0
## 10 1086 1339 68.9 253. 176 1 0 0