library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  2.0.0     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Gathering

ForestGEO datasets encode the status of each stem in the values of the column status.

select(fgeo.data::luquillo_stem5_random, status)
## # A tibble: 1,320 x 1
##    status
##    <chr> 
##  1 A     
##  2 A     
##  3 G     
##  4 G     
##  5 G     
##  6 A     
##  7 A     
##  8 A     
##  9 A     
## 10 A     
## # ... with 1,310 more rows

In raw data, however, the values of status are sometimes spread across multiple columns. For example:

wide <- tibble::tribble(
  ~treeID, ~stemID,    ~gx,    ~gy, ~dbh,  ~A,  ~G, ~P,
     104L,    143L,  10.31, 245.36,  193, "A",  NA, NA,
     119L,    158L, 182.89, 410.15, 40.4, "A",  NA, NA,
     180L,    222L, 164.61,  409.5,   NA,  NA, "G", NA,
     180L,    223L, 164.61,  409.5,   NA,  NA, "G", NA,
     180L,    224L, 164.61,  409.5,   NA,  NA, "G", NA,
     180L,    225L, 164.61,  409.5,   45, "A",  NA, NA,
     602L,    736L, 148.96, 414.44,   33, "A",  NA, NA,
     631L,    775L,   38.3,  245.3,  140, "A",  NA, NA,
     647L,    793L, 143.24, 410.91,  246, "A",  NA, NA,
    1086L,   1339L,  68.89, 253.04,  176, "A",  NA, NA
)

wide
## # A tibble: 10 x 8
##    treeID stemID    gx    gy   dbh A     G     P    
##     <int>  <int> <dbl> <dbl> <dbl> <chr> <chr> <lgl>
##  1    104    143  10.3  245. 193   A     <NA>  NA   
##  2    119    158 183.   410.  40.4 A     <NA>  NA   
##  3    180    222 165.   410.  NA   <NA>  G     NA   
##  4    180    223 165.   410.  NA   <NA>  G     NA   
##  5    180    224 165.   410.  NA   <NA>  G     NA   
##  6    180    225 165.   410.  45   A     <NA>  NA   
##  7    602    736 149.   414.  33   A     <NA>  NA   
##  8    631    775  38.3  245. 140   A     <NA>  NA   
##  9    647    793 143.   411. 246   A     <NA>  NA   
## 10   1086   1339  68.9  253. 176   A     <NA>  NA

We can gather the multiple status columns with tidyr::gather(). See ?gather() for a definition of each argument.

long <- gather(wide, key = "status", "value", A, G, P, na.rm = TRUE)
long
## # A tibble: 10 x 7
##    treeID stemID    gx    gy   dbh status value
##     <int>  <int> <dbl> <dbl> <dbl> <chr>  <chr>
##  1    104    143  10.3  245. 193   A      A    
##  2    119    158 183.   410.  40.4 A      A    
##  3    180    225 165.   410.  45   A      A    
##  4    602    736 149.   414.  33   A      A    
##  5    631    775  38.3  245. 140   A      A    
##  6    647    793 143.   411. 246   A      A    
##  7   1086   1339  68.9  253. 176   A      A    
##  8    180    222 165.   410.  NA   G      G    
##  9    180    223 165.   410.  NA   G      G    
## 10    180    224 165.   410.  NA   G      G

# In this case the column `value` is useless
select(long ,-value)
## # A tibble: 10 x 6
##    treeID stemID    gx    gy   dbh status
##     <int>  <int> <dbl> <dbl> <dbl> <chr> 
##  1    104    143  10.3  245. 193   A     
##  2    119    158 183.   410.  40.4 A     
##  3    180    225 165.   410.  45   A     
##  4    602    736 149.   414.  33   A     
##  5    631    775  38.3  245. 140   A     
##  6    647    793 143.   411. 246   A     
##  7   1086   1339  68.9  253. 176   A     
##  8    180    222 165.   410.  NA   G     
##  9    180    223 165.   410.  NA   G     
## 10    180    224 165.   410.  NA   G

The raw data may also look like this.

wide2 <- tibble::tribble(
  ~treeID, ~stemID,    ~gx,    ~gy, ~dbh, ~A, ~G, ~P,
     104L,    143L,  10.31, 245.36,  193,  1,  0,  0,
     119L,    158L, 182.89, 410.15, 40.4,  1,  0,  0,
     180L,    222L, 164.61,  409.5,   NA,  0,  1,  0,
     180L,    223L, 164.61,  409.5,   NA,  0,  1,  0,
     180L,    224L, 164.61,  409.5,   NA,  0,  1,  0,
     180L,    225L, 164.61,  409.5,   45,  1,  0,  0,
     602L,    736L, 148.96, 414.44,   33,  1,  0,  0,
     631L,    775L,   38.3,  245.3,  140,  1,  0,  0,
     647L,    793L, 143.24, 410.91,  246,  1,  0,  0,
    1086L,   1339L,  68.89, 253.04,  176,  1,  0,  0
)

wide2
## # A tibble: 10 x 8
##    treeID stemID    gx    gy   dbh     A     G     P
##     <int>  <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    104    143  10.3  245. 193       1     0     0
##  2    119    158 183.   410.  40.4     1     0     0
##  3    180    222 165.   410.  NA       0     1     0
##  4    180    223 165.   410.  NA       0     1     0
##  5    180    224 165.   410.  NA       0     1     0
##  6    180    225 165.   410.  45       1     0     0
##  7    602    736 149.   414.  33       1     0     0
##  8    631    775  38.3  245. 140       1     0     0
##  9    647    793 143.   411. 246       1     0     0
## 10   1086   1339  68.9  253. 176       1     0     0
long2 <- gather(wide2, key = "status", value = "value", A, G, P)
# Sorting to better understand the point that follows
arrange(long2, treeID, stemID)
## # A tibble: 30 x 7
##    treeID stemID    gx    gy   dbh status value
##     <int>  <int> <dbl> <dbl> <dbl> <chr>  <dbl>
##  1    104    143  10.3  245. 193   A          1
##  2    104    143  10.3  245. 193   G          0
##  3    104    143  10.3  245. 193   P          0
##  4    119    158 183.   410.  40.4 A          1
##  5    119    158 183.   410.  40.4 G          0
##  6    119    158 183.   410.  40.4 P          0
##  7    180    222 165.   410.  NA   A          0
##  8    180    222 165.   410.  NA   G          1
##  9    180    222 165.   410.  NA   P          0
## 10    180    223 165.   410.  NA   A          0
## # ... with 20 more rows

We need to remove duplicates. For each stem the number of rows should be unique but instead we have 3 rows – one for each of the columns we gathered. Above we could fix this by using na.rm. Now we have no missing values but we can get the same result by excluding the rows where value is zero.

filter(long2, value != 0)
## # A tibble: 10 x 7
##    treeID stemID    gx    gy   dbh status value
##     <int>  <int> <dbl> <dbl> <dbl> <chr>  <dbl>
##  1    104    143  10.3  245. 193   A          1
##  2    119    158 183.   410.  40.4 A          1
##  3    180    225 165.   410.  45   A          1
##  4    602    736 149.   414.  33   A          1
##  5    631    775  38.3  245. 140   A          1
##  6    647    793 143.   411. 246   A          1
##  7   1086   1339  68.9  253. 176   A          1
##  8    180    222 165.   410.  NA   G          1
##  9    180    223 165.   410.  NA   G          1
## 10    180    224 165.   410.  NA   G          1

# We no longer need `value` so we can remove it

single_rows <- filter(long2, value != 0)
single_rows <- select(single_rows, -value)
single_rows
## # A tibble: 10 x 6
##    treeID stemID    gx    gy   dbh status
##     <int>  <int> <dbl> <dbl> <dbl> <chr> 
##  1    104    143  10.3  245. 193   A     
##  2    119    158 183.   410.  40.4 A     
##  3    180    225 165.   410.  45   A     
##  4    602    736 149.   414.  33   A     
##  5    631    775  38.3  245. 140   A     
##  6    647    793 143.   411. 246   A     
##  7   1086   1339  68.9  253. 176   A     
##  8    180    222 165.   410.  NA   G     
##  9    180    223 165.   410.  NA   G     
## 10    180    224 165.   410.  NA   G

Spreading

The opposite of gathering is spreading. This is not a common operation in ForestGEO data but I’ll it for completeness.

spread(long2, key = "status", value = "value")
## # A tibble: 10 x 8
##    treeID stemID    gx    gy   dbh     A     G     P
##     <int>  <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    104    143  10.3  245. 193       1     0     0
##  2    119    158 183.   410.  40.4     1     0     0
##  3    180    222 165.   410.  NA       0     1     0
##  4    180    223 165.   410.  NA       0     1     0
##  5    180    224 165.   410.  NA       0     1     0
##  6    180    225 165.   410.  45       1     0     0
##  7    602    736 149.   414.  33       1     0     0
##  8    631    775  38.3  245. 140       1     0     0
##  9    647    793 143.   411. 246       1     0     0
## 10   1086   1339  68.9  253. 176       1     0     0