Aggregating observations on some variables

aggregate_by aggregates some variables of a data frame according to the values of a categorical variable of the same data frame and using a given function for the aggregation.

aggregate_by(df, col_name, ..., .funs = sum)

Arguments

df	the data frame on which to perform the aggregation.
col_name	the variable on which to perform the aggregation
...	other variables on which to perform the aggregation
.funs	the function used to perform the aggregation. By default `sum`. Inputed as a function name, string vector or a list of names.

Value

A data frame with the same variables as `df` but for which some of the observation have been aggregated (i.e. less rows than in `df`).

Details

On a data frame that contains the values of the categorical variable to aggregate (`col_name`, `...`), the function performs a group_by followed by a summarise with the function(s) .funs.

Examples


library(dplyr)

## A toy data frame:
set.seed(30101976)
data <- expand.grid(letters[1:3], 1:3, 4:6)
data$Var4 <- sample(1:100, nrow(data), TRUE)
data$Var5 <- sample(1:100, nrow(data), TRUE)
data$Var6 <- sample(1:100, nrow(data), TRUE)

## Aggregating the values "a" and "b" of the categorical variable Var1,
## summing the values of variables Var4, Var5, Var6 (i.e. all the variables
## that are not in the arguments of the function call):
data %>%
  mutate(Var1 = recode(Var1, a = "b")) %>%
  aggregate_by(Var1, Var2, Var3)
#> [90m# A tibble: 18 x 6[39m
#> [90m# Groups:   Var1, Var2 [?][39m
#>    Var1   Var2  Var3  Var4  Var5  Var6
#>    [3m[90m<fct>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m
#> [90m 1[39m b         1     4    77   136   142
#> [90m 2[39m b         1     5   119   116   106
#> [90m 3[39m b         1     6    14   111   107
#> [90m 4[39m b         2     4    77   122   117
#> [90m 5[39m b         2     5    78    33    98
#> [90m 6[39m b         2     6   113   144   171
#> [90m 7[39m b         3     4    54    81    82
#> [90m 8[39m b         3     5   139   138   123
#> [90m 9[39m b         3     6    49    47    58
#> [90m10[39m c         1     4    49    12    62
#> [90m11[39m c         1     5     5    27    17
#> [90m12[39m c         1     6    41    10    43
#> [90m13[39m c         2     4    42     9     6
#> [90m14[39m c         2     5    27    35    52
#> [90m15[39m c         2     6    35    38     3
#> [90m16[39m c         3     4    16     2    96
#> [90m17[39m c         3     5    21     5    10
#> [90m18[39m c         3     6    28    27     7

## To calculate the mean value:
data %>%
  mutate(Var1 = recode(Var1, a = "b")) %>%
  aggregate_by(Var1, Var2, Var3, .funs = mean)
#> [90m# A tibble: 18 x 6[39m
#> [90m# Groups:   Var1, Var2 [?][39m
#>    Var1   Var2  Var3  Var4  Var5  Var6
#>    [3m[90m<fct>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m
#> [90m 1[39m b         1     4  38.5  68    71  
#> [90m 2[39m b         1     5  59.5  58    53  
#> [90m 3[39m b         1     6   7    55.5  53.5
#> [90m 4[39m b         2     4  38.5  61    58.5
#> [90m 5[39m b         2     5  39    16.5  49  
#> [90m 6[39m b         2     6  56.5  72    85.5
#> [90m 7[39m b         3     4  27    40.5  41  
#> [90m 8[39m b         3     5  69.5  69    61.5
#> [90m 9[39m b         3     6  24.5  23.5  29  
#> [90m10[39m c         1     4  49    12    62  
#> [90m11[39m c         1     5   5    27    17  
#> [90m12[39m c         1     6  41    10    43  
#> [90m13[39m c         2     4  42     9     6  
#> [90m14[39m c         2     5  27    35    52  
#> [90m15[39m c         2     6  35    38     3  
#> [90m16[39m c         3     4  16     2    96  
#> [90m17[39m c         3     5  21     5    10  
#> [90m18[39m c         3     6  28    27     7  

## or to apply it to all the columns:
data %>%
  mutate(Var1 = recode(Var1, a = "b")) %>%
  aggregate_by(Var1, Var2, Var3, .funs = list(sum, mean))
#> [90m# A tibble: 18 x 9[39m
#> [90m# Groups:   Var1, Var2 [?][39m
#>    Var1   Var2  Var3 Var4_sum Var5_sum Var6_sum Var4_mean Var5_mean Var6_mean
#>    [3m[90m<fct>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m    [3m[90m<int>[39m[23m    [3m[90m<int>[39m[23m    [3m[90m<int>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m
#> [90m 1[39m b         1     4       77      136      142      38.5      68        71  
#> [90m 2[39m b         1     5      119      116      106      59.5      58        53  
#> [90m 3[39m b         1     6       14      111      107       7        55.5      53.5
#> [90m 4[39m b         2     4       77      122      117      38.5      61        58.5
#> [90m 5[39m b         2     5       78       33       98      39        16.5      49  
#> [90m 6[39m b         2     6      113      144      171      56.5      72        85.5
#> [90m 7[39m b         3     4       54       81       82      27        40.5      41  
#> [90m 8[39m b         3     5      139      138      123      69.5      69        61.5
#> [90m 9[39m b         3     6       49       47       58      24.5      23.5      29  
#> [90m10[39m c         1     4       49       12       62      49        12        62  
#> [90m11[39m c         1     5        5       27       17       5        27        17  
#> [90m12[39m c         1     6       41       10       43      41        10        43  
#> [90m13[39m c         2     4       42        9        6      42         9         6  
#> [90m14[39m c         2     5       27       35       52      27        35        52  
#> [90m15[39m c         2     6       35       38        3      35        38         3  
#> [90m16[39m c         3     4       16        2       96      16         2        96  
#> [90m17[39m c         3     5       21        5       10      21         5        10  
#> [90m18[39m c         3     6       28       27        7      28        27         7  

## To calculate the mean and the sum:
data %>%
  mutate(Var1 = recode(Var1, a = "b")) %>%
  aggregate_by(Var1, Var2, Var3, .funs = list(sum(Var6), mean(Var4)))
#> [90m# A tibble: 18 x 5[39m
#> [90m# Groups:   Var1, Var2 [?][39m
#>    Var1   Var2  Var3 `sum(Var6)` `mean(Var4)`
#>    [3m[90m<fct>[39m[23m [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m       [3m[90m<int>[39m[23m        [3m[90m<dbl>[39m[23m
#> [90m 1[39m b         1     4         142         38.5
#> [90m 2[39m b         1     5         106         59.5
#> [90m 3[39m b         1     6         107          7  
#> [90m 4[39m b         2     4         117         38.5
#> [90m 5[39m b         2     5          98         39  
#> [90m 6[39m b         2     6         171         56.5
#> [90m 7[39m b         3     4          82         27  
#> [90m 8[39m b         3     5         123         69.5
#> [90m 9[39m b         3     6          58         24.5
#> [90m10[39m c         1     4          62         49  
#> [90m11[39m c         1     5          17          5  
#> [90m12[39m c         1     6          43         41  
#> [90m13[39m c         2     4           6         42  
#> [90m14[39m c         2     5          52         27  
#> [90m15[39m c         2     6           3         35  
#> [90m16[39m c         3     4          96         16  
#> [90m17[39m c         3     5          10         21  
#> [90m18[39m c         3     6           7         28

Aggregating observations on some variables

Arguments

Value

Details

Examples

Contents

Author