tidyverse常用分组统计

R语言 tidyverse包常用分组统计方法汇总

模拟数据

library(tidyverse)
#create data frame
df <- data.frame(team=c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'),
                 points=c(99, 68, 86, 88, 95, 74, 78, 93),
                 assists=c(22, 28, 31, 35, 34, 45, 28, 31),
                 rebounds=c(30, 28, 24, 24, 30, 36, 30, 29),
                 pts=c(5, 8, 14, 18, 5, 7, 7,10),
                 rebs=c(8, 8, 9, 3, 8, 7, 4,5),
                           position=c('Guard', 'Guard', 'Forward','Forward',
                            'Guard', 'Forward', 'Center', 'Center')
)

分组摘要 Calculate Summary Statistics by Group

df %>%
  group_by(team) %>% 
  summarize(min = min(points),
            q1 = quantile(points, 0.25),
            median = median(points),
            mean = mean(points),
            q3 = quantile(points, 0.75),
            max = max(points))

分组均值 Calculate the Mean by Group

df %>%
  group_by(team) %>%
  summarise_at(vars(pts), list(name = mean))

分组求和 Calculate the Sum by Group

df %>%
  group_by(team) %>%
  summarise(Freq = sum(pts))

分组相关性 Calculate Correlation By Group

df %>%
  group_by(team) %>%
  summarize(cor=cor(points, assists))

分组百分比 Calculate Percentage by Group

df %>%
  group_by(team) %>%
  mutate(percent = points/sum(points))

分组标准差 Calculate Standard Deviation

df %>%
  group_by(team) %>%
  summarise_at(vars(points), list(name=sd))

分组众数 Calculate Mode by Group

find_mode <- function(x) {
  u <- unique(x)
  tab <- tabulate(match(x, u))
  u[tab == max(tab)]
}

df %>%
  group_by(team) %>%
  summarize(mode_points = find_mode(points))

分组分位数 Quantiles by Group

df <- data.frame(team=c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
                        'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
                        'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'),
                 wins=c(2, 4, 4, 5, 7, 9, 13, 13, 15, 15, 14, 13,
                        11, 9, 9, 8, 8, 16, 19, 21, 24, 20, 19, 18))


#define quantiles of interest
q = c(.25, .5, .75)

df %>%
  group_by(team) %>%
  summarize(quant25 = quantile(wins, probs = q[1]), 
            quant50 = quantile(wins, probs = q[2]),
            quant75 = quantile(wins, probs = q[3]))

分组唯一值 Count Unique Values by Group

df %>%
  group_by(team) %>%
  summarize(count = n_distinct(points))

分组条件计数 Group By and Count with Condition

df <- data.frame(team=c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'),
                 pos=c('Gu', 'Fo', 'Fo', 'Fo', 'Gu', 'Gu', 'Fo', 'Fo'),
                 points=c(18, 22, 19, 14, 14, 11, 20, 28))

df %>%
  group_by(team) %>%
  summarize(count = sum(pos == 'Gu'))

df %>%
  group_by(team) %>%
  summarize(count = sum(points > 15))

分组排序 Arrange Rows by Group

df %>%
  group_by(team) %>%
  arrange(points, .by_group=TRUE)

df %>%
  group_by(team) %>%
  arrange(desc(points), .by_group=TRUE)

df %>%
  group_by(team, position) %>%
  arrange(points, .by_group=TRUE)

分组计数 Count Observations by Group

df %>% count(team, sort=TRUE)
df %>% count(team, position)

分组过滤 Group By and Filter Data

df %>%
  group_by(team) %>%
  filter(any(points == 88))

df %>%
  group_by(team) %>%
  filter(any(points > 95))

分组取第一或最后一行 Select the First or the last Row by Group

df %>%
  group_by(team) %>%
  arrange(desc(points)) %>%
  filter(row_number()==1) # first row

df %>%
  group_by(team) %>%
  arrange(desc(points)) %>%
  filter(row_number()==n()) # last row

分组求秩 Rank Variables by Group

df %>% arrange(team, points) %>%
    group_by(team) %>% 
    mutate(rank = rank(points))  # Ascending

df %>% arrange(team, points) %>%
    group_by(team) %>% 
    mutate(rank = rank(-points)) # Descending 

分组滞后值 Calculate Lag by Group

df <- data.frame(store=c('A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'),
                 sales=c(7, 12, 10, 9, 9, 11, 18, 23))

df %>%
  group_by(store) %>%
  mutate(lag1_sales = lag(sales, n=1, order_by=store))

分组移动平均值 Calculate a Moving Average by Group

library(zoo)
df <- data.frame(store=rep(c('A', 'B'), each=7),
                 sales=c(4, 4, 3, 5, 6, 5, 7, 4, 8, 7, 2, 5, 4, 6))

df %>%
  group_by(store) %>%
  mutate(moving_avg3 = rollmean(sales, k=3, fill=NA, align='right'))

分组日期处理 Group Data by Week or Month

df <- data.frame(date=as.Date(c('1/8/2022', '1/9/2022', '2/10/2022', '2/15/2022',
                                '3/5/2022', '3/22/2022', '3/27/2022'), '%m/%d/%Y'),
                 sales=c(8, 14, 22, 23, 16, 17, 23))

df$week_num <- strftime(df$date, format = "%V")
df %>%
  group_by(week_num) %>%
  summarize(total_sales = sum(sales))

df %>% 
    group_by(month = lubridate::floor_date(date, 'month')) %>%
    summarize(sum_of_sales = sum(sales))  

分组前N行 Select Top N Values by Group

df %>% 
  arrange(desc(points)) %>% 
  group_by(team) %>%
  slice(1:2)

df %>%
  group_by(team) %>%
  top_n(2, points)

分组合并字符串 Collapse Text by Group

df %>%
  group_by(team) %>%
  summarise(text=paste(team,position, collapse=''))

留下评论