模拟数据
library(tidyverse)
#create data frame
df <- data.frame(team=c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'),
points=c(99, 68, 86, 88, 95, 74, 78, 93),
assists=c(22, 28, 31, 35, 34, 45, 28, 31),
rebounds=c(30, 28, 24, 24, 30, 36, 30, 29),
pts=c(5, 8, 14, 18, 5, 7, 7,10),
rebs=c(8, 8, 9, 3, 8, 7, 4,5),
position=c('Guard', 'Guard', 'Forward','Forward',
'Guard', 'Forward', 'Center', 'Center')
)
分组摘要 Calculate Summary Statistics by Group
df %>%
group_by(team) %>%
summarize(min = min(points),
q1 = quantile(points, 0.25),
median = median(points),
mean = mean(points),
q3 = quantile(points, 0.75),
max = max(points))
分组均值 Calculate the Mean by Group
df %>%
group_by(team) %>%
summarise_at(vars(pts), list(name = mean))
分组求和 Calculate the Sum by Group
df %>%
group_by(team) %>%
summarise(Freq = sum(pts))
分组相关性 Calculate Correlation By Group
df %>%
group_by(team) %>%
summarize(cor=cor(points, assists))
分组百分比 Calculate Percentage by Group
df %>%
group_by(team) %>%
mutate(percent = points/sum(points))
分组标准差 Calculate Standard Deviation
df %>%
group_by(team) %>%
summarise_at(vars(points), list(name=sd))
分组众数 Calculate Mode by Group
find_mode <- function(x) {
u <- unique(x)
tab <- tabulate(match(x, u))
u[tab == max(tab)]
}
df %>%
group_by(team) %>%
summarize(mode_points = find_mode(points))
分组分位数 Quantiles by Group
df <- data.frame(team=c('A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'),
wins=c(2, 4, 4, 5, 7, 9, 13, 13, 15, 15, 14, 13,
11, 9, 9, 8, 8, 16, 19, 21, 24, 20, 19, 18))
#define quantiles of interest
q = c(.25, .5, .75)
df %>%
group_by(team) %>%
summarize(quant25 = quantile(wins, probs = q[1]),
quant50 = quantile(wins, probs = q[2]),
quant75 = quantile(wins, probs = q[3]))
分组唯一值 Count Unique Values by Group
df %>%
group_by(team) %>%
summarize(count = n_distinct(points))
分组条件计数 Group By and Count with Condition
df <- data.frame(team=c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'),
pos=c('Gu', 'Fo', 'Fo', 'Fo', 'Gu', 'Gu', 'Fo', 'Fo'),
points=c(18, 22, 19, 14, 14, 11, 20, 28))
df %>%
group_by(team) %>%
summarize(count = sum(pos == 'Gu'))
df %>%
group_by(team) %>%
summarize(count = sum(points > 15))
分组排序 Arrange Rows by Group
df %>%
group_by(team) %>%
arrange(points, .by_group=TRUE)
df %>%
group_by(team) %>%
arrange(desc(points), .by_group=TRUE)
df %>%
group_by(team, position) %>%
arrange(points, .by_group=TRUE)
分组计数 Count Observations by Group
df %>% count(team, sort=TRUE)
df %>% count(team, position)
分组过滤 Group By and Filter Data
df %>%
group_by(team) %>%
filter(any(points == 88))
df %>%
group_by(team) %>%
filter(any(points > 95))
分组取第一或最后一行 Select the First or the last Row by Group
df %>%
group_by(team) %>%
arrange(desc(points)) %>%
filter(row_number()==1) # first row
df %>%
group_by(team) %>%
arrange(desc(points)) %>%
filter(row_number()==n()) # last row
分组求秩 Rank Variables by Group
df %>% arrange(team, points) %>%
group_by(team) %>%
mutate(rank = rank(points)) # Ascending
df %>% arrange(team, points) %>%
group_by(team) %>%
mutate(rank = rank(-points)) # Descending
分组滞后值 Calculate Lag by Group
df <- data.frame(store=c('A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'),
sales=c(7, 12, 10, 9, 9, 11, 18, 23))
df %>%
group_by(store) %>%
mutate(lag1_sales = lag(sales, n=1, order_by=store))
分组移动平均值 Calculate a Moving Average by Group
library(zoo)
df <- data.frame(store=rep(c('A', 'B'), each=7),
sales=c(4, 4, 3, 5, 6, 5, 7, 4, 8, 7, 2, 5, 4, 6))
df %>%
group_by(store) %>%
mutate(moving_avg3 = rollmean(sales, k=3, fill=NA, align='right'))
分组日期处理 Group Data by Week or Month
df <- data.frame(date=as.Date(c('1/8/2022', '1/9/2022', '2/10/2022', '2/15/2022',
'3/5/2022', '3/22/2022', '3/27/2022'), '%m/%d/%Y'),
sales=c(8, 14, 22, 23, 16, 17, 23))
df$week_num <- strftime(df$date, format = "%V")
df %>%
group_by(week_num) %>%
summarize(total_sales = sum(sales))
df %>%
group_by(month = lubridate::floor_date(date, 'month')) %>%
summarize(sum_of_sales = sum(sales))
分组前N行 Select Top N Values by Group
df %>%
arrange(desc(points)) %>%
group_by(team) %>%
slice(1:2)
df %>%
group_by(team) %>%
top_n(2, points)
分组合并字符串 Collapse Text by Group
df %>%
group_by(team) %>%
summarise(text=paste(team,position, collapse=''))