R语言抽样方法

R语言中进行抽样的三种方法

一、分层抽样 Stratified Sampling

library(tidyverse)
set.seed(1)

df <- data.frame(grade = rep(c('Freshman', 'Sophomore', 'Junior', 'Senior'), each=100),gpa = rnorm(400, mean=85, sd=3))

# 直接使用行数抽样
strat_sample <- df %>%
  group_by(grade) %>%
  sample_n(size=10)

# 按比例抽样
strat_sample <- df %>%
                  group_by(grade) %>%
                  sample_frac(size=.15)

table(strat_sample$grade)

二、聚类抽样 Cluster Sampling

library(tidyverse)
set.seed(1)
df <- data.frame(tour = rep(1:10, each=20),
                 experience = rnorm(200, mean=7, sd=1))


clusters <- sample(unique(df$tour), size=4, replace=F)

cluster_sample <- df[df$tour %in% clusters, ]

table(cluster_sample$tour)

三、系统抽样 Systematic Sampling

library(tidyverse)
set.seed(1)

randomNames <- function(n = 5000) {
  do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
}

df <- data.frame(last_name = randomNames(500),
                 gpa = rnorm(500, mean=82, sd=3))

obtain_sys <- function(N,n){
  k = ceiling(N/n)
  r = sample(1:k, 1)
  seq(r, r + k*(n-1), k)
}

sys_sample_df <-  df[obtain_sys(nrow(df), 100), ]

head(sys_sample_df)

留下评论