1. Check call rate per SNP and per sample (both SNP and Sample-level)
The call rate for a given SNP is defined as the proportion of individuals in the study for which the corresponding SNP information is not missing
The call rate for a given sample is defined as the proportion of SNPs that is missing for the sample.
Call rate per sample (imiss) and per SNP (lmiss)
runPLINK("--ped file.ped --map file.map --missing --out file_call_rate")
Change format to make it easier for R data frame imiss
system("cat file_call_rate.imiss | tr -s ' ' '\t' > call_rate_sample.tabs")
Change format lmiss
system("cat file_call_rate.lmiss | tr -s ' ' '\t' > call_rate_SNP.tabs")
Plot histograms of call rate per sample
Load call rate SNP and sample.
call_rate_SNP <- read.delim("call_rate_SNP.tabs") %>%
select (-X)
call_rate_samples <- read.delim("call_rate_sample.tabs") %>%
select (-X)
Calculate n of filtered out samples with missing rate < 0.1
n_filtered_out <- sum(call_rate_samples$F_MISS > 0.1)
ggplot(call_rate_samples, aes((1-F_MISS)*100)) +
geom_histogram(bins = 50) +
theme_bw() +
ylab ('Sample number') +
xlab ('Call Rate(%)') +
geom_vline(xintercept = 90, color = 'red', linetype="dashed") +
annotate(geom="text", x=70, y=200, label=paste0("Number of samples to exclude = ", n_filtered_out),
color="red", size = 4)
Plot histograms of call rate per snp
Calculate n of filtered out SNPs with missing rate < 0.1
n_filtered_out <- sum(call_rate_SNP$F_MISS >=0.1)
ggplot(call_rate_SNP, aes((1-F_MISS)*100)) +
geom_histogram(bins = 50) +
theme_bw() +
ylab ('SNP number') +
xlab ('Call Rate(%)') +
geom_vline(xintercept = 90, color = 'red', linetype="dashed") +
annotate(geom = "text", x = 70, y = 100000, label = paste0("Number of SNPs to exclude = ", n_filtered_out),
color="red", size = 4)
Exclude SNPs with call rate < 90% and check again sample call rate
runPLINK("--ped file.ped --map file.map --geno 0.1 --recode --out file_filtered_snps")
Check again sample call rate
runPLINK("--ped file_filtered_snps.ped --map file_filtered_snps.map --missing --out file_call_rate_filtered_snps")
system("cat file_call_rate_filtered_snps.imiss | tr -s ' ' '\t' > file_call_rate_filtered_snps_imiss.tabs")
call_rate_samples_after_filter <- read.delim("file_call_rate_filtered_snps_imiss.tabs") %>%
select (-X)
Plot histograms of call rate per sample after filtered SNPs
n_filtered_out <- sum(call_rate_samples_after_filter$F_MISS >0.1)
ggplot(call_rate_samples_after_filter, aes((1-F_MISS)*100)) +
geom_histogram(bins = 50) +
theme_bw() +
ylab ('Sample number') +
xlab ('Call Rate(%)') +
geom_vline(xintercept = 90, color = 'red', linetype="dashed") +
annotate(geom="text", x = 70, y = 200, label = paste0("Number of samples to exclude = ", n_filtered_out),
color="red", size = 4)
Note
This iteration can help you decide which threshold to choose to exclude low quality samples and SNPs. The threshold depends on the sample size, usually it varies from 95% to 98%.