setwd("~/Documents/PRS_MASH")


fam_UKB <- read.table("UKBB_broad/ukbb_merged.fam", header = F, stringsAsFactors = F)
colnames(fam_UKB)=c("FID","IID","paternal.ID","maternal.ID","sex","affection")
covariates <- read.csv("7089-27626.csv", header = T, stringsAsFactors = F)
rownames(covariates)=covariates$eid

agecov <- read.table("UKBB_broad/ukbcov.txt", sep="\t", stringsAsFactors = FALSE,header=T)
rownames(agecov)=agecov$id
i=intersect(agecov$id,covariates$eid)
i=as.character(i)
length(i)
cov2=covariates[i,]
agecov=agecov[i,]
dim(cov2)
dim(agecov)
dim(covariates)#HDL is 30760
#LDL column name is 30780
#TG column names i 30871
## TC is 30690
## SEc is 31
## age is 21011

cov=data.frame("FID"=cov2$eid,"HDL"=cov2$X30760.0.0,"LDL"=cov2$X30780.0.0,"TG"=cov2$X30870.0.0,"TC"=cov2$X30690.0.0,"AGE"=agecov$age,"SEX"=agecov$Sex_numeric)

suppressMessages(library(tidyverse))

set.seed(2021)
covariate = cov %>%
    drop_na() %>%
    filter(FID %in% sample(FID,5000))
    
covariate = covariate[order(covariate[,1]),]

fam_UKB = fam_UKB %>% filter(FID %in% covariate$FID)
fam_UKB = fam_UKB[order(covariate[,1]),]
colnames(fam_UKB)<-NULL

write.table(covariate, file = "UKBB_broad/UKB.cov", sep = " ", 
            row.names = F, col.names = T)
write.table(fam_UKB, file = "UKBB_broad/UKB.QC.fam", sep = " ", 
            row.names = FALSE, col.names = FALSE)


sos run ldpred.ipynb snp_qc \
    --cwd UKBB_broad \
    --genoFiles UKBB_broad/ukbb_merged.bed \
    --keep-samples UKBB_broad/UKB.QC.fam \
    --geno-filter 0.05 \
    --mind-filter 0.05 \
    --name 5000_subset -s force

INFO: Running basic QC filters: Filter SNPs and select individuals
INFO: basic QC filters is completed.
INFO: basic QC filters output:   /home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset.bed
INFO: Workflow snp_qc (ID=wd7f97a4d9651d36f) is executed successfully with 1 completed step.


cat UKBB_broad/ukbb_merged.5000_subset.log

PLINK v1.90b6.21 64-bit (19 Oct 2020)
Options in effect:
  --bfile UKBB_broad/ukbb_merged
  --geno 0.05
  --hwe 5e-08
  --keep UKBB_broad/UKB.QC.fam
  --maf 0.01
  --make-bed
  --memory 14400000000
  --mind 0.05
  --out /home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset
  --threads 20

Hostname: baymax
Working directory: /home/surbut/Documents/PRS_MASH
Start time: Fri Jun 11 16:31:19 2021

Random number seed: 1623443479
64294 MB RAM detected; reserving 14400000000 MB for main workspace.
Allocated 61092 MB successfully, after larger attempt(s) failed.
784256 variants loaded from .bim file.
488377 people (223511 males, 264863 females, 3 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset.nosex .
--keep: 4955 people remaining.
220 people removed due to missing genotype data (--mind).
IDs written to
/home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset.irem .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 4735 founders and 0 nonfounders present.
Calculating allele frequencies... done.
Total genotyping rate in remaining samples is 0.970401.
71019 variants removed due to missing genotype data (--geno).
--hwe: 3502 variants removed due to Hardy-Weinberg exact test.
103253 variants removed due to minor allele threshold(s)
(--maf/--max-maf/--mac/--max-mac).
606482 variants and 4735 people pass filters and QC.
Note: No phenotypes present.
--make-bed to
/home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset.bed +
/home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset.bim +
/home/surbut/Documents/PRS_MASH/UKBB_broad/ukbb_merged.5000_subset.fam ...
done.

End time: Fri Jun 11 16:48:16 2021


cov2=read.table("UKBB_broad/UKB.cov",header = T, stringsAsFactors = F)
qcfam=read.table("UKBB_broad/ukbb_merged.5000_subset.fam",header = F, stringsAsFactors = F)

i=as.character(qcfam[,2])
rownames(cov2)=cov2$"FID"

cov=cov2[i,]
all.equal(cov$FID,qcfam$V2)
dim(cov)
write.table(cov, file = "UKBB_broad/UKB.4700.cov", sep = " ", 
            row.names = F, col.names = T)


cd ~/Documents/PRS_MASH/UKBB_broad

awk '{print $6, $7}' UKB.4700.cov > UKB.ind.cov
awk '{print $3}' UKB.4700.cov > UKB.ldl.cov
awk '{print $2}' UKB.4700.cov > UKB.hdl.cov
awk '{print $4}' UKB.4700.cov > UKB.tg.cov
awk '{print $5}' UKB.4700.cov > UKB.tc.cov

cd ..


tree

.
├── 1000G.EUR
│   ├── 1000G.EUR.bed
│   ├── 1000G.EUR.bim
│   └── 1000G.EUR.fam
├── 7089-27626.csv
├── ldpred.html
├── ldpred.ipynb -> /home/surbut/Projects/bioworkflows/ldpred/ldpred.ipynb
├── mvpdata
│   ├── chr_pos_allele2_lfsr.txt
│   ├── gwas_hdl.rds
│   ├── gwas_ldl.rds
│   ├── gwas_tc.rds
│   ├── gwas_tg.rds
│   ├── mash_hdl.rds
│   ├── mash_ldl.rds
│   ├── mash_tc.rds
│   ├── mash_tg.rds
│   ├── Merged_MVP_Full_se_raw.txt
│   ├── MVP_all_beta_posterior_beta.txt
│   ├── posterior_beta_se.txt
│   └── zmash_raw_univariate_MVP.rds
├── mvp_gwas
│   ├── 1000G.EUR.mvp_gwas.bed
│   ├── 1000G.EUR.mvp_gwas.bim
│   ├── 1000G.EUR.mvp_gwas.fam
│   ├── 1000G.EUR.mvp_gwas.log
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.bed
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.bim
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.bk
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.fam
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.log
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.rds
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.stderr
│   ├── 1000G.EUR.mvp_gwas.snp_intersect.extracted.stdout
│   ├── 1000G.EUR.mvp_gwas.stdout
│   ├── chr10.OMNI.interpolated_genetic_map
│   ├── chr11.OMNI.interpolated_genetic_map
│   ├── chr12.OMNI.interpolated_genetic_map
│   ├── chr13.OMNI.interpolated_genetic_map
│   ├── chr14.OMNI.interpolated_genetic_map
│   ├── chr15.OMNI.interpolated_genetic_map
│   ├── chr16.OMNI.interpolated_genetic_map
│   ├── chr17.OMNI.interpolated_genetic_map
│   ├── chr18.OMNI.interpolated_genetic_map
│   ├── chr19.OMNI.interpolated_genetic_map
│   ├── chr1.OMNI.interpolated_genetic_map
│   ├── chr20.OMNI.interpolated_genetic_map
│   ├── chr21.OMNI.interpolated_genetic_map
│   ├── chr22.OMNI.interpolated_genetic_map
│   ├── chr2.OMNI.interpolated_genetic_map
│   ├── chr3.OMNI.interpolated_genetic_map
│   ├── chr4.OMNI.interpolated_genetic_map
│   ├── chr5.OMNI.interpolated_genetic_map
│   ├── chr6.OMNI.interpolated_genetic_map
│   ├── chr7.OMNI.interpolated_genetic_map
│   ├── chr8.OMNI.interpolated_genetic_map
│   ├── chr9.OMNI.interpolated_genetic_map
│   ├── gwas_hdl.intersect.rds
│   ├── gwas_hdl.intersect.snplist
│   ├── gwas_hdl.intersect.stdout
│   ├── gwas_hdl.snp_matched.auto_prs.stderr
│   ├── gwas_hdl.snp_matched.auto_prs.stdout
│   ├── gwas_hdl.snp_matched.inf_prs.rds
│   ├── gwas_hdl.snp_matched.inf_prs.stderr
│   ├── gwas_hdl.snp_matched.inf_prs.stdout
│   ├── gwas_hdl.snp_matched.ld.rds
│   ├── gwas_hdl.snp_matched.ld.stderr
│   ├── gwas_hdl.snp_matched.rds
│   ├── gwas_hdl.snp_matched.snplist
│   ├── gwas_hdl.snp_matched.stderr
│   ├── gwas_ldl.intersect.rds
│   ├── gwas_ldl.intersect.snplist
│   ├── gwas_ldl.intersect.stdout
│   ├── gwas_ldl.snp_matched.inf_prs.rds
│   ├── gwas_ldl.snp_matched.inf_prs.stderr
│   ├── gwas_ldl.snp_matched.inf_prs.stdout
│   ├── gwas_ldl.snp_matched.ld.rds
│   ├── gwas_ldl.snp_matched.ld.stderr
│   ├── gwas_ldl.snp_matched.qc.png
│   ├── gwas_ldl.snp_matched.qc.rds
│   ├── gwas_ldl.snp_matched.qc.snplist
│   ├── gwas_ldl.snp_matched.qc.stderr
│   ├── gwas_ldl.snp_matched.qc.stdout
│   ├── gwas_ldl.snp_matched.rds
│   ├── gwas_ldl.snp_matched.snplist
│   ├── gwas_ldl.snp_matched.stderr
│   ├── gwas_tg.intersect.rds
│   ├── gwas_tg.intersect.snplist
│   ├── gwas_tg.intersect.stdout
│   ├── ld-cache
│   │   ├── filea9eb7da8e69.sbk
│   │   ├── filec69c78a6fb03.sbk
│   │   └── filed06c176aa005.sbk
│   ├── UKB.2000_subset.snp_intersect.extracted.bed
│   ├── UKB.2000_subset.snp_intersect.extracted.bim
│   ├── UKB.2000_subset.snp_intersect.extracted.bk
│   ├── UKB.2000_subset.snp_intersect.extracted.fam
│   ├── UKB.2000_subset.snp_intersect.extracted.log
│   ├── UKB.2000_subset.snp_intersect.extracted.rds
│   ├── UKB.2000_subset.snp_intersect.extracted.stderr
│   ├── UKB.2000_subset.snp_intersect.extracted.stdout
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.bed
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.bim
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.bk
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.fam
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.log
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.rds
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.stderr
│   ├── ukbb_merged.5000_subset.snp_intersect.extracted.stdout
│   ├── UKB.hdl.gwas_hdl.snp_matched.inf_prs.stderr
│   ├── UKB.ldl.baseline.pdf
│   ├── UKB.ldl.baseline.rds
│   ├── UKB.ldl.baseline.stderr
│   ├── UKB.ldl.baseline.stdout
│   ├── UKB.ldl.gwas_ldl.snp_matched.inf_prs.pdf
│   ├── UKB.ldl.gwas_ldl.snp_matched.inf_prs.rds
│   ├── UKB.ldl.gwas_ldl.snp_matched.inf_prs.stderr
│   └── UKB.ldl.gwas_ldl.snp_matched.inf_prs.stdout
├── plink.log
├── UKBB
│   ├── lipid_demographics.csv
│   └── READ_ME_fam_files.txt
├── UKBB_broad
│   ├── all_files.txt
│   ├── all_files.txt~
│   ├── big_ukb_file.txt
│   ├── newfamdir
│   │   ├── mymerged.bed
│   │   ├── mymerged.bim
│   │   └── mymerged.fam
│   ├── pheno_ukbb.rdata
│   ├── UKB.4700.cov
│   ├── UKB.4747.cov
│   ├── ukb708_cal_chr10_v2_s488374.fam
│   ├── ukb708_cal_chr11_v2_s488374.fam
│   ├── ukb708_cal_chr12_v2_s488374.fam
│   ├── ukb708_cal_chr13_v2_s488374.fam
│   ├── ukb708_cal_chr14_v2_s488374.fam
│   ├── ukb708_cal_chr15_v2_s488374.fam
│   ├── ukb708_cal_chr16_v2_s488374.fam
│   ├── ukb708_cal_chr17_v2_s488374.fam
│   ├── ukb708_cal_chr18_v2_s488374.fam
│   ├── ukb708_cal_chr19_v2_s488374.fam
│   ├── ukb708_cal_chr1_v2_s488374.fam
│   ├── ukb708_cal_chr20_v2_s488374.fam
│   ├── ukb708_cal_chr21_v2_s488374.fam
│   ├── ukb708_cal_chr22_v2_s488374.fam
│   ├── ukb708_cal_chr2_v2_s488374.fam
│   ├── ukb708_cal_chr3_v2_s488374.fam
│   ├── ukb708_cal_chr4_v2_s488374.fam
│   ├── ukb708_cal_chr5_v2_s488374.fam
│   ├── ukb708_cal_chr6_v2_s488374.fam
│   ├── ukb708_cal_chr7_v2_s488374.fam
│   ├── ukb708_cal_chr8_v2_s488374.fam
│   ├── ukb708_cal_chr9_v2_s488374.fam
│   ├── ukb708_cal_chrMT_v2_s488374.fam
│   ├── ukb708_cal_chrX_v2_s488374.fam
│   ├── ukb708_cal_chrXY_v2_s488374.fam
│   ├── ukb708_cal_chrY_v2_s488374.fam
│   ├── ukbb_merged.2000_subset.irem
│   ├── ukbb_merged.2000_subset.log
│   ├── ukbb_merged.2000_subset.nosex
│   ├── ukbb_merged.2000_subset.stderr
│   ├── ukbb_merged.2000_subset.stdout
│   ├── ukbb_merged.5000_subset.bed
│   ├── ukbb_merged.5000_subset.bim
│   ├── ukbb_merged.5000_subset.fam
│   ├── ukbb_merged.5000_subset.irem
│   ├── ukbb_merged.5000_subset.log
│   ├── ukbb_merged.5000_subset.nosex
│   ├── ukbb_merged.5000_subset.stdout
│   ├── ukbb_merged.bed
│   ├── ukbb_merged.bim
│   ├── ukbb_merged.fam
│   ├── ukbb_merged.log
│   ├── ukbb_merged.nosex
│   ├── ukbb_merged.UKBB_broad
│   │   ├── ukbb_merged_qced.irem
│   │   ├── ukbb_merged_qced.log
│   │   ├── ukbb_merged_qced.nosex
│   │   ├── ukbb_merged_qced.stderr
│   │   └── ukbb_merged_qced.stdout
│   ├── ukb_cal_chr10_v2.bed
│   ├── ukb_cal_chr11_v2.bed
│   ├── ukb_cal_chr12_v2.bed
│   ├── ukb_cal_chr13_v2.bed
│   ├── ukb_cal_chr14_v2.bed
│   ├── ukb_cal_chr15_v2.bed
│   ├── ukb_cal_chr16_v2.bed
│   ├── ukb_cal_chr17_v2.bed
│   ├── ukb_cal_chr18_v2.bed
│   ├── ukb_cal_chr19_v2.bed
│   ├── ukb_cal_chr1_v2.bed
│   ├── ukb_cal_chr20_v2.bed
│   ├── ukb_cal_chr21_v2.bed
│   ├── ukb_cal_chr22_v2.bed
│   ├── ukb_cal_chr2_v2.bed
│   ├── ukb_cal_chr3_v2.bed
│   ├── ukb_cal_chr4_v2.bed
│   ├── ukb_cal_chr5_v2.bed
│   ├── ukb_cal_chr6_v2.bed
│   ├── ukb_cal_chr7_v2.bed
│   ├── ukb_cal_chr8_v2.bed
│   ├── ukb_cal_chr9_v2.bed
│   ├── ukb_cal_chrMT_v2.bed
│   ├── ukb_cal_chrX_v2.bed
│   ├── ukb_cal_chrXY_v2.bed
│   ├── ukb_cal_chrY_v2.bed
│   ├── UKB.cov
│   ├── ukbcov.txt
│   ├── UKB.hdl.cov
│   ├── UKB.ind.cov
│   ├── UKB.ldl.cov
│   ├── UKB.QC.fam
│   ├── ukb_snp_chr10_v2.bim
│   ├── ukb_snp_chr11_v2.bim
│   ├── ukb_snp_chr12_v2.bim
│   ├── ukb_snp_chr13_v2.bim
│   ├── ukb_snp_chr14_v2.bim
│   ├── ukb_snp_chr15_v2.bim
│   ├── ukb_snp_chr16_v2.bim
│   ├── ukb_snp_chr17_v2.bim
│   ├── ukb_snp_chr18_v2.bim
│   ├── ukb_snp_chr19_v2.bim
│   ├── ukb_snp_chr1_v2.bim
│   ├── ukb_snp_chr20_v2.bim
│   ├── ukb_snp_chr21_v2.bim
│   ├── ukb_snp_chr22_v2.bim
│   ├── ukb_snp_chr2_v2.bim
│   ├── ukb_snp_chr3_v2.bim
│   ├── ukb_snp_chr4_v2.bim
│   ├── ukb_snp_chr5_v2.bim
│   ├── ukb_snp_chr6_v2.bim
│   ├── ukb_snp_chr7_v2.bim
│   ├── ukb_snp_chr8_v2.bim
│   ├── ukb_snp_chr9_v2.bim
│   ├── ukb_snp_chrMT_v2.bim
│   ├── ukb_snp_chrX_v2.bim
│   ├── ukb_snp_chrXY_v2.bim
│   ├── ukb_snp_chrY_v2.bim
│   ├── UKB.tc.cov
│   └── UKB.tg.cov
└── ukbiobank
    ├── UKB.2000_subset.bed
    ├── UKB.2000_subset.bim
    ├── UKB.2000_subset.fam
    ├── UKB.cov
    ├── UKB.hdl.cov
    ├── UKB.ind.cov
    └── UKB.QC.fam

9 directories, 241 files


work_dir=mvp_gwas
cd ~/Documents/PRS_MASH


sos run ldpred.ipynb snp_qc \
    --cwd $work_dir \
    --genoFiles 1000G.EUR/1000G.EUR.bed


work_dir=mvp_gwas
lipid=hdl
data=gwas
cd ~/Documents/PRS_MASH


sos run ldpred.ipynb snp_intersect \
    --cwd $work_dir \
    --ss mvpdata/$data"_"$lipid.rds \
    --genoFiles $work_dir/1000G.EUR.$work_dir.bed UKBB_broad/ukbb_merged.5000_subset.bed -s force

INFO: Running snp_intersect_1: SNP intersect of summary stats and genotype data
INFO: snp_intersect_1 is completed.
INFO: snp_intersect_1 output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.intersect.rds /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.intersect.snplist
INFO: Running snp_intersect_2: 
INFO: snp_intersect_2 is completed (pending nested workflow).
INFO: Running preprocess_1: Filter SNPs and select individuals
INFO: preprocess_1 (index=0) is completed.
INFO: preprocess_1 (index=1) is completed.
INFO: preprocess_1 output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.bed /home/surbut/Documents/PRS_MASH/mvp_gwas/ukbb_merged.5000_subset.snp_intersect.extracted.bed in 2 groups
INFO: Running convert PLNIK to bigsnpr format with missing data mean imputed: 
INFO: convert PLNIK to bigsnpr format with missing data mean imputed (index=0) is completed.
INFO: convert PLNIK to bigsnpr format with missing data mean imputed (index=1) is completed.
INFO: convert PLNIK to bigsnpr format with missing data mean imputed output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.bk /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.rds... (4 items in 2 groups)
INFO: snp_intersect_2 output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.bed /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.rds... (4 items)
INFO: Workflow snp_intersect (ID=wffa5f5d1d8100e11) is executed successfully with 4 completed steps and 6 completed substeps.


tail -1 $work_dir/$data"_"$lipid.intersect.stdout

[1] "There are 448077 shared SNPs."


sos run ldpred.ipynb snp_match \
    --cwd $work_dir \
    --reference_geno $work_dir/1000G.EUR.$work_dir.snp_intersect.extracted.rds \
    --ss mvpdata/$data"_"$lipid.rds -s force

INFO: Running snp_match: 
INFO: snp_match is completed.
INFO: snp_match output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.snp_matched.rds /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.snp_matched.snplist
INFO: Workflow snp_match (ID=w8255e53608faedab) is executed successfully with 1 completed step.


sos run ldpred.ipynb ldsc \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --reference-geno $work_dir/1000G.EUR.$work_dir.snp_intersect.extracted.rds -s force

INFO: Running ldsc: 
INFO: ldsc is completed.
INFO: ldsc output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.snp_matched.ld.rds
INFO: Workflow ldsc (ID=w074f16b85a5b6dd6) is executed successfully with 1 completed step.


sos run ldpred.ipynb inf_prs \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --target-geno $work_dir/ukbb_merged.5000_subset.snp_intersect.extracted.rds \
    --ldsc $work_dir/$data"_"$lipid.snp_matched.ld.rds -s force

INFO: Running inf_prs: 
INFO: inf_prs is completed (pending nested workflow).
INFO: Running prs_core: 
INFO: prs_core is completed.
INFO: prs_core output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.snp_matched.inf_prs.rds
INFO: inf_prs output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/gwas_hdl.snp_matched.inf_prs.rds
INFO: Workflow inf_prs (ID=wbdd86af79f964fbe) is executed successfully with 2 completed steps.


tail -1 mvp_gwas/$data"_"$lipid.snp_matched.inf_prs.stdout

[1] "422921 SNPs are used for PRS calculations"


sos run ldpred.ipynb auto_prs \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --target-geno $work_dir/ukbb_merged.5000_subset.snp_intersect.extracted.rds \
    --ldsc $work_dir/$data"_"$lipid.snp_matched.ld.rds -s force


sos run ldpred.ipynb grid_prs \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --target-geno $work_dir/ukbb_merged.5000_subset.snp_intersect.extracted.rds \
    --ldsc $work_dir/$data"_"$lipid.snp_matched.ld.rds \
    --phenoFile UKBB_broad/UKB.$lipid.cov \
    --covFile UKBB_broad/UKB.ind.cov \
    --response continuous -s force


echo $lipid

hdl


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --phenoFile UKBB_broad/UKB.$lipid.cov \
    --covFile UKBB_broad/UKB.ind.cov \
    --response continuous -s force

INFO: Running pred_eval: 
INFO: pred_eval is completed.
INFO: pred_eval output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/UKB.hdl.baseline.rds
INFO: Workflow pred_eval (ID=wa01fb80ead946f53) is executed successfully with 1 completed step.


lipid="hdl"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".baseline.rds"))
summary(res$fitted)
res$summary

Call:
lm(formula = ., data = dat[train.ind, ])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.91299 -0.23786 -0.03966  0.20001  1.94664 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.4569056  0.0389891  37.367  < 2e-16 ***
AGE          0.0028108  0.0006783   4.144 3.49e-05 ***
SEX         -0.3451011  0.0111693 -30.897  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3427 on 3785 degrees of freedom
Multiple R-squared:  0.2038,	Adjusted R-squared:  0.2033 
F-statistic: 484.3 on 2 and 3785 DF,  p-value: < 2.2e-16


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --prs $work_dir/$data"_"$lipid.snp_matched.inf_prs.rds \
    --phenoFile UKBB_broad/UKB.$lipid.cov \
    --covFile UKBB_broad/UKB.ind.cov \
    --response continuous -s force

INFO: Running pred_eval: 
INFO: pred_eval is completed.
INFO: pred_eval output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/UKB.hdl.gwas_hdl.snp_matched.inf_prs.rds
INFO: Workflow pred_eval (ID=w37a2d3f916caa7d6) is executed successfully with 1 completed step.


lipid="hdl"
data="gwas"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".",data,"_",lipid,".snp_matched.inf_prs.rds"))
summary(res$fitted)
res$summary

Call:
lm(formula = ., data = dat[train.ind, ])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.89400 -0.23577 -0.04033  0.19628  1.91919 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.4152391  0.0383916  36.863  < 2e-16 ***
AGE          0.0022965  0.0006666   3.445 0.000578 ***
SEX         -0.3477698  0.0109571 -31.739  < 2e-16 ***
PRS         -0.1329065  0.0108331 -12.269  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3362 on 3784 degrees of freedom
Multiple R-squared:  0.2342,	Adjusted R-squared:  0.2336 
F-statistic: 385.8 on 3 and 3784 DF,  p-value: < 2.2e-16


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --prs $work_dir/$data"_"$lipid.snp_matched.grid_prs.rds \
    --phenoFile ukbiobank/UKB.$lipid.cov \
    --covFile ukbiobank/UKB.ind.cov \
    --response continuous -s force


lipid="hdl"
data="gwas"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".",data,"_",lipid,".snp_matched.grid_prs.rds"))
summary(res$fitted)
res$summary


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --prs $work_dir/$data"_"$lipid.snp_matched.auto_prs.rds \
    --phenoFile ukbiobank/UKB.$lipid.cov \
    --covFile ukbiobank/UKB.ind.cov \
    --response continuous -s force


lipid="hdl"
data="gwas"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".",data,"_",lipid,".snp_matched.auto_prs.rds"))
summary(res$fitted)
res$summary

model	R2	MSE
<chr>	<dbl>	<dbl>
model	0.20333	0.1299

model	R2	MSE
<chr>	<dbl>	<dbl>
model.inf_prs	0.23361	0.12433

HDL PRS example¶

Data used¶

Reference panel¶

GWAS summary statistics data¶

Target test data: UK biobank¶

PRS Models¶

Test genotype data preparation¶

Analysis of MVP GWAS data¶

Step 1: QC on reference panel¶

Step 2: Intersect SNPs among summary stats, reference panel and target data¶

Step 3: Harmonize alleles for shared SNPs¶

Step 4: Calculate LD matrix and fit LDSC model¶

Step 6: Estimate posterior effect sizes and PRS¶

Step 7: predict phenotypes¶