work_dir=mvp_gwas
cd ~/Documents/PRS_MASH


sos run ldpred.ipynb snp_qc \
    --cwd $work_dir \
    --genoFiles 1000G.EUR/1000G.EUR.bed


work_dir=mvp_gwas
lipid=hdl
data=mash
cd ~/Documents/PRS_MASH


sos run ldpred.ipynb snp_intersect \
    --cwd $work_dir \
    --ss mvpdata/$data"_"$lipid.rds \
    --genoFiles $work_dir/1000G.EUR.$work_dir.bed UKBB_broad/ukbb_merged.5000_subset.bed -s force

INFO: Running snp_intersect_1: SNP intersect of summary stats and genotype data
INFO: snp_intersect_1 is completed.
INFO: snp_intersect_1 output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.intersect.rds /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.intersect.snplist
INFO: Running snp_intersect_2: 
INFO: snp_intersect_2 is completed (pending nested workflow).
INFO: Running preprocess_1: Filter SNPs and select individuals
INFO: preprocess_1 (index=0) is completed.
INFO: preprocess_1 (index=1) is completed.
INFO: preprocess_1 output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.bed /home/surbut/Documents/PRS_MASH/mvp_gwas/ukbb_merged.5000_subset.snp_intersect.extracted.bed in 2 groups
INFO: Running convert PLNIK to bigsnpr format with missing data mean imputed: 
INFO: convert PLNIK to bigsnpr format with missing data mean imputed (index=0) is completed.
INFO: convert PLNIK to bigsnpr format with missing data mean imputed (index=1) is completed.
INFO: convert PLNIK to bigsnpr format with missing data mean imputed output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.bk /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.rds... (4 items in 2 groups)
INFO: snp_intersect_2 output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.bed /home/surbut/Documents/PRS_MASH/mvp_gwas/1000G.EUR.mvp_gwas.snp_intersect.extracted.rds... (4 items)
INFO: Workflow snp_intersect (ID=w0b428252be760376) is executed successfully with 4 completed steps and 6 completed substeps.


tail -1 $work_dir/$data"_"$lipid.intersect.stdout

[1] "There are 448077 shared SNPs."


sos run ldpred.ipynb snp_match \
    --cwd $work_dir \
    --reference_geno $work_dir/1000G.EUR.$work_dir.snp_intersect.extracted.rds \
    --ss mvpdata/$data"_"$lipid.rds -s force

INFO: Running snp_match: 
INFO: snp_match is completed.
INFO: snp_match output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.snp_matched.rds /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.snp_matched.snplist
INFO: Workflow snp_match (ID=wcbfdc428c85a1cf7) is executed successfully with 1 completed step.


sos run ldpred.ipynb ldsc \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --reference-geno $work_dir/1000G.EUR.$work_dir.snp_intersect.extracted.rds -s force

INFO: Running ldsc: 
INFO: ldsc is completed.
INFO: ldsc output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.snp_matched.ld.rds
INFO: Workflow ldsc (ID=w9fd24285cc7bb514) is executed successfully with 1 completed step.


sos run ldpred.ipynb inf_prs \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --target-geno $work_dir/ukbb_merged.5000_subset.snp_intersect.extracted.rds \
    --ldsc $work_dir/$data"_"$lipid.snp_matched.ld.rds -s force

INFO: Running inf_prs: 
INFO: inf_prs is completed (pending nested workflow).
INFO: Running prs_core: 
INFO: prs_core is completed.
INFO: prs_core output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.snp_matched.inf_prs.rds
INFO: inf_prs output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/mash_hdl.snp_matched.inf_prs.rds
INFO: Workflow inf_prs (ID=wb216647b5431ecc9) is executed successfully with 2 completed steps.


tail -1 mvp_gwas/$data"_"$lipid.snp_matched.inf_prs.stdout

[1] "422921 SNPs are used for PRS calculations"


sos run ldpred.ipynb auto_prs \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --target-geno $work_dir/ukbb_merged.5000_subset.snp_intersect.extracted.rds \
    --ldsc $work_dir/$data"_"$lipid.snp_matched.ld.rds -s force


sos run ldpred.ipynb grid_prs \
    --cwd $work_dir \
    --ss $work_dir/$data"_"$lipid.snp_matched.rds \
    --target-geno $work_dir/ukbb_merged.5000_subset.snp_intersect.extracted.rds \
    --ldsc $work_dir/$data"_"$lipid.snp_matched.ld.rds \
    --phenoFile UKBB_broad/UKB.$lipid.cov \
    --covFile UKBB_broad/UKB.ind.cov \
    --response continuous -s force


echo $lipid

hdl


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --phenoFile UKBB_broad/UKB.$lipid.cov \
    --covFile UKBB_broad/UKB.ind.cov \
    --response continuous -s force

INFO: Running pred_eval: 
INFO: pred_eval is completed.
INFO: pred_eval output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/UKB.hdl.baseline.rds
INFO: Workflow pred_eval (ID=wa01fb80ead946f53) is executed successfully with 1 completed step.


setwd("~/Documents/PRS_MASH")
lipid="hdl"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".baseline.rds"))
summary(res$fitted)
res$summary

Call:
lm(formula = ., data = dat[train.ind, ])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.91299 -0.23786 -0.03966  0.20001  1.94664 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.4569056  0.0389891  37.367  < 2e-16 ***
AGE          0.0028108  0.0006783   4.144 3.49e-05 ***
SEX         -0.3451011  0.0111693 -30.897  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3427 on 3785 degrees of freedom
Multiple R-squared:  0.2038,	Adjusted R-squared:  0.2033 
F-statistic: 484.3 on 2 and 3785 DF,  p-value: < 2.2e-16


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --prs $work_dir/$data"_"$lipid.snp_matched.inf_prs.rds \
    --phenoFile UKBB_broad/UKB.$lipid.cov \
    --covFile UKBB_broad/UKB.ind.cov \
    --response continuous -s force

INFO: Running pred_eval: 
INFO: pred_eval is completed.
INFO: pred_eval output:   /home/surbut/Documents/PRS_MASH/mvp_gwas/UKB.hdl.mash_hdl.snp_matched.inf_prs.rds
INFO: Workflow pred_eval (ID=w92e86ff4b7721972) is executed successfully with 1 completed step.


lipid="hdl"
data="mash"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".",data,"_",lipid,".snp_matched.inf_prs.rds"))
summary(res$fitted)
res$summary

Call:
lm(formula = ., data = dat[train.ind, ])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.92072 -0.22645 -0.03424  0.18333  1.80306 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.4028641  0.0369171  38.000  < 2e-16 ***
AGE          0.0022624  0.0006413   3.528 0.000424 ***
SEX         -0.3444224  0.0105510 -32.644  < 2e-16 ***
PRS         -0.8366674  0.0391098 -21.393  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3238 on 3784 degrees of freedom
Multiple R-squared:  0.2897,	Adjusted R-squared:  0.2891 
F-statistic: 514.4 on 3 and 3784 DF,  p-value: < 2.2e-16


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --prs $work_dir/$data"_"$lipid.snp_matched.grid_prs.rds \
    --phenoFile ukbiobank/UKB.$lipid.cov \
    --covFile ukbiobank/UKB.ind.cov \
    --response continuous -s force


lipid="hdl"
data="gwas"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".",data,"_",lipid,".snp_matched.grid_prs.rds"))
summary(res$fitted)
res$summary


sos run ldpred.ipynb pred_eval \
    --cwd $work_dir \
    --prs $work_dir/$data"_"$lipid.snp_matched.auto_prs.rds \
    --phenoFile ukbiobank/UKB.$lipid.cov \
    --covFile ukbiobank/UKB.ind.cov \
    --response continuous -s force


lipid="hdl"
data="gwas"
res = readRDS(paste0("mvp_gwas/UKB.",lipid,".",data,"_",lipid,".snp_matched.auto_prs.rds"))
summary(res$fitted)
res$summary

model	R2	MSE
<chr>	<dbl>	<dbl>
model	0.20333	0.1299

model	R2	MSE
<chr>	<dbl>	<dbl>
model.inf_prs	0.2891	0.1132

HDL PRS example¶

Data used¶

Reference panel¶

GWAS summary statistics data¶

Target test data: UK biobank¶

PRS Models¶

Analysis of MVP GWAS data¶

Step 1: QC on reference panel¶

Step 2: Intersect SNPs among summary stats, reference panel and target data¶

Step 3: Harmonize alleles for shared SNPs¶

Step 4: Calculate LD matrix and fit LDSC model¶

Step 6: Estimate posterior effect sizes and PRS¶

Step 7: predict phenotypes¶