This R
script links together the most relevant functions for preprocessing data, running cNMTF and prioritising variants.
###############################################################
# Full executable script for cNMTF
#
# Corresponding author:
# Luis Leal, Imperial College London, email: lgl15@imperial.ac.uk
###############################################################
#-------------------------------------------------
# 1. Installation
#-------------------------------------------------
# Clean your current workspace
rm(list=ls()) ; gc()
# Load libraries
library("devtools") #For the installation of cnmtf package
library("biomaRt") #For querying SNV functional consequences from ENSEMBL
library("LDcorSV") #For Linkage disequilibrium assessment
library("doParallel") #For parallel computing
library("igraph") #For SNV-SNV network construction
library("ade4") #For Principal components analysis of patients genotypes
library("VennDiagram") #For plotting Venn diagrams of prioritised variatns
library("snpStats") #For plotting QQplots in genomic control of population structure
library("ComplexHeatmap") #For plotting heatmaps of cluster membership
library("RDAVIDWebService") #For annotating the SNVs and genes with functional data
# Installation of cnmtf package from our github repository
devtools::install_github("lgl15/cnmtf")
# Create a directory for your analysis
dir.create("./test/")
#-------------------------------------------------
# 2. Preprocessing the data
#-------------------------------------------------
# Find variants in high Linkage-disequilibrium
find.snps.ld( file.LD = "./test/fileLD.RData", #File to save results
type.ld = "gene", #Find LD within the same gene region
tmap = tmap, #Mapping of SNPs to genes, chr and genomic position
R = R) #Relationship matrix
# Create SNV-SNV network
create.network( net.type = "ppi", #Type of reference network
dedges = dedges, #Object with edges from reference network
#Parameters for Linkage Disequilibrium
remove.highLD = TRUE,
ld.tao = 0.8, #Treshold of LD
res.ld = "./test/fileLD.RData", #Table of LD
#Parameters to construct Wu
R.snps = rownames(R), #List of SNPs in R
work.dat = "./test/", #Working directory
trait.project = "test", #Trait
n.cores = 3, #Number of cores for parallel computing
tmap = tmap, #Mapping of SNPs to genes
plot.file = "Gu_ppi_test_venn.pdf" )
#-------------------------------------------------
# 3. Running cNMTF
#-------------------------------------------------
score.cnmtf( R = R, #Relationship matrix
out = out, #Categorical outcome variable
pop = pop, #Population variable
log.file = "logfile_my_experiment", #Log-file to track the performance
#Variables to Save/Load data and workspaces
name.exp = "my_experiment", #Name of experiment to save files
work.dat = "./test/", #Folder to save and load workspaces
#Number of clusters
define.k = "user", #Method to define k1
k1 = 10, #Number of clusters of SNVs
k2 = nlevels(out), #Number of clusters of patients
#Penalisation parameters
wparameters = list( gamma1 = 0.5, #Weight for the SNV network, Lu
gamma2 = 1, #Weight for the outcome matrix, Vo
gamma3 = 1), #Weight for the kernel matrix, A
save.parameters = TRUE, #Save parameters to file
run.t.par = 4, #Number of repetitions for parameters fitting
max.try0 = 4, #Maximum number of tries to fit the parameter
snps.known = NULL, #List of known SNV associated with the trait
#Variables to control performance of the algorithm
parallel.opt = TRUE, #Run some instances of the algorithm in parallel
n.cores = 3, #Number of cores to use in the parallel processing
init = 0, #Type of seeding/initialisation of matrices in the algorithm
do.U = TRUE, #Perform clustering of SNPs
calcObj = 20, #Check convergency every 20 iterations
calcObj2 = 40, #Start checking convergency after first 40 iterations
iters = 300, #Number of iterations
run.t.exp = 10, #Number of repetitions for the experiment
display.iters = FALSE, #Display the iterations of function cnmtf
#Randomisations
score.pvalues = TRUE, #Estimate p-values for the scores
random.parallel = TRUE, #Run each randomisation in parallel
randomisations = 100, #Number of randomisations
#Construction of penalisation terms
file.Gu = "./test/Gu_ppi_test.RData" #Workspace with SNV-SNV network
)
#-------------------------------------------------
# 4. Prioritising SNVs
#-------------------------------------------------
# Calculate the delta score and create Manhattan plots
analyze.cnmtf ( trait.project = "test", #Trait under analysis
name.exp = "my_experiment", #Experiment under analysis
work.dat = "./test/", #Folder with the results of score.cnmtf
alpha.cnmtf = 0.005, #Significance level of the scores
d.conf = NULL, #Optional. Confounder variables.
snps.known = NULL, #Optional. List of SNVs associated with the disease
snps.known2 = NULL, #Optional. A second list of SNVs.
tmap = tmap) #Mapping of SNPs to genes, chr and genomic position
# Map variants to genes and add functional annotations
# Your email account must be registered in the The Database for Annotation, Visualization and Integrated Discovery (DAVID ) [https://david.ncifcrf.gov/webservice/register.htm]
t.res = annotate.results( trait.project = "test", #Trait under analysis
name.exp = "my_experiment", #Experiment under analysis
work.dat = "./test/", #Working directory
add.david.annotations = TRUE, #Use DAVID web service
email.david = "email@example.com", #Email registered in DAVID.
add.ensemble.conseq = FALSE, #Add SNV consequences from ENSEMBL
tmap = tmap, #Mapping of SNPs to genes, chr and genomic position
file.LD = "./test/fileLD.RData", #Workspace with pairwise LD
ld.tao = 0.8, #Treshold of LD
snps.known = NULL, #Optional. List of known associations.
snps.known2 = NULL) #Optional. A second list of SNVs.
#Extract table of prioritised variants
(t.snvs = t.res[[1]])